Add --ignore-non-html option to skip pages that weren't HTML (which might trigger Chromium's 'sha256-4Su6mBWzEIFnH4pAGMOuaeBrstwJN4Z3pq/s1Kn4/KQ=' hash)
This commit is contained in:
parent
09aa2ded5e
commit
bfa16a145a
3 changed files with 32 additions and 4 deletions
8
CHANGELOG.md
Normal file
8
CHANGELOG.md
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
## 0.1.1
|
||||||
|
|
||||||
|
* Fix prog name
|
||||||
|
* Add --ignore-non-html option to skip pages that weren't HTML (which might trigger Chromium's 'sha256-4Su6mBWzEIFnH4pAGMOuaeBrstwJN4Z3pq/s1Kn4/KQ=' hash)
|
||||||
|
|
||||||
|
## 0.1.0
|
||||||
|
|
||||||
|
* Initial release
|
||||||
|
|
@ -81,8 +81,8 @@ poetry run cspresso https://example.com --json
|
||||||
## Full usage info
|
## Full usage info
|
||||||
|
|
||||||
```
|
```
|
||||||
usage: csp-crawl [-h] [--max-pages MAX_PAGES] [--timeout-ms TIMEOUT_MS] [--settle-ms SETTLE_MS] [--headed] [--no-install] [--with-deps] [--browsers-path BROWSERS_PATH] [--allow-blob] [--unsafe-eval]
|
usage: cspresso [-h] [--max-pages MAX_PAGES] [--timeout-ms TIMEOUT_MS] [--settle-ms SETTLE_MS] [--headed] [--no-install] [--with-deps] [--browsers-path BROWSERS_PATH] [--allow-blob] [--unsafe-eval]
|
||||||
[--upgrade-insecure-requests] [--include-sourcemaps] [--json]
|
[--upgrade-insecure-requests] [--include-sourcemaps] [--ignore-non-html] [--json]
|
||||||
url
|
url
|
||||||
|
|
||||||
Crawl up to N pages (same-origin) with Playwright and generate a draft CSP.
|
Crawl up to N pages (same-origin) with Playwright and generate a draft CSP.
|
||||||
|
|
@ -108,5 +108,6 @@ options:
|
||||||
--upgrade-insecure-requests
|
--upgrade-insecure-requests
|
||||||
Add upgrade-insecure-requests directive
|
Add upgrade-insecure-requests directive
|
||||||
--include-sourcemaps Analyze JS/CSS for sourceMappingURL and add map origins to connect-src
|
--include-sourcemaps Analyze JS/CSS for sourceMappingURL and add map origins to connect-src
|
||||||
|
--ignore-non-html Ignore non-HTML pages that get crawled (which might trigger Chromium's word-wrap hash: https://stackoverflow.com/a/69838710)
|
||||||
--json Output JSON instead of a header line
|
--json Output JSON instead of a header line
|
||||||
```
|
```
|
||||||
|
|
|
||||||
|
|
@ -307,6 +307,7 @@ async def crawl_and_generate_csp(
|
||||||
allow_unsafe_eval: bool = False,
|
allow_unsafe_eval: bool = False,
|
||||||
upgrade_insecure_requests: bool = False,
|
upgrade_insecure_requests: bool = False,
|
||||||
include_sourcemaps: bool = False,
|
include_sourcemaps: bool = False,
|
||||||
|
ignore_non_html: bool = False,
|
||||||
) -> CrawlResult:
|
) -> CrawlResult:
|
||||||
start_url, _ = urldefrag(start_url)
|
start_url, _ = urldefrag(start_url)
|
||||||
base_origin = origin_of(start_url)
|
base_origin = origin_of(start_url)
|
||||||
|
|
@ -413,7 +414,18 @@ async def crawl_and_generate_csp(
|
||||||
page.on("response", on_response)
|
page.on("response", on_response)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
await page.goto(url, wait_until="networkidle", timeout=timeout_ms)
|
resp = await page.goto(
|
||||||
|
url, wait_until="networkidle", timeout=timeout_ms
|
||||||
|
)
|
||||||
|
|
||||||
|
ct = ""
|
||||||
|
if resp is not None:
|
||||||
|
ct = (await resp.header_value("content-type") or "").lower()
|
||||||
|
|
||||||
|
is_html = ("text/html" in ct) or ("application/xhtml+xml" in ct)
|
||||||
|
if not is_html and ignore_non_html:
|
||||||
|
# Still count as visited, but don't hash inline attrs / don't extract links.
|
||||||
|
continue
|
||||||
|
|
||||||
# Give the page a moment to run hydration / delayed fetches.
|
# Give the page a moment to run hydration / delayed fetches.
|
||||||
if settle_ms > 0:
|
if settle_ms > 0:
|
||||||
|
|
@ -565,6 +577,12 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
|
||||||
default=False,
|
default=False,
|
||||||
help="Analyze JS/CSS for sourceMappingURL and add map origins to connect-src",
|
help="Analyze JS/CSS for sourceMappingURL and add map origins to connect-src",
|
||||||
)
|
)
|
||||||
|
ap.add_argument(
|
||||||
|
"--ignore-non-html",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Ignore non-HTML pages that get crawled (which might trigger Chromium's word-wrap hash: https://stackoverflow.com/a/69838710)",
|
||||||
|
)
|
||||||
ap.add_argument(
|
ap.add_argument(
|
||||||
"--json", action="store_true", help="Output JSON instead of a header line"
|
"--json", action="store_true", help="Output JSON instead of a header line"
|
||||||
)
|
)
|
||||||
|
|
@ -589,6 +607,7 @@ def main(argv: list[str] | None = None) -> None:
|
||||||
allow_unsafe_eval=args.unsafe_eval,
|
allow_unsafe_eval=args.unsafe_eval,
|
||||||
upgrade_insecure_requests=args.upgrade_insecure_requests,
|
upgrade_insecure_requests=args.upgrade_insecure_requests,
|
||||||
include_sourcemaps=args.include_sourcemaps,
|
include_sourcemaps=args.include_sourcemaps,
|
||||||
|
ignore_non_html=args.ignore_non_html,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue