From bfa16a145abc40edc3f2a9342f0132c38007fd3c Mon Sep 17 00:00:00 2001 From: Miguel Jacq Date: Fri, 2 Jan 2026 10:41:57 +1100 Subject: [PATCH] Add --ignore-non-html option to skip pages that weren't HTML (which might trigger Chromium's 'sha256-4Su6mBWzEIFnH4pAGMOuaeBrstwJN4Z3pq/s1Kn4/KQ=' hash) --- CHANGELOG.md | 8 ++++++++ README.md | 7 ++++--- src/cspresso/crawl.py | 21 ++++++++++++++++++++- 3 files changed, 32 insertions(+), 4 deletions(-) create mode 100644 CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..319821a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,8 @@ +## 0.1.1 + + * Fix prog name + * Add --ignore-non-html option to skip pages that weren't HTML (which might trigger Chromium's 'sha256-4Su6mBWzEIFnH4pAGMOuaeBrstwJN4Z3pq/s1Kn4/KQ=' hash) + +## 0.1.0 + + * Initial release diff --git a/README.md b/README.md index dafe7aa..97d066b 100644 --- a/README.md +++ b/README.md @@ -81,9 +81,9 @@ poetry run cspresso https://example.com --json ## Full usage info ``` -usage: csp-crawl [-h] [--max-pages MAX_PAGES] [--timeout-ms TIMEOUT_MS] [--settle-ms SETTLE_MS] [--headed] [--no-install] [--with-deps] [--browsers-path BROWSERS_PATH] [--allow-blob] [--unsafe-eval] - [--upgrade-insecure-requests] [--include-sourcemaps] [--json] - url +usage: cspresso [-h] [--max-pages MAX_PAGES] [--timeout-ms TIMEOUT_MS] [--settle-ms SETTLE_MS] [--headed] [--no-install] [--with-deps] [--browsers-path BROWSERS_PATH] [--allow-blob] [--unsafe-eval] + [--upgrade-insecure-requests] [--include-sourcemaps] [--ignore-non-html] [--json] + url Crawl up to N pages (same-origin) with Playwright and generate a draft CSP. @@ -108,5 +108,6 @@ options: --upgrade-insecure-requests Add upgrade-insecure-requests directive --include-sourcemaps Analyze JS/CSS for sourceMappingURL and add map origins to connect-src + --ignore-non-html Ignore non-HTML pages that get crawled (which might trigger Chromium's word-wrap hash: https://stackoverflow.com/a/69838710) --json Output JSON instead of a header line ``` diff --git a/src/cspresso/crawl.py b/src/cspresso/crawl.py index e24539c..6f9c8bc 100644 --- a/src/cspresso/crawl.py +++ b/src/cspresso/crawl.py @@ -307,6 +307,7 @@ async def crawl_and_generate_csp( allow_unsafe_eval: bool = False, upgrade_insecure_requests: bool = False, include_sourcemaps: bool = False, + ignore_non_html: bool = False, ) -> CrawlResult: start_url, _ = urldefrag(start_url) base_origin = origin_of(start_url) @@ -413,7 +414,18 @@ async def crawl_and_generate_csp( page.on("response", on_response) try: - await page.goto(url, wait_until="networkidle", timeout=timeout_ms) + resp = await page.goto( + url, wait_until="networkidle", timeout=timeout_ms + ) + + ct = "" + if resp is not None: + ct = (await resp.header_value("content-type") or "").lower() + + is_html = ("text/html" in ct) or ("application/xhtml+xml" in ct) + if not is_html and ignore_non_html: + # Still count as visited, but don't hash inline attrs / don't extract links. + continue # Give the page a moment to run hydration / delayed fetches. if settle_ms > 0: @@ -565,6 +577,12 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace: default=False, help="Analyze JS/CSS for sourceMappingURL and add map origins to connect-src", ) + ap.add_argument( + "--ignore-non-html", + action="store_true", + default=False, + help="Ignore non-HTML pages that get crawled (which might trigger Chromium's word-wrap hash: https://stackoverflow.com/a/69838710)", + ) ap.add_argument( "--json", action="store_true", help="Output JSON instead of a header line" ) @@ -589,6 +607,7 @@ def main(argv: list[str] | None = None) -> None: allow_unsafe_eval=args.unsafe_eval, upgrade_insecure_requests=args.upgrade_insecure_requests, include_sourcemaps=args.include_sourcemaps, + ignore_non_html=args.ignore_non_html, ) )