From bfa16a145abc40edc3f2a9342f0132c38007fd3c Mon Sep 17 00:00:00 2001
From: Miguel Jacq <mig@mig5.net>
Date: Fri, 2 Jan 2026 10:41:57 +1100
Subject: [PATCH] Add --ignore-non-html option to skip pages that weren't HTML
 (which might trigger Chromium's
 'sha256-4Su6mBWzEIFnH4pAGMOuaeBrstwJN4Z3pq/s1Kn4/KQ=' hash)

---
 CHANGELOG.md          |  8 ++++++++
 README.md             |  7 ++++---
 src/cspresso/crawl.py | 21 ++++++++++++++++++++-
 3 files changed, 32 insertions(+), 4 deletions(-)
 create mode 100644 CHANGELOG.md

diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..319821a
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,8 @@
+## 0.1.1
+
+ * Fix prog name
+ * Add --ignore-non-html option to skip pages that weren't HTML (which might trigger Chromium's 'sha256-4Su6mBWzEIFnH4pAGMOuaeBrstwJN4Z3pq/s1Kn4/KQ=' hash)
+
+## 0.1.0
+
+ * Initial release
diff --git a/README.md b/README.md
index dafe7aa..97d066b 100644
--- a/README.md
+++ b/README.md
@@ -81,9 +81,9 @@ poetry run cspresso https://example.com --json
 ## Full usage info
 
 ```
-usage: csp-crawl [-h] [--max-pages MAX_PAGES] [--timeout-ms TIMEOUT_MS] [--settle-ms SETTLE_MS] [--headed] [--no-install] [--with-deps] [--browsers-path BROWSERS_PATH] [--allow-blob] [--unsafe-eval]
-                 [--upgrade-insecure-requests] [--include-sourcemaps] [--json]
-                 url
+usage: cspresso [-h] [--max-pages MAX_PAGES] [--timeout-ms TIMEOUT_MS] [--settle-ms SETTLE_MS] [--headed] [--no-install] [--with-deps] [--browsers-path BROWSERS_PATH] [--allow-blob] [--unsafe-eval]
+                [--upgrade-insecure-requests] [--include-sourcemaps] [--ignore-non-html] [--json]
+                url
 
 Crawl up to N pages (same-origin) with Playwright and generate a draft CSP.
 
@@ -108,5 +108,6 @@ options:
   --upgrade-insecure-requests
                         Add upgrade-insecure-requests directive
   --include-sourcemaps  Analyze JS/CSS for sourceMappingURL and add map origins to connect-src
+  --ignore-non-html     Ignore non-HTML pages that get crawled (which might trigger Chromium's word-wrap hash: https://stackoverflow.com/a/69838710)
   --json                Output JSON instead of a header line
 ```
diff --git a/src/cspresso/crawl.py b/src/cspresso/crawl.py
index e24539c..6f9c8bc 100644
--- a/src/cspresso/crawl.py
+++ b/src/cspresso/crawl.py
@@ -307,6 +307,7 @@ async def crawl_and_generate_csp(
     allow_unsafe_eval: bool = False,
     upgrade_insecure_requests: bool = False,
     include_sourcemaps: bool = False,
+    ignore_non_html: bool = False,
 ) -> CrawlResult:
     start_url, _ = urldefrag(start_url)
     base_origin = origin_of(start_url)
@@ -413,7 +414,18 @@ async def crawl_and_generate_csp(
                 page.on("response", on_response)
 
             try:
-                await page.goto(url, wait_until="networkidle", timeout=timeout_ms)
+                resp = await page.goto(
+                    url, wait_until="networkidle", timeout=timeout_ms
+                )
+
+                ct = ""
+                if resp is not None:
+                    ct = (await resp.header_value("content-type") or "").lower()
+
+                is_html = ("text/html" in ct) or ("application/xhtml+xml" in ct)
+                if not is_html and ignore_non_html:
+                    # Still count as visited, but don't hash inline attrs / don't extract links.
+                    continue
 
                 # Give the page a moment to run hydration / delayed fetches.
                 if settle_ms > 0:
@@ -565,6 +577,12 @@ def _parse_args(argv: list[str] | None = None) -> argparse.Namespace:
         default=False,
         help="Analyze JS/CSS for sourceMappingURL and add map origins to connect-src",
     )
+    ap.add_argument(
+        "--ignore-non-html",
+        action="store_true",
+        default=False,
+        help="Ignore non-HTML pages that get crawled (which might trigger Chromium's word-wrap hash: https://stackoverflow.com/a/69838710)",
+    )
     ap.add_argument(
         "--json", action="store_true", help="Output JSON instead of a header line"
     )
@@ -589,6 +607,7 @@ def main(argv: list[str] | None = None) -> None:
             allow_unsafe_eval=args.unsafe_eval,
             upgrade_insecure_requests=args.upgrade_insecure_requests,
             include_sourcemaps=args.include_sourcemaps,
+            ignore_non_html=args.ignore_non_html,
         )
     )