Browsertrix normalization handling, take 3 (#912)

Mr0grog · web-flow · commit 712ca9c2c3be · 2026-02-24T20:25:07.000-08:00
A couple more items following on from #909 and #910 that came up when I did a more exhaustive check of *all* our URLs in current Browsertrix, and of the source for the normalizer Browsertrix uses. This should cover everything until Browsertrix releases an update that changes how it normalizes.
diff --git a/web_monitoring/utils.py b/web_monitoring/utils.py
@@ -197,6 +197,9 @@ def matchable_querystring(querystring: str) -> str:
     URLs are still matchable, even though they are not strictly correct.
     """
     parsed = parse_qsl(querystring, keep_blank_values=True)
+    # TODO: consider bringing in some more ignorable params from our custom
+    # SURT implementation in web-monitoring-db.
+    parsed = [(k, v) for k, v in parsed if not k.lower().startswith('utm_')]
     result = urlencode(sorted(parsed))
     if '=' not in querystring:
         result = re.sub(r'=', '', result)
@@ -231,7 +234,7 @@ def matchable_url(url: str) -> str:
     parsed = urlsplit(url)
     return parsed._replace(
         netloc=normalize_netloc(parsed),
-        path=(parsed.path or '/').rstrip('/'),
+        path=re.sub(r'//+', '/', (parsed.path or '/').rstrip('/')),
         query=matchable_querystring(parsed.query),
         fragment=''
     ).geturl()