feat: add canonicalizeUrl utility for consistent URL comparison (#1334)

prithipalpatwal · web-flow · commit eb1513228e5d · 2026-02-12T15:01:33.000+05:30
Add canonicalizeUrl function to normalize URLs by removing protocol, www
prefixes, trailing slashes, and optionally query parameters. This
enables accurate URL matching and deduplication across different URL
variations.

Please ensure your pull request adheres to the following guidelines:
- [ ] make sure to link the related issues in this description
- [ ] when merging / squashing, make sure the fixed issue references are
visible in the commits, for easy compilation of release notes

## Related Issues


Thanks for contributing!
diff --git a/packages/spacecat-shared-utils/src/index.d.ts b/packages/spacecat-shared-utils/src/index.d.ts
@@ -133,6 +133,20 @@ export declare function stripTrailingSlash(url: string): string;
  */
 export declare function stripWWW(url: string): string;
 
+/**
+ * Canonicalizes a URL by removing protocol, www prefix, and trailing slash
+ * for comparison and matching purposes.
+ * Optionally strips query parameters and fragments.
+ * @param url - URL to canonicalize
+ * @param options - Canonicalization options
+ * @param options.stripQuery - Whether to strip query parameters and fragments
+ * @returns Canonicalized URL
+ */
+export declare function canonicalizeUrl(
+  url: string,
+  options?: { stripQuery?: boolean }
+): string;
+
 /**
  * Composes a base URL by applying a series of transformations to the given domain.
  * @param domain - The domain to compose the base URL from.
diff --git a/packages/spacecat-shared-utils/src/index.js b/packages/spacecat-shared-utils/src/index.js
@@ -55,6 +55,7 @@ export { logWrapper } from './log-wrapper.js';
 export { instrumentAWSClient, getTraceId, addTraceIdHeader } from './xray.js';
 
 export {
+  canonicalizeUrl,
   composeBaseURL,
   composeAuditURL,
   prependSchema,
diff --git a/packages/spacecat-shared-utils/src/url-helpers.js b/packages/spacecat-shared-utils/src/url-helpers.js
@@ -318,6 +318,40 @@ async function wwwUrlResolver(site, rumApiClient, log) {
   return fallback;
 }
 
+/**
+ * Canonicalizes a URL by removing protocol, www prefix, and trailing slash
+ * for comparison and matching purposes.
+ * Optionally strips query parameters and fragments.
+ * @param {string} url - URL to canonicalize
+ * @param {object} options - Canonicalization options
+ * @param {boolean} options.stripQuery - Whether to strip query parameters and fragments
+ * @returns {string} Canonicalized URL
+ */
+export function canonicalizeUrl(url, { stripQuery = false } = {}) {
+  if (!url || typeof url !== 'string') {
+    return '';
+  }
+
+  let canonicalized = url
+    .toLowerCase() // Case insensitive
+    .trim()
+    .replace(/^https?:\/\//, '') // Remove protocol
+    .replace(/^www\d*\./, '') // Remove www, www2, www3, etc.
+    .replace(/\/$/, ''); // Remove trailing slash
+
+  // Optionally strip query parameters and fragments
+  if (stripQuery) {
+    const queryIndex = canonicalized.search(/[?#]/);
+    if (queryIndex !== -1) {
+      canonicalized = canonicalized.substring(0, queryIndex);
+    }
+    // Remove any trailing slash that may have been revealed
+    canonicalized = canonicalized.replace(/\/$/, '');
+  }
+
+  return canonicalized;
+}
+
 export {
   ensureHttps,
   getSpacecatRequestHeaders,
diff --git a/packages/spacecat-shared-utils/test/index.test.js b/packages/spacecat-shared-utils/test/index.test.js
@@ -27,6 +27,7 @@ describe('Index Exports', () => {
     'buildKey',
     'buildSuggestionKey',
     'calculateCPCValue',
+    'canonicalizeUrl',
     'composeAuditURL',
     'composeBaseURL',
     'dateAfterDays',
diff --git a/packages/spacecat-shared-utils/test/url-helpers.test.js b/packages/spacecat-shared-utils/test/url-helpers.test.js
@@ -16,6 +16,7 @@ import { expect } from 'chai';
 import nock from 'nock';
 import sinon from 'sinon';
 import {
+  canonicalizeUrl,
   composeAuditURL,
   composeBaseURL,
   prependSchema,
@@ -868,4 +869,65 @@ describe('URL Utility Functions', () => {
       expect(result).to.equal('www.example.com');
     });
   });
+
+  describe('canonicalizeUrl', () => {
+    it('removes protocol and converts to lowercase', () => {
+      expect(canonicalizeUrl('HTTPS://Example.com/Page')).to.equal('example.com/page');
+      expect(canonicalizeUrl('http://EXAMPLE.COM/Path')).to.equal('example.com/path');
+    });
+
+    it('removes www prefixes', () => {
+      expect(canonicalizeUrl('https://www.example.com/path')).to.equal('example.com/path');
+      expect(canonicalizeUrl('https://www2.example.com/path')).to.equal('example.com/path');
+      expect(canonicalizeUrl('https://www123.example.com/path')).to.equal('example.com/path');
+    });
+
+    it('removes trailing slashes', () => {
+      expect(canonicalizeUrl('https://example.com/path/')).to.equal('example.com/path');
+      expect(canonicalizeUrl('https://example.com/')).to.equal('example.com');
+      expect(canonicalizeUrl('https://example.com/path/subpath/')).to.equal('example.com/path/subpath');
+    });
+
+    it('handles combined variations', () => {
+      expect(canonicalizeUrl('HTTPS://WWW.Example.COM/Path/')).to.equal('example.com/path');
+      expect(canonicalizeUrl('http://www2.EXAMPLE.com/PAGE/SubPage/')).to.equal('example.com/page/subpage');
+    });
+
+    it('strips query parameters when stripQuery is true', () => {
+      expect(canonicalizeUrl('https://example.com/path?param=value', { stripQuery: true }))
+        .to.equal('example.com/path');
+      expect(canonicalizeUrl('https://example.com/path#anchor', { stripQuery: true }))
+        .to.equal('example.com/path');
+      expect(canonicalizeUrl('https://example.com/path?a=b&c=d#hash', { stripQuery: true }))
+        .to.equal('example.com/path');
+      expect(canonicalizeUrl('https://www2.google.com/?query=hello&params=world', { stripQuery: true }))
+        .to.equal('google.com');
+      expect(canonicalizeUrl('https://example.com/?param=value', { stripQuery: true }))
+        .to.equal('example.com');
+    });
+
+    it('preserves query parameters by default', () => {
+      expect(canonicalizeUrl('https://example.com/path?param=value'))
+        .to.equal('example.com/path?param=value');
+      expect(canonicalizeUrl('https://example.com/path#anchor'))
+        .to.equal('example.com/path#anchor');
+    });
+
+    it('handles empty and invalid inputs', () => {
+      expect(canonicalizeUrl('')).to.equal('');
+      expect(canonicalizeUrl(null)).to.equal('');
+      expect(canonicalizeUrl(undefined)).to.equal('');
+      expect(canonicalizeUrl(123)).to.equal('');
+    });
+
+    it('trims whitespace', () => {
+      expect(canonicalizeUrl('  https://example.com/path  ')).to.equal('example.com/path');
+      expect(canonicalizeUrl('\t\nhttps://example.com/path\n\t')).to.equal('example.com/path');
+    });
+
+    it('handles URLs without protocol', () => {
+      expect(canonicalizeUrl('example.com/path')).to.equal('example.com/path');
+      expect(canonicalizeUrl('www.example.com/path')).to.equal('example.com/path');
+    });
+  });
 });