Skip to content

Commit eb15132

Browse files
feat: add canonicalizeUrl utility for consistent URL comparison (#1334)
Add canonicalizeUrl function to normalize URLs by removing protocol, www prefixes, trailing slashes, and optionally query parameters. This enables accurate URL matching and deduplication across different URL variations. Please ensure your pull request adheres to the following guidelines: - [ ] make sure to link the related issues in this description - [ ] when merging / squashing, make sure the fixed issue references are visible in the commits, for easy compilation of release notes ## Related Issues Thanks for contributing!
1 parent 67e805a commit eb15132

File tree

5 files changed

+112
-0
lines changed

5 files changed

+112
-0
lines changed

packages/spacecat-shared-utils/src/index.d.ts

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,20 @@ export declare function stripTrailingSlash(url: string): string;
133133
*/
134134
export declare function stripWWW(url: string): string;
135135

136+
/**
137+
* Canonicalizes a URL by removing protocol, www prefix, and trailing slash
138+
* for comparison and matching purposes.
139+
* Optionally strips query parameters and fragments.
140+
* @param url - URL to canonicalize
141+
* @param options - Canonicalization options
142+
* @param options.stripQuery - Whether to strip query parameters and fragments
143+
* @returns Canonicalized URL
144+
*/
145+
export declare function canonicalizeUrl(
146+
url: string,
147+
options?: { stripQuery?: boolean }
148+
): string;
149+
136150
/**
137151
* Composes a base URL by applying a series of transformations to the given domain.
138152
* @param domain - The domain to compose the base URL from.

packages/spacecat-shared-utils/src/index.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ export { logWrapper } from './log-wrapper.js';
5555
export { instrumentAWSClient, getTraceId, addTraceIdHeader } from './xray.js';
5656

5757
export {
58+
canonicalizeUrl,
5859
composeBaseURL,
5960
composeAuditURL,
6061
prependSchema,

packages/spacecat-shared-utils/src/url-helpers.js

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,40 @@ async function wwwUrlResolver(site, rumApiClient, log) {
318318
return fallback;
319319
}
320320

321+
/**
322+
* Canonicalizes a URL by removing protocol, www prefix, and trailing slash
323+
* for comparison and matching purposes.
324+
* Optionally strips query parameters and fragments.
325+
* @param {string} url - URL to canonicalize
326+
* @param {object} options - Canonicalization options
327+
* @param {boolean} options.stripQuery - Whether to strip query parameters and fragments
328+
* @returns {string} Canonicalized URL
329+
*/
330+
export function canonicalizeUrl(url, { stripQuery = false } = {}) {
331+
if (!url || typeof url !== 'string') {
332+
return '';
333+
}
334+
335+
let canonicalized = url
336+
.toLowerCase() // Case insensitive
337+
.trim()
338+
.replace(/^https?:\/\//, '') // Remove protocol
339+
.replace(/^www\d*\./, '') // Remove www, www2, www3, etc.
340+
.replace(/\/$/, ''); // Remove trailing slash
341+
342+
// Optionally strip query parameters and fragments
343+
if (stripQuery) {
344+
const queryIndex = canonicalized.search(/[?#]/);
345+
if (queryIndex !== -1) {
346+
canonicalized = canonicalized.substring(0, queryIndex);
347+
}
348+
// Remove any trailing slash that may have been revealed
349+
canonicalized = canonicalized.replace(/\/$/, '');
350+
}
351+
352+
return canonicalized;
353+
}
354+
321355
export {
322356
ensureHttps,
323357
getSpacecatRequestHeaders,

packages/spacecat-shared-utils/test/index.test.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ describe('Index Exports', () => {
2727
'buildKey',
2828
'buildSuggestionKey',
2929
'calculateCPCValue',
30+
'canonicalizeUrl',
3031
'composeAuditURL',
3132
'composeBaseURL',
3233
'dateAfterDays',

packages/spacecat-shared-utils/test/url-helpers.test.js

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import { expect } from 'chai';
1616
import nock from 'nock';
1717
import sinon from 'sinon';
1818
import {
19+
canonicalizeUrl,
1920
composeAuditURL,
2021
composeBaseURL,
2122
prependSchema,
@@ -868,4 +869,65 @@ describe('URL Utility Functions', () => {
868869
expect(result).to.equal('www.example.com');
869870
});
870871
});
872+
873+
describe('canonicalizeUrl', () => {
874+
it('removes protocol and converts to lowercase', () => {
875+
expect(canonicalizeUrl('HTTPS://Example.com/Page')).to.equal('example.com/page');
876+
expect(canonicalizeUrl('http://EXAMPLE.COM/Path')).to.equal('example.com/path');
877+
});
878+
879+
it('removes www prefixes', () => {
880+
expect(canonicalizeUrl('https://www.example.com/path')).to.equal('example.com/path');
881+
expect(canonicalizeUrl('https://www2.example.com/path')).to.equal('example.com/path');
882+
expect(canonicalizeUrl('https://www123.example.com/path')).to.equal('example.com/path');
883+
});
884+
885+
it('removes trailing slashes', () => {
886+
expect(canonicalizeUrl('https://example.com/path/')).to.equal('example.com/path');
887+
expect(canonicalizeUrl('https://example.com/')).to.equal('example.com');
888+
expect(canonicalizeUrl('https://example.com/path/subpath/')).to.equal('example.com/path/subpath');
889+
});
890+
891+
it('handles combined variations', () => {
892+
expect(canonicalizeUrl('HTTPS://WWW.Example.COM/Path/')).to.equal('example.com/path');
893+
expect(canonicalizeUrl('http://www2.EXAMPLE.com/PAGE/SubPage/')).to.equal('example.com/page/subpage');
894+
});
895+
896+
it('strips query parameters when stripQuery is true', () => {
897+
expect(canonicalizeUrl('https://example.com/path?param=value', { stripQuery: true }))
898+
.to.equal('example.com/path');
899+
expect(canonicalizeUrl('https://example.com/path#anchor', { stripQuery: true }))
900+
.to.equal('example.com/path');
901+
expect(canonicalizeUrl('https://example.com/path?a=b&c=d#hash', { stripQuery: true }))
902+
.to.equal('example.com/path');
903+
expect(canonicalizeUrl('https://www2.google.com/?query=hello&params=world', { stripQuery: true }))
904+
.to.equal('google.com');
905+
expect(canonicalizeUrl('https://example.com/?param=value', { stripQuery: true }))
906+
.to.equal('example.com');
907+
});
908+
909+
it('preserves query parameters by default', () => {
910+
expect(canonicalizeUrl('https://example.com/path?param=value'))
911+
.to.equal('example.com/path?param=value');
912+
expect(canonicalizeUrl('https://example.com/path#anchor'))
913+
.to.equal('example.com/path#anchor');
914+
});
915+
916+
it('handles empty and invalid inputs', () => {
917+
expect(canonicalizeUrl('')).to.equal('');
918+
expect(canonicalizeUrl(null)).to.equal('');
919+
expect(canonicalizeUrl(undefined)).to.equal('');
920+
expect(canonicalizeUrl(123)).to.equal('');
921+
});
922+
923+
it('trims whitespace', () => {
924+
expect(canonicalizeUrl(' https://example.com/path ')).to.equal('example.com/path');
925+
expect(canonicalizeUrl('\t\nhttps://example.com/path\n\t')).to.equal('example.com/path');
926+
});
927+
928+
it('handles URLs without protocol', () => {
929+
expect(canonicalizeUrl('example.com/path')).to.equal('example.com/path');
930+
expect(canonicalizeUrl('www.example.com/path')).to.equal('example.com/path');
931+
});
932+
});
871933
});

0 commit comments

Comments
 (0)