diff --git a/src/util/normalizeurlopts.ts b/src/util/normalizeurlopts.ts new file mode 100644 index 000000000..252477848 --- /dev/null +++ b/src/util/normalizeurlopts.ts @@ -0,0 +1,16 @@ +import { Options as NormalizeUrlOptions } from "normalize-url"; + +// URL normalization options for consistent URL handling across the crawler +// Query parameters are sorted alphabetically by the normalize-url library +export const normalizeUrlOpts: NormalizeUrlOptions = { + defaultProtocol: "https", + stripAuthentication: false, + stripTextFragment: false, + stripWWW: false, + stripHash: false, + removeTrailingSlash: false, + removeSingleSlash: false, + removeExplicitPort: false, + sortQueryParameters: true, + removePath: false, +}; diff --git a/src/util/seeds.ts b/src/util/seeds.ts index a06ac807d..a8f82b769 100644 --- a/src/util/seeds.ts +++ b/src/util/seeds.ts @@ -4,6 +4,8 @@ import { MAX_DEPTH } from "./constants.js"; import { collectOnlineSeedFile } from "./file_reader.js"; import { logger } from "./logger.js"; import { type CrawlerArgs } from "./argParser.js"; +import normalizeUrlLib from "normalize-url"; +import { normalizeUrlOpts } from "./normalizeurlopts.js"; type ScopeType = | "prefix" @@ -63,7 +65,8 @@ export class ScopedSeed { parsedUrl.username = ""; parsedUrl.password = ""; - this.url = parsedUrl.href; + // Normalize URL with sorted query parameters for consistent matching + this.url = normalizeUrlLib(parsedUrl.href, normalizeUrlOpts); this.include = parseRx(include); this.exclude = parseRx(exclude); @@ -250,7 +253,8 @@ export class ScopedSeed { urlParsed.hash = ""; } - url = urlParsed.href; + // Normalize URL with sorted query parameters for consistent matching + url = normalizeUrlLib(urlParsed.href, normalizeUrlOpts); if (url === this.url) { return { url, isOOS: false }; diff --git a/src/util/state.ts b/src/util/state.ts index ab9810eae..b8bd38ba7 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -11,7 +11,8 @@ import { import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename, UploadResult } from "./storage.js"; -import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url"; +import normalizeUrl from "normalize-url"; +import { normalizeUrlOpts } from "./normalizeurlopts.js"; // ============================================================================ export enum LoadState { @@ -29,20 +30,6 @@ export enum QueueState { DUPE_URL = 2, } -// ============================================================================ -const normalizeUrlOpts: NormamlizeUrlOptions = { - defaultProtocol: "https", - stripAuthentication: false, - stripTextFragment: false, - stripWWW: false, - stripHash: false, - removeTrailingSlash: false, - removeSingleSlash: false, - removeExplicitPort: false, - sortQueryParameters: true, - removePath: false, -}; - // ============================================================================ // treat 0 or 206 as 200 for purposes of dedup function normalizeDedupStatus(status: number): number { diff --git a/tests/recorder.test.js b/tests/recorder.test.js new file mode 100644 index 000000000..083c97747 --- /dev/null +++ b/tests/recorder.test.js @@ -0,0 +1,31 @@ +import util from "util"; +import { exec as execCallback } from "child_process"; + +const exec = util.promisify(execCallback); + +test("ensure URLs with array-valued headers succeed without multiValueHeader errors", async () => { + // Facebook and other sites return array-valued headers that aren't in the + // multiValueHeader allowed list (set-cookie, warc-concurrent-to, warc-protocol) + // This test verifies that such headers are properly joined with ", " instead of + // throwing "not a valid multi value header" error + let passed = true; + let output = ""; + try { + const result = await exec( + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.facebook.com/permalink.php?story_fbid=pfbid0BqNZHQaQfqTAKzVaaeeYNuyPXFJhkPmzwWT7mZPZJLFnHNEvsdbnLJRPkHJDMcqFl&id=100082135548177 --scopeType page --limit 1 --collection multivalueheadertest", + ); + output = result.stdout + result.stderr; + } catch (error) { + console.log(error); + output = error.stdout + error.stderr; + passed = false; + } + + // Should not contain the multiValueHeader error + expect(output).not.toContain("not a valid multi value header"); + + // Should successfully crawl at least one page + expect(output).toMatch(/crawled:\s+1/); + + expect(passed).toBe(true); +}); diff --git a/tests/scopes.test.js b/tests/scopes.test.js index 84824f0fb..2e3af1b0a 100644 --- a/tests/scopes.test.js +++ b/tests/scopes.test.js @@ -326,3 +326,35 @@ seeds: expect(seeds[8].exclude).toEqual([/false/]); expect(seeds[9].exclude).toEqual([/true/]); }); + +test("scopeType page should match URLs with reordered query parameters", async () => { + const seeds = await getSeeds(` +seeds: + - url: https://example.com/page?foo=bar&baz=qux + scopeType: page +`); + + expect(seeds.length).toEqual(1); + expect(seeds[0].scopeType).toEqual("page"); + expect(seeds[0].include).toEqual([]); + expect(seeds[0].maxDepth).toEqual(0); + expect(seeds[0].maxExtraHops).toEqual(0); + + // Test with the same URL (should match) + const result1 = seeds[0].isIncluded( + "https://example.com/page?foo=bar&baz=qux", + 0, + 0 + ); + expect(result1).not.toBe(false); + expect(result1.isOOS).toBe(false); + + // Test with reordered query parameters (should still match) + const result2 = seeds[0].isIncluded( + "https://example.com/page?baz=qux&foo=bar", + 0, + 0 + ); + expect(result2).not.toBe(false); + expect(result2.isOOS).toBe(false); +});