Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions src/util/normalizeurlopts.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import { Options as NormalizeUrlOptions } from "normalize-url";

// URL normalization options for consistent URL handling across the crawler
// Query parameters are sorted alphabetically by the normalize-url library
export const normalizeUrlOpts: NormalizeUrlOptions = {
defaultProtocol: "https",
stripAuthentication: false,
stripTextFragment: false,
stripWWW: false,
stripHash: false,
removeTrailingSlash: false,
removeSingleSlash: false,
removeExplicitPort: false,
sortQueryParameters: true,
removePath: false,
};
8 changes: 6 additions & 2 deletions src/util/seeds.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import { MAX_DEPTH } from "./constants.js";
import { collectOnlineSeedFile } from "./file_reader.js";
import { logger } from "./logger.js";
import { type CrawlerArgs } from "./argParser.js";
import normalizeUrlLib from "normalize-url";
import { normalizeUrlOpts } from "./normalizeurlopts.js";

type ScopeType =
| "prefix"
Expand Down Expand Up @@ -63,7 +65,8 @@ export class ScopedSeed {
parsedUrl.username = "";
parsedUrl.password = "";

this.url = parsedUrl.href;
// Normalize URL with sorted query parameters for consistent matching
this.url = normalizeUrlLib(parsedUrl.href, normalizeUrlOpts);
this.include = parseRx(include);
this.exclude = parseRx(exclude);

Expand Down Expand Up @@ -250,7 +253,8 @@ export class ScopedSeed {
urlParsed.hash = "";
}

url = urlParsed.href;
// Normalize URL with sorted query parameters for consistent matching
url = normalizeUrlLib(urlParsed.href, normalizeUrlOpts);

if (url === this.url) {
return { url, isOOS: false };
Expand Down
17 changes: 2 additions & 15 deletions src/util/state.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ import {
import { ScopedSeed } from "./seeds.js";
import { Frame } from "puppeteer-core";
import { interpolateFilename, UploadResult } from "./storage.js";
import normalizeUrl, { Options as NormamlizeUrlOptions } from "normalize-url";
import normalizeUrl from "normalize-url";
import { normalizeUrlOpts } from "./normalizeurlopts.js";

// ============================================================================
export enum LoadState {
Expand All @@ -29,20 +30,6 @@ export enum QueueState {
DUPE_URL = 2,
}

// ============================================================================
const normalizeUrlOpts: NormamlizeUrlOptions = {
defaultProtocol: "https",
stripAuthentication: false,
stripTextFragment: false,
stripWWW: false,
stripHash: false,
removeTrailingSlash: false,
removeSingleSlash: false,
removeExplicitPort: false,
sortQueryParameters: true,
removePath: false,
};

// ============================================================================
// treat 0 or 206 as 200 for purposes of dedup
function normalizeDedupStatus(status: number): number {
Expand Down
31 changes: 31 additions & 0 deletions tests/recorder.test.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import util from "util";
import { exec as execCallback } from "child_process";

const exec = util.promisify(execCallback);

test("ensure URLs with array-valued headers succeed without multiValueHeader errors", async () => {
// Facebook and other sites return array-valued headers that aren't in the
// multiValueHeader allowed list (set-cookie, warc-concurrent-to, warc-protocol)
// This test verifies that such headers are properly joined with ", " instead of
// throwing "not a valid multi value header" error
let passed = true;
let output = "";
try {
const result = await exec(
"docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.facebook.com/permalink.php?story_fbid=pfbid0BqNZHQaQfqTAKzVaaeeYNuyPXFJhkPmzwWT7mZPZJLFnHNEvsdbnLJRPkHJDMcqFl&id=100082135548177 --scopeType page --limit 1 --collection multivalueheadertest",
);
output = result.stdout + result.stderr;
} catch (error) {
console.log(error);
output = error.stdout + error.stderr;
passed = false;
}

// Should not contain the multiValueHeader error
expect(output).not.toContain("not a valid multi value header");

// Should successfully crawl at least one page
expect(output).toMatch(/crawled:\s+1/);

expect(passed).toBe(true);
});
32 changes: 32 additions & 0 deletions tests/scopes.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -326,3 +326,35 @@ seeds:
expect(seeds[8].exclude).toEqual([/false/]);
expect(seeds[9].exclude).toEqual([/true/]);
});

test("scopeType page should match URLs with reordered query parameters", async () => {
const seeds = await getSeeds(`
seeds:
- url: https://example.com/page?foo=bar&baz=qux
scopeType: page
`);

expect(seeds.length).toEqual(1);
expect(seeds[0].scopeType).toEqual("page");
expect(seeds[0].include).toEqual([]);
expect(seeds[0].maxDepth).toEqual(0);
expect(seeds[0].maxExtraHops).toEqual(0);

// Test with the same URL (should match)
const result1 = seeds[0].isIncluded(
"https://example.com/page?foo=bar&baz=qux",
0,
0
);
expect(result1).not.toBe(false);
expect(result1.isOOS).toBe(false);

// Test with reordered query parameters (should still match)
const result2 = seeds[0].isIncluded(
"https://example.com/page?baz=qux&foo=bar",
0,
0
);
expect(result2).not.toBe(false);
expect(result2.isOOS).toBe(false);
});