Skip to content

Commit a2742df

Browse files
authored
seed urls list: check for quoted URLs and remove quotes (#883)
- check for urls that are wrapped in quotes, eg. 'https://example.com/' or "https://example.com/" and trim and remove the quotes before adding seed - tests: add quoted URL to tests, fix old.webrecorder.net test - deps: update wabac.js, RWP to latest - logging: reduce error logging for seed lists, only log once that there are duplicates or page limit is reached - fix for #882
1 parent 705bc0c commit a2742df

File tree

8 files changed

+68
-33
lines changed

8 files changed

+68
-33
lines changed

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ ADD config/ /app/
3939

4040
ADD html/ /app/html/
4141

42-
ARG RWP_VERSION=2.3.15
42+
ARG RWP_VERSION=2.3.17
4343
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
4444
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
4545
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz

package.json

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
"dependencies": {
1919
"@novnc/novnc": "1.4.0",
2020
"@puppeteer/replay": "^3.1.1",
21-
"@webrecorder/wabac": "^2.23.8",
21+
"@webrecorder/wabac": "^2.23.11",
2222
"browsertrix-behaviors": "^0.9.2",
2323
"client-zip": "^2.4.5",
2424
"css-selector-parser": "^3.0.5",
@@ -39,7 +39,7 @@
3939
"tsc": "^2.0.4",
4040
"undici": "^6.18.2",
4141
"uuid": "8.3.2",
42-
"warcio": "^2.4.4",
42+
"warcio": "^2.4.5",
4343
"ws": "^7.4.4",
4444
"yargs": "^17.7.2"
4545
},
@@ -71,7 +71,7 @@
7171
},
7272
"resolutions": {
7373
"wrap-ansi": "7.0.0",
74-
"warcio": "^2.4.4",
74+
"warcio": "^2.4.5",
7575
"@novnc/novnc": "1.4.0"
7676
}
7777
}

src/crawler.ts

Lines changed: 16 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,8 @@ export class Crawler {
129129
limitHit = false;
130130
pageLimit: number;
131131

132+
dupeSeedsFound = false;
133+
132134
saveStateFiles: string[] = [];
133135
lastSaveTime: number;
134136

@@ -2465,30 +2467,34 @@ self.__bx_behaviors.selectMainBehavior();
24652467
this.pageLimit,
24662468
);
24672469

2468-
const logContext = depth === 0 ? "scope" : "links";
2469-
const logLevel = depth === 0 ? "error" : "debug";
2470-
24712470
switch (result) {
24722471
case QueueState.ADDED:
2473-
logger.debug("Queued new page URL", { url, ...logDetails }, logContext);
2472+
logger.debug("Queued new page URL", { url, ...logDetails }, "links");
24742473
return true;
24752474

24762475
case QueueState.LIMIT_HIT:
2477-
logger.logAsJSON(
2476+
logger.debug(
24782477
"Page URL not queued, at page limit",
24792478
{ url, ...logDetails },
2480-
logContext,
2481-
logLevel,
2479+
"links",
24822480
);
2481+
if (!this.limitHit && depth === 0) {
2482+
logger.error(
2483+
"Page limit reached when adding URL list, some URLs not crawled.",
2484+
);
2485+
}
24832486
this.limitHit = true;
24842487
return false;
24852488

24862489
case QueueState.DUPE_URL:
2487-
logger.logAsJSON(
2490+
if (!this.dupeSeedsFound && depth === 0) {
2491+
logger.error("Duplicate seed URLs found and skipped");
2492+
this.dupeSeedsFound = true;
2493+
}
2494+
logger.debug(
24882495
"Page URL not queued, already seen",
24892496
{ url, ...logDetails },
2490-
logContext,
2491-
logLevel,
2497+
"links",
24922498
);
24932499
return false;
24942500
}

src/util/seeds.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ export async function parseSeeds(params: CrawlerArgs): Promise<ScopedSeed[]> {
342342

343343
for (const seed of seeds) {
344344
const newSeed = typeof seed === "string" ? { url: seed } : seed;
345+
newSeed.url = removeQuotes(newSeed.url);
345346

346347
try {
347348
scopedSeeds.push(new ScopedSeed({ ...scopeOpts, ...newSeed }));
@@ -389,3 +390,14 @@ export function parseRx(
389390
return value.map((e) => (e instanceof RegExp ? e : new RegExp(e)));
390391
}
391392
}
393+
394+
export function removeQuotes(url: string) {
395+
url = url.trim();
396+
if (
397+
(url.startsWith(`"`) && url.endsWith(`"`)) ||
398+
(url.startsWith(`'`) && url.endsWith(`'`))
399+
) {
400+
url = url.slice(1, -1);
401+
}
402+
return url;
403+
}

tests/fixtures/urlSeedFile.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
1-
https://webrecorder.net/about/
1+
https://old.webrecorder.net/about/
22
https://specs.webrecorder.net/wacz/1.1.1/
3+
"https://old.webrecorder.net/faq"

tests/pageinfo-records.test.js

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,9 @@ function validateResourcesIndex(json) {
118118
{ status: 200, mime: "text/css", type: "stylesheet" },
119119
"https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
120120
{ status: 200, mime: "text/css", type: "stylesheet" },
121-
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
121+
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
122122
{ status: 200, mime: "font/woff2", type: "font" },
123-
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
123+
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
124124
{ status: 200, mime: "font/woff2", type: "font" },
125125
"https://old.webrecorder.net/assets/favicon.ico": {
126126
status: 200,
@@ -161,9 +161,9 @@ function validateResourcesAbout(json) {
161161
mime: "image/svg+xml",
162162
type: "image",
163163
},
164-
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
164+
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
165165
{ status: 200, mime: "font/woff2", type: "font" },
166-
"https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
166+
"https://fonts.gstatic.com/s/sourcesanspro/v23/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
167167
{ status: 200, mime: "font/woff2", type: "font" },
168168
});
169169
}

tests/url_file_list.test.js

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,30 @@
11
import util from "util";
2-
import { exec as execCallback } from "child_process";
2+
import { spawn, exec as execCallback } from "child_process";
33
import fs from "fs";
44

55
const exec = util.promisify(execCallback);
66

7+
let proc = null;
8+
9+
const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
10+
const TEST_HOST = `http://${DOCKER_HOST_NAME}:31502`;
11+
12+
beforeAll(() => {
13+
proc = spawn("../../node_modules/.bin/http-server", ["-p", "31502"], {cwd: "tests/fixtures/"});
14+
});
15+
16+
afterAll(() => {
17+
if (proc) {
18+
proc.kill();
19+
}
20+
});
21+
22+
23+
724
test("check that URLs in seed-list are crawled", async () => {
825
try {
926
await exec(
10-
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
27+
"docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000 --scopeType page",
1128
);
1229
} catch (error) {
1330
console.log(error);
@@ -43,7 +60,7 @@ test("check that URLs in seed-list are crawled", async () => {
4360
test("check that URLs in seed-list hosted at URL are crawled", async () => {
4461
try {
4562
await exec(
46-
'docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/refs/heads/main/tests/fixtures/urlSeedFile.txt" --timeout 90000',
63+
`docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection onlinefilelisttest --urlFile "${TEST_HOST}/urlSeedFile.txt" --timeout 90000 --scopeType page`,
4764
);
4865
} catch (error) {
4966
console.log(error);

yarn.lock

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,10 +1134,10 @@
11341134
resolved "https://registry.yarnpkg.com/@ungap/structured-clone/-/structured-clone-1.2.0.tgz#756641adb587851b5ccb3e095daf27ae581c8406"
11351135
integrity sha512-zuVdFrMJiuCDQUMCzQaD6KL28MjnqqN8XnAqiEq9PNm/hCPTSGfrXCOfwj1ow4LFb/tNymJPwsNbVePc1xFqrQ==
11361136

1137-
"@webrecorder/wabac@^2.23.8":
1138-
version "2.23.8"
1139-
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.8.tgz#a3eb1e605acb706b6f043ec9e7fae9ff412ccc8a"
1140-
integrity sha512-+ShHsaBHwFC0SPFTpMWrwJHd47MzT6o1Rg12FSfGfpycrcmrBV447+JR28NitLJIsfcIif8xAth9Vh5Z7tHWlQ==
1137+
"@webrecorder/wabac@^2.23.11":
1138+
version "2.23.11"
1139+
resolved "https://registry.yarnpkg.com/@webrecorder/wabac/-/wabac-2.23.11.tgz#945da06e08b6d093b525e6e5bfd6a8f17beb995b"
1140+
integrity sha512-rsBAkcYvgX+0HgwhgvSb3cBCBp0rVnHGQS/K5A9aJwOmfymHt0C2vInH/lmKV/5H38rJu29c2cvRX962h+lUiw==
11411141
dependencies:
11421142
"@peculiar/asn1-ecc" "^2.3.4"
11431143
"@peculiar/asn1-schema" "^2.3.3"
@@ -1151,7 +1151,6 @@
11511151
buffer "^6.0.3"
11521152
fast-xml-parser "^4.4.1"
11531153
hash-wasm "^4.9.0"
1154-
http-link-header "^1.1.3"
11551154
http-status-codes "^2.1.4"
11561155
idb "^7.1.1"
11571156
js-levenshtein "^1.1.6"
@@ -1162,7 +1161,7 @@
11621161
path-parser "^6.1.0"
11631162
process "^0.11.10"
11641163
stream-browserify "^3.0.0"
1165-
warcio "^2.4.3"
1164+
warcio "^2.4.5"
11661165

11671166
"@webrecorder/wombat@^3.8.14":
11681167
version "3.8.14"
@@ -2834,7 +2833,7 @@ html-escaper@^2.0.0:
28342833
resolved "https://registry.yarnpkg.com/html-escaper/-/html-escaper-2.0.2.tgz#dfd60027da36a36dfcbe236262c00a5822681453"
28352834
integrity sha512-H2iMtd0I4Mt5eYiapRdIDjp+XzelXQ0tFE4JS7YFwFevXXMmOp9myNrUvCg0D6ws8iqkRPBfKHgbwig1SmlLfg==
28362835

2837-
http-link-header@^1.1.1, http-link-header@^1.1.3:
2836+
http-link-header@^1.1.1:
28382837
version "1.1.3"
28392838
resolved "https://registry.yarnpkg.com/http-link-header/-/http-link-header-1.1.3.tgz#b367b7a0ad1cf14027953f31aa1df40bb433da2a"
28402839
integrity sha512-3cZ0SRL8fb9MUlU3mKM61FcQvPfXx2dBrZW3Vbg5CXa8jFlK8OaEpePenLe1oEXQduhz8b0QjsqfS59QP4AJDQ==
@@ -5527,10 +5526,10 @@ walker@^1.0.8:
55275526
dependencies:
55285527
makeerror "1.0.12"
55295528

5530-
warcio@^2.4.0, warcio@^2.4.3, warcio@^2.4.4:
5531-
version "2.4.4"
5532-
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.4.tgz#6c0c030bb55c0f0b824f854fa9e6718ca25d333d"
5533-
integrity sha512-FrWOhv1qLNhPBPGEMm24Yo+DtkipK5DxK3ckVGbOf0OJ/UqaxAhiiby74q+GW70dsJV0wF+RA1ToK6CKseTshA==
5529+
warcio@^2.4.0, warcio@^2.4.5:
5530+
version "2.4.5"
5531+
resolved "https://registry.yarnpkg.com/warcio/-/warcio-2.4.5.tgz#ba39c38e433491ab9016282813b9cf6539c3d808"
5532+
integrity sha512-b6R/aIsR4fXzrpY/Zud7LqHFi2Bt8Ov5VLOnruHQ10rk129e9d0KOCZlyRmPD6ENTcV7yze5rXvJ5WSNS8R1zw==
55345533
dependencies:
55355534
"@types/pako" "^1.0.7"
55365535
"@types/stream-buffers" "^3.0.7"

0 commit comments

Comments
 (0)