Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
8691625
dedup work:
ikreymer Aug 30, 2025
cf2d766
args: add separate --dedupIndexUrl to support separate redis for dedup
ikreymer Sep 17, 2025
c5f84fe
add indexer entrypoint:
ikreymer Sep 18, 2025
35d43a5
keep skipping dupe URLs as before
ikreymer Sep 18, 2025
0ab43db
warc writing:
ikreymer Sep 18, 2025
ab4b19f
rename --dedupStoreUrl -> redisDedupUrl
ikreymer Sep 18, 2025
b24b2f2
update to latest warcio (2.4.7) to fix issus when returning payload o…
ikreymer Sep 18, 2025
1213ed0
bump to 2.4.7
ikreymer Sep 18, 2025
e059949
tests: add dedup-basic.test for simple dedup, ensure number of revisi…
ikreymer Sep 18, 2025
03bbf69
deps update
ikreymer Sep 20, 2025
fc3f9b4
Merge branch 'main' into hash-based-dedup
ikreymer Sep 20, 2025
aa7b8a1
dedup indexing: strip hash prefix from digest, as cdx does not have it
ikreymer Sep 23, 2025
3428b16
use dedup redis for queue up wacz files that need to be updated
ikreymer Sep 23, 2025
6ca191b
dedup post requests and non-404s as well!
ikreymer Sep 25, 2025
79c9327
Merge branch 'main' into hash-based-dedup
tw4l Oct 1, 2025
a39eea0
Merge branch 'main' into hash-based-dedup
ikreymer Oct 14, 2025
3397eb1
Merge branch 'hash-based-dedup' of github.com:webrecorder/browsertrix…
ikreymer Oct 14, 2025
b9db2ef
- track source index for each hash, so entry becomes '<source index> …
ikreymer Oct 18, 2025
f506113
update to new data model:
ikreymer Oct 24, 2025
48f781c
cleanup, keep compatibility with redis 6 still
ikreymer Oct 24, 2025
99a49d5
always return wacz, store wacz depends only for current wacz
ikreymer Oct 24, 2025
6f00a2e
rename 'dedup' -> 'dedupe' for consistency
ikreymer Oct 25, 2025
532fbe3
indexer optimize: commit only if added
ikreymer Oct 25, 2025
01930ee
add removing option to also remove unused crawls if doing a full sync…
ikreymer Oct 25, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,12 @@ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rw
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz

RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/*
RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js /app/dist/indexer.js && chmod a+r /app/html/rwp/*

RUN ln -s /app/dist/main.js /usr/bin/crawl; \
ln -s /app/dist/main.js /usr/bin/qa; \
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile; \
ln -s /app/dist/indexer.js /usr/bin/indexer;

RUN mkdir -p /app/behaviors

Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "browsertrix-crawler",
"version": "1.8.1",
"version": "1.9.0-beta.0",
"main": "browsertrix-crawler",
"type": "module",
"repository": "https://github.com/webrecorder/browsertrix-crawler",
Expand Down
74 changes: 45 additions & 29 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ import {
} from "./util/storage.js";
import { ScreenCaster, WSTransport } from "./util/screencaster.js";
import { Screenshots } from "./util/screenshots.js";
import { initRedis } from "./util/redis.js";
import { initRedisWaitForSuccess } from "./util/redis.js";
import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js";
import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
Expand Down Expand Up @@ -199,6 +199,7 @@ export class Crawler {
| null = null;

recording: boolean;
deduping = false;

constructor() {
const args = this.parseArgs();
Expand Down Expand Up @@ -335,32 +336,30 @@ export class Crawler {

async initCrawlState() {
const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
const dedupeRedisUrl = this.params.redisDedupeUrl || redisUrl;

this.deduping = dedupeRedisUrl !== redisUrl;

if (!redisUrl.startsWith("redis://")) {
logger.fatal(
"stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
);
}

let redis;

while (true) {
try {
redis = await initRedis(redisUrl);
break;
} catch (e) {
//logger.fatal("Unable to connect to state store Redis: " + redisUrl);
logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
await sleep(1);
}
}
const redis = await initRedisWaitForSuccess(redisUrl);

logger.debug(
`Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
{},
"state",
);

let dedupeRedis = redis;

if (redisUrl !== dedupeRedisUrl) {
dedupeRedis = await initRedisWaitForSuccess(dedupeRedisUrl);
}

logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");

this.crawlState = new RedisCrawlState(
Expand All @@ -369,6 +368,7 @@ export class Crawler {
this.maxPageTime,
os.hostname(),
this.params.maxPageRetries,
dedupeRedis,
);

if (this.params.logErrorsToRedis) {
Expand Down Expand Up @@ -1047,7 +1047,7 @@ self.__bx_behaviors.selectMainBehavior();
const { page, cdp, data, workerid, callbacks, recorder } = opts;
data.callbacks = callbacks;

const { url, seedId } = data;
const { url, seedId, depth } = data;

const auth = this.seeds[seedId].authHeader();

Expand Down Expand Up @@ -1120,6 +1120,7 @@ self.__bx_behaviors.selectMainBehavior();

if (recorder) {
recorder.pageSeed = seed;
recorder.pageSeedDepth = depth;
}

// run custom driver here, if any
Expand Down Expand Up @@ -1293,6 +1294,7 @@ self.__bx_behaviors.selectMainBehavior();
} else {
if (pageSkipped) {
await this.crawlState.markExcluded(url);
this.limitHit = false;
} else {
const retry = await this.crawlState.markFailed(url);

Expand Down Expand Up @@ -1654,9 +1656,7 @@ self.__bx_behaviors.selectMainBehavior();
if (this.params.generateWACZ) {
this.storage = initStorage();

if (this.storage) {
await this.crawlState.setWACZFilename();
}
await this.crawlState.setWACZFilename();
}

if (POST_CRAWL_STATES.includes(initState)) {
Expand Down Expand Up @@ -1843,9 +1843,20 @@ self.__bx_behaviors.selectMainBehavior();
}

if (this.params.generateWACZ && generateFiles) {
const uploaded = await this.generateWACZ();
const wacz = await this.generateWACZ();

if (wacz) {
if (this.deduping) {
await this.crawlState.setStatus("post-crawl");
await this.crawlState.updateDedupeSource(wacz);

await this.crawlState.clearDupeFileRef();
}

if (uploaded && this.uploadAndDeleteLocal) {
await this.crawlState.clearWACZFilename();
}

if (wacz && this.storage && this.uploadAndDeleteLocal) {
logger.info(
`Uploaded WACZ, deleting local data to free up space: ${this.collDir}`,
);
Expand Down Expand Up @@ -1883,7 +1894,7 @@ self.__bx_behaviors.selectMainBehavior();
await streamFinish(logFH);
}

async generateWACZ() {
async generateWACZ(): Promise<WACZ | null> {
logger.info("Generating WACZ");
await this.crawlState.setStatus("generate-wacz");

Expand All @@ -1897,11 +1908,11 @@ self.__bx_behaviors.selectMainBehavior();
if (!warcFileList.length) {
// if finished, just return
if (isFinished || (await this.crawlState.isCrawlCanceled())) {
return;
return null;
}
// possibly restarted after committing, so assume done here!
if ((await this.crawlState.numDone()) > 0) {
return;
return null;
}
// fail crawl otherwise
logger.fatal("No WARC Files, assuming crawl failed");
Expand All @@ -1921,6 +1932,8 @@ self.__bx_behaviors.selectMainBehavior();

await this.closeLog();

const requires = await this.crawlState.getDupeDependentSources();

const waczOpts: WACZInitOpts = {
input: warcFileList.map((x) => path.join(this.archivesDir, x)),
output: waczPath,
Expand All @@ -1929,6 +1942,7 @@ self.__bx_behaviors.selectMainBehavior();
warcCdxDir: this.warcCdxDir,
indexesDir: this.indexesDir,
softwareString: this.infoString,
requires,
};

if (process.env.WACZ_SIGN_URL) {
Expand Down Expand Up @@ -1958,13 +1972,8 @@ self.__bx_behaviors.selectMainBehavior();
const targetFilename = await this.crawlState.getWACZFilename();

await this.storage.uploadCollWACZ(wacz, targetFilename, isFinished);

await this.crawlState.clearWACZFilename();

return true;
}

return false;
return wacz;
} catch (e) {
logger.error("Error creating WACZ", e);
if (!streaming) {
Expand All @@ -1973,6 +1982,8 @@ self.__bx_behaviors.selectMainBehavior();
await this.setStatusAndExit(ExitCodes.UploadFailed, "interrupted");
}
}

return null;
}

logMemory() {
Expand Down Expand Up @@ -2138,7 +2149,12 @@ self.__bx_behaviors.selectMainBehavior();
// excluded in recorder
if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
data.pageSkipped = true;
logger.warn("Page Load Blocked, skipping", { msg, loadState });
logger.warn(
"Page Load Blocked, skipping",
{ msg, loadState },
"pageStatus",
);
throw new Error("logged");
} else {
return this.pageFailed("Page Load Failed", retry, {
msg,
Expand Down
Loading
Loading