webrecorder · ikreymer · Aug 30, 2025 · Sep 17, 2025 · Sep 18, 2025 · Sep 18, 2025
diff --git a/Dockerfile b/Dockerfile
@@ -44,11 +44,12 @@ ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rw
 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
 ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
 
-RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/*
+RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js /app/dist/indexer.js && chmod a+r /app/html/rwp/*
 
 RUN ln -s /app/dist/main.js /usr/bin/crawl; \
     ln -s /app/dist/main.js /usr/bin/qa; \
-    ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
+    ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile; \
+    ln -s /app/dist/indexer.js /usr/bin/indexer;
 
 RUN mkdir -p /app/behaviors
 

diff --git a/package.json b/package.json
@@ -1,6 +1,6 @@
 {
   "name": "browsertrix-crawler",
-  "version": "1.8.0",
+  "version": "1.9.0-beta.0",
   "main": "browsertrix-crawler",
   "type": "module",
   "repository": "https://github.com/webrecorder/browsertrix-crawler",

diff --git a/src/crawler.ts b/src/crawler.ts
@@ -31,7 +31,7 @@ import {
 } from "./util/storage.js";
 import { ScreenCaster, WSTransport } from "./util/screencaster.js";
 import { Screenshots } from "./util/screenshots.js";
-import { initRedis } from "./util/redis.js";
+import { initRedisWaitForSuccess } from "./util/redis.js";
 import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js";
 import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js";
 import { sleep, timedRun, secondsElapsed } from "./util/timing.js";
@@ -337,32 +337,28 @@ export class Crawler {
 
   async initCrawlState() {
     const redisUrl = this.params.redisStoreUrl || "redis://localhost:6379/0";
+    const dedupRedisUrl = this.params.redisDedupUrl || redisUrl;
 
     if (!redisUrl.startsWith("redis://")) {
       logger.fatal(
         "stateStoreUrl must start with redis:// -- Only redis-based store currently supported",
       );
     }
 
-    let redis;
-
-    while (true) {
-      try {
-        redis = await initRedis(redisUrl);
-        break;
-      } catch (e) {
-        //logger.fatal("Unable to connect to state store Redis: " + redisUrl);
-        logger.warn(`Waiting for redis at ${redisUrl}`, {}, "state");
-        await sleep(1);
-      }
-    }
+    const redis = await initRedisWaitForSuccess(redisUrl);
 
     logger.debug(
       `Storing state via Redis ${redisUrl} @ key prefix "${this.crawlId}"`,
       {},
       "state",
     );
 
+    let dedupRedis = redis;
+
+    if (redisUrl !== dedupRedisUrl) {
+      dedupRedis = await initRedisWaitForSuccess(dedupRedisUrl);
+    }
+
     logger.debug(`Max Page Time: ${this.maxPageTime} seconds`, {}, "state");
 
     this.crawlState = new RedisCrawlState(
@@ -371,6 +367,7 @@ export class Crawler {
       this.maxPageTime,
       os.hostname(),
       this.params.maxPageRetries,
+      dedupRedis,
     );
 
     if (this.params.logErrorsToRedis) {
@@ -1046,7 +1043,7 @@ self.__bx_behaviors.selectMainBehavior();
     const { page, cdp, data, workerid, callbacks, recorder } = opts;
     data.callbacks = callbacks;
 
-    const { url, seedId } = data;
+    const { url, seedId, depth } = data;
 
     const auth = this.seeds[seedId].authHeader();
 
@@ -1119,6 +1116,7 @@ self.__bx_behaviors.selectMainBehavior();
 
     if (recorder) {
       recorder.pageSeed = seed;
+      recorder.pageSeedDepth = depth;
     }
 
     // run custom driver here, if any
@@ -1292,6 +1290,7 @@ self.__bx_behaviors.selectMainBehavior();
     } else {
       if (pageSkipped) {
         await this.crawlState.markExcluded(url);
+        this.limitHit = false;
       } else {
         const retry = await this.crawlState.markFailed(url);
 
@@ -2137,7 +2136,12 @@ self.__bx_behaviors.selectMainBehavior();
           // excluded in recorder
           if (msg.startsWith("net::ERR_BLOCKED_BY_RESPONSE")) {
             data.pageSkipped = true;
-            logger.warn("Page Load Blocked, skipping", { msg, loadState });
+            logger.warn(
+              "Page Load Blocked, skipping",
+              { msg, loadState },
+              "pageStatus",
+            );
+            throw new Error("logged");
           } else {
             return this.pageFailed("Page Load Failed", retry, {
               msg,

diff --git a/src/indexer.ts b/src/indexer.ts
@@ -0,0 +1,181 @@
+#!/usr/bin/env node
+
+import yargs from "yargs";
+import { logger } from "./util/logger.js";
+import { getInfoString } from "./util/file_reader.js";
+import { openAsBlob } from "node:fs";
+import { WACZLoader } from "./util/wacz.js";
+import { ExitCodes } from "./util/constants.js";
+import { initRedisWaitForSuccess } from "./util/redis.js";
+import { AsyncIterReader } from "warcio";
+import { RedisDedupIndex } from "./util/state.js";
+
+export class CrawlIndexer {
+  constructor() {}
+
+  initArgs() {
+    return yargs(process.argv)
+      .usage("indexer [options]")
+      .options({
+        redisDedupUrl: {
+          describe: "URL for remote redis instance to index into",
+          type: "string",
+          required: true,
+        },
+
+        sourceUrl: {
+          describe: "Source WACZ or Multi WACZ or Multi WACZ JSON to index",
+          type: "string",
+          required: true,
+        },
+      })
+      .parseSync();
+  }
+
+  async run() {
+    logger.setDebugLogging(true);
+
+    process.on("SIGINT", () => this.handleTerminate("SIGINT"));
+
+    process.on("SIGTERM", () => this.handleTerminate("SIGTERM"));
+
+    logger.info(await getInfoString());
+
+    const params = this.initArgs();
+
+    const redis = await initRedisWaitForSuccess(params.redisDedupUrl);
+    const dedupIndex = new RedisDedupIndex(redis);
+
+    for await (const [name, waczfile] of this.iterWACZ(params.sourceUrl)) {
+      await dedupIndex.addHashSource(name, waczfile);
+    }
+
+    let count = 0;
+    let res;
+
+    while ((res = await dedupIndex.nextQueuedHashSource())) {
+      const { id, url, total } = res;
+      count += 1;
+      const loader = new WACZLoader(url);
+      logger.debug(`Processing WACZ ${count} of ${total}`, { waczfile: url });
+      for await (const file of loader.iterFiles("indexes/")) {
+        const filename = file.filename;
+        if (filename.endsWith(".cdx.gz")) {
+          logger.debug("Processing CDX GZ Index", { filename });
+          await this.ingestCDXJ(dedupIndex, loader, filename, "gzip");
+        } else if (filename.endsWith(".cdx") || filename.endsWith(".cdxj")) {
+          logger.debug("Processing CDX Index", { filename });
+          await this.ingestCDXJ(dedupIndex, loader, filename);
+        }
+      }
+      await dedupIndex.addDoneSource(id);
+    }
+
+    logger.info("Done!");
+    await dedupIndex.markDoneImport();
+    process.exit(ExitCodes.Success);
+  }
+
+  async ingestCDXJ(
+    dedupIndex: RedisDedupIndex,
+    loader: WACZLoader,
+    filename: string,
+    compression?: string,
+  ) {
+    let reader = await loader.loadFile(filename);
+
+    if (!reader) {
+      logger.error("File not found, skipping!");
+      return;
+    }
+
+    if (compression === "gzip") {
+      reader = new AsyncIterReader(reader, "gzip", false);
+    }
+
+    let count = 0;
+
+    for await (const line of reader.iterLines()) {
+      const inx = line.indexOf(" {");
+      if (inx < 0) {
+        logger.error("Skipping invalid CDXJ, no JSON", { line });
+        continue;
+      }
+
+      // eslint-disable-next-line @typescript-eslint/no-explicit-any
+      let cdx: Record<string, any>;
+
+      try {
+        cdx = JSON.parse(line.slice(inx));
+      } catch (e) {
+        logger.error("Skipping invalid CDXJ, JSON invalid", { line });
+        continue;
+      }
+
+      const date = line.split(" ", 2)[1];
+      const url = cdx.url;
+      const hash = cdx.digest;
+
+      if (url.startsWith("urn:")) {
+        continue;
+      }
+
+      // only adding originals to dedup against, don't want to dedup against existing revisits
+      if (cdx.mime === "warc/revisit") {
+        continue;
+      }
+
+      if (url && date && hash) {
+        await dedupIndex.addHashDupe(hash, url, date);
+      } else {
+        logger.warn("Skipping invalid CDXJ, data missing", {
+          url,
+          date,
+          digest: hash,
+        });
+        continue;
+      }
+
+      count += 1;
+    }
+
+    logger.debug("Processed", { count });
+  }
+
+  async *iterWACZ(url: string, name?: string): AsyncIterable<[string, string]> {
+    let path: string = url;
+
+    try {
+      path = new URL(url).pathname;
+    } catch (e) {
+      // ignore
+    }
+
+    if (path.endsWith(".wacz")) {
+      yield [name || url, url];
+    } else if (path.endsWith(".json")) {
+      if (!url.startsWith("http://") && !url.startsWith("https://")) {
+        const blob = await openAsBlob(url);
+        url = URL.createObjectURL(blob);
+      }
+
+      const resp = await fetch(url);
+      const json = await resp.json();
+
+      for (const entry of json.resources) {
+        if (entry.path) {
+          yield* this.iterWACZ(entry.path, entry.name);
+        }
+      }
+    } else {
+      logger.warn("Unknown source", { url }, "replay");
+    }
+  }
+
+  handleTerminate(signame: string) {
+    logger.info(`Got signal ${signame}, exiting`);
+    process.exit(ExitCodes.SignalInterrupted);
+  }
+}
+
+await new CrawlIndexer().run();
diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts
@@ -10,9 +10,6 @@ import { PageInfoRecord, PageInfoValue, Recorder } from "./util/recorder.js";
 import fsp from "fs/promises";
 import path from "path";
 
-import { ZipRangeReader, createLoader } from "@webrecorder/wabac";
-
-import { AsyncIterReader } from "warcio";
 import { parseArgs } from "./util/argParser.js";
 
 import { PNG } from "pngjs";
@@ -23,6 +20,7 @@ import { MAX_URL_LENGTH } from "./util/reqresp.js";
 import { openAsBlob } from "fs";
 import { WARCWriter } from "./util/warcwriter.js";
 import { parseRx } from "./util/seeds.js";
+import { WACZLoader } from "./util/wacz.js";
 
 // RWP Replay Prefix
 const REPLAY_PREFIX = "http://localhost:9990/replay/w/replay/";
@@ -784,38 +782,3 @@ export class ReplayCrawler extends Crawler {
     return null;
   }
 }
-
-class WACZLoader {
-  url: string;
-  zipreader: ZipRangeReader | null;
-
-  constructor(url: string) {
-    this.url = url;
-    this.zipreader = null;
-  }
-
-  async init() {
-    if (!this.url.startsWith("http://") && !this.url.startsWith("https://")) {
-      const blob = await openAsBlob(this.url);
-      this.url = URL.createObjectURL(blob);
-    }
-
-    const loader = await createLoader({ url: this.url });
-
-    this.zipreader = new ZipRangeReader(loader);
-  }
-
-  async loadFile(fileInZip: string) {
-    const { reader } = await this.zipreader!.loadFile(fileInZip);
-
-    if (!reader) {
-      return null;
-    }
-
-    if (!reader.iterLines) {
-      return new AsyncIterReader(reader);
-    }
-
-    return reader;
-  }
-}
diff --git a/src/util/argParser.ts b/src/util/argParser.ts
@@ -437,6 +437,19 @@ class ArgParser {
           default: "redis://localhost:6379/0",
         },
 
+        redisDedupUrl: {
+          describe:
+            "If set, url for remote redis server to store state. Otherwise, using local redis instance",
+          type: "string",
+        },
+
+        minPageDedupDepth: {
+          describe:
+            "If set >= 0, minimum depth at which duplicate pages can be skipped. -1 means never skip duplicate pages",
+          type: "number",
+          default: -1,
+        },
+
         saveState: {
           describe:
             "If the crawl state should be serialized to the crawls/ directory. Defaults to 'partial', only saved when crawl is interrupted",

diff --git a/src/util/constants.ts b/src/util/constants.ts
@@ -22,6 +22,8 @@ export const DETECT_SITEMAP = "<detect>";
 
 export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
 
+export const HASH_DUPE_KEY = "dupe";
+
 export enum BxFunctionBindings {
   BehaviorLogFunc = "__bx_log",
   AddLinkFunc = "__bx_addLink",