diff --git a/package.json b/package.json index 76285477..1e7a00dc 100644 --- a/package.json +++ b/package.json @@ -34,6 +34,7 @@ "pixelmatch": "^5.3.0", "pngjs": "^7.0.0", "puppeteer-core": "^24.22.0", + "robots-parser": "^3.0.1", "sax": "^1.3.0", "sharp": "^0.32.6", "tsc": "^2.0.4", diff --git a/src/crawler.ts b/src/crawler.ts index 39b1b2f5..84e1e7ba 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -3,6 +3,8 @@ import path from "path"; import fs, { WriteStream } from "fs"; import os from "os"; import fsp from "fs/promises"; +import { fetch as undiciFetch } from "undici"; +import robotsParser, { Robot } from "robots-parser"; import { RedisCrawlState, @@ -36,6 +38,7 @@ import { logger, formatErr, LogDetails, LogContext } from "./util/logger.js"; import { WorkerState, closeWorkers, runWorkers } from "./util/worker.js"; import { sleep, timedRun, secondsElapsed } from "./util/timing.js"; import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js"; +import { getProxyDispatcher } from "./util/proxy.js"; import { Browser } from "./util/browser.js"; @@ -1249,6 +1252,96 @@ self.__bx_behaviors.selectMainBehavior(); } } + async _fetchRobots(url: string) { + while (true) { + const resp = await undiciFetch(url, { + headers: this.headers, + dispatcher: getProxyDispatcher(url), + }); + + if (resp.ok) { + return resp; + } + + const retry = resp.headers.get("retry-after"); + + if (retry) { + logger.debug( + "Robots.txt fetch: Retry after", + { url, retrySeconds: retry }, + "robots", + ); + await sleep(parseInt(retry)); + continue; + } + + logger.debug( + "Robots.txt not fetched", + { url, status: resp.status }, + "robots", + ); + return null; + } + return null; + } + + async fetchAndParseRobots( + url: string, + logDetails: LogDetails, + ): Promise { + // Fetch robots.txt for url's host and return parser. + // Results are cached by robots.txt URL in Redis using an LRU cache + // implementation that retains the 100 most recently used values. + const urlParser = new URL(url); + const robotsUrl = `${urlParser.origin}/robots.txt`; + + const cachedRobots = await this.crawlState.getCachedRobots(robotsUrl); + if (cachedRobots) { + logger.debug( + "Using cached robots.txt body", + { + url: robotsUrl, + ...logDetails, + }, + "robots", + ); + return robotsParser(robotsUrl, cachedRobots); + } + + try { + logger.debug( + "Fetching robots.txt", + { url: robotsUrl, ...logDetails }, + "robots", + ); + const resp = await this._fetchRobots(robotsUrl); + if (!resp) { + return null; + } + const content = await resp.text(); + + logger.debug( + "Caching robots.txt body", + { url: robotsUrl, ...logDetails }, + "robots", + ); + await this.crawlState.setCachedRobots(robotsUrl, content); + + return robotsParser(robotsUrl, content); + } catch (e) { + // ignore + } + logger.warn( + "Failed to fetch robots.txt", + { + url: robotsUrl, + ...logDetails, + }, + "robots", + ); + return null; + } + async awaitPageExtraDelay(opts: WorkerState) { if (this.params.pageExtraDelay) { const { @@ -2462,6 +2555,18 @@ self.__bx_behaviors.selectMainBehavior(); return false; } + if (this.params.robots) { + const robots = await this.fetchAndParseRobots(url, logDetails); + if (robots && robots.isDisallowed(url, "Browsertrix/1.0")) { + logger.debug( + "Page URL not queued, disallowed by robots.txt", + { url, ...logDetails }, + "links", + ); + return false; + } + } + const result = await this.crawlState.addToQueue( { url, seedId, depth, extraHops, ts, pageid }, this.pageLimit, diff --git a/src/util/argParser.ts b/src/util/argParser.ts index cd64e8fd..0ea4a898 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -683,6 +683,13 @@ class ArgParser { "path to SSH known hosts file for SOCKS5 over SSH proxy connection", type: "string", }, + + robots: { + describe: + "If set, fetch and respect page disallows specified in per-host robots.txt", + type: "boolean", + default: false, + }, }); } diff --git a/src/util/constants.ts b/src/util/constants.ts index 15b00bd7..ebf83c57 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -41,6 +41,8 @@ export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; +export const ROBOTS_CACHE_LIMIT = 100; + export type ExtractSelector = { selector: string; extract: string; diff --git a/src/util/logger.ts b/src/util/logger.ts index 7d10939e..4842aa22 100644 --- a/src/util/logger.ts +++ b/src/util/logger.ts @@ -57,6 +57,7 @@ export const LOG_CONTEXT_TYPES = [ "replay", "proxy", "scope", + "robots", ] as const; export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; diff --git a/src/util/state.ts b/src/util/state.ts index 9309116a..43978de6 100644 --- a/src/util/state.ts +++ b/src/util/state.ts @@ -3,7 +3,11 @@ import { v4 as uuidv4 } from "uuid"; import { logger } from "./logger.js"; -import { MAX_DEPTH, DEFAULT_MAX_RETRIES } from "./constants.js"; +import { + MAX_DEPTH, + DEFAULT_MAX_RETRIES, + ROBOTS_CACHE_LIMIT, +} from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { Frame } from "puppeteer-core"; import { interpolateFilename } from "./storage.js"; @@ -200,7 +204,10 @@ export class RedisCrawlState { fkey: string; ekey: string; bkey: string; + rkey: string; + lkey: string; pageskey: string; + esKey: string; esMap: string; @@ -233,6 +240,10 @@ export class RedisCrawlState { this.ekey = this.key + ":e"; // crawler behavior script messages this.bkey = this.key + ":b"; + // cached robots.txt bodies (per-origin) + this.rkey = this.key + ":r"; + // LRU cache of robots.txt keys + this.lkey = this.key + ":l"; // pages this.pageskey = this.key + ":pages"; @@ -1025,6 +1036,38 @@ return inx; return await this.redis.lpush(this.bkey, behaviorLog); } + async _updateRobotsAccessTime(robotsUrl: string) { + const accessTime = Date.now(); + await this.redis.zadd(this.lkey, accessTime, robotsUrl); + } + + async setCachedRobots(robotsUrl: string, body: string) { + await this._updateRobotsAccessTime(robotsUrl); + await this.redis.set(`${this.rkey}:${robotsUrl}`, body); + + // prune least-recently used items in zset and robots cache if over limit + const cacheCount = await this.redis.zcard(this.lkey); + if (cacheCount > ROBOTS_CACHE_LIMIT) { + const diff = cacheCount - ROBOTS_CACHE_LIMIT; + const keysToDelete = await this.redis.zrange(this.lkey, 0, diff - 1); + + for (const keyToDelete of keysToDelete) { + logger.debug( + "Deleting cached robots.txt, over cache limit", + { url: keyToDelete }, + "robots", + ); + await this.redis.del(`${this.rkey}:${keyToDelete}`); + await this.redis.zrem(this.lkey, keyToDelete); + } + } + } + + async getCachedRobots(robotsUrl: string) { + await this._updateRobotsAccessTime(robotsUrl); + return await this.redis.get(`${this.rkey}:${robotsUrl}`); + } + async writeToPagesQueue( data: Record, ) { diff --git a/tests/robots_txt.test.js b/tests/robots_txt.test.js new file mode 100644 index 00000000..43ffe197 --- /dev/null +++ b/tests/robots_txt.test.js @@ -0,0 +1,35 @@ +import child_process from "child_process"; + +test("test robots.txt is fetched and cached", async () => { + const res = child_process.execSync( + "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net/ --url https://webrecorder.net/ --scopeType page --robots --logging debug", + ); + + const log = res.toString(); + + // robots.txt not found + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://specs.webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Robots.txt not fetched","details":{"url":"https://specs.webrecorder.net/robots.txt","status":404}}', + ) > 0, + ).toBe(true); + + // robots.txt found and cached + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Fetching robots.txt","details":{"url":"https://webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); + + expect( + log.indexOf( + '"logLevel":"debug","context":"robots","message":"Caching robots.txt body","details":{"url":"https://webrecorder.net/robots.txt"}}', + ) > 0, + ).toBe(true); +});