diff --git a/src/crawler.ts b/src/crawler.ts index 61c7516f3..d992e17d8 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -47,6 +47,7 @@ import { ExitCodes, InterruptReason, BxFunctionBindings, + SEED_REDIRECT_ADD_DELAY, } from "./util/constants.js"; import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js"; @@ -592,7 +593,14 @@ export class Crawler { extraChromeArgs() { const args = []; if (this.params.lang) { - args.push(`--accept-lang=${this.params.lang}`); + if (this.params.profile) { + logger.warn( + "Ignoring --lang option with profile, using language configured in the profile", + { lang: this.params.lang }, + ); + } else { + args.push(`--accept-lang=${this.params.lang}`); + } } return args; } @@ -2123,6 +2131,8 @@ self.__bx_behaviors.selectMainBehavior(); const respUrl = resp.url().split("#")[0]; const isChromeError = page.url().startsWith("chrome-error://"); + let thisPageDelay = 0; + let originalSeedId = null; if ( depth === 0 && @@ -2131,6 +2141,7 @@ self.__bx_behaviors.selectMainBehavior(); respUrl + "/" !== url && !downloadResponse ) { + originalSeedId = data.seedId; data.seedId = await this.crawlState.addExtraSeed( this.seeds, this.numOriginalSeeds, @@ -2142,6 +2153,7 @@ self.__bx_behaviors.selectMainBehavior(); newUrl: respUrl, seedId: data.seedId, }); + thisPageDelay = SEED_REDIRECT_ADD_DELAY; } const status = resp.status(); @@ -2228,7 +2240,7 @@ self.__bx_behaviors.selectMainBehavior(); await this.netIdle(page, logDetails); - await this.awaitPageLoad(page.mainFrame(), logDetails); + await this.awaitPageLoad(page.mainFrame(), thisPageDelay, logDetails); // skip extraction if at max depth if (seed.isAtMaxDepth(depth, extraHops)) { @@ -2242,6 +2254,27 @@ self.__bx_behaviors.selectMainBehavior(); "links", ); + const pageUrl = page.url().split("#")[0]; + + if (depth === 0 && respUrl !== urlNoHash) { + if (pageUrl === urlNoHash && originalSeedId !== null) { + logger.info("Seed page redirected back to original seed", { pageUrl }); + data.seedId = originalSeedId; + } else { + data.seedId = await this.crawlState.addExtraSeed( + this.seeds, + this.numOriginalSeeds, + data.seedId, + pageUrl, + ); + logger.info("Seed page redirected, adding redirected seed", { + origUrl: respUrl, + newUrl: pageUrl, + seedId: data.seedId, + }); + } + } + await this.extractLinks(page, data, this.params.selectLinks, logDetails); } @@ -2263,7 +2296,7 @@ self.__bx_behaviors.selectMainBehavior(); } } - async awaitPageLoad(frame: Frame, logDetails: LogDetails) { + async awaitPageLoad(frame: Frame, tempDelay: number, logDetails: LogDetails) { if (this.params.behaviorOpts) { try { await timedRun( @@ -2279,11 +2312,13 @@ self.__bx_behaviors.selectMainBehavior(); } } - if (this.params.postLoadDelay) { + const delay = tempDelay + this.params.postLoadDelay; + + if (delay) { logger.info("Awaiting post load delay", { - seconds: this.params.postLoadDelay, + seconds: delay, }); - await sleep(this.params.postLoadDelay); + await sleep(delay); } } diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 75abfc4ef..aa50a6cfd 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -450,7 +450,7 @@ export class ReplayCrawler extends Crawler { // optionally reload (todo: reevaluate if this is needed) // await page.reload(); - await this.awaitPageLoad(replayFrame, logDetails); + await this.awaitPageLoad(replayFrame, 0, logDetails); data.isHTMLPage = true; diff --git a/src/util/constants.ts b/src/util/constants.ts index 0f75df739..a2d4eab78 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -38,6 +38,7 @@ export const DEFAULT_MAX_RETRIES = 2; export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; +export const SEED_REDIRECT_ADD_DELAY = 20; export type ExtractSelector = { selector: string; diff --git a/src/util/recorder.ts b/src/util/recorder.ts index bbaffa436..699db6a8f 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -507,7 +507,7 @@ export class Recorder extends EventEmitter { return; } - this.serializeToWARC(reqresp).catch((e) => + this.serializeToWARC(reqresp, true).catch((e) => logger.warn("Error Serializing to WARC", e, "recorder"), ); } @@ -1327,7 +1327,7 @@ export class Recorder extends EventEmitter { return reqresp; } - async serializeToWARC(reqresp: RequestResponseInfo) { + async serializeToWARC(reqresp: RequestResponseInfo, fromFinished = false) { // always include in pageinfo record if going to serialize to WARC // even if serialization does not happen this.addPageRecord(reqresp); @@ -1371,6 +1371,15 @@ export class Recorder extends EventEmitter { const requestRecord = createRequest(reqresp, responseRecord, this.pageid); this.writer.writeRecordPair(responseRecord, requestRecord); + + // edge case: from finished response load, and page response and no mime type or status != 200, possibly a captcha/sso page + // allow it to be captured again + if ( + (fromFinished && url === this.pageUrl && !reqresp.getMimeType()) || + status !== 200 + ) { + await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status); + } } async directFetchCapture({ @@ -1404,7 +1413,7 @@ export class Recorder extends EventEmitter { mime = ct.split(";")[0]; } - const result = !isHTMLMime(mime); + const result = !!mime && !isHTMLMime(mime); if (result) { logger.info(