Skip to content

Commit 340252a

Browse files
authored
fix: Fix skipNavigation behavior in HttpCrawler subclasses (#3381)
closes #3304
1 parent 7c3ba07 commit 340252a

File tree

9 files changed

+274
-108
lines changed

9 files changed

+274
-108
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ import {
4444
KeyValueStore,
4545
LogLevel,
4646
mergeCookies,
47+
NavigationSkippedError,
4748
NonRetryableError,
4849
purgeDefaultStorages,
4950
RequestHandlerError,
@@ -914,13 +915,13 @@ export class BasicCrawler<
914915
try {
915916
await this.basicContextPipeline
916917
.chain(this.contextPipeline)
917-
.call(crawlingContext, (ctx) => this.handleRequest(ctx, source));
918+
.call(crawlingContext, (ctx) => this.handleRequest(ctx, source, request));
918919
} catch (error) {
919920
// ContextPipelineInterruptedError means the request was intentionally skipped
920921
// (e.g., doesn't match enqueue strategy after redirect). Just return gracefully.
921922
if (error instanceof ContextPipelineInterruptedError) {
922923
await this._timeoutAndRetry(
923-
async () => this.requestManager?.markRequestHandled(crawlingContext.request!),
924+
async () => this.requestManager?.markRequestHandled(request),
924925
this.internalTimeoutMillis,
925926
`Marking request ${crawlingContext.request.url} (${crawlingContext.request.id}) as handled timed out after ${
926927
this.internalTimeoutMillis / 1e3
@@ -939,6 +940,7 @@ export class BasicCrawler<
939940
await this._requestFunctionErrorHandler(
940941
unwrappedError,
941942
crawlingContext as CrawlingContext,
943+
request,
942944
this.requestManager!,
943945
);
944946
crawlingContext.session?.markBad();
@@ -1838,9 +1840,7 @@ export class BasicCrawler<
18381840
}
18391841

18401842
/** Handles a single request - runs the request handler with retries, error handling, and lifecycle management. */
1841-
protected async handleRequest(crawlingContext: ExtendedContext, requestSource: IRequestManager) {
1842-
const { request } = crawlingContext;
1843-
1843+
protected async handleRequest(crawlingContext: ExtendedContext, requestSource: IRequestManager, request: Request) {
18441844
const statisticsId = request.id || request.uniqueKey;
18451845
this.stats.startJob(statisticsId);
18461846

@@ -1871,7 +1871,7 @@ export class BasicCrawler<
18711871
try {
18721872
request.state = RequestState.ERROR_HANDLER;
18731873
await addTimeoutToPromise(
1874-
async () => this._requestFunctionErrorHandler(err, crawlingContext, requestSource),
1874+
async () => this._requestFunctionErrorHandler(err, crawlingContext, request, requestSource),
18751875
this.internalTimeoutMillis,
18761876
`Handling request failure of ${request.url} (${request.id}) timed out after ${
18771877
this.internalTimeoutMillis / 1e3
@@ -2052,13 +2052,15 @@ export class BasicCrawler<
20522052

20532053
/**
20542054
* Handles errors thrown by user provided requestHandler()
2055+
*
2056+
* @param request The request object, passed separately to circumvent potential dynamic logic in crawlingContext.request
20552057
*/
20562058
protected async _requestFunctionErrorHandler(
20572059
error: Error,
20582060
crawlingContext: CrawlingContext,
2061+
request: Request,
20592062
source: IRequestList | IRequestManager,
20602063
): Promise<void> {
2061-
const { request } = crawlingContext;
20622064
request.pushErrorMessage(error);
20632065

20642066
if (error instanceof CriticalError) {
@@ -2256,6 +2258,18 @@ export class BasicCrawler<
22562258
}
22572259

22582260
private requestMatchesEnqueueStrategy(request: Request) {
2261+
// If `skipNavigation` was used, just return `true`
2262+
try {
2263+
// eslint-disable-next-line @typescript-eslint/no-unused-expressions
2264+
request.loadedUrl;
2265+
} catch (err) {
2266+
if (err instanceof NavigationSkippedError) {
2267+
return true;
2268+
}
2269+
2270+
throw err;
2271+
}
2272+
22592273
const { url, loadedUrl } = request;
22602274

22612275
// eslint-disable-next-line dot-notation -- private access

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import {
2020
enqueueLinks,
2121
EVENT_SESSION_RETIRED,
2222
handleRequestTimeout,
23+
NavigationSkippedError,
2324
RequestState,
2425
resolveBaseUrlForEnqueueLinksFiltering,
2526
SessionError,
@@ -508,15 +509,17 @@ export abstract class BrowserCrawler<
508509
request: new Proxy(crawlingContext.request, {
509510
get(target, propertyName, receiver) {
510511
if (propertyName === 'loadedUrl') {
511-
throw new Error(
512+
throw new NavigationSkippedError(
512513
'The `request.loadedUrl` property is not available - `skipNavigation` was used',
513514
);
514515
}
515516
return Reflect.get(target, propertyName, receiver);
516517
},
517518
}) as LoadedRequest<Request>,
518519
get response(): Response {
519-
throw new Error('The `response` property is not available - `skipNavigation` was used');
520+
throw new NavigationSkippedError(
521+
'The `response` property is not available - `skipNavigation` was used',
522+
);
520523
},
521524
};
522525
}

packages/cheerio-crawler/src/internals/cheerio-crawler.ts

Lines changed: 40 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,13 @@ import type {
1111
RouterRoutes,
1212
SkippedRequestCallback,
1313
} from '@crawlee/http';
14-
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
14+
import {
15+
enqueueLinks,
16+
HttpCrawler,
17+
NavigationSkippedError,
18+
resolveBaseUrlForEnqueueLinksFiltering,
19+
Router,
20+
} from '@crawlee/http';
1521
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
1622
import { type CheerioRoot, extractUrlsFromCheerio, type RobotsTxtFile } from '@crawlee/utils';
1723
import type { CheerioAPI, CheerioOptions } from 'cheerio';
@@ -194,19 +200,40 @@ export class CheerioCrawler<
194200
}
195201

196202
private async parseContent(crawlingContext: InternalHttpCrawlingContext) {
197-
const isXml = crawlingContext.contentType.type.includes('xml');
198-
const body = Buffer.isBuffer(crawlingContext.body)
199-
? crawlingContext.body.toString(crawlingContext.contentType.encoding)
200-
: crawlingContext.body;
201-
const dom = parseDocument(body, { decodeEntities: true, xmlMode: isXml });
202-
const $ = cheerio.load(dom, {
203-
xml: { decodeEntities: true, xmlMode: isXml },
204-
} as CheerioOptions);
203+
try {
204+
const isXml = crawlingContext.contentType.type.includes('xml');
205+
const body = Buffer.isBuffer(crawlingContext.body)
206+
? crawlingContext.body.toString(crawlingContext.contentType.encoding)
207+
: crawlingContext.body;
208+
const dom = parseDocument(body, { decodeEntities: true, xmlMode: isXml });
209+
const $ = cheerio.load(dom, {
210+
xml: { decodeEntities: true, xmlMode: isXml },
211+
} as CheerioOptions);
205212

206-
return {
207-
$,
208-
body,
209-
};
213+
return {
214+
$,
215+
body,
216+
};
217+
} catch (err) {
218+
if (err instanceof NavigationSkippedError) {
219+
return {
220+
get body(): string {
221+
throw new NavigationSkippedError(
222+
'The `body` property is not available - `skipNavigation` was used',
223+
{ cause: err },
224+
);
225+
},
226+
get $(): CheerioAPI {
227+
throw new NavigationSkippedError(
228+
'The `$` property is not available - `skipNavigation` was used',
229+
{ cause: err },
230+
);
231+
},
232+
};
233+
}
234+
235+
throw err;
236+
}
210237
}
211238

212239
private async addHelpers(crawlingContext: InternalHttpCrawlingContext & { $: CheerioAPI }) {

packages/core/src/errors.ts

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,3 +78,9 @@ export class ServiceConflictError extends Error {
7878
);
7979
}
8080
}
81+
82+
/**
83+
* Thrown by crawlers when `skipNavigation` is used on a request.
84+
* Subclasses can catch this error to skip their own navigation-dependent logic.
85+
*/
86+
export class NavigationSkippedError extends NonRetryableError {}

packages/core/src/request.ts

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -275,12 +275,24 @@ class CrawleeRequest<UserData extends Dictionary = Dictionary> {
275275
});
276276
}
277277

278-
/** Tells the crawler processing this request to skip the navigation and process the request directly. */
278+
/**
279+
* Tells the crawler processing this request to skip the navigation and process the request directly.
280+
*
281+
* When this is set to `true`, the crawling context will not contain the results of the navigation
282+
* (e.g. `response`, `body`, `contentType`, `$` or `request.loadedUrl`).
283+
* Accessing these properties will throw a {@apilink NavigationSkippedError} at runtime.
284+
*/
279285
get skipNavigation(): boolean {
280286
return this.userData.__crawlee?.skipNavigation ?? false;
281287
}
282288

283-
/** Tells the crawler processing this request to skip the navigation and process the request directly. */
289+
/**
290+
* Tells the crawler processing this request to skip the navigation and process the request directly.
291+
*
292+
* When this is set to `true`, the crawling context will not contain the results of the navigation
293+
* (e.g. `response`, `body`, `contentType`, `$` or `request.loadedUrl`).
294+
* Accessing these properties will throw a {@apilink NavigationSkippedError} at runtime.
295+
*/
284296
set skipNavigation(value: boolean) {
285297
if (!this.userData.__crawlee) {
286298
(this.userData as Dictionary).__crawlee = { skipNavigation: value };
@@ -548,6 +560,10 @@ export interface RequestOptions<UserData extends Dictionary = Dictionary> {
548560
/**
549561
* If set to `true` then the crawler processing this request evaluates
550562
* the `requestHandler` immediately without prior browser navigation.
563+
*
564+
* When enabled, the crawling context will not contain the results of the navigation
565+
* (e.g. `response`, `body`, `contentType`, `$` or `request.loadedUrl`).
566+
* Accessing these properties will throw a {@apilink NavigationSkippedError} at runtime.
551567
* @default false
552568
*/
553569
skipNavigation?: boolean;

packages/http-crawler/src/internals/http-crawler.ts

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,15 @@ import type {
1313
RouterRoutes,
1414
Session,
1515
} from '@crawlee/basic';
16-
import { BasicCrawler, ContextPipeline, mergeCookies, RequestState, Router, SessionError } from '@crawlee/basic';
16+
import {
17+
BasicCrawler,
18+
ContextPipeline,
19+
mergeCookies,
20+
NavigationSkippedError,
21+
RequestState,
22+
Router,
23+
SessionError,
24+
} from '@crawlee/basic';
1725
import type { LoadedRequest } from '@crawlee/core';
1826
import { ResponseWithUrl } from '@crawlee/http-client';
1927
import type { Awaitable, Dictionary } from '@crawlee/types';
@@ -392,15 +400,17 @@ export class HttpCrawler<
392400
request: new Proxy(request, {
393401
get(target, propertyName, receiver) {
394402
if (propertyName === 'loadedUrl') {
395-
throw new Error(
403+
throw new NavigationSkippedError(
396404
'The `request.loadedUrl` property is not available - `skipNavigation` was used',
397405
);
398406
}
399407
return Reflect.get(target, propertyName, receiver);
400408
},
401409
}) as LoadedRequest<CrawleeRequest>,
402410
get response(): InternalHttpCrawlingContext['response'] {
403-
throw new Error('The `response` property is not available - `skipNavigation` was used');
411+
throw new NavigationSkippedError(
412+
'The `response` property is not available - `skipNavigation` was used',
413+
);
404414
},
405415
};
406416
}
@@ -440,19 +450,29 @@ export class HttpCrawler<
440450
if (crawlingContext.request.skipNavigation) {
441451
return {
442452
get contentType(): InternalHttpCrawlingContext['contentType'] {
443-
throw new Error('The `contentType` property is not available - `skipNavigation` was used');
453+
throw new NavigationSkippedError(
454+
'The `contentType` property is not available - `skipNavigation` was used',
455+
);
444456
},
445457
get body(): InternalHttpCrawlingContext['body'] {
446-
throw new Error('The `body` property is not available - `skipNavigation` was used');
458+
throw new NavigationSkippedError(
459+
'The `body` property is not available - `skipNavigation` was used',
460+
);
447461
},
448462
get json(): InternalHttpCrawlingContext['json'] {
449-
throw new Error('The `json` property is not available - `skipNavigation` was used');
463+
throw new NavigationSkippedError(
464+
'The `json` property is not available - `skipNavigation` was used',
465+
);
450466
},
451467
get waitForSelector(): InternalHttpCrawlingContext['waitForSelector'] {
452-
throw new Error('The `waitForSelector` method is not available - `skipNavigation` was used');
468+
throw new NavigationSkippedError(
469+
'The `waitForSelector` method is not available - `skipNavigation` was used',
470+
);
453471
},
454472
get parseWithCheerio(): InternalHttpCrawlingContext['parseWithCheerio'] {
455-
throw new Error('The `parseWithCheerio` method is not available - `skipNavigation` was used');
473+
throw new NavigationSkippedError(
474+
'The `parseWithCheerio` method is not available - `skipNavigation` was used',
475+
);
456476
},
457477
};
458478
}

0 commit comments

Comments
 (0)