Skip to content

Commit 5a867bc

Browse files
janbucharbarjin
andauthored
feat: Report links skipped because of various filter conditions (#3026)
- closes #3016 - [x] consider also reporting links skipped due to after-redirect checks --------- Co-authored-by: Jindřich Bär <jindrichbar@gmail.com>
1 parent 6cbdb8a commit 5a867bc

File tree

6 files changed

+101
-34
lines changed

6 files changed

+101
-34
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -354,7 +354,11 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
354354

355355
/**
356356
* When a request is skipped for some reason, you can use this callback to act on it.
357-
* This is currently fired only for requests skipped based on robots.txt file.
357+
* This is currently fired for requests skipped
358+
* 1. based on robots.txt file,
359+
* 2. because they don't match enqueueLinks filters,
360+
* 3. because they are redirected to a URL that doesn't match the enqueueLinks strategy,
361+
* 4. or because the {@apilink BasicCrawlerOptions.maxRequestsPerCrawl|`maxRequestsPerCrawl`} limit has been reached
358362
*/
359363
onSkippedRequest?: SkippedRequestCallback;
360364

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,8 @@ export abstract class BrowserCrawler<
573573
request.noRetry = true;
574574
request.state = RequestState.SKIPPED;
575575

576+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
577+
576578
return;
577579
}
578580

packages/core/src/enqueue_links/enqueue_links.ts

Lines changed: 60 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { Awaitable, BatchAddRequestsResult, Dictionary } from '@crawlee/types';
1+
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
22
import { type RobotsTxtFile } from '@crawlee/utils';
33
import ow from 'ow';
44
import { getDomain } from 'tldts';
@@ -13,7 +13,15 @@ import type {
1313
RequestProvider,
1414
RequestQueueOperationOptions,
1515
} from '../storages';
16-
import type { GlobInput, PseudoUrlInput, RegExpInput, RequestTransform, UrlPatternObject } from './shared';
16+
import type {
17+
GlobInput,
18+
PseudoUrlInput,
19+
RegExpInput,
20+
RequestTransform,
21+
SkippedRequestCallback,
22+
SkippedRequestReason,
23+
UrlPatternObject,
24+
} from './shared';
1725
import {
1826
constructGlobObjectsFromGlobs,
1927
constructRegExpObjectsFromPseudoUrls,
@@ -23,8 +31,6 @@ import {
2331
filterRequestsByPatterns,
2432
} from './shared';
2533

26-
export type SkippedRequestCallback = (args: { url: string; reason: 'robotsTxt' | 'limit' }) => Awaitable<void>;
27-
2834
export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
2935
/** Limit the amount of actually enqueued URLs to this number. Useful for testing across the entire crawling scope. */
3036
limit?: number;
@@ -175,7 +181,10 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
175181

176182
/**
177183
* When a request is skipped for some reason, you can use this callback to act on it.
178-
* This is currently fired only for requests skipped based on robots.txt file.
184+
* This is currently fired for requests skipped
185+
* 1. based on robots.txt file,
186+
* 2. because they don't match enqueueLinks filters,
187+
* 3. or because the maxRequestsPerCrawl limit has been reached
179188
*/
180189
onSkippedRequest?: SkippedRequestCallback;
181190
}
@@ -392,6 +401,16 @@ export async function enqueueLinks(
392401
}
393402
}
394403

404+
async function reportSkippedRequests(skippedRequests: { url: string }[], reason: SkippedRequestReason) {
405+
if (onSkippedRequest && skippedRequests.length > 0) {
406+
await Promise.all(
407+
skippedRequests.map((request) => {
408+
return onSkippedRequest({ url: request.url, reason });
409+
}),
410+
);
411+
}
412+
}
413+
395414
let requestOptions = createRequestOptions(urls, options);
396415

397416
if (robotsTxtFile) {
@@ -406,25 +425,37 @@ export async function enqueueLinks(
406425
return false;
407426
});
408427

409-
if (onSkippedRequest && skippedRequests.length > 0) {
410-
await Promise.all(
411-
skippedRequests.map((request) => {
412-
return onSkippedRequest({ url: request.url, reason: 'robotsTxt' });
413-
}),
414-
);
415-
}
428+
await reportSkippedRequests(skippedRequests, 'robotsTxt');
416429
}
417430

418431
if (transformRequestFunction) {
432+
const skippedRequests: RequestOptions[] = [];
433+
419434
requestOptions = requestOptions
420-
.map((request) => transformRequestFunction(request))
421-
.filter((r) => !!r) as RequestOptions[];
435+
.map((request) => {
436+
const transformedRequest = transformRequestFunction(request);
437+
if (!transformedRequest) {
438+
skippedRequests.push(request);
439+
}
440+
return transformedRequest;
441+
})
442+
.filter((r) => Boolean(r)) as RequestOptions[];
443+
444+
await reportSkippedRequests(skippedRequests, 'filters');
422445
}
423446

424-
function createFilteredRequests() {
447+
async function createFilteredRequests() {
448+
const skippedRequests: string[] = [];
449+
425450
// No user provided patterns means we can skip an extra filtering step
426451
if (urlPatternObjects.length === 0) {
427-
return createRequests(requestOptions, enqueueStrategyPatterns, urlExcludePatternObjects, options.strategy);
452+
return createRequests(
453+
requestOptions,
454+
enqueueStrategyPatterns,
455+
urlExcludePatternObjects,
456+
options.strategy,
457+
(url) => skippedRequests.push(url),
458+
);
428459
}
429460

430461
// Generate requests based on the user patterns first
@@ -433,19 +464,24 @@ export async function enqueueLinks(
433464
urlPatternObjects,
434465
urlExcludePatternObjects,
435466
options.strategy,
467+
(url) => skippedRequests.push(url),
436468
);
437469
// ...then filter them by the enqueue links strategy (making this an AND check)
438-
return filterRequestsByPatterns(generatedRequestsFromUserFilters, enqueueStrategyPatterns);
470+
const filtered = filterRequestsByPatterns(generatedRequestsFromUserFilters, enqueueStrategyPatterns, (url) =>
471+
skippedRequests.push(url),
472+
);
473+
474+
await reportSkippedRequests(
475+
skippedRequests.map((url) => ({ url })),
476+
'filters',
477+
);
478+
479+
return filtered;
439480
}
440481

441-
let requests = createFilteredRequests();
482+
let requests = await createFilteredRequests();
442483
if (limit && limit < requests.length) {
443-
if (onSkippedRequest) {
444-
for (const request of requests.slice(limit)) {
445-
await onSkippedRequest({ url: request.url, reason: 'limit' });
446-
}
447-
}
448-
484+
await reportSkippedRequests(requests.slice(limit), 'limit');
449485
requests = requests.slice(0, limit);
450486
}
451487

packages/core/src/enqueue_links/shared.ts

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { URL } from 'node:url';
22

3+
import type { Awaitable } from '@crawlee/types';
34
import { minimatch } from 'minimatch';
45

56
import { purlToRegExp } from '@apify/pseudo_url';
@@ -46,6 +47,13 @@ export type RegExpObject = { regexp: RegExp } & Pick<
4647

4748
export type RegExpInput = RegExp | RegExpObject;
4849

50+
export type SkippedRequestReason = 'robotsTxt' | 'limit' | 'filters' | 'redirect';
51+
52+
export type SkippedRequestCallback = (args: {
53+
url: string;
54+
reason: SkippedRequestReason;
55+
}) => Awaitable<void>;
56+
4957
/**
5058
* @ignore
5159
*/
@@ -166,14 +174,21 @@ export function createRequests(
166174
urlPatternObjects?: UrlPatternObject[],
167175
excludePatternObjects: UrlPatternObject[] = [],
168176
strategy?: EnqueueLinksOptions['strategy'],
177+
onSkippedUrl?: (url: string) => void,
169178
): Request[] {
170179
return requestOptions
171180
.map((opts) => ({ url: typeof opts === 'string' ? opts : opts.url, opts }))
172181
.filter(({ url }) => {
173-
return !excludePatternObjects.some((excludePatternObject) => {
182+
const matchesExcludePatterns = excludePatternObjects.some((excludePatternObject) => {
174183
const { regexp, glob } = excludePatternObject;
175184
return (regexp && url.match(regexp)) || (glob && minimatch(url, glob, { nocase: true }));
176185
});
186+
187+
if (matchesExcludePatterns) {
188+
onSkippedUrl?.(url);
189+
}
190+
191+
return !matchesExcludePatterns;
177192
})
178193
.map(({ url, opts }) => {
179194
if (!urlPatternObjects || !urlPatternObjects.length) {
@@ -193,27 +208,33 @@ export function createRequests(
193208
}
194209

195210
// didn't match any positive pattern
211+
onSkippedUrl?.(url);
196212
return null;
197213
})
198214
.filter((request) => request) as Request[];
199215
}
200216

201-
export function filterRequestsByPatterns(requests: Request[], patterns?: UrlPatternObject[]): Request[] {
217+
export function filterRequestsByPatterns(
218+
requests: Request[],
219+
patterns?: UrlPatternObject[],
220+
onSkippedUrl?: (url: string) => void,
221+
): Request[] {
202222
if (!patterns?.length) {
203223
return requests;
204224
}
205225

206226
const filtered: Request[] = [];
207227

208228
for (const request of requests) {
209-
for (const urlPatternObject of patterns) {
210-
const { regexp, glob } = urlPatternObject;
229+
const matchingPattern = patterns.find(
230+
({ regexp, glob }) =>
231+
(regexp && request.url.match(regexp)) || (glob && minimatch(request.url, glob, { nocase: true })),
232+
);
211233

212-
if ((regexp && request.url.match(regexp)) || (glob && minimatch(request.url, glob, { nocase: true }))) {
213-
filtered.push(request);
214-
// Break the pattern loop, as we already matched this request once
215-
break;
216-
}
234+
if (matchingPattern !== undefined) {
235+
filtered.push(request);
236+
} else {
237+
onSkippedUrl?.(request.url);
217238
}
218239
}
219240

packages/http-crawler/src/internals/http-crawler.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,8 @@ export class HttpCrawler<
557557
request.noRetry = true;
558558
request.state = RequestState.SKIPPED;
559559

560+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
561+
560562
return;
561563
}
562564

packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -600,6 +600,8 @@ export class AdaptivePlaywrightCrawler extends PlaywrightCrawler {
600600
request.noRetry = true;
601601
request.state = RequestState.SKIPPED;
602602

603+
await this.handleSkippedRequest({ url: request.url, reason: 'redirect' });
604+
603605
return;
604606
}
605607

0 commit comments

Comments
 (0)