1- import type { Awaitable , BatchAddRequestsResult , Dictionary } from '@crawlee/types' ;
1+ import type { BatchAddRequestsResult , Dictionary } from '@crawlee/types' ;
22import { type RobotsTxtFile } from '@crawlee/utils' ;
33import ow from 'ow' ;
44import { getDomain } from 'tldts' ;
@@ -13,7 +13,15 @@ import type {
1313 RequestProvider ,
1414 RequestQueueOperationOptions ,
1515} from '../storages' ;
16- import type { GlobInput , PseudoUrlInput , RegExpInput , RequestTransform , UrlPatternObject } from './shared' ;
16+ import type {
17+ GlobInput ,
18+ PseudoUrlInput ,
19+ RegExpInput ,
20+ RequestTransform ,
21+ SkippedRequestCallback ,
22+ SkippedRequestReason ,
23+ UrlPatternObject ,
24+ } from './shared' ;
1725import {
1826 constructGlobObjectsFromGlobs ,
1927 constructRegExpObjectsFromPseudoUrls ,
@@ -23,8 +31,6 @@ import {
2331 filterRequestsByPatterns ,
2432} from './shared' ;
2533
26- export type SkippedRequestCallback = ( args : { url : string ; reason : 'robotsTxt' | 'limit' } ) => Awaitable < void > ;
27-
2834export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
2935 /** Limit the amount of actually enqueued URLs to this number. Useful for testing across the entire crawling scope. */
3036 limit ?: number ;
@@ -175,7 +181,10 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
175181
176182 /**
177183 * When a request is skipped for some reason, you can use this callback to act on it.
178- * This is currently fired only for requests skipped based on robots.txt file.
184+ * This is currently fired for requests skipped
185+ * 1. based on robots.txt file,
186+ * 2. because they don't match enqueueLinks filters,
187+ * 3. or because the maxRequestsPerCrawl limit has been reached
179188 */
180189 onSkippedRequest ?: SkippedRequestCallback ;
181190}
@@ -392,6 +401,16 @@ export async function enqueueLinks(
392401 }
393402 }
394403
404+ async function reportSkippedRequests ( skippedRequests : { url : string } [ ] , reason : SkippedRequestReason ) {
405+ if ( onSkippedRequest && skippedRequests . length > 0 ) {
406+ await Promise . all (
407+ skippedRequests . map ( ( request ) => {
408+ return onSkippedRequest ( { url : request . url , reason } ) ;
409+ } ) ,
410+ ) ;
411+ }
412+ }
413+
395414 let requestOptions = createRequestOptions ( urls , options ) ;
396415
397416 if ( robotsTxtFile ) {
@@ -406,25 +425,37 @@ export async function enqueueLinks(
406425 return false ;
407426 } ) ;
408427
409- if ( onSkippedRequest && skippedRequests . length > 0 ) {
410- await Promise . all (
411- skippedRequests . map ( ( request ) => {
412- return onSkippedRequest ( { url : request . url , reason : 'robotsTxt' } ) ;
413- } ) ,
414- ) ;
415- }
428+ await reportSkippedRequests ( skippedRequests , 'robotsTxt' ) ;
416429 }
417430
418431 if ( transformRequestFunction ) {
432+ const skippedRequests : RequestOptions [ ] = [ ] ;
433+
419434 requestOptions = requestOptions
420- . map ( ( request ) => transformRequestFunction ( request ) )
421- . filter ( ( r ) => ! ! r ) as RequestOptions [ ] ;
435+ . map ( ( request ) => {
436+ const transformedRequest = transformRequestFunction ( request ) ;
437+ if ( ! transformedRequest ) {
438+ skippedRequests . push ( request ) ;
439+ }
440+ return transformedRequest ;
441+ } )
442+ . filter ( ( r ) => Boolean ( r ) ) as RequestOptions [ ] ;
443+
444+ await reportSkippedRequests ( skippedRequests , 'filters' ) ;
422445 }
423446
424- function createFilteredRequests ( ) {
447+ async function createFilteredRequests ( ) {
448+ const skippedRequests : string [ ] = [ ] ;
449+
425450 // No user provided patterns means we can skip an extra filtering step
426451 if ( urlPatternObjects . length === 0 ) {
427- return createRequests ( requestOptions , enqueueStrategyPatterns , urlExcludePatternObjects , options . strategy ) ;
452+ return createRequests (
453+ requestOptions ,
454+ enqueueStrategyPatterns ,
455+ urlExcludePatternObjects ,
456+ options . strategy ,
457+ ( url ) => skippedRequests . push ( url ) ,
458+ ) ;
428459 }
429460
430461 // Generate requests based on the user patterns first
@@ -433,19 +464,24 @@ export async function enqueueLinks(
433464 urlPatternObjects ,
434465 urlExcludePatternObjects ,
435466 options . strategy ,
467+ ( url ) => skippedRequests . push ( url ) ,
436468 ) ;
437469 // ...then filter them by the enqueue links strategy (making this an AND check)
438- return filterRequestsByPatterns ( generatedRequestsFromUserFilters , enqueueStrategyPatterns ) ;
470+ const filtered = filterRequestsByPatterns ( generatedRequestsFromUserFilters , enqueueStrategyPatterns , ( url ) =>
471+ skippedRequests . push ( url ) ,
472+ ) ;
473+
474+ await reportSkippedRequests (
475+ skippedRequests . map ( ( url ) => ( { url } ) ) ,
476+ 'filters' ,
477+ ) ;
478+
479+ return filtered ;
439480 }
440481
441- let requests = createFilteredRequests ( ) ;
482+ let requests = await createFilteredRequests ( ) ;
442483 if ( limit && limit < requests . length ) {
443- if ( onSkippedRequest ) {
444- for ( const request of requests . slice ( limit ) ) {
445- await onSkippedRequest ( { url : request . url , reason : 'limit' } ) ;
446- }
447- }
448-
484+ await reportSkippedRequests ( requests . slice ( limit ) , 'limit' ) ;
449485 requests = requests . slice ( 0 , limit ) ;
450486 }
451487
0 commit comments