Skip to content

Commit 9c0580b

Browse files
authored
fix: handle multiple BasicCrawler.stop() calls correctly (#3324)
`BasicCrawler.stop()` calls asynchronous functions without awaiting them, which can cause unexpected race conditions. This PR ensures that multiple `.stop()` calls only result in one `AutoscaledPool.abort()` call and that the `.stop()`-induced promises are resolved before the main `BasicCrawler.run()` call resolves. Closes #3257
1 parent 2037ea9 commit 9c0580b

File tree

1 file changed

+32
-14
lines changed

1 file changed

+32
-14
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 32 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
536536

537537
running = false;
538538
hasFinishedBefore = false;
539+
protected unexpectedStop = false;
539540

540541
readonly log: Log;
541542
protected requestHandler!: RequestHandler<Context>;
@@ -562,8 +563,8 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
562563
protected respectRobotsTxtFile: boolean | { userAgent?: string };
563564
protected onSkippedRequest?: SkippedRequestCallback;
564565
private _closeEvents?: boolean;
565-
private shouldLogMaxProcessedRequestsExceeded = true;
566566
private shouldLogMaxEnqueuedRequestsExceeded = true;
567+
private shouldLogShuttingDown = true;
567568
private experiments: CrawlerExperiments;
568569
private readonly robotsTxtFileCache: LruCache<RobotsTxtFile>;
569570
private _experimentWarnings: Partial<Record<keyof CrawlerExperiments, boolean>> = {};
@@ -810,12 +811,23 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
810811
runTaskFunction: this._runTaskFunction.bind(this),
811812
isTaskReadyFunction: async () => {
812813
if (isMaxPagesExceeded()) {
813-
if (this.shouldLogMaxProcessedRequestsExceeded) {
814+
if (this.shouldLogShuttingDown) {
814815
log.info(
815816
'Crawler reached the maxRequestsPerCrawl limit of ' +
816817
`${this.maxRequestsPerCrawl} requests and will shut down soon. Requests that are in progress will be allowed to finish.`,
817818
);
818-
this.shouldLogMaxProcessedRequestsExceeded = false;
819+
this.shouldLogShuttingDown = false;
820+
}
821+
return false;
822+
}
823+
824+
if (this.unexpectedStop) {
825+
if (this.shouldLogShuttingDown) {
826+
this.log.info(
827+
'No new requests are allowed because the `stop()` method has been called. ' +
828+
'Ongoing requests will be allowed to complete.',
829+
);
830+
this.shouldLogShuttingDown = false;
819831
}
820832
return false;
821833
}
@@ -829,6 +841,15 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
829841
'and all requests that were in progress at that time have now finished. ' +
830842
`In total, the crawler processed ${this.handledRequestsCount} requests and will shut down.`,
831843
);
844+
this.shouldLogShuttingDown = false;
845+
return true;
846+
}
847+
848+
if (this.unexpectedStop) {
849+
this.log.info(
850+
'The crawler has finished all the remaining ongoing requests and will shut down now.',
851+
);
852+
this.shouldLogShuttingDown = false;
832853
return true;
833854
}
834855

@@ -977,9 +998,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
977998
await this.sessionPool?.resetStore();
978999
}
9791000

1001+
this.unexpectedStop = false;
9801002
this.running = true;
981-
this.shouldLogMaxProcessedRequestsExceeded = true;
9821003
this.shouldLogMaxEnqueuedRequestsExceeded = true;
1004+
this.shouldLogShuttingDown = true;
9831005

9841006
await purgeDefaultStorages({
9851007
onlyPurgeOnce: true,
@@ -1079,16 +1101,12 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
10791101
*
10801102
* To stop the crawler immediately, use {@apilink BasicCrawler.teardown|`crawler.teardown()`} instead.
10811103
*/
1082-
stop(message = 'The crawler has been gracefully stopped.'): void {
1083-
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
1084-
this.autoscaledPool
1085-
?.pause()
1086-
// Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
1087-
.then(async () => this.autoscaledPool?.abort())
1088-
.then(() => this.log.info(message))
1089-
.catch((err) => {
1090-
this.log.error('An error occurred when stopping the crawler:', err);
1091-
});
1104+
stop(reason = 'The crawler has been gracefully stopped.'): void {
1105+
if (this.unexpectedStop) {
1106+
return;
1107+
}
1108+
this.log.info(reason);
1109+
this.unexpectedStop = true;
10921110
}
10931111

10941112
async getRequestQueue(): Promise<RequestProvider> {

0 commit comments

Comments
 (0)