Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 16 additions & 9 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
protected retryOnBlocked: boolean;
protected respectRobotsTxtFile: boolean | { userAgent?: string };
protected onSkippedRequest?: SkippedRequestCallback;
protected stoppingPromise?: Promise<void>;
private _closeEvents?: boolean;
private shouldLogMaxProcessedRequestsExceeded = true;
private shouldLogMaxEnqueuedRequestsExceeded = true;
Expand Down Expand Up @@ -978,6 +979,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
}

this.running = true;
this.stoppingPromise = undefined as any;
this.shouldLogMaxProcessedRequestsExceeded = true;
this.shouldLogMaxEnqueuedRequestsExceeded = true;

Expand Down Expand Up @@ -1016,6 +1018,9 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
try {
await this.autoscaledPool!.run();
} finally {
if (this?.stoppingPromise) {
await this.stoppingPromise;
}
await this.teardown();
await this.stats.stopCapturing();

Expand Down Expand Up @@ -1080,15 +1085,17 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
* To stop the crawler immediately, use {@apilink BasicCrawler.teardown|`crawler.teardown()`} instead.
*/
stop(message = 'The crawler has been gracefully stopped.'): void {
// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
this.autoscaledPool
?.pause()
// Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
.then(async () => this.autoscaledPool?.abort())
.then(() => this.log.info(message))
.catch((err) => {
this.log.error('An error occurred when stopping the crawler:', err);
});
if (!this.stoppingPromise) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems much more complex than what we have in the Python version (stop() sets a boolean flag and is_finished_function picks it up and terminates the AutoscaledPool). Makes me think... what is the advantage of the approach that we use here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe both these solutions were developed around the same time, independently. I agree the Python implementation is considerably simpler, let's go with that (I'll edit this PR).

// Gracefully starve the this.autoscaledPool, so it doesn't start new tasks. Resolves once the pool is cleared.
this.stoppingPromise = this.autoscaledPool
?.pause()
// Resolves the `autoscaledPool.run()` promise in the `BasicCrawler.run()` method. Since the pool is already paused, it resolves immediately and doesn't kill any tasks.
.then(async () => this.autoscaledPool?.abort())
.then(() => this.log.info(message))
.catch((err) => {
this.log.error('An error occurred when stopping the crawler:', err);
});
}
}

async getRequestQueue(): Promise<RequestProvider> {
Expand Down