-
Notifications
You must be signed in to change notification settings - Fork 877
Open
Description
Seems that when the site errors out, self.options.skipDuplicates is se to false, but never set back to true in the current version. This lets duplicate urls to end up crawled by the system.
Better solution could be something like:
if (options.retries) {
self.options.skipDuplicates = false;
setTimeout(function() {
options.retries--;
const skipDuplicates = self.options.skipDuplicates;
self.options.skipDuplicates = false;
self.queue(options);
self.options.skipDuplicates = skipDuplicates;
options.release();
},options.retryTimeout);
} else{
options.callback(error,{options:options},options.release);
}
If .queue method can throw, then something like
if (options.retries) {
self.options.skipDuplicates = false;
setTimeout(function() {
options.retries--;
const skipDuplicates = self.options.skipDuplicates;
try {
self.options.skipDuplicates = false;
self.queue(options);
} finally {
self.options.skipDuplicates = skipDuplicates;
}
options.release();
},options.retryTimeout);
} else{
options.callback(error,{options:options},options.release);
}
Metadata
Metadata
Assignees
Labels
No labels