From 81495f62393b952cc35c30e4953edc18fbfe8a23 Mon Sep 17 00:00:00 2001 From: Hugh Francis Date: Fri, 23 Oct 2020 16:33:23 -0400 Subject: [PATCH 1/3] Adds fastFail setting for flaky pages that aren't important --- index.js | 2 ++ src/puppeteer_utils.js | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/index.js b/index.js index c0d9f91a..51e73a89 100644 --- a/index.js +++ b/index.js @@ -19,6 +19,8 @@ const defaultOptions = { destination: null, concurrency: 4, include: ["/"], + // If you get a Timeout error, ignore and move on + fastFail: true, userAgent: "ReactSnap", // 4 params below will be refactored to one: `puppeteer: {}` // https://github.com/stereobooster/react-snap/issues/120 diff --git a/src/puppeteer_utils.js b/src/puppeteer_utils.js index 820cded0..1df42511 100644 --- a/src/puppeteer_utils.js +++ b/src/puppeteer_utils.js @@ -241,7 +241,11 @@ const crawl = async opt => { await page.goto(pageUrl, { waitUntil: "networkidle0" }); } catch (e) { e.message = augmentTimeoutError(e.message, tracker); - throw e; + if (opt.fastFail) { + throw e; + } else { + console.log(`🔥 failed to crawl page: ${pageUrl}`, e); + } } finally { tracker.dispose(); } From ef485ba09543db8e3a3e922ff88a5e0c57ca6fa2 Mon Sep 17 00:00:00 2001 From: Hugh Francis Date: Tue, 15 Dec 2020 18:07:32 -0500 Subject: [PATCH 2/3] Don't fastfail on a UnhandledPromiseRejectionWarning --- src/puppeteer_utils.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/puppeteer_utils.js b/src/puppeteer_utils.js index 1df42511..77c812da 100644 --- a/src/puppeteer_utils.js +++ b/src/puppeteer_utils.js @@ -157,7 +157,9 @@ const crawl = async opt => { const onUnhandledRejection = error => { console.log("🔥 UnhandledPromiseRejectionWarning", error); - shuttingDown = true; + if (options.fastFail) { + shuttingDown = true; + } }; process.on("unhandledRejection", onUnhandledRejection); From 512e3b95696ebd5d394a55c26d8be24f2b269c20 Mon Sep 17 00:00:00 2001 From: Hugh Francis Date: Mon, 28 Mar 2022 08:13:56 -0400 Subject: [PATCH 3/3] Properly skip --- src/puppeteer_utils.js | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/puppeteer_utils.js b/src/puppeteer_utils.js index 77c812da..734f142a 100644 --- a/src/puppeteer_utils.js +++ b/src/puppeteer_utils.js @@ -239,6 +239,7 @@ const crawl = async opt => { beforeFetch && beforeFetch({ page, route }); await page.setUserAgent(options.userAgent); const tracker = createTracker(page); + let skipPage = false; try { await page.goto(pageUrl, { waitUntil: "networkidle0" }); } catch (e) { @@ -247,16 +248,19 @@ const crawl = async opt => { throw e; } else { console.log(`🔥 failed to crawl page: ${pageUrl}`, e); + skipPage = true; } } finally { tracker.dispose(); } - if (options.waitFor) await page.waitFor(options.waitFor); - if (options.crawl) { - const links = await getLinks({ page }); - links.forEach(addToQueue); + if (skipPage == false) { + if (options.waitFor) await page.waitFor(options.waitFor); + if (options.crawl) { + const links = await getLinks({ page }); + links.forEach(addToQueue); + } + afterFetch && (await afterFetch({ page, route, browser, addToQueue })); } - afterFetch && (await afterFetch({ page, route, browser, addToQueue })); await page.close(); console.log(`✅ crawled ${processed + 1} out of ${enqued} (${route})`); } catch (e) {