From 2a3ae238a3cf1fd96dd896c0f34b111c4230fd29 Mon Sep 17 00:00:00 2001 From: Lucas Verney Date: Fri, 11 Jul 2025 14:48:32 +0200 Subject: [PATCH 1/5] Add support for extra options in the fullDomFetcher Add support for proxy, disabling the sandboxing (which is required by some Docker setups) and disabling headless mode in the fullDomFetcher. --- src/archivist/fetcher/fullDomFetcher.js | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index bb5d22c45..be8bffcb8 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -86,7 +86,21 @@ export async function launchHeadlessBrowser() { return browser; } - browser = await puppeteer.launch({ headless: true }); + const options = { + args: [], + headless: true, + }; + if (process.env.http_proxy) { + options.args = [].concat(options.args, `--proxy-server=${process.env.http_proxy}`); + } + if (process.env.FETCHER_NO_SANDBOX) { + options.args = [].concat(options.args, [ '--no-sandbox', '--disable-setuid-sandbox' ]); + } + if (process.env.FETCHER_NO_HEADLESS) { + options.headless = false; + } + + browser = await puppeteer.launch(options); return browser; } From 7ab613c4452b026e0ff6020054c51a72fd1a95f5 Mon Sep 17 00:00:00 2001 From: LVerneyEC Date: Fri, 5 Sep 2025 16:15:03 +0200 Subject: [PATCH 2/5] Update src/archivist/fetcher/fullDomFetcher.js Co-authored-by: Nicolas Dupont --- src/archivist/fetcher/fullDomFetcher.js | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index be8bffcb8..746aa76eb 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -93,9 +93,11 @@ export async function launchHeadlessBrowser() { if (process.env.http_proxy) { options.args = [].concat(options.args, `--proxy-server=${process.env.http_proxy}`); } + if (process.env.FETCHER_NO_SANDBOX) { options.args = [].concat(options.args, [ '--no-sandbox', '--disable-setuid-sandbox' ]); } + if (process.env.FETCHER_NO_HEADLESS) { options.headless = false; } From cb8dd27e59af97100ced07c27eddb204e77faa89 Mon Sep 17 00:00:00 2001 From: Lucas Verney Date: Fri, 5 Sep 2025 16:43:21 +0200 Subject: [PATCH 3/5] Review http_proxy/https_proxy logic --- src/archivist/fetcher/fullDomFetcher.js | 47 +++++++++++++++++++++---- 1 file changed, 40 insertions(+), 7 deletions(-) diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index 746aa76eb..fbbfa5577 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -4,6 +4,7 @@ import stealthPlugin from 'puppeteer-extra-plugin-stealth'; puppeteer.use(stealthPlugin()); let browser; +let proxyCredentials = {}; export default async function fetch(url, cssSelectors, config) { let page; @@ -25,6 +26,10 @@ export default async function fetch(url, cssSelectors, config) { await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs + if (proxyCredentials.username && proxyCredentials.password) { + await page.authenticate(proxyCredentials); + } + response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. if (!response) { @@ -88,19 +93,47 @@ export async function launchHeadlessBrowser() { const options = { args: [], - headless: true, + headless: !process.env.FETCHER_NO_HEADLESS, }; + + // Handle http_proxy/https_proxy environment variables precedence + let http_proxy = null; + let https_proxy = null; + if (process.env.http_proxy) { - options.args = [].concat(options.args, `--proxy-server=${process.env.http_proxy}`); + http_proxy = process.env.http_proxy; + } + else if (process.env.HTTP_PROXY) { + http_proxy = process.env.HTTP_PROXY; + } + + if (process.env.https_proxy) { + https_proxy = process.env.https_proxy; + } + else if (process.env.HTTPS_PROXY) { + https_proxy = process.env.HTTPS_PROXY; } - + else if (http_proxy) { + https_proxy = http_proxy; + } + + // Set proxy in Puppeteer and eventually store credentials + if (http_proxy) { + const httpProxyUrl = new URL(http_proxy); + const httpsProxyUrl = new URL(https_proxy); + proxyCredentials.username = httpProxyUrl.username; + proxyCredentials.password = httpProxyUrl.password; + + if (httpProxyUrl.username != httpsProxyUrl.username || httpProxyUrl.password != httpsProxyUrl.password) { + throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.'); + } + + options.args = [].concat(options.args, `--proxy-server=http=${httpProxyUrl.host};https=${httpsProxyUrl.host}`); + } + if (process.env.FETCHER_NO_SANDBOX) { options.args = [].concat(options.args, [ '--no-sandbox', '--disable-setuid-sandbox' ]); } - - if (process.env.FETCHER_NO_HEADLESS) { - options.headless = false; - } browser = await puppeteer.launch(options); From 7654f602f0f4a4f328908faa7fb84275e92c28ed Mon Sep 17 00:00:00 2001 From: Lucas Verney Date: Fri, 5 Sep 2025 16:45:13 +0200 Subject: [PATCH 4/5] Replicate http_proxy/https_proxy logic into htmlOnlyFetcher --- src/archivist/fetcher/htmlOnlyFetcher.js | 29 ++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/src/archivist/fetcher/htmlOnlyFetcher.js b/src/archivist/fetcher/htmlOnlyFetcher.js index 65c37a5a3..461ff33ed 100644 --- a/src/archivist/fetcher/htmlOnlyFetcher.js +++ b/src/archivist/fetcher/htmlOnlyFetcher.js @@ -14,10 +14,31 @@ export default async function fetch(url, config) { headers: { 'Accept-Language': config.language }, }; - if (url.startsWith('https:') && process.env.HTTPS_PROXY) { - nodeFetchOptions.agent = new HttpsProxyAgent(process.env.HTTPS_PROXY); - } else if (url.startsWith('http:') && process.env.HTTP_PROXY) { - nodeFetchOptions.agent = new HttpProxyAgent(process.env.HTTP_PROXY); + // Handle http_proxy/https_proxy environment variables precedence + let http_proxy = null; + let https_proxy = null; + + if (process.env.http_proxy) { + http_proxy = process.env.http_proxy; + } + else if (process.env.HTTP_PROXY) { + http_proxy = process.env.HTTP_PROXY; + } + + if (process.env.https_proxy) { + https_proxy = process.env.https_proxy; + } + else if (process.env.HTTPS_PROXY) { + https_proxy = process.env.HTTPS_PROXY; + } + else if (http_proxy) { + https_proxy = http_proxy; + } + + if (url.startsWith('https:') && https_proxy) { + nodeFetchOptions.agent = new HttpsProxyAgent(https_proxy); + } else if (url.startsWith('http:') && http_proxy) { + nodeFetchOptions.agent = new HttpProxyAgent(http_proxy); } let response; From 2e1ce9b47f432a944577bf0b18e75c6680975739 Mon Sep 17 00:00:00 2001 From: Lucas Verney Date: Fri, 5 Sep 2025 16:45:13 +0200 Subject: [PATCH 5/5] Factorize proxy logic & store creds in browser --- src/archivist/fetcher/fullDomFetcher.js | 45 ++++++++---------------- src/archivist/fetcher/htmlOnlyFetcher.js | 31 ++++------------ src/archivist/fetcher/proxyUtils.js | 41 +++++++++++++++++++++ 3 files changed, 62 insertions(+), 55 deletions(-) create mode 100644 src/archivist/fetcher/proxyUtils.js diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index fbbfa5577..16c6f6ff9 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -1,10 +1,11 @@ import puppeteer from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; +import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js'; + puppeteer.use(stealthPlugin()); let browser; -let proxyCredentials = {}; export default async function fetch(url, cssSelectors, config) { let page; @@ -26,8 +27,8 @@ export default async function fetch(url, cssSelectors, config) { await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs - if (proxyCredentials.username && proxyCredentials.password) { - await page.authenticate(proxyCredentials); + if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { + await page.authenticate(browser.proxyCredentials); } response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. @@ -96,37 +97,15 @@ export async function launchHeadlessBrowser() { headless: !process.env.FETCHER_NO_HEADLESS, }; - // Handle http_proxy/https_proxy environment variables precedence - let http_proxy = null; - let https_proxy = null; + const { httpProxy, httpsProxy } = resolveProxyConfiguration(); - if (process.env.http_proxy) { - http_proxy = process.env.http_proxy; - } - else if (process.env.HTTP_PROXY) { - http_proxy = process.env.HTTP_PROXY; - } - - if (process.env.https_proxy) { - https_proxy = process.env.https_proxy; - } - else if (process.env.HTTPS_PROXY) { - https_proxy = process.env.HTTPS_PROXY; - } - else if (http_proxy) { - https_proxy = http_proxy; - } + let proxyCredentials = null; - // Set proxy in Puppeteer and eventually store credentials - if (http_proxy) { - const httpProxyUrl = new URL(http_proxy); - const httpsProxyUrl = new URL(https_proxy); - proxyCredentials.username = httpProxyUrl.username; - proxyCredentials.password = httpProxyUrl.password; + if (httpProxy) { + const httpProxyUrl = new URL(httpProxy); + const httpsProxyUrl = new URL(httpsProxy); - if (httpProxyUrl.username != httpsProxyUrl.username || httpProxyUrl.password != httpsProxyUrl.password) { - throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.'); - } + proxyCredentials = extractProxyCredentials(httpProxy, httpsProxy); options.args = [].concat(options.args, `--proxy-server=http=${httpProxyUrl.host};https=${httpsProxyUrl.host}`); } @@ -137,6 +116,10 @@ export async function launchHeadlessBrowser() { browser = await puppeteer.launch(options); + if (proxyCredentials) { + browser.proxyCredentials = proxyCredentials; + } + return browser; } diff --git a/src/archivist/fetcher/htmlOnlyFetcher.js b/src/archivist/fetcher/htmlOnlyFetcher.js index 461ff33ed..e3e581c44 100644 --- a/src/archivist/fetcher/htmlOnlyFetcher.js +++ b/src/archivist/fetcher/htmlOnlyFetcher.js @@ -4,6 +4,8 @@ import HttpProxyAgent from 'http-proxy-agent'; import HttpsProxyAgent from 'https-proxy-agent'; import nodeFetch, { AbortError } from 'node-fetch'; +import { resolveProxyConfiguration } from './proxyUtils.js'; + export default async function fetch(url, config) { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), config.navigationTimeout); @@ -14,31 +16,12 @@ export default async function fetch(url, config) { headers: { 'Accept-Language': config.language }, }; - // Handle http_proxy/https_proxy environment variables precedence - let http_proxy = null; - let https_proxy = null; - - if (process.env.http_proxy) { - http_proxy = process.env.http_proxy; - } - else if (process.env.HTTP_PROXY) { - http_proxy = process.env.HTTP_PROXY; - } - - if (process.env.https_proxy) { - https_proxy = process.env.https_proxy; - } - else if (process.env.HTTPS_PROXY) { - https_proxy = process.env.HTTPS_PROXY; - } - else if (http_proxy) { - https_proxy = http_proxy; - } + const { httpProxy, httpsProxy } = resolveProxyConfiguration(); - if (url.startsWith('https:') && https_proxy) { - nodeFetchOptions.agent = new HttpsProxyAgent(https_proxy); - } else if (url.startsWith('http:') && http_proxy) { - nodeFetchOptions.agent = new HttpProxyAgent(http_proxy); + if (url.startsWith('https:') && httpsProxy) { + nodeFetchOptions.agent = new HttpsProxyAgent(httpsProxy); + } else if (url.startsWith('http:') && httpProxy) { + nodeFetchOptions.agent = new HttpProxyAgent(httpProxy); } let response; diff --git a/src/archivist/fetcher/proxyUtils.js b/src/archivist/fetcher/proxyUtils.js new file mode 100644 index 000000000..0214e957c --- /dev/null +++ b/src/archivist/fetcher/proxyUtils.js @@ -0,0 +1,41 @@ +export function resolveProxyConfiguration() { + let httpProxy = null; + let httpsProxy = null; + + if (process.env.http_proxy) { + httpProxy = process.env.http_proxy; + } else if (process.env.HTTP_PROXY) { + httpProxy = process.env.HTTP_PROXY; + } + + if (process.env.https_proxy) { + httpsProxy = process.env.https_proxy; + } else if (process.env.HTTPS_PROXY) { + httpsProxy = process.env.HTTPS_PROXY; + } else if (httpProxy) { + httpsProxy = httpProxy; + } + + return { httpProxy, httpsProxy }; +} + +export function extractProxyCredentials(httpProxy, httpsProxy) { + if (!httpProxy) { + return null; + } + + const httpProxyUrl = new URL(httpProxy); + const httpsProxyUrl = new URL(httpsProxy); + + const { username, password } = httpProxyUrl; + + if (!username || !password) { + return null; + } + + if (httpProxyUrl.username !== httpsProxyUrl.username || httpProxyUrl.password !== httpsProxyUrl.password) { + throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.'); + } + + return { username, password }; +}