diff --git a/src/archivist/fetcher/fullDomFetcher.js b/src/archivist/fetcher/fullDomFetcher.js index bb5d22c45..16c6f6ff9 100644 --- a/src/archivist/fetcher/fullDomFetcher.js +++ b/src/archivist/fetcher/fullDomFetcher.js @@ -1,6 +1,8 @@ import puppeteer from 'puppeteer-extra'; import stealthPlugin from 'puppeteer-extra-plugin-stealth'; +import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js'; + puppeteer.use(stealthPlugin()); let browser; @@ -25,6 +27,10 @@ export default async function fetch(url, cssSelectors, config) { await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs + if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) { + await page.authenticate(browser.proxyCredentials); + } + response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout. if (!response) { @@ -86,7 +92,33 @@ export async function launchHeadlessBrowser() { return browser; } - browser = await puppeteer.launch({ headless: true }); + const options = { + args: [], + headless: !process.env.FETCHER_NO_HEADLESS, + }; + + const { httpProxy, httpsProxy } = resolveProxyConfiguration(); + + let proxyCredentials = null; + + if (httpProxy) { + const httpProxyUrl = new URL(httpProxy); + const httpsProxyUrl = new URL(httpsProxy); + + proxyCredentials = extractProxyCredentials(httpProxy, httpsProxy); + + options.args = [].concat(options.args, `--proxy-server=http=${httpProxyUrl.host};https=${httpsProxyUrl.host}`); + } + + if (process.env.FETCHER_NO_SANDBOX) { + options.args = [].concat(options.args, [ '--no-sandbox', '--disable-setuid-sandbox' ]); + } + + browser = await puppeteer.launch(options); + + if (proxyCredentials) { + browser.proxyCredentials = proxyCredentials; + } return browser; } diff --git a/src/archivist/fetcher/htmlOnlyFetcher.js b/src/archivist/fetcher/htmlOnlyFetcher.js index 65c37a5a3..e3e581c44 100644 --- a/src/archivist/fetcher/htmlOnlyFetcher.js +++ b/src/archivist/fetcher/htmlOnlyFetcher.js @@ -4,6 +4,8 @@ import HttpProxyAgent from 'http-proxy-agent'; import HttpsProxyAgent from 'https-proxy-agent'; import nodeFetch, { AbortError } from 'node-fetch'; +import { resolveProxyConfiguration } from './proxyUtils.js'; + export default async function fetch(url, config) { const controller = new AbortController(); const timeout = setTimeout(() => controller.abort(), config.navigationTimeout); @@ -14,10 +16,12 @@ export default async function fetch(url, config) { headers: { 'Accept-Language': config.language }, }; - if (url.startsWith('https:') && process.env.HTTPS_PROXY) { - nodeFetchOptions.agent = new HttpsProxyAgent(process.env.HTTPS_PROXY); - } else if (url.startsWith('http:') && process.env.HTTP_PROXY) { - nodeFetchOptions.agent = new HttpProxyAgent(process.env.HTTP_PROXY); + const { httpProxy, httpsProxy } = resolveProxyConfiguration(); + + if (url.startsWith('https:') && httpsProxy) { + nodeFetchOptions.agent = new HttpsProxyAgent(httpsProxy); + } else if (url.startsWith('http:') && httpProxy) { + nodeFetchOptions.agent = new HttpProxyAgent(httpProxy); } let response; diff --git a/src/archivist/fetcher/proxyUtils.js b/src/archivist/fetcher/proxyUtils.js new file mode 100644 index 000000000..0214e957c --- /dev/null +++ b/src/archivist/fetcher/proxyUtils.js @@ -0,0 +1,41 @@ +export function resolveProxyConfiguration() { + let httpProxy = null; + let httpsProxy = null; + + if (process.env.http_proxy) { + httpProxy = process.env.http_proxy; + } else if (process.env.HTTP_PROXY) { + httpProxy = process.env.HTTP_PROXY; + } + + if (process.env.https_proxy) { + httpsProxy = process.env.https_proxy; + } else if (process.env.HTTPS_PROXY) { + httpsProxy = process.env.HTTPS_PROXY; + } else if (httpProxy) { + httpsProxy = httpProxy; + } + + return { httpProxy, httpsProxy }; +} + +export function extractProxyCredentials(httpProxy, httpsProxy) { + if (!httpProxy) { + return null; + } + + const httpProxyUrl = new URL(httpProxy); + const httpsProxyUrl = new URL(httpsProxy); + + const { username, password } = httpProxyUrl; + + if (!username || !password) { + return null; + } + + if (httpProxyUrl.username !== httpsProxyUrl.username || httpProxyUrl.password !== httpsProxyUrl.password) { + throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.'); + } + + return { username, password }; +}