Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion src/archivist/fetcher/fullDomFetcher.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import puppeteer from 'puppeteer-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';

import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js';

puppeteer.use(stealthPlugin());

let browser;
Expand All @@ -25,6 +27,10 @@ export default async function fetch(url, cssSelectors, config) {

await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs

if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
await page.authenticate(browser.proxyCredentials);
}

response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.

if (!response) {
Expand Down Expand Up @@ -86,7 +92,33 @@ export async function launchHeadlessBrowser() {
return browser;
}

browser = await puppeteer.launch({ headless: true });
const options = {
args: [],
headless: !process.env.FETCHER_NO_HEADLESS,
};

const { httpProxy, httpsProxy } = resolveProxyConfiguration();

let proxyCredentials = null;

if (httpProxy) {
const httpProxyUrl = new URL(httpProxy);
const httpsProxyUrl = new URL(httpsProxy);

proxyCredentials = extractProxyCredentials(httpProxy, httpsProxy);

options.args = [].concat(options.args, `--proxy-server=http=${httpProxyUrl.host};https=${httpsProxyUrl.host}`);
}

if (process.env.FETCHER_NO_SANDBOX) {
options.args = [].concat(options.args, [ '--no-sandbox', '--disable-setuid-sandbox' ]);
}

browser = await puppeteer.launch(options);

if (proxyCredentials) {
browser.proxyCredentials = proxyCredentials;
}

return browser;
}
Expand Down
12 changes: 8 additions & 4 deletions src/archivist/fetcher/htmlOnlyFetcher.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ import HttpProxyAgent from 'http-proxy-agent';
import HttpsProxyAgent from 'https-proxy-agent';
import nodeFetch, { AbortError } from 'node-fetch';

import { resolveProxyConfiguration } from './proxyUtils.js';

export default async function fetch(url, config) {
const controller = new AbortController();
const timeout = setTimeout(() => controller.abort(), config.navigationTimeout);
Expand All @@ -14,10 +16,12 @@ export default async function fetch(url, config) {
headers: { 'Accept-Language': config.language },
};

if (url.startsWith('https:') && process.env.HTTPS_PROXY) {
nodeFetchOptions.agent = new HttpsProxyAgent(process.env.HTTPS_PROXY);
} else if (url.startsWith('http:') && process.env.HTTP_PROXY) {
nodeFetchOptions.agent = new HttpProxyAgent(process.env.HTTP_PROXY);
const { httpProxy, httpsProxy } = resolveProxyConfiguration();

if (url.startsWith('https:') && httpsProxy) {
nodeFetchOptions.agent = new HttpsProxyAgent(httpsProxy);
} else if (url.startsWith('http:') && httpProxy) {
nodeFetchOptions.agent = new HttpProxyAgent(httpProxy);
}

let response;
Expand Down
41 changes: 41 additions & 0 deletions src/archivist/fetcher/proxyUtils.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
export function resolveProxyConfiguration() {
let httpProxy = null;
let httpsProxy = null;

if (process.env.http_proxy) {
httpProxy = process.env.http_proxy;
} else if (process.env.HTTP_PROXY) {
httpProxy = process.env.HTTP_PROXY;
}

if (process.env.https_proxy) {
httpsProxy = process.env.https_proxy;
} else if (process.env.HTTPS_PROXY) {
httpsProxy = process.env.HTTPS_PROXY;
} else if (httpProxy) {
httpsProxy = httpProxy;
}

return { httpProxy, httpsProxy };
}

export function extractProxyCredentials(httpProxy, httpsProxy) {
if (!httpProxy) {
return null;
}

const httpProxyUrl = new URL(httpProxy);
const httpsProxyUrl = new URL(httpsProxy);

const { username, password } = httpProxyUrl;

if (!username || !password) {
return null;
}

if (httpProxyUrl.username !== httpsProxyUrl.username || httpProxyUrl.password !== httpsProxyUrl.password) {
throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.');
}

return { username, password };
}
Loading