Skip to content

Commit d889a5a

Browse files
committed
Add support for extra options in the fetchers
Add support for proxy, disabling the sandboxing (which is required by some Docker setups) and disabling headless mode in the fullDomFetcher.
1 parent ba211e3 commit d889a5a

File tree

4 files changed

+89
-5
lines changed

4 files changed

+89
-5
lines changed

CHANGELOG.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,13 @@
22

33
All changes that impact users of this module are documented in this file, in the [Common Changelog](https://common-changelog.org) format with some additional specifications defined in the CONTRIBUTING file. This codebase adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
44

5+
## Unreleased
6+
7+
### Changed
8+
9+
- Add extra options for fetchers, including support for (authentifying) proxies and switchin on/off headless mode and
10+
sandboxing.
11+
512
## 9.1.0 - 2025-10-01
613

714
_Full changeset and discussions: [#1197](https://github.com/OpenTermsArchive/engine/pull/1197)._

src/archivist/fetcher/fullDomFetcher.js

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import puppeteer from 'puppeteer-extra';
22
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
33

4+
import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js';
5+
46
puppeteer.use(stealthPlugin());
57

68
let browser;
@@ -25,6 +27,10 @@ export default async function fetch(url, cssSelectors, config) {
2527

2628
await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs
2729

30+
if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
31+
await page.authenticate(browser.proxyCredentials);
32+
}
33+
2834
response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
2935

3036
if (!response) {
@@ -86,7 +92,33 @@ export async function launchHeadlessBrowser() {
8692
return browser;
8793
}
8894

89-
browser = await puppeteer.launch({ headless: true });
95+
const options = {
96+
args: [],
97+
headless: !process.env.FETCHER_NO_HEADLESS,
98+
};
99+
100+
const { httpProxy, httpsProxy } = resolveProxyConfiguration();
101+
102+
let proxyCredentials = null;
103+
104+
if (httpProxy) {
105+
const httpProxyUrl = new URL(httpProxy);
106+
const httpsProxyUrl = new URL(httpsProxy);
107+
108+
proxyCredentials = extractProxyCredentials(httpProxy, httpsProxy);
109+
110+
options.args = [].concat(options.args, `--proxy-server=http=${httpProxyUrl.host};https=${httpsProxyUrl.host}`);
111+
}
112+
113+
if (process.env.FETCHER_NO_SANDBOX) {
114+
options.args = [].concat(options.args, [ '--no-sandbox', '--disable-setuid-sandbox' ]);
115+
}
116+
117+
browser = await puppeteer.launch(options);
118+
119+
if (proxyCredentials) {
120+
browser.proxyCredentials = proxyCredentials;
121+
}
90122

91123
return browser;
92124
}

src/archivist/fetcher/htmlOnlyFetcher.js

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import HttpProxyAgent from 'http-proxy-agent';
44
import HttpsProxyAgent from 'https-proxy-agent';
55
import nodeFetch, { AbortError } from 'node-fetch';
66

7+
import { resolveProxyConfiguration } from './proxyUtils.js';
8+
79
export default async function fetch(url, config) {
810
const controller = new AbortController();
911
const timeout = setTimeout(() => controller.abort(), config.navigationTimeout);
@@ -14,10 +16,12 @@ export default async function fetch(url, config) {
1416
headers: { 'Accept-Language': config.language },
1517
};
1618

17-
if (url.startsWith('https:') && process.env.HTTPS_PROXY) {
18-
nodeFetchOptions.agent = new HttpsProxyAgent(process.env.HTTPS_PROXY);
19-
} else if (url.startsWith('http:') && process.env.HTTP_PROXY) {
20-
nodeFetchOptions.agent = new HttpProxyAgent(process.env.HTTP_PROXY);
19+
const { httpProxy, httpsProxy } = resolveProxyConfiguration();
20+
21+
if (url.startsWith('https:') && httpsProxy) {
22+
nodeFetchOptions.agent = new HttpsProxyAgent(httpsProxy);
23+
} else if (url.startsWith('http:') && httpProxy) {
24+
nodeFetchOptions.agent = new HttpProxyAgent(httpProxy);
2125
}
2226

2327
let response;
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
export function resolveProxyConfiguration() {
2+
let httpProxy = null;
3+
let httpsProxy = null;
4+
5+
if (process.env.http_proxy) {
6+
httpProxy = process.env.http_proxy;
7+
} else if (process.env.HTTP_PROXY) {
8+
httpProxy = process.env.HTTP_PROXY;
9+
}
10+
11+
if (process.env.https_proxy) {
12+
httpsProxy = process.env.https_proxy;
13+
} else if (process.env.HTTPS_PROXY) {
14+
httpsProxy = process.env.HTTPS_PROXY;
15+
} else if (httpProxy) {
16+
httpsProxy = httpProxy;
17+
}
18+
19+
return { httpProxy, httpsProxy };
20+
}
21+
22+
export function extractProxyCredentials(httpProxy, httpsProxy) {
23+
if (!httpProxy) {
24+
return null;
25+
}
26+
27+
const httpProxyUrl = new URL(httpProxy);
28+
const httpsProxyUrl = new URL(httpsProxy);
29+
30+
const { username, password } = httpProxyUrl;
31+
32+
if (!username || !password) {
33+
return null;
34+
}
35+
36+
if (httpProxyUrl.username !== httpsProxyUrl.username || httpProxyUrl.password !== httpsProxyUrl.password) {
37+
throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.');
38+
}
39+
40+
return { username, password };
41+
}

0 commit comments

Comments
 (0)