Skip to content

Commit 2e1ce9b

Browse files
LVerneyECNdpnt
authored andcommitted
Factorize proxy logic & store creds in browser
1 parent 7654f60 commit 2e1ce9b

File tree

3 files changed

+62
-55
lines changed

3 files changed

+62
-55
lines changed

src/archivist/fetcher/fullDomFetcher.js

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
import puppeteer from 'puppeteer-extra';
22
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
33

4+
import { resolveProxyConfiguration, extractProxyCredentials } from './proxyUtils.js';
5+
46
puppeteer.use(stealthPlugin());
57

68
let browser;
7-
let proxyCredentials = {};
89

910
export default async function fetch(url, cssSelectors, config) {
1011
let page;
@@ -26,8 +27,8 @@ export default async function fetch(url, cssSelectors, config) {
2627

2728
await client.send('Network.clearBrowserCookies'); // Clear cookies to ensure clean state between fetches and prevent session persistence across different URLs
2829

29-
if (proxyCredentials.username && proxyCredentials.password) {
30-
await page.authenticate(proxyCredentials);
30+
if (browser.proxyCredentials?.username && browser.proxyCredentials?.password) {
31+
await page.authenticate(browser.proxyCredentials);
3132
}
3233

3334
response = await page.goto(url, { waitUntil: 'load' }); // Using `load` instead of `networkidle0` as it's more reliable and faster. The 'load' event fires when the page and all its resources (stylesheets, scripts, images) have finished loading. `networkidle0` can be problematic as it waits for 500ms of network inactivity, which may never occur on dynamic pages and then triggers a navigation timeout.
@@ -96,37 +97,15 @@ export async function launchHeadlessBrowser() {
9697
headless: !process.env.FETCHER_NO_HEADLESS,
9798
};
9899

99-
// Handle http_proxy/https_proxy environment variables precedence
100-
let http_proxy = null;
101-
let https_proxy = null;
100+
const { httpProxy, httpsProxy } = resolveProxyConfiguration();
102101

103-
if (process.env.http_proxy) {
104-
http_proxy = process.env.http_proxy;
105-
}
106-
else if (process.env.HTTP_PROXY) {
107-
http_proxy = process.env.HTTP_PROXY;
108-
}
109-
110-
if (process.env.https_proxy) {
111-
https_proxy = process.env.https_proxy;
112-
}
113-
else if (process.env.HTTPS_PROXY) {
114-
https_proxy = process.env.HTTPS_PROXY;
115-
}
116-
else if (http_proxy) {
117-
https_proxy = http_proxy;
118-
}
102+
let proxyCredentials = null;
119103

120-
// Set proxy in Puppeteer and eventually store credentials
121-
if (http_proxy) {
122-
const httpProxyUrl = new URL(http_proxy);
123-
const httpsProxyUrl = new URL(https_proxy);
124-
proxyCredentials.username = httpProxyUrl.username;
125-
proxyCredentials.password = httpProxyUrl.password;
104+
if (httpProxy) {
105+
const httpProxyUrl = new URL(httpProxy);
106+
const httpsProxyUrl = new URL(httpsProxy);
126107

127-
if (httpProxyUrl.username != httpsProxyUrl.username || httpProxyUrl.password != httpsProxyUrl.password) {
128-
throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.');
129-
}
108+
proxyCredentials = extractProxyCredentials(httpProxy, httpsProxy);
130109

131110
options.args = [].concat(options.args, `--proxy-server=http=${httpProxyUrl.host};https=${httpsProxyUrl.host}`);
132111
}
@@ -137,6 +116,10 @@ export async function launchHeadlessBrowser() {
137116

138117
browser = await puppeteer.launch(options);
139118

119+
if (proxyCredentials) {
120+
browser.proxyCredentials = proxyCredentials;
121+
}
122+
140123
return browser;
141124
}
142125

src/archivist/fetcher/htmlOnlyFetcher.js

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ import HttpProxyAgent from 'http-proxy-agent';
44
import HttpsProxyAgent from 'https-proxy-agent';
55
import nodeFetch, { AbortError } from 'node-fetch';
66

7+
import { resolveProxyConfiguration } from './proxyUtils.js';
8+
79
export default async function fetch(url, config) {
810
const controller = new AbortController();
911
const timeout = setTimeout(() => controller.abort(), config.navigationTimeout);
@@ -14,31 +16,12 @@ export default async function fetch(url, config) {
1416
headers: { 'Accept-Language': config.language },
1517
};
1618

17-
// Handle http_proxy/https_proxy environment variables precedence
18-
let http_proxy = null;
19-
let https_proxy = null;
20-
21-
if (process.env.http_proxy) {
22-
http_proxy = process.env.http_proxy;
23-
}
24-
else if (process.env.HTTP_PROXY) {
25-
http_proxy = process.env.HTTP_PROXY;
26-
}
27-
28-
if (process.env.https_proxy) {
29-
https_proxy = process.env.https_proxy;
30-
}
31-
else if (process.env.HTTPS_PROXY) {
32-
https_proxy = process.env.HTTPS_PROXY;
33-
}
34-
else if (http_proxy) {
35-
https_proxy = http_proxy;
36-
}
19+
const { httpProxy, httpsProxy } = resolveProxyConfiguration();
3720

38-
if (url.startsWith('https:') && https_proxy) {
39-
nodeFetchOptions.agent = new HttpsProxyAgent(https_proxy);
40-
} else if (url.startsWith('http:') && http_proxy) {
41-
nodeFetchOptions.agent = new HttpProxyAgent(http_proxy);
21+
if (url.startsWith('https:') && httpsProxy) {
22+
nodeFetchOptions.agent = new HttpsProxyAgent(httpsProxy);
23+
} else if (url.startsWith('http:') && httpProxy) {
24+
nodeFetchOptions.agent = new HttpProxyAgent(httpProxy);
4225
}
4326

4427
let response;
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
export function resolveProxyConfiguration() {
2+
let httpProxy = null;
3+
let httpsProxy = null;
4+
5+
if (process.env.http_proxy) {
6+
httpProxy = process.env.http_proxy;
7+
} else if (process.env.HTTP_PROXY) {
8+
httpProxy = process.env.HTTP_PROXY;
9+
}
10+
11+
if (process.env.https_proxy) {
12+
httpsProxy = process.env.https_proxy;
13+
} else if (process.env.HTTPS_PROXY) {
14+
httpsProxy = process.env.HTTPS_PROXY;
15+
} else if (httpProxy) {
16+
httpsProxy = httpProxy;
17+
}
18+
19+
return { httpProxy, httpsProxy };
20+
}
21+
22+
export function extractProxyCredentials(httpProxy, httpsProxy) {
23+
if (!httpProxy) {
24+
return null;
25+
}
26+
27+
const httpProxyUrl = new URL(httpProxy);
28+
const httpsProxyUrl = new URL(httpsProxy);
29+
30+
const { username, password } = httpProxyUrl;
31+
32+
if (!username || !password) {
33+
return null;
34+
}
35+
36+
if (httpProxyUrl.username !== httpsProxyUrl.username || httpProxyUrl.password !== httpsProxyUrl.password) {
37+
throw new Error('Unsupported proxies specified, http and https proxy should have the same credentials.');
38+
}
39+
40+
return { username, password };
41+
}

0 commit comments

Comments
 (0)