Skip to content

Commit 8f92235

Browse files
committed
Move fullDomFetcher to Playwright
1 parent 79f72d6 commit 8f92235

File tree

2 files changed

+83
-76
lines changed

2 files changed

+83
-76
lines changed

package.json

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,10 +94,8 @@
9494
"morgan": "^1.10.0",
9595
"node-fetch": "^3.1.0",
9696
"octokit": "2.0.2",
97+
"patchright": "1.50.1",
9798
"pdfjs-dist": "^2.9.359",
98-
"puppeteer": "^22.8.1",
99-
"puppeteer-extra": "^3.3.6",
100-
"puppeteer-extra-plugin-stealth": "^2.11.2",
10199
"sib-api-v3-sdk": "^8.2.1",
102100
"simple-git": "^3.27.0",
103101
"swagger-jsdoc": "^6.2.8",
Lines changed: 82 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1,94 +1,103 @@
1-
import { TimeoutError } from 'puppeteer';
2-
import puppeteer from 'puppeteer-extra';
3-
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
1+
import fs from 'fs';
2+
import os from 'os';
3+
import path from 'path';
4+
5+
import { chromium, errors } from 'patchright';
46

5-
puppeteer.use(stealthPlugin());
67

78
let browser;
89

910
export default async function fetch(url, cssSelectors, config) {
10-
let page;
11-
let response;
12-
const selectors = [].concat(cssSelectors);
13-
14-
if (!browser) {
15-
throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
16-
}
17-
18-
try {
19-
page = await browser.newPage();
20-
21-
await page.setDefaultNavigationTimeout(config.navigationTimeout);
22-
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
23-
24-
response = await page.goto(url, { waitUntil: 'networkidle0' });
11+
let page;
12+
let response;
13+
const selectors = [].concat(cssSelectors);
2514

26-
if (!response) {
27-
throw new Error(`Response is empty when trying to fetch '${url}'`);
15+
if (!browser) {
16+
throw new Error('The headless browser should be controlled manually with "launchHeadlessBrowser" and "stopHeadlessBrowser".');
2817
}
2918

30-
const statusCode = response.status();
31-
32-
if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
33-
throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
19+
try {
20+
page = await browser.newPage();
21+
22+
await page.setDefaultNavigationTimeout(config.navigationTimeout);
23+
await page.setExtraHTTPHeaders({ 'Accept-Language': config.language });
24+
25+
response = await page.goto(url);
26+
27+
if (!response) {
28+
throw new Error(`Response is empty when trying to fetch '${url}'`);
29+
}
30+
31+
const statusCode = response.status();
32+
33+
if (statusCode < 200 || (statusCode >= 300 && statusCode !== 304)) {
34+
throw new Error(`Received HTTP code ${statusCode} when trying to fetch '${url}'`);
35+
}
36+
37+
const waitForSelectorsPromises = selectors.map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
38+
39+
// We expect all elements to be present on the page…
40+
await Promise.all(waitForSelectorsPromises).catch(error => {
41+
if (error.name == 'TimeoutError') {
42+
// however, if they are not, this is not considered as an error since selectors may be out of date
43+
// and the whole content of the page should still be returned.
44+
return;
45+
}
46+
47+
throw error;
48+
});
49+
50+
return {
51+
mimeType: 'text/html',
52+
content: await page.content(),
53+
};
54+
} catch (error) {
55+
if (error instanceof errors.TimeoutError) {
56+
throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
57+
}
58+
throw new Error(error.message);
59+
} finally {
60+
if (page) {
61+
await page.close();
62+
}
3463
}
64+
}
3565

36-
const waitForSelectorsPromises = selectors.map(selector => page.waitForSelector(selector, { timeout: config.waitForElementsTimeout }));
37-
38-
// We expect all elements to be present on the page…
39-
await Promise.all(waitForSelectorsPromises).catch(error => {
40-
if (error.name == 'TimeoutError') {
41-
// however, if they are not, this is not considered as an error since selectors may be out of date
42-
// and the whole content of the page should still be returned.
43-
return;
44-
}
45-
46-
throw error;
47-
});
66+
export async function launchHeadlessBrowser() {
67+
if (browser) {
68+
return browser;
69+
}
4870

49-
return {
50-
mimeType: 'text/html',
51-
content: await page.content(),
71+
let options = {
72+
channel: "chrome",
73+
viewport: null,
5274
};
53-
} catch (error) {
54-
if (error instanceof TimeoutError) {
55-
throw new Error(`Timed out after ${config.navigationTimeout / 1000} seconds when trying to fetch '${url}'`);
75+
if (process.env.http_proxy) {
76+
let proxyUrl = new URL(process.env.http_proxy);
77+
options['proxy'] = {
78+
server: proxyUrl.origin,
79+
username: proxyUrl.username,
80+
password: proxyUrl.password
81+
}
5682
}
57-
throw new Error(error.message);
58-
} finally {
59-
if (page) {
60-
await page.close();
83+
if (process.env.PLAYWRIGHT_NO_SANDBOX) {
84+
options['args'] = ["--no-sandbox", "--disable-setuid-sandbox"];
6185
}
62-
}
63-
}
86+
if (process.env.PLAYWRIGHT_NO_HEADLESS) {
87+
options['headless'] = false;
88+
}
89+
const userDataDir = fs.mkdtempSync(path.join(os.tmpdir(), 'ota'));
90+
browser = await chromium.launchPersistentContext(userDataDir, options);
6491

65-
/**
66-
* Launches a headless browser instance using Puppeteer if one is not already running. Returns the existing browser instance if one is already running, otherwise creates and returns a new instance.
67-
* @function launchHeadlessBrowser
68-
* @returns {Promise<puppeteer.Browser>} The Puppeteer browser instance.
69-
* @async
70-
*/
71-
export async function launchHeadlessBrowser() {
72-
if (browser) {
7392
return browser;
74-
}
75-
76-
browser = await puppeteer.launch({ headless: true });
77-
78-
return browser;
7993
}
8094

81-
/**
82-
* Stops the headless browser instance if one is running. If no instance exists, it does nothing.
83-
* @function stopHeadlessBrowser
84-
* @returns {Promise<void>}
85-
* @async
86-
*/
8795
export async function stopHeadlessBrowser() {
88-
if (!browser) {
89-
return;
90-
}
96+
if (!browser) {
97+
return;
98+
}
9199

92-
await browser.close();
93-
browser = null;
100+
await browser.close();
101+
browser = null;
94102
}
103+

0 commit comments

Comments
 (0)