Skip to content

Commit 00d6a12

Browse files
authored
Puppeteer improvements (#270)
* improve puppeteer handling. Now only 1 puppeteer instance is being used which is WAY more efficient * removing package-lock * reduce logging * removing problematic docker command * Remove Immonet. They now belong to immowelt
1 parent 0521880 commit 00d6a12

File tree

10 files changed

+105
-13728
lines changed

10 files changed

+105
-13728
lines changed

lib/FredyPipelineExecutioner.js

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,15 @@ class FredyPipelineExecutioner {
6363
* @param {string} providerId The ID of the provider currently in use.
6464
* @param {string} jobKey Key of the job that is currently running (from within the config).
6565
* @param {SimilarityCache} similarityCache Cache instance for checking similar entries.
66+
* @param browser
6667
*/
67-
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache) {
68+
constructor(providerConfig, notificationConfig, providerId, jobKey, similarityCache, browser) {
6869
this._providerConfig = providerConfig;
6970
this._notificationConfig = notificationConfig;
7071
this._providerId = providerId;
7172
this._jobKey = jobKey;
7273
this._similarityCache = similarityCache;
74+
this._browser = browser;
7375
}
7476

7577
/**
@@ -119,7 +121,7 @@ class FredyPipelineExecutioner {
119121
* @returns {Promise<Listing[]>} Resolves with an array of listings (empty when none found).
120122
*/
121123
_getListings(url) {
122-
const extractor = new Extractor();
124+
const extractor = new Extractor({ ...this._providerConfig.puppeteerOptions, browser: this._browser });
123125
return new Promise((resolve, reject) => {
124126
extractor
125127
.execute(url, this._providerConfig.waitForSelector)

lib/provider/immonet.js

Lines changed: 0 additions & 53 deletions
This file was deleted.

lib/services/extractor/puppeteerExtractor.js

Lines changed: 68 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -19,52 +19,80 @@ import path from 'path';
1919

2020
puppeteer.use(StealthPlugin());
2121

22+
export async function launchBrowser(url, options) {
23+
const preCfg = getPreLaunchConfig(url, options || {});
24+
const launchArgs = [
25+
'--no-sandbox',
26+
'--disable-gpu',
27+
'--disable-setuid-sandbox',
28+
'--disable-dev-shm-usage',
29+
'--disable-crash-reporter',
30+
'--no-first-run',
31+
'--no-default-browser-check',
32+
preCfg.langArg,
33+
preCfg.windowSizeArg,
34+
...preCfg.extraArgs,
35+
];
36+
if (options?.proxyUrl) {
37+
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
38+
}
39+
40+
let userDataDir;
41+
let removeUserDataDir = false;
42+
if (options && options.userDataDir) {
43+
userDataDir = options.userDataDir;
44+
} else {
45+
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
46+
userDataDir = fs.mkdtempSync(prefix);
47+
removeUserDataDir = true;
48+
}
49+
50+
const browser = await puppeteer.launch({
51+
headless: options?.puppeteerHeadless ?? true,
52+
args: launchArgs,
53+
timeout: options?.puppeteerTimeout || 30_000,
54+
userDataDir,
55+
executablePath: options?.executablePath,
56+
});
57+
58+
browser.__fredy_userDataDir = userDataDir;
59+
browser.__fredy_removeUserDataDir = removeUserDataDir;
60+
61+
return browser;
62+
}
63+
64+
export async function closeBrowser(browser) {
65+
if (!browser) return;
66+
const userDataDir = browser.__fredy_userDataDir;
67+
const removeUserDataDir = browser.__fredy_removeUserDataDir;
68+
try {
69+
await browser.close();
70+
} catch {
71+
// ignore
72+
}
73+
if (removeUserDataDir && userDataDir) {
74+
try {
75+
await fs.promises.rm(userDataDir, { recursive: true, force: true });
76+
} catch {
77+
// ignore
78+
}
79+
}
80+
}
81+
2282
export default async function execute(url, waitForSelector, options) {
23-
let browser;
83+
let browser = options?.browser;
84+
let isExternalBrowser = !!browser;
2485
let page;
2586
let result;
26-
let userDataDir;
27-
let removeUserDataDir = false;
2887
try {
2988
debug(`Sending request to ${url} using Puppeteer.`);
3089

31-
// Prepare a dedicated temporary userDataDir to avoid leaking /tmp/.org.chromium.* dirs
32-
if (options && options.userDataDir) {
33-
userDataDir = options.userDataDir;
34-
removeUserDataDir = !!options.cleanupUserDataDir;
35-
} else {
36-
const prefix = path.join(os.tmpdir(), 'puppeteer-fredy-');
37-
userDataDir = fs.mkdtempSync(prefix);
38-
removeUserDataDir = true;
90+
if (!isExternalBrowser) {
91+
browser = await launchBrowser(url, options);
3992
}
4093

41-
const launchArgs = [
42-
'--no-sandbox',
43-
'--disable-gpu',
44-
'--disable-setuid-sandbox',
45-
'--disable-dev-shm-usage',
46-
'--disable-crash-reporter',
47-
'--no-first-run',
48-
'--no-default-browser-check',
49-
];
50-
if (options?.proxyUrl) {
51-
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
52-
}
53-
// Prepare bot prevention pre-launch config
54-
const preCfg = getPreLaunchConfig(url, options || {});
55-
launchArgs.push(preCfg.langArg);
56-
launchArgs.push(preCfg.windowSizeArg);
57-
launchArgs.push(...preCfg.extraArgs);
58-
59-
browser = await puppeteer.launch({
60-
headless: options?.puppeteerHeadless ?? true,
61-
args: launchArgs,
62-
timeout: options?.puppeteerTimeout || 30_000,
63-
userDataDir,
64-
executablePath: options?.executablePath, // allow using system Chrome
65-
});
66-
6794
page = await browser.newPage();
95+
const preCfg = getPreLaunchConfig(url, options || {});
6896
await applyBotPreventionToPage(page, preCfg);
6997
// Provide languages value before navigation
7098
await applyLanguagePersistence(page, preCfg);
@@ -104,7 +132,7 @@ export default async function execute(url, waitForSelector, options) {
104132
result = pageSource || (await page.content());
105133
}
106134
} catch (error) {
107-
if (error?.message?.includes('Timeout')) {
135+
if (error?.name?.includes('Timeout')) {
108136
logger.debug('Error executing with puppeteer executor', error);
109137
} else {
110138
logger.warn('Error executing with puppeteer executor', error);
@@ -118,19 +146,8 @@ export default async function execute(url, waitForSelector, options) {
118146
} catch {
119147
// ignore
120148
}
121-
try {
122-
if (browser != null) {
123-
await browser.close();
124-
}
125-
} catch {
126-
// ignore
127-
}
128-
try {
129-
if (removeUserDataDir && userDataDir) {
130-
await fs.promises.rm(userDataDir, { recursive: true, force: true });
131-
}
132-
} catch {
133-
// ignore
149+
if (browser != null && !isExternalBrowser) {
150+
await closeBrowser(browser);
134151
}
135152
}
136153
return result;

lib/services/jobs/jobExecutionService.js

Lines changed: 31 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import FredyPipelineExecutioner from '../../FredyPipelineExecutioner.js';
1313
import * as similarityCache from '../similarity-check/similarityCache.js';
1414
import { isRunning, markFinished, markRunning } from './run-state.js';
1515
import { sendToUsers } from '../sse/sse-broker.js';
16+
import * as puppeteerExtractor from '../extractor/puppeteerExtractor.js';
1617

1718
/**
1819
* Initializes the job execution service.
@@ -94,7 +95,7 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
9495
* @param {{userId?: string, isAdmin?: boolean}} [context] - Who requested the run; determines job filtering.
9596
* @returns {void}
9697
*/
97-
function runAll(respectWorkingHours = true, context = undefined) {
98+
async function runAll(respectWorkingHours = true, context = undefined) {
9899
if (settings.demoMode) return;
99100
const now = Date.now();
100101
const withinHours = duringWorkingHoursOrNotSet(settings, now);
@@ -103,15 +104,18 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
103104
return;
104105
}
105106
settings.lastRun = now;
106-
jobStorage
107+
const jobs = jobStorage
107108
.getJobs()
108109
.filter((job) => job.enabled)
109110
.filter((job) => {
110111
if (!context) return true; // startup/cron → all
111112
if (context.isAdmin) return true; // admin → all
112113
return context.userId ? job.userId === context.userId : false; // user → own
113-
})
114-
.forEach((job) => executeJob(job));
114+
});
115+
116+
for (const job of jobs) {
117+
await executeJob(job);
118+
}
115119
}
116120

117121
/**
@@ -154,28 +158,36 @@ export function initJobExecutionService({ providers, settings, intervalMs }) {
154158
} catch (err) {
155159
logger.warn('Failed to emit start status for job', job.id, err);
156160
}
161+
let browser;
157162
try {
158163
const jobProviders = job.provider.filter(
159164
(p) => providers.find((loaded) => loaded.metaInformation.id === p.id) != null,
160165
);
161-
const executions = jobProviders.map(async (prov) => {
162-
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
163-
matchedProvider.init(prov, job.blacklist);
164-
await new FredyPipelineExecutioner(
165-
matchedProvider.config,
166-
job.notificationAdapter,
167-
prov.id,
168-
job.id,
169-
similarityCache,
170-
).execute();
171-
});
172-
const results = await Promise.allSettled(executions);
173-
for (const r of results) {
174-
if (r.status === 'rejected') {
175-
logger.error(r.reason);
166+
for (const prov of jobProviders) {
167+
try {
168+
const matchedProvider = providers.find((loaded) => loaded.metaInformation.id === prov.id);
169+
matchedProvider.init(prov, job.blacklist);
170+
171+
if (!browser && matchedProvider.config.getListings == null) {
172+
browser = await puppeteerExtractor.launchBrowser(matchedProvider.config.url, {});
173+
}
174+
175+
await new FredyPipelineExecutioner(
176+
matchedProvider.config,
177+
job.notificationAdapter,
178+
prov.id,
179+
job.id,
180+
similarityCache,
181+
browser,
182+
).execute();
183+
} catch (err) {
184+
logger.error(err);
176185
}
177186
}
178187
} finally {
188+
if (browser) {
189+
await puppeteerExtractor.closeBrowser(browser);
190+
}
179191
markFinished(job.id);
180192
try {
181193
bus.emit('jobs:status', { jobId: job.id, running: false });

0 commit comments

Comments
 (0)