Skip to content

Commit 22df683

Browse files
committed
more efficient bot protection
1 parent 4aab850 commit 22df683

File tree

3 files changed

+389
-58
lines changed

3 files changed

+389
-58
lines changed
Lines changed: 274 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,274 @@
1+
import { DEFAULT_HEADER } from './utils.js';
2+
3+
// Helper to safely coerce numbers
4+
const toInt = (v, d) => {
5+
const n = parseInt(v, 10);
6+
return Number.isFinite(n) ? n : d;
7+
};
8+
9+
/**
10+
* Compute pre-launch configuration and flags for Puppeteer with bot prevention in mind.
11+
* Returns language, user agent, viewport (with optional jitter), and additional launch args.
12+
*
13+
* @param {string} url
14+
* @param {object} [options]
15+
*/
16+
export function getPreLaunchConfig(url, options = {}) {
17+
const { hostname } = new URL(url);
18+
19+
const acceptLanguage = options.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
20+
const langForFlag = acceptLanguage.split(',')[0];
21+
22+
const baseViewport = { width: 1366, height: 768, deviceScaleFactor: 1 };
23+
const jitter = options.viewportJitter !== false ? Math.floor(Math.random() * 6) : 0; // 0..5 px
24+
const width = toInt(options?.viewport?.width, baseViewport.width) + jitter;
25+
const height = toInt(options?.viewport?.height, baseViewport.height) + jitter;
26+
const deviceScaleFactor = toInt(options?.viewport?.deviceScaleFactor, baseViewport.deviceScaleFactor);
27+
const viewport = { width, height, deviceScaleFactor };
28+
29+
const userAgent =
30+
options.userAgent ||
31+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
32+
33+
const windowSizeArg = `--window-size=${viewport.width},${viewport.height}`;
34+
const langArg = `--lang=${langForFlag}`;
35+
36+
const extraArgs = [
37+
'--disable-blink-features=AutomationControlled',
38+
'--force-webrtc-ip-handling-policy=disable_non_proxied_udp',
39+
'--webrtc-ip-handling-policy=default_public_interface_only',
40+
'--proxy-bypass-list=<-loopback>',
41+
];
42+
43+
const headers = {
44+
...DEFAULT_HEADER,
45+
'Accept-Language': acceptLanguage,
46+
'User-Agent': userAgent,
47+
Referer: options?.referer || `https://${hostname}/`,
48+
Connection: 'keep-alive',
49+
DNT: '1',
50+
};
51+
52+
const timezone = options?.timezone || 'Europe/Berlin';
53+
54+
return {
55+
acceptLanguage,
56+
langForFlag,
57+
userAgent,
58+
viewport,
59+
windowSizeArg,
60+
langArg,
61+
extraArgs,
62+
headers,
63+
timezone,
64+
humanDelay: options?.humanDelay !== false,
65+
};
66+
}
67+
68+
/**
69+
* Apply bot-prevention hardening to a Puppeteer page.
70+
* Sets UA, viewport, JS enabled, headers, timezone and injects stealth-like patches.
71+
*
72+
* @param {import('puppeteer').Page} page
73+
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
74+
*/
75+
export async function applyBotPreventionToPage(page, cfg) {
76+
await page.setUserAgent(cfg.userAgent);
77+
await page.setViewport(cfg.viewport);
78+
await page.setJavaScriptEnabled(true);
79+
await page.setExtraHTTPHeaders(cfg.headers);
80+
try {
81+
if (cfg.timezone) await page.emulateTimezone(cfg.timezone);
82+
} catch {
83+
// ignore timezone failures
84+
}
85+
86+
// Inject patches as early as possible
87+
await page.evaluateOnNewDocument(() => {
88+
try {
89+
// webdriver
90+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
91+
92+
// chrome runtime
93+
// @ts-ignore
94+
if (!window.chrome) {
95+
// @ts-ignore
96+
window.chrome = { runtime: {} };
97+
}
98+
99+
// languages
100+
// @ts-ignore
101+
Object.defineProperty(navigator, 'languages', {
102+
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
103+
});
104+
105+
// plugins
106+
// @ts-ignore
107+
Object.defineProperty(navigator, 'plugins', {
108+
get: () => [{}, {}, {}],
109+
});
110+
111+
// platform and concurrency hints
112+
// @ts-ignore
113+
Object.defineProperty(navigator, 'platform', { get: () => 'Win32' });
114+
// @ts-ignore
115+
if (typeof navigator.hardwareConcurrency === 'number' && navigator.hardwareConcurrency < 2) {
116+
Object.defineProperty(navigator, 'hardwareConcurrency', { get: () => 4 });
117+
}
118+
// @ts-ignore
119+
if (typeof navigator.deviceMemory === 'number' && navigator.deviceMemory < 2) {
120+
Object.defineProperty(navigator, 'deviceMemory', { get: () => 8 });
121+
}
122+
123+
// userAgentData (Client Hints)
124+
try {
125+
// @ts-ignore
126+
if ('userAgentData' in navigator) {
127+
// @ts-ignore
128+
Object.defineProperty(navigator, 'userAgentData', {
129+
get: () => ({
130+
brands: [
131+
{ brand: 'Chromium', version: '126' },
132+
{ brand: 'Google Chrome', version: '126' },
133+
],
134+
mobile: false,
135+
platform: 'Windows',
136+
getHighEntropyValues: async (hints) => {
137+
const values = {
138+
platform: 'Windows',
139+
platformVersion: '15.0.0',
140+
architecture: 'x86',
141+
model: '',
142+
uaFullVersion: '126.0.0.0',
143+
bitness: '64',
144+
};
145+
const out = {};
146+
for (const k of hints || []) if (k in values) out[k] = values[k];
147+
return out;
148+
},
149+
}),
150+
});
151+
}
152+
} catch {
153+
//noop
154+
}
155+
156+
// Permissions API
157+
const origQuery = navigator.permissions && navigator.permissions.query;
158+
if (origQuery) {
159+
// @ts-ignore
160+
navigator.permissions.query = (parameters) =>
161+
origQuery.call(navigator.permissions, parameters).then((result) => {
162+
if (parameters && parameters.name === 'notifications') {
163+
Object.defineProperty(result, 'state', { get: () => Notification.permission });
164+
}
165+
return result;
166+
});
167+
}
168+
169+
// WebGL vendor/renderer
170+
const patchWebGL = (proto) => {
171+
if (!proto || !proto.getParameter) return;
172+
const getParameter = proto.getParameter;
173+
// @ts-ignore
174+
proto.getParameter = function (param) {
175+
const UNMASKED_VENDOR_WEBGL = 0x9245;
176+
const UNMASKED_RENDERER_WEBGL = 0x9246;
177+
if (param === UNMASKED_VENDOR_WEBGL) return 'Google Inc.';
178+
if (param === UNMASKED_RENDERER_WEBGL)
179+
return 'ANGLE (NVIDIA, NVIDIA GeForce GTX 1660 Ti Direct3D11 vs_5_0 ps_5_0)';
180+
return getParameter.call(this, param);
181+
};
182+
};
183+
// @ts-ignore
184+
patchWebGL(WebGLRenderingContext?.prototype);
185+
// @ts-ignore
186+
patchWebGL(WebGL2RenderingContext?.prototype);
187+
188+
// AudioContext timestamp rounding consistency
189+
const patchAudio = (Ctx) => {
190+
try {
191+
if (!Ctx) return;
192+
const proto = Ctx.prototype;
193+
const createOsc = proto.createOscillator;
194+
proto.createOscillator = function () {
195+
const osc = createOsc.call(this);
196+
const start = osc.start;
197+
osc.start = function (when) {
198+
return start.call(this, when || 0);
199+
};
200+
return osc;
201+
};
202+
} catch {
203+
//noop
204+
}
205+
};
206+
// @ts-ignore
207+
patchAudio(window.AudioContext);
208+
// @ts-ignore
209+
patchAudio(window.OfflineAudioContext);
210+
211+
// Navigator.connection
212+
try {
213+
// @ts-ignore
214+
Object.defineProperty(navigator, 'connection', { get: () => undefined });
215+
} catch {
216+
//noop
217+
}
218+
219+
// Consistent outer sizes
220+
try {
221+
const calcOuter = () => {
222+
const w = window.innerWidth + 16;
223+
const h = window.innerHeight + 88;
224+
return { w, h };
225+
};
226+
const { w: outerW, h: outerH } = calcOuter();
227+
// @ts-ignore
228+
Object.defineProperty(window, 'outerWidth', { get: () => outerW });
229+
// @ts-ignore
230+
Object.defineProperty(window, 'outerHeight', { get: () => outerH });
231+
} catch {
232+
//noop
233+
}
234+
} catch {
235+
//noop
236+
}
237+
});
238+
}
239+
240+
/**
241+
* Persist languages value before navigation via localStorage.
242+
* @param {import('puppeteer').Page} page
243+
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
244+
*/
245+
export async function applyLanguagePersistence(page, cfg) {
246+
await page.evaluateOnNewDocument((langs) => {
247+
try {
248+
window.localStorage.setItem('__LANGS__', langs);
249+
} catch {
250+
// noop
251+
}
252+
}, cfg.acceptLanguage.split(';')[0]);
253+
}
254+
255+
/**
256+
* Perform subtle human-like interactions post navigation.
257+
* @param {import('puppeteer').Page} page
258+
* @param {ReturnType<typeof getPreLaunchConfig>} cfg
259+
*/
260+
export async function applyPostNavigationHumanSignals(page, cfg) {
261+
if (!cfg.humanDelay) return;
262+
const delay = 200 + Math.floor(Math.random() * 400);
263+
await new Promise((res) => setTimeout(res, delay));
264+
try {
265+
const vw = cfg.viewport.width;
266+
const vh = cfg.viewport.height;
267+
const mx = Math.floor(vw * (0.3 + Math.random() * 0.4));
268+
const my = Math.floor(vh * (0.3 + Math.random() * 0.4));
269+
await page.mouse.move(mx, my, { steps: 10 + Math.floor(Math.random() * 10) });
270+
await page.mouse.wheel({ deltaY: 100 + Math.floor(Math.random() * 200) });
271+
} catch {
272+
// ignore if mouse is unavailable
273+
}
274+
}

lib/services/extractor/puppeteerExtractor.js

Lines changed: 16 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
11
import puppeteer from 'puppeteer-extra';
22
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
3-
import { debug, DEFAULT_HEADER, botDetected } from './utils.js';
3+
import { debug, botDetected } from './utils.js';
4+
import {
5+
getPreLaunchConfig,
6+
applyBotPreventionToPage,
7+
applyLanguagePersistence,
8+
applyPostNavigationHumanSignals,
9+
} from './botPrevention.js';
410
import logger from '../logger.js';
511
import fs from 'fs';
612
import os from 'os';
713
import path from 'path';
8-
import { URL } from 'url';
914

1015
puppeteer.use(StealthPlugin());
1116

@@ -40,6 +45,11 @@ export default async function execute(url, waitForSelector, options) {
4045
if (options?.proxyUrl) {
4146
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
4247
}
48+
// Prepare bot prevention pre-launch config
49+
const preCfg = getPreLaunchConfig(url, options || {});
50+
launchArgs.push(preCfg.langArg);
51+
launchArgs.push(preCfg.windowSizeArg);
52+
launchArgs.push(...preCfg.extraArgs);
4353

4454
browser = await puppeteer.launch({
4555
headless: options?.puppeteerHeadless ?? true,
@@ -50,58 +60,9 @@ export default async function execute(url, waitForSelector, options) {
5060
});
5161

5262
page = await browser.newPage();
53-
54-
// Derive domain-specific defaults
55-
const { hostname } = new URL(url);
56-
57-
// Set a realistic modern user agent unless provided
58-
const userAgent =
59-
options?.userAgent ||
60-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
61-
await page.setUserAgent(userAgent);
62-
63-
// Viewport and device scale for typical desktop
64-
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
65-
66-
// Extra HTTP headers with localized Accept-Language
67-
const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
68-
const headers = {
69-
...DEFAULT_HEADER,
70-
'Accept-Language': acceptLanguage,
71-
'User-Agent': userAgent,
72-
Referer: options?.referer || `https://${hostname}/`,
73-
Connection: 'keep-alive',
74-
DNT: '1',
75-
};
76-
await page.setExtraHTTPHeaders(headers);
77-
78-
// Timezone and locale tweaks to look German when needed
79-
try {
80-
const tz = options?.timezone || 'Europe/Berlin';
81-
if (tz) await page.emulateTimezone(tz);
82-
} catch {
83-
//noop
84-
}
85-
86-
// Harden navigator properties (stealth already covers many, but we ensure critical ones)
87-
await page.evaluateOnNewDocument(() => {
88-
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
89-
// Plugins and mimeTypes
90-
// @ts-ignore
91-
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
92-
// @ts-ignore
93-
Object.defineProperty(navigator, 'languages', {
94-
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
95-
});
96-
});
63+
await applyBotPreventionToPage(page, preCfg);
9764
// Provide languages value before navigation
98-
await page.evaluateOnNewDocument((langs) => {
99-
try {
100-
window.localStorage.setItem('__LANGS__', langs);
101-
} catch {
102-
//noop
103-
}
104-
}, acceptLanguage.split(';')[0]);
65+
await applyLanguagePersistence(page, preCfg);
10566

10667
// Optional cookies
10768
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
@@ -113,11 +74,8 @@ export default async function execute(url, waitForSelector, options) {
11374
waitUntil: options?.waitUntil || 'domcontentloaded',
11475
});
11576

116-
// Optionally wait a random small delay to mimic human rendering time
117-
if (options?.humanDelay !== false) {
118-
const delay = 200 + Math.floor(Math.random() * 400);
119-
await new Promise((res) => setTimeout(res, delay));
120-
}
77+
// Optionally wait and add subtle human-like interactions
78+
await applyPostNavigationHumanSignals(page, preCfg);
12179

12280
let pageSource;
12381
// if we're extracting data from a SPA, we must wait for the selector

0 commit comments

Comments
 (0)