Skip to content

Commit 0606122

Browse files
committed
improving bot detection prevention
1 parent 53d5098 commit 0606122

File tree

3 files changed

+663
-271
lines changed

3 files changed

+663
-271
lines changed

lib/services/extractor/puppeteerExtractor.js

Lines changed: 87 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import logger from '../logger.js';
55
import fs from 'fs';
66
import os from 'os';
77
import path from 'path';
8+
import { URL } from 'url';
89

910
puppeteer.use(StealthPlugin());
1011

@@ -27,23 +28,97 @@ export default async function execute(url, waitForSelector, options) {
2728
removeUserDataDir = true;
2829
}
2930

31+
const launchArgs = [
32+
'--no-sandbox',
33+
'--disable-gpu',
34+
'--disable-setuid-sandbox',
35+
'--disable-dev-shm-usage',
36+
'--disable-crash-reporter',
37+
'--no-first-run',
38+
'--no-default-browser-check',
39+
];
40+
if (options?.proxyUrl) {
41+
launchArgs.push(`--proxy-server=${options.proxyUrl}`);
42+
}
43+
3044
browser = await puppeteer.launch({
31-
headless: options.puppeteerHeadless ?? true,
32-
args: [
33-
'--no-sandbox',
34-
'--disable-gpu',
35-
'--disable-setuid-sandbox',
36-
'--disable-dev-shm-usage',
37-
'--disable-crash-reporter',
38-
],
39-
timeout: options.puppeteerTimeout || 30_000,
45+
headless: options?.puppeteerHeadless ?? true,
46+
args: launchArgs,
47+
timeout: options?.puppeteerTimeout || 30_000,
4048
userDataDir,
49+
executablePath: options?.executablePath, // allow using system Chrome
4150
});
51+
4252
page = await browser.newPage();
43-
await page.setExtraHTTPHeaders(DEFAULT_HEADER);
53+
54+
// Derive domain-specific defaults
55+
const { hostname } = new URL(url);
56+
57+
// Set a realistic modern user agent unless provided
58+
const userAgent =
59+
options?.userAgent ||
60+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36';
61+
await page.setUserAgent(userAgent);
62+
63+
// Viewport and device scale for typical desktop
64+
await page.setViewport({ width: 1366, height: 768, deviceScaleFactor: 1 });
65+
66+
// Extra HTTP headers with localized Accept-Language
67+
const acceptLanguage = options?.acceptLanguage || 'de-DE,de;q=0.9,en-US;q=0.7,en;q=0.5';
68+
const headers = {
69+
...DEFAULT_HEADER,
70+
'Accept-Language': acceptLanguage,
71+
'User-Agent': userAgent,
72+
Referer: options?.referer || `https://${hostname}/`,
73+
Connection: 'keep-alive',
74+
DNT: '1',
75+
};
76+
await page.setExtraHTTPHeaders(headers);
77+
78+
// Timezone and locale tweaks to look German when needed
79+
try {
80+
const tz = options?.timezone || 'Europe/Berlin';
81+
if (tz) await page.emulateTimezone(tz);
82+
} catch {
83+
//noop
84+
}
85+
86+
// Harden navigator properties (stealth already covers many, but we ensure critical ones)
87+
await page.evaluateOnNewDocument(() => {
88+
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
89+
// Plugins and mimeTypes
90+
// @ts-ignore
91+
Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3] });
92+
// @ts-ignore
93+
Object.defineProperty(navigator, 'languages', {
94+
get: () => (window.localStorage.getItem('__LANGS__') || 'de-DE,de').split(','),
95+
});
96+
});
97+
// Provide languages value before navigation
98+
await page.evaluateOnNewDocument((langs) => {
99+
try {
100+
window.localStorage.setItem('__LANGS__', langs);
101+
} catch {
102+
//noop
103+
}
104+
}, acceptLanguage.split(';')[0]);
105+
106+
// Optional cookies
107+
if (Array.isArray(options?.cookies) && options.cookies.length > 0) {
108+
await page.setCookie(...options.cookies);
109+
}
110+
111+
// Navigation
44112
const response = await page.goto(url, {
45-
waitUntil: 'domcontentloaded',
113+
waitUntil: options?.waitUntil || 'domcontentloaded',
46114
});
115+
116+
// Optionally wait a random small delay to mimic human rendering time
117+
if (options?.humanDelay !== false) {
118+
const delay = 200 + Math.floor(Math.random() * 400);
119+
await new Promise((res) => setTimeout(res, delay));
120+
}
121+
47122
let pageSource;
48123
// if we're extracting data from a SPA, we must wait for the selector
49124
if (waitForSelector != null) {
@@ -57,7 +132,7 @@ export default async function execute(url, waitForSelector, options) {
57132
pageSource = await page.content();
58133
}
59134

60-
const statusCode = response.status();
135+
const statusCode = response?.status?.() ?? 200;
61136

62137
if (botDetected(pageSource, statusCode)) {
63138
logger.warn('We have been detected as a bot :-/ Tried url: => ', url);

package.json

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "fredy",
3-
"version": "14.3.3",
3+
"version": "14.3.4",
44
"description": "[F]ind [R]eal [E]states [d]amn eas[y].",
55
"scripts": {
66
"prepare": "husky",
@@ -56,13 +56,13 @@
5656
"Firefox ESR"
5757
],
5858
"dependencies": {
59-
"@douyinfe/semi-icons": "^2.87.1",
60-
"@douyinfe/semi-ui": "2.87.1",
59+
"@douyinfe/semi-icons": "^2.88.0",
60+
"@douyinfe/semi-ui": "2.88.0",
6161
"@sendgrid/mail": "8.1.6",
62-
"@visactor/react-vchart": "^2.0.5",
63-
"@visactor/vchart": "^2.0.5",
62+
"@visactor/react-vchart": "^2.0.8",
63+
"@visactor/vchart": "^2.0.8",
6464
"@visactor/vchart-semi-theme": "^1.12.2",
65-
"@vitejs/plugin-react": "5.1.0",
65+
"@vitejs/plugin-react": "5.1.1",
6666
"better-sqlite3": "^12.4.1",
6767
"body-parser": "2.2.0",
6868
"cheerio": "^1.1.2",
@@ -73,21 +73,21 @@
7373
"node-cron": "^4.2.1",
7474
"node-fetch": "3.3.2",
7575
"node-mailjet": "6.0.11",
76-
"p-throttle": "^8.0.0",
76+
"p-throttle": "^8.1.0",
7777
"package-up": "^5.0.0",
78-
"puppeteer": "^24.27.0",
78+
"puppeteer": "^24.30.0",
7979
"puppeteer-extra": "^3.3.6",
8080
"puppeteer-extra-plugin-stealth": "^2.11.2",
8181
"query-string": "9.3.1",
8282
"react": "18.3.1",
8383
"react-dom": "18.3.1",
84-
"react-router": "7.9.5",
85-
"react-router-dom": "7.9.5",
84+
"react-router": "7.9.6",
85+
"react-router-dom": "7.9.6",
8686
"restana": "5.1.0",
8787
"semver": "^7.7.3",
8888
"serve-static": "2.2.0",
8989
"slack": "11.0.2",
90-
"vite": "7.1.12",
90+
"vite": "7.2.2",
9191
"x-var": "^3.0.1",
9292
"zustand": "^5.0.8"
9393
},
@@ -96,17 +96,17 @@
9696
"@babel/eslint-parser": "7.28.5",
9797
"@babel/preset-env": "7.28.5",
9898
"@babel/preset-react": "7.28.5",
99-
"chai": "6.2.0",
100-
"eslint": "9.39.0",
99+
"chai": "6.2.1",
100+
"eslint": "9.39.1",
101101
"eslint-config-prettier": "10.1.8",
102102
"eslint-plugin-react": "7.37.5",
103103
"esmock": "2.7.3",
104104
"history": "5.3.0",
105105
"husky": "9.1.7",
106106
"less": "4.4.2",
107107
"lint-staged": "16.2.6",
108-
"mocha": "11.7.4",
109-
"nodemon": "^3.1.10",
108+
"mocha": "11.7.5",
109+
"nodemon": "^3.1.11",
110110
"prettier": "3.6.2"
111111
}
112112
}

0 commit comments

Comments
 (0)