Skip to content

Commit acaf52c

Browse files
committed
Feat: Add device fingerprint
1 parent 4484101 commit acaf52c

File tree

5 files changed

+228
-26
lines changed

5 files changed

+228
-26
lines changed

src/api.ts

Lines changed: 109 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ import {
1414
logError,
1515
logSuccess,
1616
logWarn,
17-
mkdirDirSync
17+
mkdirDirSync,
18+
random
1819
} from './utils'
1920

2021
import {
@@ -216,7 +217,8 @@ function loaderCommonConfig(
216217
// 1.detailTargets
217218
crawlConfig.detailTargets.forEach((detail) => {
218219
// detail > advanced > app
219-
const { url, timeout, proxy, maxRetry, priority, headers } = detail
220+
const { url, timeout, proxy, maxRetry, priority, headers, fingerprint } =
221+
detail
220222

221223
// 1.1.baseUrl
222224
if (!isUndefined(xCrawlConfig.baseUrl)) {
@@ -259,6 +261,75 @@ function loaderCommonConfig(
259261
if (isUndefined(headers)) {
260262
detail.headers = advancedConfig.headers
261263
}
264+
265+
// 1.7.fingerprint(公共部分)
266+
if (fingerprint) {
267+
const { userAgent, ua, platform, mobile, acceptLanguage } = fingerprint
268+
let headers = detail.headers
269+
270+
if (!headers) {
271+
detail.headers = headers = {}
272+
}
273+
274+
// 1.user-agent
275+
if (userAgent) {
276+
headers['user-agent'] = userAgent
277+
}
278+
279+
// 2.sec-ch-ua
280+
if (ua) {
281+
headers['sec-ch-ua'] = ua
282+
}
283+
284+
// 3.sec-ch-platform
285+
if (platform) {
286+
headers['sec-ch-platform'] = platform
287+
}
288+
289+
// 4.sec-ch-mobile
290+
if (mobile) {
291+
headers['sec-ch-mobile'] = mobile
292+
}
293+
294+
// 4.accept-language
295+
if (acceptLanguage) {
296+
headers['accept-language'] = acceptLanguage
297+
}
298+
} else if (isUndefined(fingerprint) && advancedConfig.fingerprint) {
299+
const { userAgents, uas, platforms, mobiles, acceptLanguages } =
300+
advancedConfig.fingerprint
301+
let headers = detail.headers
302+
303+
if (!headers) {
304+
detail.headers = headers = {}
305+
}
306+
307+
// 1.user-agent
308+
if (userAgents) {
309+
headers['user-agent'] = userAgents[random(userAgents.length)]
310+
}
311+
312+
// 2.sec-ch-ua
313+
if (uas) {
314+
headers['sec-ch-ua'] = uas[random(uas.length)]
315+
}
316+
317+
// 3.sec-ch-platform
318+
if (platforms) {
319+
headers['sec-ch-platform'] = platforms[random(platforms.length)]
320+
}
321+
322+
// 4.sec-ch-mobile
323+
if (mobiles) {
324+
headers['sec-ch-mobile'] = mobiles[random(mobiles.length)]
325+
}
326+
327+
// 4.accept-language
328+
if (acceptLanguages) {
329+
headers['accept-language'] =
330+
acceptLanguages[random(acceptLanguages.length)]
331+
}
332+
}
262333
})
263334

264335
// 2.intervalTime
@@ -274,6 +345,30 @@ function loaderCommonConfig(
274345
crawlConfig.onCrawlItemComplete = advancedConfig.onCrawlItemComplete
275346
}
276347

348+
function loaderPageDetailFingerprint(
349+
detail: CrawlPageDetailConfig,
350+
fingerprint: {
351+
maxWidth: number
352+
minWidth?: number
353+
maxHeight: number
354+
minHidth?: number
355+
}
356+
) {
357+
const { maxWidth, minWidth, maxHeight, minHidth } = fingerprint
358+
359+
// 1.width / height
360+
const width = maxWidth === minWidth ? maxWidth : random(maxWidth, minWidth)
361+
const height =
362+
maxHeight === minHidth ? maxHeight : random(maxHeight, minHidth)
363+
const viewport = detail.viewport
364+
if (!viewport) {
365+
detail.viewport = { width, height }
366+
} else {
367+
viewport.width = width
368+
viewport.height = height
369+
}
370+
}
371+
277372
/* Create Config */
278373
/*
279374
每个创建配置函数的返回值都是类似于对应的进阶版(类似 CrawlAdvancedConfig)配置
@@ -318,21 +413,26 @@ function createCrawlPageConfig(
318413
loaderCommonConfig(xCrawlConfig, advancedConfig, crawlPageConfig)
319414

320415
// 装载单独配置
321-
const haveAdvancedCookies = !isUndefined(advancedConfig.cookies)
322-
const haveAdvancedViewport = !isUndefined(advancedConfig.viewport)
323416
crawlPageConfig.detailTargets.forEach((detail) => {
324417
// detail > advanced > xCrawl
325-
const { cookies, viewport } = detail
418+
const { cookies, viewport, fingerprint } = detail
326419

327420
// 1.cookies
328-
if (isUndefined(cookies) && haveAdvancedCookies) {
421+
if (isUndefined(cookies) && advancedConfig.cookies) {
329422
detail.cookies = advancedConfig.cookies
330423
}
331424

332425
// 2.viewport
333-
if (isUndefined(viewport) && haveAdvancedViewport) {
426+
if (isUndefined(viewport) && advancedConfig.viewport) {
334427
detail.viewport = advancedConfig.viewport
335428
}
429+
430+
// 3.fingerprint
431+
if (fingerprint) {
432+
loaderPageDetailFingerprint(detail, fingerprint)
433+
} else if (isUndefined(fingerprint) && advancedConfig.fingerprint) {
434+
loaderPageDetailFingerprint(detail, advancedConfig.fingerprint)
435+
}
336436
})
337437

338438
return crawlPageConfig as CrawlPageConfig
@@ -657,6 +757,8 @@ export function createCrawlPage(xCrawlConfig: LoaderXCrawlConfig) {
657757
const { detailTargets, intervalTime, onCrawlItemComplete } =
658758
createCrawlPageConfig(xCrawlConfig, config)
659759

760+
log(detailTargets)
761+
660762
const extraConfig: ExtraPageConfig = {
661763
errorPageMap: new Map(),
662764
browser: browser!,

src/request.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ function parseHeaders(
4141
) {
4242
const rawHeaders = rawConfig.headers ?? {}
4343
const headers: AnyObject = {
44-
'User-Agent':
45-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
44+
'user-agent':
45+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
4646
...rawHeaders
4747
}
4848

src/types/api.ts

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,18 @@ export type PageCookies =
3535
| Protocol.Network.CookieParam
3636
| Protocol.Network.CookieParam[]
3737

38+
export type Platform =
39+
| 'Android'
40+
| 'Chrome OS'
41+
| 'Chromium OS'
42+
| 'iOS'
43+
| 'Linux'
44+
| 'macOS'
45+
| 'Windows'
46+
| 'Unknown'
47+
48+
export type Mobile = '?0' | '?1'
49+
3850
// API crawl config
3951
// Common
4052
export interface CrawlCommonConfig {
@@ -50,6 +62,17 @@ export interface CrawlPageDetailConfig extends CrawlCommonConfig {
5062
cookies?: PageCookies | null
5163
priority?: number
5264
viewport?: Viewport | null
65+
fingerprint?: {
66+
maxWidth: number
67+
minWidth: number
68+
maxHeight: number
69+
minHidth: number
70+
userAgent?: string
71+
ua?: string
72+
platform?: Platform
73+
mobile?: Mobile
74+
acceptLanguage?: string
75+
} | null
5376
}
5477

5578
export interface CrawlDataDetailConfig extends CrawlCommonConfig {
@@ -59,6 +82,13 @@ export interface CrawlDataDetailConfig extends CrawlCommonConfig {
5982
params?: AnyObject
6083
data?: any
6184
priority?: number
85+
fingerprint?: {
86+
userAgent?: string
87+
ua?: string
88+
platform?: Platform
89+
mobile?: Mobile
90+
acceptLanguage?: string
91+
} | null
6292
}
6393

6494
export interface CrawlFileDetailConfig extends CrawlCommonConfig {
@@ -68,12 +98,30 @@ export interface CrawlFileDetailConfig extends CrawlCommonConfig {
6898
storeDir?: string | null
6999
fileName?: string
70100
extension?: string | null
101+
fingerprint?: {
102+
userAgent?: string
103+
ua?: string
104+
platform?: Platform
105+
mobile?: Mobile
106+
acceptLanguage?: string
107+
} | null
71108
}
72109

73110
// 2.Advanced
74111
export interface CrawlPageAdvancedConfig extends CrawlCommonConfig {
75112
targets: (string | CrawlPageDetailConfig)[]
76113
intervalTime?: IntervalTime
114+
fingerprint?: {
115+
maxWidth: number
116+
minWidth?: number
117+
maxHeight: number
118+
minHidth?: number
119+
userAgents?: string[]
120+
uas?: string[]
121+
platforms?: Platform[]
122+
mobiles?: Mobile[]
123+
acceptLanguages?: string[]
124+
}
77125

78126
headers?: AnyObject
79127
cookies?: PageCookies
@@ -85,6 +133,13 @@ export interface CrawlPageAdvancedConfig extends CrawlCommonConfig {
85133
export interface CrawlDataAdvancedConfig<T> extends CrawlCommonConfig {
86134
targets: (string | CrawlDataDetailConfig)[]
87135
intervalTime?: IntervalTime
136+
fingerprint?: {
137+
userAgents?: string[]
138+
uas?: string[]
139+
platforms?: Platform[]
140+
mobiles?: Mobile[]
141+
acceptLanguages?: string[]
142+
}
88143

89144
headers?: AnyObject
90145

@@ -94,6 +149,13 @@ export interface CrawlDataAdvancedConfig<T> extends CrawlCommonConfig {
94149
export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
95150
targets: (string | CrawlFileDetailConfig)[]
96151
intervalTime?: IntervalTime
152+
fingerprint?: {
153+
userAgents?: string[]
154+
uas?: string[]
155+
platforms?: Platform[]
156+
mobiles?: Mobile[]
157+
acceptLanguages?: string[]
158+
}
97159

98160
headers?: AnyObject
99161
storeDir?: string

0 commit comments

Comments
 (0)