Skip to content

Commit 1dacd25

Browse files
committed
Feat: Default device fingerprint
1 parent 51f2813 commit 1dacd25

File tree

8 files changed

+253
-188
lines changed

8 files changed

+253
-188
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
### 🚀 特征
1212

13-
- 新增设备指纹,避免浏览器识别并跟踪我们的在线行为。可在进阶用法中设置,也可以通过详细用法指定设置
13+
- 新增设备指纹,避免浏览器识别并跟踪我们的在线行为。可以通过一个开关使用默认的,如果需指定则可在进阶用法中为所有爬取目标统一设置,也可以通过详细目标用法指定设置
1414
- 在创建爬虫应用的配置新增 crawlPage ,可以在 crawlPage.launchBrowser 选项中设置创建浏览器的配置(类型为 PuppeteerLaunchOptions 来自 Puppeteer)。
1515
- CrawlPageAdvancedConfig、CrawlDataAdvancedConfig 以及 CrawlFileAdvancedConfig 进阶用法里面的每个爬取请求 header 可以在进阶方式配置对象统一设置,不必为每个爬取配置重复设置一遍。
1616
- crawlPage 新增 viewport 选项,用于设置页面的视口。

src/api.ts

Lines changed: 141 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,14 @@ import {
3030
CrawlFileSingleRes,
3131
CrawlFileAdvancedConfig,
3232
CrawlDataAdvancedConfig,
33-
IntervalTime
33+
IntervalTime,
34+
DetailTargetFingerprintCommon,
35+
Platform,
36+
Mobile
3437
} from './types/api'
3538
import { LoaderXCrawlConfig } from './types'
3639
import { AnyObject } from './types/common'
40+
import { randomFingerprint } from './default'
3741

3842
/* Types */
3943

@@ -212,7 +216,78 @@ function transformToDetailTargets(config: any) {
212216
: [isObject(config) ? config : { url: config }]
213217
}
214218

215-
function loaderCommonConfig(
219+
function loaderCommonFingerprintToDetailTarget(
220+
detail:
221+
| CrawlPageDetailTargetConfig
222+
| CrawlDataDetailTargetConfig
223+
| CrawlFileDetailTargetConfig,
224+
fingerprint: DetailTargetFingerprintCommon
225+
) {
226+
const { userAgent, ua, platform, platformVersion, mobile, acceptLanguage } =
227+
fingerprint
228+
229+
let headers = detail.headers
230+
231+
if (!headers) {
232+
detail.headers = headers = {}
233+
}
234+
235+
// 1.user-agent
236+
if (userAgent) {
237+
headers['user-agent'] = userAgent
238+
}
239+
240+
// 2.sec-ch-ua
241+
if (ua) {
242+
headers['sec-ch-ua'] = ua
243+
}
244+
245+
// 3.sec-ch-platform
246+
if (platform) {
247+
headers['sec-ch-platform'] = platform
248+
}
249+
250+
// 4.sec-ch-ua-platform-version
251+
if (platformVersion) {
252+
headers['sec-ch-ua-platform-version'] = platformVersion
253+
}
254+
255+
// 5.sec-ch-mobile
256+
if (mobile) {
257+
headers['sec-ch-mobile'] = mobile
258+
}
259+
260+
// 6.accept-language
261+
if (acceptLanguage) {
262+
headers['accept-language'] = acceptLanguage
263+
}
264+
}
265+
266+
function loaderPageFingerprintToDetailTarget(
267+
detail: CrawlPageDetailTargetConfig,
268+
fingerprint: {
269+
maxWidth: number
270+
minWidth?: number
271+
maxHeight: number
272+
minHidth?: number
273+
}
274+
) {
275+
const { maxWidth, minWidth, maxHeight, minHidth } = fingerprint
276+
277+
// 1.width / height
278+
const width = maxWidth === minWidth ? maxWidth : random(maxWidth, minWidth)
279+
const height =
280+
maxHeight === minHidth ? maxHeight : random(maxHeight, minHidth)
281+
const viewport = detail.viewport
282+
if (!viewport) {
283+
detail.viewport = { width, height }
284+
} else {
285+
viewport.width = width
286+
viewport.height = height
287+
}
288+
}
289+
290+
function loaderCommonConfigToCrawlConfig(
216291
xCrawlConfig: LoaderXCrawlConfig,
217292
advancedConfig:
218293
| CrawlPageAdvancedConfig
@@ -267,77 +342,82 @@ function loaderCommonConfig(
267342
}
268343

269344
// 1.6.header
270-
if (isUndefined(headers)) {
271-
detail.headers = advancedConfig.headers
345+
if (isUndefined(headers) && advancedConfig.headers) {
346+
detail.headers = { ...advancedConfig.headers }
272347
}
273348

274349
// 1.7.fingerprint(公共部分)
275350
if (fingerprint) {
276-
const { userAgent, ua, platform, mobile, acceptLanguage } = fingerprint
277-
let headers = detail.headers
351+
// detaileTarget
278352

279-
if (!headers) {
280-
detail.headers = headers = {}
281-
}
353+
loaderCommonFingerprintToDetailTarget(detail, fingerprint)
354+
} else if (isUndefined(fingerprint) && advancedConfig.fingerprint) {
355+
// advancedConfig
356+
357+
const {
358+
userAgents,
359+
uas,
360+
platforms,
361+
platformVersions,
362+
mobiles,
363+
acceptLanguages
364+
} = advancedConfig.fingerprint
282365

283366
// 1.user-agent
284-
if (userAgent) {
285-
headers['user-agent'] = userAgent
286-
}
367+
const userAgent = userAgents
368+
? userAgents[random(userAgents.length)]
369+
: undefined
287370

288371
// 2.sec-ch-ua
289-
if (ua) {
290-
headers['sec-ch-ua'] = ua
291-
}
372+
const ua = uas ? uas[random(uas.length)] : undefined
292373

293374
// 3.sec-ch-platform
294-
if (platform) {
295-
headers['sec-ch-platform'] = platform
296-
}
297-
298-
// 4.sec-ch-mobile
299-
if (mobile) {
300-
headers['sec-ch-mobile'] = mobile
301-
}
302-
303-
// 4.accept-language
304-
if (acceptLanguage) {
305-
headers['accept-language'] = acceptLanguage
306-
}
307-
} else if (isUndefined(fingerprint) && advancedConfig.fingerprint) {
308-
const { userAgents, uas, platforms, mobiles, acceptLanguages } =
309-
advancedConfig.fingerprint
310-
let headers = detail.headers
375+
const platform = platforms
376+
? platforms[random(platforms.length)]
377+
: undefined
378+
379+
// 4.sec-ch-platform-version
380+
const platformVersion = platformVersions
381+
? platformVersions[random(platformVersions.length)]
382+
: undefined
383+
384+
// 5.sec-ch-mobile
385+
const mobile = mobiles ? mobiles[random(mobiles.length)] : undefined
386+
387+
// 6.accept-language
388+
const acceptLanguage = acceptLanguages
389+
? acceptLanguages[random(acceptLanguages.length)]
390+
: undefined
391+
392+
loaderCommonFingerprintToDetailTarget(detail, {
393+
userAgent,
394+
ua,
395+
platform,
396+
platformVersion,
397+
mobile,
398+
acceptLanguage
399+
})
400+
} else if (xCrawlConfig.enableRandomFingerprint) {
401+
// xCrawlConfig
311402

312-
if (!headers) {
313-
detail.headers = headers = {}
314-
}
403+
const { platforms, mobiles } = randomFingerprint
315404

316405
// 1.user-agent
317-
if (userAgents) {
318-
headers['user-agent'] = userAgents[random(userAgents.length)]
319-
}
406+
const userAgent = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.${random(
407+
10
408+
)}.${random(10000)}.${random(1000)} Safari/537.36`
320409

321-
// 2.sec-ch-ua
322-
if (uas) {
323-
headers['sec-ch-ua'] = uas[random(uas.length)]
324-
}
410+
// 2.sec-ch-platform
411+
const platform = platforms[random(platforms.length)] as Platform
325412

326-
// 3.sec-ch-platform
327-
if (platforms) {
328-
headers['sec-ch-platform'] = platforms[random(platforms.length)]
329-
}
413+
// 3.sec-ch-mobile
414+
const mobile = mobiles[random(mobiles.length)] as Mobile
330415

331-
// 4.sec-ch-mobile
332-
if (mobiles) {
333-
headers['sec-ch-mobile'] = mobiles[random(mobiles.length)]
334-
}
335-
336-
// 4.accept-language
337-
if (acceptLanguages) {
338-
headers['accept-language'] =
339-
acceptLanguages[random(acceptLanguages.length)]
340-
}
416+
loaderCommonFingerprintToDetailTarget(detail, {
417+
userAgent,
418+
platform,
419+
mobile
420+
})
341421
}
342422
})
343423

@@ -354,30 +434,6 @@ function loaderCommonConfig(
354434
crawlConfig.onCrawlItemComplete = advancedConfig.onCrawlItemComplete
355435
}
356436

357-
function loaderPageDetailTargetFingerprint(
358-
detail: CrawlPageDetailTargetConfig,
359-
fingerprint: {
360-
maxWidth: number
361-
minWidth?: number
362-
maxHeight: number
363-
minHidth?: number
364-
}
365-
) {
366-
const { maxWidth, minWidth, maxHeight, minHidth } = fingerprint
367-
368-
// 1.width / height
369-
const width = maxWidth === minWidth ? maxWidth : random(maxWidth, minWidth)
370-
const height =
371-
maxHeight === minHidth ? maxHeight : random(maxHeight, minHidth)
372-
const viewport = detail.viewport
373-
if (!viewport) {
374-
detail.viewport = { width, height }
375-
} else {
376-
viewport.width = width
377-
viewport.height = height
378-
}
379-
}
380-
381437
/* Create Config */
382438
/*
383439
每个创建配置函数的返回值都是类似于进阶版配置
@@ -419,7 +475,7 @@ function createCrawlPageConfig(
419475
}
420476

421477
// 装载公共配置
422-
loaderCommonConfig(xCrawlConfig, advancedConfig, crawlPageConfig)
478+
loaderCommonConfigToCrawlConfig(xCrawlConfig, advancedConfig, crawlPageConfig)
423479

424480
// 装载单独配置
425481
crawlPageConfig.detailTargets.forEach((detail) => {
@@ -438,9 +494,9 @@ function createCrawlPageConfig(
438494

439495
// 3.fingerprint
440496
if (fingerprint) {
441-
loaderPageDetailTargetFingerprint(detail, fingerprint)
497+
loaderPageFingerprintToDetailTarget(detail, fingerprint)
442498
} else if (isUndefined(fingerprint) && advancedConfig.fingerprint) {
443-
loaderPageDetailTargetFingerprint(detail, advancedConfig.fingerprint)
499+
loaderPageFingerprintToDetailTarget(detail, advancedConfig.fingerprint)
444500
}
445501
})
446502

@@ -477,7 +533,7 @@ function createCrawlDataConfig<T>(
477533
crawlDataConfig.detailTargets.push(...detaileTargets)
478534
}
479535

480-
loaderCommonConfig(xCrawlConfig, advancedConfig, crawlDataConfig)
536+
loaderCommonConfigToCrawlConfig(xCrawlConfig, advancedConfig, crawlDataConfig)
481537

482538
return crawlDataConfig as CrawlDataConfig
483539
}
@@ -510,7 +566,7 @@ function createCrawlFileConfig(
510566
)
511567
}
512568

513-
loaderCommonConfig(xCrawlConfig, advancedConfig, crawlFileConfig)
569+
loaderCommonConfigToCrawlConfig(xCrawlConfig, advancedConfig, crawlFileConfig)
514570

515571
const haveAdvancedStoreDir = !isUndefined(advancedConfig?.storeDir)
516572
const haveAdvancedExtension = !isUndefined(advancedConfig?.extension)

src/default.ts

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
export const randomFingerprint = {
2+
platforms: [
3+
'Android',
4+
'Chrome OS',
5+
'Chromium OS',
6+
'iOS',
7+
'Linux',
8+
'macOS',
9+
'Windows'
10+
],
11+
mobiles: ['?0', '?1']
12+
}

src/index.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,14 @@ function loaderBaseConfig(
1313
): LoaderXCrawlConfig {
1414
const loaderBaseConfig = baseConfig ? baseConfig : {}
1515

16-
if (!loaderBaseConfig.mode) {
16+
if (isUndefined(loaderBaseConfig.mode)) {
1717
loaderBaseConfig.mode = 'async'
1818
}
1919

20+
if (isUndefined(loaderBaseConfig.enableRandomFingerprint)) {
21+
loaderBaseConfig.enableRandomFingerprint = true
22+
}
23+
2024
if (isUndefined(baseConfig?.timeout)) {
2125
loaderBaseConfig.timeout = 10000
2226
}

0 commit comments

Comments
 (0)