Skip to content

Commit 2393934

Browse files
committed
Feat: Execution timing of processing results / onCrawlItemComplete life cycle function / type adjustment
1 parent 03ebd7d commit 2393934

File tree

10 files changed

+552
-453
lines changed

10 files changed

+552
-453
lines changed

src/api.ts

Lines changed: 339 additions & 280 deletions
Large diffs are not rendered by default.

src/batchCrawlHandle.ts

Lines changed: 60 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,7 @@
11
import { isNumber, isUndefined, log, logNumber, random, sleep } from './utils'
22

3-
import {
4-
LoaderCrawlDataDetail,
5-
LoaderCrawlFileDetail,
6-
LoaderCrawlPageDetail
7-
} from './api'
8-
9-
import type { IntervalTime } from './types/api'
10-
import type { ControllerConfig, CrawlDetail } from './controller'
3+
import type { ExtraCommonConfig } from './api'
4+
import type { DetailInfo, CrawlDetail } from './controller'
115

126
async function useSleepByBatch(
137
isHaventervalTime: boolean,
@@ -32,21 +26,27 @@ async function useSleepByBatch(
3226
}
3327
}
3428

35-
export async function asyncBatchCrawl<T extends CrawlDetail, V, C>(
36-
controllerConfigs: ControllerConfig<T, V>[],
37-
crawlSingleFnExtra: C,
38-
intervalTime: IntervalTime | undefined,
39-
crawlSingleFn: (
40-
controllerConfig: ControllerConfig<T, V>,
41-
crawlSingleFnExtra: C
42-
) => Promise<V>
29+
export async function asyncBatchCrawl<
30+
T extends CrawlDetail,
31+
E extends ExtraCommonConfig,
32+
R
33+
>(
34+
detailInfos: DetailInfo<T, R>[],
35+
extraConfig: E,
36+
singleCrawlHandle: (
37+
detailInfo: DetailInfo<T, R>,
38+
extraConfig: E
39+
) => Promise<R>,
40+
singleResultHandle: (detailInfo: DetailInfo<T, R>, extraConfig: E) => void
4341
) {
42+
const { intervalTime } = extraConfig
43+
4444
const isHaventervalTime = !isUndefined(intervalTime)
4545
const isNumberIntervalTime = isNumber(intervalTime)
4646

47-
const crawlQueue: Promise<any>[] = []
48-
for (const controllerConfig of controllerConfigs) {
49-
const { id } = controllerConfig
47+
const crawlPendingQueue: Promise<any>[] = []
48+
for (const detaileInfo of detailInfos) {
49+
const { id } = detaileInfo
5050

5151
await useSleepByBatch(
5252
isHaventervalTime,
@@ -55,41 +55,53 @@ export async function asyncBatchCrawl<T extends CrawlDetail, V, C>(
5555
id
5656
)
5757

58-
controllerConfig.crawlCount++
59-
60-
const crawlSingle = crawlSingleFn(controllerConfig, crawlSingleFnExtra)
58+
const crawlSinglePending = singleCrawlHandle(detaileInfo, extraConfig)
6159
.catch((error) => {
62-
controllerConfig.errorQueue.push(error)
60+
detaileInfo.crawlErrorQueue.push(error)
6361
return false
6462
})
65-
.then((crawlSingleRes) => {
66-
if (crawlSingleRes === false) return
63+
.then((detailTargetRes) => {
64+
if (typeof detailTargetRes === 'boolean') {
65+
if (detaileInfo.retryCount === detaileInfo.maxRetry) {
66+
singleResultHandle(detaileInfo, extraConfig)
67+
}
68+
69+
return
70+
}
71+
72+
detaileInfo.isSuccess = true
73+
detaileInfo.detailTargetRes = detailTargetRes
6774

68-
controllerConfig.isSuccess = true
69-
controllerConfig.crawlSingleRes = crawlSingleRes as V
75+
singleResultHandle(detaileInfo, extraConfig)
7076
})
7177

72-
crawlQueue.push(crawlSingle)
78+
crawlPendingQueue.push(crawlSinglePending)
7379
}
7480

7581
// 等待所有爬取结束
76-
await Promise.all(crawlQueue)
82+
await Promise.all(crawlPendingQueue)
7783
}
7884

79-
export async function syncBatchCrawl<T extends CrawlDetail, V, C>(
80-
controllerConfigs: ControllerConfig<T, V>[],
81-
crawlSingleFnExtra: C,
82-
intervalTime: IntervalTime | undefined,
83-
crawlSingleFn: (
84-
controllerConfig: ControllerConfig<T, V>,
85-
crawlSingleFnExtra: C
86-
) => Promise<V>
85+
export async function syncBatchCrawl<
86+
T extends CrawlDetail,
87+
E extends ExtraCommonConfig,
88+
R
89+
>(
90+
detailInfos: DetailInfo<T, R>[],
91+
extraConfig: E,
92+
singleCrawlHandle: (
93+
detaileInfo: DetailInfo<T, R>,
94+
extraConfig: E
95+
) => Promise<R>,
96+
singleResultHandle: (detaileInfo: DetailInfo<T, R>, extraConfig: E) => void
8797
) {
98+
const { intervalTime } = extraConfig
99+
88100
const isHaventervalTime = !isUndefined(intervalTime)
89101
const isNumberIntervalTime = isNumber(intervalTime)
90102

91-
for (const controllerConfig of controllerConfigs) {
92-
const { id } = controllerConfig
103+
for (const detailInfo of detailInfos) {
104+
const { id } = detailInfo
93105

94106
await useSleepByBatch(
95107
isHaventervalTime,
@@ -98,16 +110,18 @@ export async function syncBatchCrawl<T extends CrawlDetail, V, C>(
98110
id
99111
)
100112

101-
controllerConfig.crawlCount++
102-
103113
try {
104-
controllerConfig.crawlSingleRes = await crawlSingleFn(
105-
controllerConfig,
106-
crawlSingleFnExtra
114+
detailInfo.detailTargetRes = await singleCrawlHandle(
115+
detailInfo,
116+
extraConfig
107117
)
108-
controllerConfig.isSuccess = true
118+
detailInfo.isSuccess = true
109119
} catch (error: any) {
110-
controllerConfig.errorQueue.push(error)
120+
detailInfo.crawlErrorQueue.push(error)
121+
}
122+
123+
if (detailInfo.isSuccess || detailInfo.retryCount === detailInfo.maxRetry) {
124+
singleResultHandle(detailInfo, extraConfig)
111125
}
112126
}
113127
}

src/controller.ts

Lines changed: 52 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -2,94 +2,111 @@ import { asyncBatchCrawl, syncBatchCrawl } from './batchCrawlHandle'
22
import { priorityQueueMergeSort } from './sort'
33

44
import {
5+
ExtraCommonConfig,
56
LoaderCrawlDataDetail,
67
LoaderCrawlFileDetail,
78
LoaderCrawlPageDetail
89
} from './api'
910

10-
import { IntervalTime } from './types/api'
1111
import { log, logError, logNumber, logSuccess, logWarn } from './utils'
1212

1313
export type CrawlDetail =
1414
| LoaderCrawlPageDetail
1515
| LoaderCrawlDataDetail
1616
| LoaderCrawlFileDetail
1717

18-
export interface ControllerConfig<T extends CrawlDetail, V> {
18+
export interface DetailInfo<T extends CrawlDetail, R> {
1919
id: number
2020
isSuccess: boolean
21-
crawlCount: number
2221
maxRetry: number
23-
errorQueue: Error[]
24-
crawlDetailConfig: T
25-
crawlSingleRes: V | null
22+
retryCount: number
23+
crawlErrorQueue: Error[]
24+
data: any | null
25+
26+
detailTarget: T
27+
detailTargetRes: R | null
2628
}
2729

28-
export async function controller<T extends CrawlDetail, V, C>(
30+
type TargetSingleRes = Omit<
31+
DetailInfo<any, any>,
32+
'detailTarget' | 'detailTargetRes'
33+
>
34+
35+
export async function controller<
36+
T extends CrawlDetail,
37+
E extends ExtraCommonConfig,
38+
R
39+
>(
2940
name: 'page' | 'data' | 'file',
3041
mode: 'async' | 'sync',
31-
crawlDetails: T[],
32-
crawlSingleFnExtra: C,
33-
intervalTime: IntervalTime | undefined,
34-
crawlSingleFn: (
35-
controllerConfig: ControllerConfig<T, V>,
36-
crawlSingleFnExtra: C
37-
) => Promise<V>
38-
): Promise<ControllerConfig<T, V>[]> {
42+
detailTargets: T[],
43+
extraConfig: E,
44+
singleCrawlHandle: (
45+
detailInfo: DetailInfo<T, R>,
46+
extraConfig: E
47+
) => Promise<R>,
48+
singleResultHandle: (detailInfo: DetailInfo<T, R>, extraConfig: E) => void
49+
): Promise<TargetSingleRes[]> {
3950
// 是否使用优先爬取
40-
const isPriorityCrawl = !crawlDetails.every(
41-
(item) => item.priority === crawlDetails[0].priority
51+
const isPriorityCrawl = !detailTargets.every(
52+
(item) => item.priority === detailTargets[0].priority
4253
)
4354
const targetRequestConfigs = isPriorityCrawl
4455
? priorityQueueMergeSort(
45-
crawlDetails.map((item) => ({
56+
detailTargets.map((item) => ({
4657
...item,
4758
valueOf: () => item.priority
4859
}))
4960
)
50-
: crawlDetails
61+
: detailTargets
5162

5263
// 通过映射生成新的配置数组
53-
const controllerConfigs: ControllerConfig<T, V>[] = targetRequestConfigs.map(
54-
(crawlDetailConfig, index) => ({
64+
const detailInfos: DetailInfo<T, R>[] = targetRequestConfigs.map(
65+
(detailTarget, index) => ({
5566
id: index + 1,
5667
isSuccess: false,
57-
maxRetry: crawlDetailConfig.maxRetry,
58-
crawlCount: 0,
59-
errorQueue: [],
60-
crawlDetailConfig,
61-
crawlSingleRes: null
68+
maxRetry: detailTarget.maxRetry,
69+
retryCount: 0,
70+
crawlErrorQueue: [],
71+
data: null,
72+
73+
detailTarget,
74+
detailTargetRes: null
6275
})
6376
)
6477

6578
log(
6679
`${logSuccess(`Start crawling`)} - name: ${logWarn(name)}, mode: ${logWarn(
6780
mode
68-
)}, total: ${logNumber(controllerConfigs.length)} `
81+
)}, total: ${logNumber(detailInfos.length)} `
6982
)
7083

7184
// 选择爬取模式
7285
const batchCrawl = mode === 'async' ? asyncBatchCrawl : syncBatchCrawl
7386

7487
let i = 0
75-
let crawlQueue: ControllerConfig<T, V>[] = controllerConfigs
88+
let crawlQueue: DetailInfo<T, R>[] = detailInfos
7689
while (crawlQueue.length) {
7790
await batchCrawl(
7891
crawlQueue,
79-
crawlSingleFnExtra,
80-
intervalTime,
81-
crawlSingleFn
92+
extraConfig,
93+
singleCrawlHandle,
94+
singleResultHandle
8295
)
8396

8497
crawlQueue = crawlQueue.filter(
8598
(config) =>
8699
config.maxRetry &&
87100
!config.isSuccess &&
88-
config.crawlCount <= config.maxRetry
101+
config.retryCount < config.maxRetry
89102
)
90103

91104
if (crawlQueue.length) {
92-
const retriedIds = crawlQueue.map((item) => item.id)
105+
const retriedIds = crawlQueue.map((item) => {
106+
item.retryCount++
107+
108+
return item.id
109+
})
93110
log(
94111
logWarn(`Retry: ${++i} - Ids to retry: [ ${retriedIds.join(' - ')} ]`)
95112
)
@@ -99,7 +116,7 @@ export async function controller<T extends CrawlDetail, V, C>(
99116
// 统计结果
100117
const succssIds: number[] = []
101118
const errorIds: number[] = []
102-
controllerConfigs.forEach((item) => {
119+
detailInfos.forEach((item) => {
103120
if (item.isSuccess) {
104121
succssIds.push(item.id)
105122
} else {
@@ -121,5 +138,5 @@ export async function controller<T extends CrawlDetail, V, C>(
121138
)
122139
)
123140

124-
return controllerConfigs
141+
return detailInfos as TargetSingleRes[]
125142
}

0 commit comments

Comments
 (0)