Skip to content

Commit daf6eda

Browse files
committed
feat: crawlFile API adds new parameter types, onBeforeSaveItemFile is no longer restricted
1 parent ee6f971 commit daf6eda

File tree

6 files changed

+52
-19
lines changed

6 files changed

+52
-19
lines changed

packages/ai/context.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ coder 用户: { message: string, pathMode: string }
5252
发来了一个 JavaScript 对象转换为 JSON 字符串的值。
5353
- message:
5454
- 类型: string,
55-
- 作用: 用户的需求求
55+
- 作用: 用户的需求
5656
- pathMode:
5757
- 类型: string, default 或者 strict
5858
- 作用: default 则可以不从 HTML 片段的根部开始的 selectors , 为 strict 则说明必需从 HTML 片段的根部开始的 selectors 。

packages/crawl/api.ts

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,9 @@ import {
1818
logSuccess,
1919
logWarn,
2020
random,
21-
whiteBold
21+
whiteBold,
22+
isPromise,
23+
isBuffer
2224
} from './utils'
2325

2426
import {
@@ -92,7 +94,7 @@ interface InfoFileConfig extends InfoCommonConfig {
9294
fileName: string
9395
filePath: string
9496
data: Buffer
95-
}) => Promise<Buffer>)
97+
}) => Promise<Buffer> | Buffer | void)
9698
| undefined
9799
}
98100

@@ -197,7 +199,7 @@ interface CrawlFileConfig {
197199
fileName: string
198200
filePath: string
199201
data: Buffer
200-
}) => Promise<Buffer>)
202+
}) => Promise<Buffer> | Buffer | void)
201203
| undefined
202204
onCrawlItemComplete:
203205
| ((crawlDataSingleResult: CrawlDataSingleResult<any>) => void)
@@ -224,8 +226,9 @@ type UniteCrawlDataConfig<T> =
224226
| CrawlDataAdvancedConfig<T>
225227

226228
type UniteCrawlFileConfig =
229+
| string
227230
| CrawlFileDetailTargetConfig
228-
| CrawlFileDetailTargetConfig[]
231+
| (string | CrawlFileDetailTargetConfig)[]
229232
| CrawlFileAdvancedConfig
230233

231234
/* Function */
@@ -273,7 +276,10 @@ function transformTargetToDetailTargets(
273276
| (string | CrawlDataDetailTargetConfig)[]
274277
): CrawlDataDetailTargetConfig[]
275278
function transformTargetToDetailTargets(
276-
config: (string | CrawlFileDetailTargetConfig)[]
279+
config:
280+
| string
281+
| CrawlFileDetailTargetConfig
282+
| (string | CrawlFileDetailTargetConfig)[]
277283
): CrawlFileDetailTargetConfig[]
278284
function transformTargetToDetailTargets(config: any) {
279285
return isArray(config)
@@ -741,10 +747,13 @@ function createCrawlFileConfig(
741747
advancedDetailTargetsConfig.detailTargets =
742748
transformTargetToDetailTargets(targets)
743749
} else {
744-
// CrawlFileDetailTargetConfig | CrawlFileDetailTargetConfig[] 处理
745-
advancedDetailTargetsConfig.detailTargets = isArray(originalConfig)
746-
? originalConfig
747-
: [originalConfig as CrawlFileDetailTargetConfig]
750+
// string | CrawlFileDetailTargetConfig | (string | CrawlFileDetailTargetConfig)[] 处理
751+
advancedDetailTargetsConfig.detailTargets = transformTargetToDetailTargets(
752+
originalConfig as
753+
| string
754+
| CrawlFileDetailTargetConfig
755+
| (string | CrawlFileDetailTargetConfig)[]
756+
)
748757
}
749758

750759
loaderCommonConfigToCrawlConfig(
@@ -1022,17 +1031,17 @@ function fileSingleResultHandle(
10221031

10231032
// 在保存前的回调
10241033
const data = detailTargetResult.data
1025-
let dataPromise = Promise.resolve(data)
1034+
let onBeforeSaveItemFileResult
10261035
if (onBeforeSaveItemFile) {
1027-
dataPromise = onBeforeSaveItemFile({
1036+
onBeforeSaveItemFileResult = onBeforeSaveItemFile({
10281037
id,
10291038
fileName,
10301039
filePath,
10311040
data
10321041
})
10331042
}
10341043

1035-
const saveFileItemPending = dataPromise.then(async (newData) => {
1044+
async function saveFile(newData: Buffer) {
10361045
let isSuccess = true
10371046
try {
10381047
await writeFile(filePath, newData)
@@ -1061,7 +1070,16 @@ function fileSingleResultHandle(
10611070
if (onCrawlItemComplete) {
10621071
onCrawlItemComplete(device.result as CrawlFileSingleResult)
10631072
}
1064-
})
1073+
}
1074+
1075+
let saveFileItemPending
1076+
if (isPromise(onBeforeSaveItemFileResult)) {
1077+
saveFileItemPending = onBeforeSaveItemFileResult!.then(saveFile)
1078+
} else if (isBuffer(onBeforeSaveItemFileResult)) {
1079+
saveFileItemPending = saveFile(onBeforeSaveItemFileResult)
1080+
} else {
1081+
saveFileItemPending = saveFile(data)
1082+
}
10651083

10661084
// 存放保存文件 Promise , 后续等待即可回到 crawlFile 函数内部等待完成即可
10671085
saveFilePendingQueue.push(saveFileItemPending)
@@ -1263,12 +1281,14 @@ export function createCrawlFile(crawlBaseConfig: CrawlBaseConfig) {
12631281
let id = 0
12641282
const type = 'file'
12651283

1284+
function crawlFile(config: string): Promise<CrawlFileSingleResult>
1285+
12661286
function crawlFile(
12671287
config: CrawlFileDetailTargetConfig
12681288
): Promise<CrawlFileSingleResult>
12691289

12701290
function crawlFile(
1271-
config: CrawlFileDetailTargetConfig[]
1291+
config: (string | CrawlFileDetailTargetConfig)[]
12721292
): Promise<CrawlFileSingleResult[]>
12731293

12741294
function crawlFile(

packages/crawl/index.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ import { isBoolean, isObject } from './utils'
1010

1111
let id = 0
1212

13-
function createInstanceConfig(config: CreateCrawlConfig): CrawlBaseConfig {
13+
function createBaseConfig(config: CreateCrawlConfig): CrawlBaseConfig {
1414
const {
1515
mode,
1616
enableRandomFingerprint,
@@ -69,7 +69,7 @@ function createnApp(crawlBaseConfig: CrawlBaseConfig): CrawlApp {
6969
}
7070

7171
export function createCrawl(config: CreateCrawlConfig = {}): CrawlApp {
72-
const crawlBaseConfig = createInstanceConfig(config)
72+
const crawlBaseConfig = createBaseConfig(config)
7373

7474
const app = createnApp(crawlBaseConfig)
7575

packages/crawl/types/api.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ export interface CrawlFileAdvancedConfig extends CrawlCommonConfig {
178178
fileName: string
179179
filePath: string
180180
data: Buffer
181-
}) => Promise<Buffer>
181+
}) => Promise<Buffer> | Buffer | void
182182
}
183183

184184
/* API Result */

packages/crawl/types/index.ts

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,9 +87,13 @@ export interface CrawlApp {
8787
}
8888

8989
crawlFile: {
90+
(config: string): Promise<CrawlFileSingleResult>
91+
9092
(config: CrawlFileDetailTargetConfig): Promise<CrawlFileSingleResult>
9193

92-
(config: CrawlFileDetailTargetConfig[]): Promise<CrawlFileSingleResult[]>
94+
(
95+
config: (string | CrawlFileDetailTargetConfig)[]
96+
): Promise<CrawlFileSingleResult[]>
9397

9498
(config: CrawlFileAdvancedConfig): Promise<CrawlFileSingleResult[]>
9599
}

packages/crawl/utils.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { Buffer } from 'node:buffer'
12
import chalk from 'chalk'
23

34
// Log
@@ -34,6 +35,14 @@ export function isArray(value: any): value is any[] {
3435
return Array.isArray(value)
3536
}
3637

38+
export function isPromise(value: any): value is Promise<any> {
39+
return typeof value === 'function' && !isUndefined(value.then)
40+
}
41+
42+
export function isBuffer(value: any): value is Buffer {
43+
return Buffer.isBuffer(value)
44+
}
45+
3746
export function sleep(timeout: number) {
3847
return new Promise((resolve) => setTimeout(resolve, timeout))
3948
}

0 commit comments

Comments
 (0)