Skip to content

Commit 4704d4b

Browse files
committed
Change: crawlFile configuration and uniform header setting
1 parent ab4a0ad commit 4704d4b

File tree

5 files changed

+77
-73
lines changed

5 files changed

+77
-73
lines changed

src/api.ts

Lines changed: 49 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -82,16 +82,16 @@ function parsePageCookies(
8282
return cookiesArr
8383
}
8484

85-
function transformToCrawlObjects(
85+
function transformToCrawlDetails(
8686
config: string | CrawlPageDetailConfig | (string | CrawlPageDetailConfig)[]
8787
): CrawlPageDetailConfig[]
88-
function transformToCrawlObjects(
88+
function transformToCrawlDetails(
8989
config: string | CrawlDataDetailConfig | (string | CrawlDataDetailConfig)[]
9090
): CrawlDataDetailConfig[]
91-
function transformToCrawlObjects(
91+
function transformToCrawlDetails(
9292
config: (string | CrawlFileDetailConfig)[]
9393
): CrawlFileDetailConfig[]
94-
function transformToCrawlObjects(config: any) {
94+
function transformToCrawlDetails(config: any) {
9595
return isArray(config)
9696
? config.map((item) => (isObject(item) ? item : { url: item }))
9797
: [isObject(config) ? config : { url: config }]
@@ -117,7 +117,7 @@ function loaderCommonConfig(
117117
// 1.rawCrawlDetails
118118
rawCrawlDetails.forEach((detail) => {
119119
// detail > API > xCrawl
120-
let { url, timeout, proxy, maxRetry, priority } = detail
120+
let { url, timeout, proxy, maxRetry, priority, headers } = detail
121121

122122
// 1.1.baseUrl
123123
if (!isUndefined(xCrawlConfig.baseUrl)) {
@@ -156,13 +156,19 @@ function loaderCommonConfig(
156156
priority = 0
157157
}
158158

159+
// 1.6.header
160+
if (isUndefined(headers)) {
161+
headers = crawlAPIConfig.headers
162+
}
163+
159164
crawlDetails.push({
160165
...detail,
161166
url,
162167
timeout,
163168
proxy,
164169
maxRetry,
165-
priority
170+
priority,
171+
headers
166172
})
167173
})
168174

@@ -187,17 +193,17 @@ function loaderPageConfig(
187193
// 统一转成 CrawlPageDetailConfig 类型
188194
if (isObject(rawConfig) && Object.hasOwn(rawConfig, 'crawlPages')) {
189195
// CrawlPageEnhanceConfig 处理
190-
const { crawlPages, proxy, timeout, cookies, intervalTime, maxRetry } =
191-
rawConfig as CrawlPageEnhanceConfig
196+
const { crawlPages } = rawConfig as CrawlPageEnhanceConfig
192197

193198
// 给 crawlPageConfig 装载 CrawlPageEnhanceConfig
194-
crawlPageConfig.proxy = proxy
195-
crawlPageConfig.cookies = cookies
196-
crawlPageConfig.intervalTime = intervalTime
197-
crawlPageConfig.maxRetry = maxRetry
198-
crawlPageConfig.timeout = timeout
199+
const rawConfigMap: any = rawConfig
200+
const crawlPageConfigMap: any = crawlPageConfig
201+
Object.keys(rawConfig as CrawlPageEnhanceConfig).forEach((key) => {
202+
if (key === 'crawlPages') return
203+
crawlPageConfigMap[key] = rawConfigMap[key]
204+
})
199205

200-
rawCrawlPageDetails.push(...transformToCrawlObjects(crawlPages))
206+
rawCrawlPageDetails.push(...transformToCrawlDetails(crawlPages))
201207
} else {
202208
// string | CrawlPageDetailConfig | (string | CrawlPageDetailConfig)[] 处理
203209
const transformRes = transformToCrawlObjects(
@@ -246,26 +252,27 @@ function loaderDataConfig(
246252
// 统一转成 CrawlDataDetailConfig 类型
247253
if (isObject(rawConfig) && Object.hasOwn(rawConfig, 'crawlDatas')) {
248254
// CrawlDataEnhanceConfig 处理
249-
const { crawlDatas, proxy, timeout, intervalTime, maxRetry } =
250-
rawConfig as CrawlDataEnhanceConfig
255+
const { crawlDatas } = rawConfig as CrawlDataEnhanceConfig
251256

252257
// 给 crawlDataConfig 装载 crawlDataEnhanceConfig
253-
crawlDataConfig.proxy = proxy
254-
crawlDataConfig.intervalTime = intervalTime
255-
crawlDataConfig.maxRetry = maxRetry
256-
crawlDataConfig.timeout = timeout
258+
const rawConfigMap: any = rawConfig
259+
const crawlDataConfigMap: any = crawlDataConfig
260+
Object.keys(rawConfig as CrawlDataEnhanceConfig).forEach((key) => {
261+
if (key === 'crawlDatas') return
262+
crawlDataConfigMap[key] = rawConfigMap[key]
263+
})
257264

258-
rawCrawlDataDetails.push(...transformToCrawlObjects(crawlDatas))
265+
rawCrawlDataDetails.push(...transformToCrawlDetails(crawlDatas))
259266
} else {
260267
// string | CrawlDataDetailConfig | (string | CrawlDataDetailConfig)[] 处理
261-
const transformRes = transformToCrawlObjects(
268+
const transformRes = transformToCrawlDetails(
262269
rawConfig as
263270
| string
264271
| CrawlDataDetailConfig
265272
| (string | CrawlDataDetailConfig)[]
266273
)
267274

268-
rawCrawlDataDetails.push(...transformToCrawlObjects(transformRes))
275+
rawCrawlDataDetails.push(...transformToCrawlDetails(transformRes))
269276
}
270277

271278
// 装载公共配置
@@ -291,17 +298,17 @@ function loaderFileConfig(
291298
// 统一转成 CrawlFileDetailConfig 类型
292299
if (isObject(rawConfig) && Object.hasOwn(rawConfig, 'crawlFiles')) {
293300
// CrawlFileMoreConfig 处理
294-
const { crawlFiles, proxy, timeout, intervalTime, maxRetry, fileConfig } =
295-
rawConfig as CrawlFileEnhanceConfig
301+
const { crawlFiles } = rawConfig as CrawlFileEnhanceConfig
296302

297303
// 给 crawlFileConfig 装载 crawlFileMoreConfig
298-
crawlFileConfig.proxy = proxy
299-
crawlFileConfig.intervalTime = intervalTime
300-
crawlFileConfig.maxRetry = maxRetry
301-
crawlFileConfig.timeout = timeout
302-
crawlFileConfig.fileConfig = fileConfig
304+
const rawConfigMap: any = rawConfig
305+
const crawlFileConfigMap: any = crawlFileConfig
306+
Object.keys(rawConfig as CrawlFileEnhanceConfig).forEach((key) => {
307+
if (key === 'crawlFiles') return
308+
crawlFileConfigMap[key] = rawConfigMap[key]
309+
})
303310

304-
rawCrawlFileDetails.push(...transformToCrawlObjects(crawlFiles))
311+
rawCrawlFileDetails.push(...transformToCrawlDetails(crawlFiles))
305312
} else {
306313
// CrawlFileDetailConfig | CrawlFileDetailConfig[] 处理
307314
rawCrawlFileDetails.push(
@@ -317,26 +324,26 @@ function loaderFileConfig(
317324
crawlFileConfig.crawlFileDetails
318325
)
319326

320-
// 装载单独的配置
327+
// 装载单独配置
321328
if (
322-
!isUndefined(crawlFileConfig.fileConfig?.storeDir) ||
323-
!isUndefined(crawlFileConfig.fileConfig?.extension)
329+
!isUndefined(crawlFileConfig?.storeDir) ||
330+
!isUndefined(crawlFileConfig?.extension)
324331
) {
325332
crawlFileConfig.crawlFileDetails.forEach((fileSingleConfig) => {
326333
// 1.storeDir
327334
if (
328335
isUndefined(fileSingleConfig.storeDir) &&
329-
!isUndefined(crawlFileConfig.fileConfig?.storeDir)
336+
!isUndefined(crawlFileConfig?.storeDir)
330337
) {
331-
fileSingleConfig.storeDir = crawlFileConfig.fileConfig!.storeDir
338+
fileSingleConfig.storeDir = crawlFileConfig!.storeDir
332339
}
333340

334341
// 2.extension
335342
if (
336343
isUndefined(fileSingleConfig.extension) &&
337-
!isUndefined(crawlFileConfig.fileConfig?.extension)
344+
!isUndefined(crawlFileConfig?.extension)
338345
) {
339-
fileSingleConfig.extension = crawlFileConfig.fileConfig!.extension
346+
fileSingleConfig.extension = crawlFileConfig!.extension
340347
}
341348
})
342349
}
@@ -632,10 +639,8 @@ export function createCrawlFile(xCrawlConfig: LoaderXCrawlConfig) {
632639
config: UniteCrawlFileConfig,
633640
callback?: (res: CrawlFileSingleRes) => void
634641
): Promise<CrawlFileSingleRes | CrawlFileSingleRes[]> {
635-
const { crawlFileDetails, intervalTime, fileConfig } = loaderFileConfig(
636-
xCrawlConfig,
637-
config
638-
)
642+
const { crawlFileDetails, intervalTime, onBeforeSaveFile } =
643+
loaderFileConfig(xCrawlConfig, config)
639644

640645
const controllerRes = await controller(
641646
'file',
@@ -691,8 +696,8 @@ export function createCrawlFile(xCrawlConfig: LoaderXCrawlConfig) {
691696
// 在保存前的回调
692697
const data = crawlSingleRes.data
693698
let dataPromise = Promise.resolve(data)
694-
if (fileConfig?.beforeSave) {
695-
dataPromise = fileConfig.beforeSave({
699+
if (onBeforeSaveFile) {
700+
dataPromise = onBeforeSaveFile({
696701
id,
697702
fileName,
698703
filePath,

src/types/api.ts

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -17,17 +17,17 @@ export type LoaderCrawlDataDetail = CrawlDataDetailConfig & LoaderHasConfig
1717
export type LoaderCrawlFileDetail = CrawlFileDetailConfig & LoaderHasConfig
1818

1919
export interface LoaderCrawlPageConfig
20-
extends MapTypeObject<CrawlPageEnhanceConfig, 'crawlPages'> {
20+
extends Omit<CrawlPageEnhanceConfig, 'crawlPages'> {
2121
crawlPageDetails: LoaderCrawlPageDetail[]
2222
}
2323

2424
export interface LoaderCrawlDataConfig
25-
extends MapTypeObject<CrawlDataEnhanceConfig, 'crawlDatas'> {
25+
extends Omit<CrawlDataEnhanceConfig, 'crawlDatas'> {
2626
crawlDataDetails: LoaderCrawlDataDetail[]
2727
}
2828

2929
export interface LoaderCrawlFileConfig
30-
extends MapTypeObject<CrawlFileEnhanceConfig, 'crawlFiles'> {
30+
extends Omit<CrawlFileEnhanceConfig, 'crawlFiles'> {
3131
crawlFileDetails: LoaderCrawlFileDetail[]
3232
}
3333

@@ -100,7 +100,7 @@ export interface CrawlPageEnhanceConfig extends CrawlCommonConfig {
100100
crawlPages: (string | CrawlPageDetailConfig)[]
101101
intervalTime?: IntervalTime
102102

103-
// page common attribute
103+
headers?: AnyObject
104104
cookies?: PageCookies
105105
}
106106

@@ -117,6 +117,8 @@ export interface CrawlDataDetailConfig extends CrawlCommonConfig {
117117
export interface CrawlDataEnhanceConfig extends CrawlCommonConfig {
118118
crawlDatas: (string | CrawlDataDetailConfig)[]
119119
intervalTime?: IntervalTime
120+
121+
headers?: AnyObject
120122
}
121123

122124
// 3.Crawl file config
@@ -133,15 +135,16 @@ export interface CrawlFileEnhanceConfig extends CrawlCommonConfig {
133135
crawlFiles: (string | CrawlFileDetailConfig)[]
134136
intervalTime?: IntervalTime
135137

136-
fileConfig?: {
137-
storeDir?: string
138-
extension?: string
139-
beforeSave?: (info: {
140-
id: number
141-
fileName: string
142-
filePath: string
143-
data: Buffer
144-
}) => Promise<Buffer>
138+
headers?: AnyObject
139+
storeDir?: string
140+
extension?: string
141+
142+
onBeforeSaveFile?: (info: {
143+
id: number
144+
fileName: string
145+
filePath: string
146+
data: Buffer
147+
}) => Promise<Buffer>
145148
}
146149
}
147150

test/environment/crawlFile.test.ts

Lines changed: 8 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ async function writtenCrawlFileConfigObject() {
5252

5353
const res = await testXCrawl.crawlFile({
5454
crawlFiles: urls,
55-
fileConfig: { storeDir }
55+
storeDir
5656
})
5757

5858
return res.reduce(
@@ -75,7 +75,7 @@ async function loaderBaseConfig() {
7575

7676
const res = await testXCrawl.crawlFile({
7777
crawlFiles: ['/4401.jpg', '/4403.jpg'],
78-
fileConfig: { storeDir }
78+
storeDir
7979
})
8080

8181
return res.reduce((prev, item) => prev && item.isSuccess, true)
@@ -92,7 +92,7 @@ async function loaderAPIConfig() {
9292
crawlFiles: ['/4401.jpg', '/4403.jpg'],
9393
proxy: 'http://localhost:14892',
9494
timeout: 10000,
95-
fileConfig: { storeDir },
95+
storeDir,
9696
intervalTime: { max: 1000 },
9797
maxRetry: 0
9898
})
@@ -114,13 +114,11 @@ async function storeConfig() {
114114
{ url: '/4401.jpg', fileName: '4401' },
115115
{ url: '/4403.jpg', fileName: '4403' }
116116
],
117-
fileConfig: {
118-
storeDir: path.resolve(__dirname, './upload'),
119-
extension: '.jpg',
120-
async beforeSave(info) {
121-
record.push(info.fileName)
122-
return info.data
123-
}
117+
storeDir: path.resolve(__dirname, './upload'),
118+
extension: '.jpg',
119+
async onBeforeSaveFile(info) {
120+
record.push(info.fileName)
121+
return info.data
124122
}
125123
})
126124

0 commit comments

Comments
 (0)