Skip to content

Commit ae34e91

Browse files
committed
其他
1 parent 9519fb2 commit ae34e91

File tree

10 files changed

+54
-71
lines changed

10 files changed

+54
-71
lines changed

.eslintignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
publish

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ npm install x-crawl
1414

1515
## example
1616

17-
Get the title of [https://docs.github.com/zh/get-started as an example:](https://docs.github.com/zh/get-started)
17+
Get the title of https://docs.github.com/zh/get-started as an example:
1818

1919
```js
2020
// Import module ES/CJS
@@ -287,7 +287,7 @@ class XCrawl {
287287
}
288288
```
289289

290-
- <div id="myXCrawl" style="text-decoration: none">示例</div>
290+
- <div id="cn-myXCrawl" style="text-decoration: none">示例</div>
291291

292292
myXCrawl 为后面示例的爬虫实例。
293293

@@ -305,7 +305,7 @@ const myXCrawl = new XCrawl({
305305

306306
### fetch
307307

308-
fetch 是上面 <a href="#myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 API ,可获取 JSON 数据等等。
308+
fetch 是上面 <a href="#cn-myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 API ,可获取 JSON 数据等等。
309309

310310
- 类型
311311

@@ -332,7 +332,7 @@ myXCrawl.fetch({
332332

333333
### fetchFile
334334

335-
fetchFile 是上面 <a href="#myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取文件,可获取图片、pdf 文件等等。
335+
fetchFile 是上面 <a href="#cn-myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取文件,可获取图片、pdf 文件等等。
336336

337337
- 类型
338338

@@ -361,7 +361,7 @@ myXCrawl.fetchFile({
361361

362362
### fetchHTML
363363

364-
fetchHTML 是上面 <a href="#myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 HTML
364+
fetchHTML 是上面 <a href="#cn-myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 HTML
365365

366366
- 类型
367367

package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"name": "x-crawl",
44
"version": "0.0.1",
55
"author": "coderhxl",
6-
"description": "XCrawl is a Nodejs crawl library, providing configurations to help you crawl data or files in batches.",
6+
"description": "XCrawl is a Nodejs multifunctional crawler library.",
77
"license": "MIT",
88
"main": "src/index.ts",
99
"scripts": {

publish/README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ npm install x-crawl
1414

1515
## example
1616

17-
Get the title of [https://docs.github.com/zh/get-started as an example:](https://docs.github.com/zh/get-started)
17+
Get the title of https://docs.github.com/zh/get-started as an example:
1818

1919
```js
2020
// Import module ES/CJS
@@ -287,7 +287,7 @@ class XCrawl {
287287
}
288288
```
289289

290-
- <div id="myXCrawl" style="text-decoration: none">示例</div>
290+
- <div id="cn-myXCrawl" style="text-decoration: none">示例</div>
291291

292292
myXCrawl 为后面示例的爬虫实例。
293293

@@ -305,7 +305,7 @@ const myXCrawl = new XCrawl({
305305

306306
### fetch
307307

308-
fetch 是上面 <a href="#myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 API ,可获取 JSON 数据等等。
308+
fetch 是上面 <a href="#cn-myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 API ,可获取 JSON 数据等等。
309309

310310
- 类型
311311

@@ -332,7 +332,7 @@ myXCrawl.fetch({
332332

333333
### fetchFile
334334

335-
fetchFile 是上面 <a href="#myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取文件,可获取图片、pdf 文件等等。
335+
fetchFile 是上面 <a href="#cn-myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取文件,可获取图片、pdf 文件等等。
336336

337337
- 类型
338338

@@ -361,7 +361,7 @@ myXCrawl.fetchFile({
361361

362362
### fetchHTML
363363

364-
fetchHTML 是上面 <a href="#myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 HTML
364+
fetchHTML 是上面 <a href="#cn-myXCrawl" style="text-decoration: none">myXCrawl</a> 实例的方法,通常用于爬取 HTML
365365

366366
- 类型
367367

publish/package.json

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,17 @@
22
"name": "x-crawl",
33
"version": "0.0.1",
44
"author": "coderhxl",
5-
"description": "XCrawl is a Nodejs crawl library, providing configurations to help you crawl data or files in batches.",
5+
"description": "XCrawl is a Nodejs multifunctional crawler library.",
66
"license": "MIT",
7+
"keywords": [
8+
"nodejs",
9+
"typescript",
10+
"crawl",
11+
"crawler",
12+
"spider"
13+
],
714
"main": "dist/index.js",
8-
"types": "",
15+
"types": "dist/index.d.ts",
916
"homepage": "https://github.com/coder-hxl/x-crawl/tree/main/publish",
1017
"repository": {
1118
"type": "git",
@@ -18,5 +25,7 @@
1825
"require": "./dist/index.js"
1926
}
2027
},
21-
"keywords": []
28+
"dependencies": {
29+
"jsdom": "^21.1.0"
30+
}
2231
}

src/index.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import fs from 'node:fs'
2+
import path from 'node:path'
23
import { JSDOM } from 'jsdom'
34

45
import { batchRequest, request } from './request'
@@ -11,7 +12,6 @@ import {
1112
IRequest,
1213
IXCrawlBaseConifg
1314
} from './types'
14-
import path from 'node:path'
1515

1616
export default class XCrawl {
1717
private readonly baseConfig: IXCrawlBaseConifg

src/types.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import { IncomingHttpHeaders, OutgoingHttpHeaders } from 'node:http'
1+
import { IncomingHttpHeaders } from 'node:http'
22

33
export interface IAnyObject extends Object {
44
[key: string | number | symbol]: any
@@ -42,7 +42,7 @@ export type IMethod =
4242
export interface IRequestConfig {
4343
url: string
4444
method?: IMethod
45-
headers?: OutgoingHttpHeaders
45+
headers?: IAnyObject
4646
params?: IAnyObject
4747
data?: any
4848
timeout?: number

test/start/index.js

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

test/start/index.ts

Lines changed: 23 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ import XCrawl from '../../src'
55
import { IRequestConfig } from '../../src/types'
66
import { IAreaRoom, IBRecommend } from './types'
77

8-
// 1. 爬取 房间数据
8+
// 1. 房间数据
99
// https://github.com/coder-hxl/airbnb-api
1010

1111
const roomXCrawl = new XCrawl({
@@ -46,66 +46,37 @@ function areaRoomData() {
4646
}
4747
// areaRoomData()
4848

49-
// ==================================================
50-
51-
// 2. 爬取 b站 数据
52-
53-
// 2.1 JSON: b站首页推荐视频的封面图
54-
/*
55-
https://api.bilibili.com/x/web-interface/wbi/index/top/feed/rcmd?y_num=5&fresh_type=3&feed_version=V8&fresh_idx_1h=1&fetch_row=1&fresh_idx=1&brush=0&homepage_ver=1&ps=10&outside_trigger=&w_rid=921db33671365ec8b9f7cab1971a3834&wts=1674553870
56-
*/
49+
// 2 HTML: GitHub Docs
50+
// 采用 jsdom 对 HTML String 解析
5751

58-
const bilibiliXCrawl = new XCrawl({
52+
const githubDocsXCrawl = new XCrawl({
5953
timeout: 10000,
6054
intervalTime: {
61-
max: 1500,
55+
max: 3000,
6256
min: 1000
6357
}
6458
})
6559

66-
async function bilibiliRecommendData() {
67-
const recommend = await bilibiliXCrawl.fetch<IBRecommend>({
68-
requestConifg: {
69-
url: 'https://api.bilibili.com/x/web-interface/wbi/index/top/feed/rcmd',
70-
method: 'GET',
71-
params: {
72-
y_num: 5,
73-
fresh_type: 3,
74-
feed_version: 'V8',
75-
fresh_idx_1h: 1,
76-
fetch_row: 1,
77-
fresh_idx: 1,
78-
brush: 0,
79-
homepage_ver: 1,
80-
ps: 10,
81-
outside_trigger: '',
82-
w_rid: '2e4be8e9830ecd780c5b0ff2bef805c9',
83-
wts: 1674556002
84-
}
85-
}
86-
})
87-
88-
const pictureUrls: IRequestConfig[] = recommend.data.item.map((item) => ({
89-
url: item.pic,
90-
method: 'GET'
91-
}))
60+
async function githubDocs() {
61+
const dom = await githubDocsXCrawl.fetchHTML('https://docs.github.com/zh')
9262

93-
const storeFile = await bilibiliXCrawl.fetchFile({
94-
requestConifg: pictureUrls,
95-
intervalTime: { max: 3000, min: 2000 },
96-
fileConfig: { storeDir: path.resolve(__dirname, './upload') }
97-
})
98-
99-
console.log(storeFile)
63+
console.log(dom.window.document.querySelector('title')?.textContent)
10064
}
101-
bilibiliRecommendData()
10265

103-
// 2.2 HTML: b站首页标题
104-
// 采用 jsdom 对 HTML String 解析
66+
// githubDocs()
67+
68+
// https://docs.github.com/assets/cb-262/images/octicons/search-16.svg
10569

106-
function bilibiliHTMLData() {
107-
bilibiliXCrawl.fetchHTML('https://www.bilibili.com/').then((dom) => {
108-
console.log(dom.window.document.querySelector('title')?.textContent)
70+
githubDocsXCrawl
71+
.fetchFile({
72+
requestConifg: {
73+
url: 'https://docs.github.com/assets/cb-262/images/octicons/search-16.svg',
74+
method: 'GET'
75+
},
76+
fileConfig: {
77+
storeDir: path.resolve(__dirname, './upload')
78+
}
79+
})
80+
.then((fileInfos) => {
81+
console.log(fileInfos)
10982
})
110-
}
111-
// bilibiliHTMLData()

tsconfig.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
"allowSyntheticDefaultImports": true,
99
"forceConsistentCasingInFileNames": true,
1010
"strict": true,
11-
"skipLibCheck": true
11+
"skipLibCheck": true,
12+
"composite": true,
13+
"outDir": "./publish"
1214
},
1315
"include": ["src/**/*", "test/**/*"]
1416
}

0 commit comments

Comments
 (0)