Skip to content

Commit 42ba951

Browse files
committed
test: test
1 parent 615466a commit 42ba951

File tree

1 file changed

+21
-24
lines changed

1 file changed

+21
-24
lines changed

test/dev/index.ts

Lines changed: 21 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,41 @@
1-
import path from 'node:path'
21
import { createCrawl, createCrawlOpenAI } from 'x-crawl'
3-
import { fileURLToPath } from 'node:url'
42

53
import { BASE_URL, API_KEY } from './envConfig'
4+
import { fileURLToPath } from 'url'
65

7-
const pathResolve = (dirPath: string) =>
8-
fileURLToPath(new URL(dirPath, import.meta.url))
6+
const pathResolve = (dir: string) =>
7+
fileURLToPath(new URL(dir, import.meta.url))
98

10-
const crawlOpenAIApp = createCrawlOpenAI({
11-
clientOptions: { baseURL: BASE_URL, apiKey: API_KEY }
9+
const crawlApp = createCrawl({
10+
maxRetry: 3,
11+
intervalTime: { max: 2000, min: 1000 }
1212
})
1313

14-
const crawlApp = createCrawl({
15-
crawlPage: { puppeteerLaunchOptions: { headless: true } }
14+
const crawlOpenAIApp = createCrawlOpenAI({
15+
clientOptions: { baseURL: BASE_URL, apiKey: API_KEY },
16+
defaultModel: { chatModel: 'gpt-4-turbo-preview' }
1617
})
1718

19+
// crawlPage 用于爬取页面
1820
crawlApp.crawlPage('https://www.airbnb.cn/s/select_homes').then(async (res) => {
1921
const { page, browser } = res.data
2022

21-
// await page.waitForSelector('.g1nr81q6')
22-
// const sectionHTML = await page.$eval('.g1nr81q6 ', (el) => el.innerHTML)
23-
await page.waitForSelector(
24-
'.g1nr81q6 > a:nth-child(1), .g1nr81q6 > a:nth-child(2)'
25-
)
26-
const sectionHTML = await page.$$eval(
27-
'.g1nr81q6 > a:nth-child(1), .g1nr81q6 > a:nth-child(2) ',
28-
(els) => els.reduce((p, v) => p + v.innerHTML, '')
29-
)
23+
// 等待元素出现在页面中, 并获取 HTML
24+
const targetSelector = '[data-tracking-id="TOP_REVIEWED_LISTINGS"]'
25+
await page.waitForSelector(targetSelector)
26+
const highlyHTML = await page.$eval(targetSelector, (el) => el.innerHTML)
3027

31-
const srcResult = await crawlOpenAIApp.parseElements<{ src: string }>(
32-
sectionHTML,
33-
`获取 img 的 src`
28+
// 让 AI 获取图片链接, 并去重
29+
const srcResult = await crawlOpenAIApp.parseElements(
30+
highlyHTML,
31+
'获取图片链接, 不要source里面的, 并去重'
3432
)
3533

36-
console.log(srcResult)
34+
browser.close()
3735

36+
// crawlFile 用于爬取文件资源
3837
crawlApp.crawlFile({
3938
targets: srcResult.elements.map((item) => item.src),
40-
storeDirs: pathResolve('upload')
39+
storeDirs: pathResolve('./upload')
4140
})
42-
43-
browser.close()
4441
})

0 commit comments

Comments
 (0)