|
1 |
| -import path from 'node:path' |
2 | 1 | import { createCrawl, createCrawlOpenAI } from 'x-crawl'
|
3 |
| -import { fileURLToPath } from 'node:url' |
4 | 2 |
|
5 | 3 | import { BASE_URL, API_KEY } from './envConfig'
|
| 4 | +import { fileURLToPath } from 'url' |
6 | 5 |
|
7 |
| -const pathResolve = (dirPath: string) => |
8 |
| - fileURLToPath(new URL(dirPath, import.meta.url)) |
| 6 | +const pathResolve = (dir: string) => |
| 7 | + fileURLToPath(new URL(dir, import.meta.url)) |
9 | 8 |
|
10 |
| -const crawlOpenAIApp = createCrawlOpenAI({ |
11 |
| - clientOptions: { baseURL: BASE_URL, apiKey: API_KEY } |
| 9 | +const crawlApp = createCrawl({ |
| 10 | + maxRetry: 3, |
| 11 | + intervalTime: { max: 2000, min: 1000 } |
12 | 12 | })
|
13 | 13 |
|
14 |
| -const crawlApp = createCrawl({ |
15 |
| - crawlPage: { puppeteerLaunchOptions: { headless: true } } |
| 14 | +const crawlOpenAIApp = createCrawlOpenAI({ |
| 15 | + clientOptions: { baseURL: BASE_URL, apiKey: API_KEY }, |
| 16 | + defaultModel: { chatModel: 'gpt-4-turbo-preview' } |
16 | 17 | })
|
17 | 18 |
|
| 19 | +// crawlPage 用于爬取页面 |
18 | 20 | crawlApp.crawlPage('https://www.airbnb.cn/s/select_homes').then(async (res) => {
|
19 | 21 | const { page, browser } = res.data
|
20 | 22 |
|
21 |
| - // await page.waitForSelector('.g1nr81q6') |
22 |
| - // const sectionHTML = await page.$eval('.g1nr81q6 ', (el) => el.innerHTML) |
23 |
| - await page.waitForSelector( |
24 |
| - '.g1nr81q6 > a:nth-child(1), .g1nr81q6 > a:nth-child(2)' |
25 |
| - ) |
26 |
| - const sectionHTML = await page.$$eval( |
27 |
| - '.g1nr81q6 > a:nth-child(1), .g1nr81q6 > a:nth-child(2) ', |
28 |
| - (els) => els.reduce((p, v) => p + v.innerHTML, '') |
29 |
| - ) |
| 23 | + // 等待元素出现在页面中, 并获取 HTML |
| 24 | + const targetSelector = '[data-tracking-id="TOP_REVIEWED_LISTINGS"]' |
| 25 | + await page.waitForSelector(targetSelector) |
| 26 | + const highlyHTML = await page.$eval(targetSelector, (el) => el.innerHTML) |
30 | 27 |
|
31 |
| - const srcResult = await crawlOpenAIApp.parseElements<{ src: string }>( |
32 |
| - sectionHTML, |
33 |
| - `获取 img 的 src` |
| 28 | + // 让 AI 获取图片链接, 并去重 |
| 29 | + const srcResult = await crawlOpenAIApp.parseElements( |
| 30 | + highlyHTML, |
| 31 | + '获取图片链接, 不要source里面的, 并去重' |
34 | 32 | )
|
35 | 33 |
|
36 |
| - console.log(srcResult) |
| 34 | + browser.close() |
37 | 35 |
|
| 36 | + // crawlFile 用于爬取文件资源 |
38 | 37 | crawlApp.crawlFile({
|
39 | 38 | targets: srcResult.elements.map((item) => item.src),
|
40 |
| - storeDirs: pathResolve('upload') |
| 39 | + storeDirs: pathResolve('./upload') |
41 | 40 | })
|
42 |
| - |
43 |
| - browser.close() |
44 | 41 | })
|
0 commit comments