Skip to content

Commit 8e40bad

Browse files
committed
feat: 优化百度的结果抓取
1 parent a945f56 commit 8e40bad

File tree

7 files changed

+50
-10
lines changed

7 files changed

+50
-10
lines changed

.github/workflows/publish-npm.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ jobs:
2020
registry-url: 'https://registry.npmjs.org'
2121
- run: |
2222
pnpm i
23+
cp README.md packages/browser/README.md
24+
cp README.md packages/mcp/README.md
2325
cd packages/browser && pnpm build && pnpm publish
2426
cd ../mcp && pnpm build && pnpm publish
2527
env:

packages/browser/package.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@tiny-tool/browser",
3-
"version": "1.0.0-beta.6",
3+
"version": "1.0.0-beta.7",
44
"repository": {
55
"url": "https://github.com/tiny-tool/browser"
66
},
@@ -43,7 +43,8 @@
4343
"remark-gfm": "^4.0.1",
4444
"remark-stringify": "^11.0.0",
4545
"unified": "^11.0.5",
46-
"unist-util-visit": "^5.0.0"
46+
"unist-util-visit": "^5.0.0",
47+
"uuid": "^11.1.0"
4748
},
4849
"devDependencies": {
4950
"@types/hast": "^3.0.4",

packages/browser/src/app.ts

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import express from 'express';
2+
import { v4 as uuidv4 } from 'uuid';
23
import { SearchAgent } from './search-agent';
34

45
const app = express();
@@ -32,12 +33,15 @@ app.get('/api/v1/search', async (req, res) => {
3233
console.log('req.query', req.query);
3334
res.send(
3435
await agent.search(engine as string, decodeURIComponent(q as string), payload, {
35-
sessionId: req.query.sessionId as string,
36+
sessionId: (req.query.sessionId as string) || uuidv4(),
3637
}),
3738
);
3839
});
3940

4041
app.listen(port, () => {
4142
console.log(`app listening on port ${port}`);
42-
agent = new SearchAgent();
43+
agent = new SearchAgent({
44+
headless: false,
45+
log: './logs',
46+
});
4347
});

packages/browser/src/baidu/index.ts

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,19 @@
1-
import { Page } from 'puppeteer-core';
1+
import { Page, ElementHandle } from 'puppeteer-core';
22
import { OrganicResult, Result, SearchOptions } from '../define';
33
import { getRedirectUrl } from '../common/get-redirect-url';
44
import { EventType, LogContext } from '../logs';
55
import { resolveUrlAndContent } from '../common/get-page-content';
66

7+
async function findElement(selectorList: string[], el: ElementHandle<Element>): Promise<ElementHandle | null> {
8+
for (const selector of selectorList) {
9+
const element = await el.$(selector);
10+
if (element) {
11+
return element;
12+
}
13+
}
14+
return null;
15+
}
16+
717
async function extractOrganicResults(page: Page): Promise<OrganicResult[]> {
818
const resultList = await page.$$('.result.c-container.new-pmd');
919

@@ -13,7 +23,12 @@ async function extractOrganicResults(page: Page): Promise<OrganicResult[]> {
1323
for (const el of resultList) {
1424
pos++;
1525
let currentPosition = pos;
16-
const title = await el.$('.c-title');
26+
27+
// 提取标题
28+
// 百度可能会调整结构,这里是可能的几个选择器
29+
const titleSelectorList = ['.c-title', 'a[data-module="title"]', 'h3[class^="struct-title_"]', 'div[class^="title-box_"]', 'div[class^="title-wrapper_"]'];
30+
31+
const title = await findElement(titleSelectorList, el);
1732
if (!title) {
1833
continue;
1934
}
@@ -23,13 +38,19 @@ async function extractOrganicResults(page: Page): Promise<OrganicResult[]> {
2338
}
2439

2540
// const site = await el.$('div[class^="source_"] a[class^="siteLink_"]');
26-
const site = await el.$('a[class^="siteLink_"]');
41+
// 提取站点名称
42+
const siteSelectorList = ['a[class^="siteLink_"]', '.cosc-source-link', '.cosc-source-text'];
43+
const site = await findElement(siteSelectorList, el);
2744
const siteContent = site ? await site.evaluate((el) => el.textContent) : '';
2845

29-
const link = (await el.$('.c-title a'))!;
46+
// 结果对应的跳转链接
47+
const titleIsLink = await title.evaluate((el) => el.tagName.toLowerCase() === 'a');
48+
const link = titleIsLink ? (title as ElementHandle<HTMLAnchorElement>) : (await title.$('a'))!;
3049
const linkContent = await link.evaluate((el) => el.href);
3150

32-
const content = (await el.$('span[class^="content-right"]'))!;
51+
// 页面展示的内容
52+
const contentSelectorList = ['span[class^="content-right"]', 'div[data-module="abstract"]', 'span[class^="summary-text_"]'];
53+
const content = (await findElement(contentSelectorList, el))!;
3354
const contentContent = content ? await content.evaluate((el) => el.textContent) : '';
3455

3556
ret.push({

packages/browser/src/util.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
export async function sleep(ms: number): Promise<void> {
2+
return new Promise((resolve) => setTimeout(resolve, ms));
3+
}

packages/mcp/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@tiny-tool/browser-mcp",
3-
"version": "1.0.0-beta.6",
3+
"version": "1.0.0-beta.7",
44
"bin": "dist/index.js",
55
"repository": {
66
"url": "https://github.com/tiny-tool/browser"

pnpm-lock.yaml

Lines changed: 9 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)