Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 54 additions & 4 deletions clis/xiaohongshu/search.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ import { ArgumentError, AuthRequiredError, CommandExecutionError } from '@jackwe
/**
* Wait for search results or login wall using MutationObserver (max 5s).
* Returns 'content' if note items appeared, 'login_wall' if login gate
* detected, or 'timeout' if neither appeared within the deadline.
* detected, 'security_block' for Xiaohongshu's account/security restriction
* page, or 'timeout' if none appeared within the deadline.
*
* Note-item detection tries the legacy `section.note-item` class first
* (still observed in many sessions, including rednote) and falls back to
Expand All @@ -24,7 +25,12 @@ const WAIT_FOR_CONTENT_JS = `
);
const detect = () => {
if (findNoteCard()) return 'content';
if (/登录后查看搜索结果/.test(document.body?.innerText || '')) return 'login_wall';
const text = document.body?.innerText || '';
if (/登录后查看搜索结果/.test(text)) return 'login_wall';
if (
/安全限制|Account abnormal|300011/.test(text) ||
/\\/website-login\\/error/.test(location.pathname)
) return 'security_block';
return null;
};
const found = detect();
Expand All @@ -37,6 +43,33 @@ const WAIT_FOR_CONTENT_JS = `
setTimeout(() => { observer.disconnect(); resolve('timeout'); }, 5000);
})
`;
const WAIT_FOR_SEARCH_ROUTE_JS = `
new Promise((resolve) => {
const isSearchRoute = () => /\\/search_result(?:_ai)?/.test(location.pathname);
const isSecurityBlock = () => {
const text = document.body?.innerText || '';
return /安全限制|Account abnormal|300011/.test(text) || /\\/website-login\\/error/.test(location.pathname);
};
const detect = () => {
if (isSearchRoute()) return 'search_route';
if (isSecurityBlock()) return 'security_block';
return null;
};
const found = detect();
if (found) return resolve(found);
const timer = setInterval(() => {
const result = detect();
if (result) {
clearInterval(timer);
resolve(result);
}
}, 100);
setTimeout(() => {
clearInterval(timer);
resolve('timeout');
}, 8000);
})
`;
/**
* Extract approximate publish date from a Xiaohongshu note URL.
* XHS note IDs follow MongoDB ObjectID format where the first 8 hex
Expand Down Expand Up @@ -80,6 +113,23 @@ function requireSearchRows(payload, phase) {
}
return rows;
}
function assertSearchRoute(payload) {
const result = unwrapEvaluateResult(payload);
if (result === 'search_route') {
return;
}
if (result === 'security_block') {
throw new CommandExecutionError('Xiaohongshu search hit a security restriction page instead of search results.');
}
throw new CommandExecutionError('Xiaohongshu search did not reach a search results route after submitting the query.');
}
async function submitSearchFromHome(page, query) {
await page.goto('https://www.xiaohongshu.com/');
await page.wait?.(1);
await page.click('#search-input-in-feeds');
await page.typeText('#search-input-in-feeds', query);
await page.pressKey('Enter');
}
export function parseLimit(raw) {
const parsed = Number(raw ?? 20);
if (!Number.isFinite(parsed) || !Number.isInteger(parsed)) {
Expand Down Expand Up @@ -282,8 +332,8 @@ export const command = cli({
columns: ['rank', 'title', 'author', 'likes', 'published_at', 'url'],
func: async (page, kwargs) => {
const limit = parseLimit(kwargs.limit);
const keyword = encodeURIComponent(kwargs.query);
await page.goto(`https://www.xiaohongshu.com/search_result?keyword=${keyword}&source=web_search_result_notes`);
await submitSearchFromHome(page, kwargs.query);
assertSearchRoute(await page.evaluate(WAIT_FOR_SEARCH_ROUTE_JS));
// Wait for search results to render (or login wall to appear).
// Uses MutationObserver to resolve as soon as content appears,
// instead of a fixed delay + blind retry.
Expand Down
95 changes: 82 additions & 13 deletions clis/xiaohongshu/search.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -45,28 +45,88 @@ describe('xiaohongshu search', () => {
});
expect(page.goto).not.toHaveBeenCalled();
});
it('submits the keyword through the Xiaohongshu search box instead of direct-linking search results', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
const page = createPageMock([
'search_route',
'content',
[
{
title: 'Result A',
author: 'UserA',
likes: '10',
url: 'https://www.xiaohongshu.com/search_result/aaa',
author_url: '',
},
],
]);

await cmd.func(page, { query: '增额寿险靠谱吗', limit: 1 });

expect(page.goto).toHaveBeenCalledWith('https://www.xiaohongshu.com/');
expect(page.click).toHaveBeenCalledWith('#search-input-in-feeds');
expect(page.typeText).toHaveBeenCalledWith('#search-input-in-feeds', '增额寿险靠谱吗');
expect(page.pressKey).toHaveBeenCalledWith('Enter');
});
it('throws a clear error when the search page is blocked by a login wall', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
expect(cmd?.func).toBeTypeOf('function');
const page = createPageMock([
// First evaluate: MutationObserver wait (login wall detected)
// First evaluate: search route reached
'search_route',
// Second evaluate: MutationObserver wait (login wall detected)
'login_wall',
]);
await expect(cmd.func(page, { query: '特斯拉', limit: 5 })).rejects.toThrow('Xiaohongshu search results are blocked behind a login wall');
// No scroll-until / autoScroll call when a login wall is detected early.
expect(page.evaluate).toHaveBeenCalledTimes(1);
expect(page.evaluate).toHaveBeenCalledTimes(2);
expect(page.autoScroll).not.toHaveBeenCalled();
});
it('unwraps a browser-bridge envelope before handling login-wall wait result', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
const page = createPageMock([
'search_route',
{ session: 'site:xiaohongshu', data: 'login_wall' },
]);

await expect(cmd.func(page, { query: '特斯拉', limit: 5 })).rejects.toMatchObject({
code: 'AUTH_REQUIRED',
message: expect.stringContaining('blocked behind a login wall'),
});
expect(page.evaluate).toHaveBeenCalledTimes(2);
});
it('throws a clear error when Xiaohongshu redirects search to a security restriction page', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
const page = createPageMock([
{ session: 'site:xiaohongshu', data: 'security_block' },
]);

await expect(cmd.func(page, { query: '增额寿险靠谱吗', limit: 5 })).rejects.toMatchObject({
code: 'COMMAND_EXEC',
message: expect.stringContaining('security restriction'),
});
expect(page.evaluate).toHaveBeenCalledTimes(1);
});
it('does not extract homepage feed cards when search submission never reaches a search results route', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
const page = createPageMock([
'timeout',
'content',
[
{
title: 'Homepage feed card',
author: 'FeedUser',
likes: '1',
url: 'https://www.xiaohongshu.com/explore/homefeed',
author_url: '',
},
],
]);

await expect(cmd.func(page, { query: '增额寿险靠谱吗', limit: 5 })).rejects.toMatchObject({
code: 'COMMAND_EXEC',
message: expect.stringContaining('did not reach a search results route'),
});
expect(page.evaluate).toHaveBeenCalledTimes(1);
});
it('returns ranked results with search_result url and author_url preserved', async () => {
Expand All @@ -84,9 +144,11 @@ describe('xiaohongshu search', () => {
},
];
const page = createPageMock([
// First evaluate: MutationObserver wait (content appeared)
// First evaluate: search route reached
'search_route',
// Second evaluate: MutationObserver wait (content appeared)
'content',
// Second evaluate: initial DOM extraction (already enough results) through Browser Bridge envelope.
// Third evaluate: initial DOM extraction (already enough results) through Browser Bridge envelope.
{ session: 'site:xiaohongshu', data: rows },
]);
const result = await cmd.func(page, { query: '特斯拉', limit: 1 });
Expand All @@ -107,6 +169,7 @@ describe('xiaohongshu search', () => {
it('fails typed instead of silently returning [] for malformed extraction payloads', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
const page = createPageMock([
'search_route',
'content',
{ session: 'site:xiaohongshu', data: { rows: [] } },
]);
Expand All @@ -120,9 +183,11 @@ describe('xiaohongshu search', () => {
const cmd = getRegistry().get('xiaohongshu/search');
expect(cmd?.func).toBeTypeOf('function');
const page = createPageMock([
// First evaluate: MutationObserver wait (content appeared)
// First evaluate: search route reached
'search_route',
// Second evaluate: MutationObserver wait (content appeared)
'content',
// Second evaluate: initial DOM extraction (already enough valid rows)
// Third evaluate: initial DOM extraction (already enough valid rows)
[
{
title: 'Result A',
Expand Down Expand Up @@ -156,26 +221,29 @@ describe('xiaohongshu search', () => {
const cmd = getRegistry().get('xiaohongshu/search');
expect(cmd?.func).toBeTypeOf('function');
const page = createPageMock([
// First evaluate: MutationObserver wait (content appeared)
// First evaluate: search route reached
'search_route',
// Second evaluate: MutationObserver wait (content appeared)
'content',
// Second evaluate: initial extraction (no rows rendered)
// Third evaluate: initial extraction (no rows rendered)
[],
// Third evaluate: scroll-until row count
// Fourth evaluate: scroll-until row count
0,
// Fourth evaluate: post-scroll extraction (still no rows)
// Fifth evaluate: post-scroll extraction (still no rows)
[],
]);
const result = (await cmd.func(page, { query: '测试等待', limit: 5 }));
expect(result).toHaveLength(0);
// Only one navigation, no retry
expect(page.goto).toHaveBeenCalledTimes(1);
// Four evaluate calls: wait, initial extraction, scroll-until, post-scroll extraction.
expect(page.evaluate).toHaveBeenCalledTimes(4);
// Five evaluate calls: route wait, content wait, initial extraction, scroll-until, post-scroll extraction.
expect(page.evaluate).toHaveBeenCalledTimes(5);
});
it('scrolls only when the initial extraction has fewer rows than requested', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
expect(cmd?.func).toBeTypeOf('function');
const page = createPageMock([
'search_route',
'content',
[
{ title: 'Result A', author: 'UserA', likes: '10', url: 'https://www.xiaohongshu.com/search_result/aaa', author_url: '' },
Expand All @@ -191,7 +259,7 @@ describe('xiaohongshu search', () => {

expect(result).toHaveLength(2);
expect(result.map((item) => item.title)).toEqual(['Result A', 'Result B']);
expect(page.evaluate).toHaveBeenCalledTimes(4);
expect(page.evaluate).toHaveBeenCalledTimes(5);
});
it('separates fallback author text from appended relative date', async () => {
const cmd = getRegistry().get('xiaohongshu/search');
Expand All @@ -208,6 +276,7 @@ describe('xiaohongshu search', () => {
`, { url: 'https://www.xiaohongshu.com/search_result?keyword=test' });
markVisible(dom.window.document.querySelector('section.note-item'));
const page = createPageMock([]);
page.evaluate.mockImplementationOnce(async () => 'search_route');
page.evaluate.mockImplementationOnce(async () => 'content');
page.evaluate.mockImplementationOnce(async (script) => Function('document', 'getComputedStyle', `return (${script})`)(dom.window.document, dom.window.getComputedStyle.bind(dom.window)));

Expand Down
Loading