From 65a99283e083d6ccc27f3514fbc55ccce38be6fc Mon Sep 17 00:00:00 2001 From: wangyan Date: Wed, 24 Jun 2026 11:26:50 +0800 Subject: [PATCH] fix xiaohongshu search navigation --- clis/xiaohongshu/search.js | 58 ++++++++++++++++++-- clis/xiaohongshu/search.test.js | 95 ++++++++++++++++++++++++++++----- 2 files changed, 136 insertions(+), 17 deletions(-) diff --git a/clis/xiaohongshu/search.js b/clis/xiaohongshu/search.js index a56475bad..a9ec1a1c4 100644 --- a/clis/xiaohongshu/search.js +++ b/clis/xiaohongshu/search.js @@ -10,7 +10,8 @@ import { ArgumentError, AuthRequiredError, CommandExecutionError } from '@jackwe /** * Wait for search results or login wall using MutationObserver (max 5s). * Returns 'content' if note items appeared, 'login_wall' if login gate - * detected, or 'timeout' if neither appeared within the deadline. + * detected, 'security_block' for Xiaohongshu's account/security restriction + * page, or 'timeout' if none appeared within the deadline. * * Note-item detection tries the legacy `section.note-item` class first * (still observed in many sessions, including rednote) and falls back to @@ -24,7 +25,12 @@ const WAIT_FOR_CONTENT_JS = ` ); const detect = () => { if (findNoteCard()) return 'content'; - if (/登录后查看搜索结果/.test(document.body?.innerText || '')) return 'login_wall'; + const text = document.body?.innerText || ''; + if (/登录后查看搜索结果/.test(text)) return 'login_wall'; + if ( + /安全限制|Account abnormal|300011/.test(text) || + /\\/website-login\\/error/.test(location.pathname) + ) return 'security_block'; return null; }; const found = detect(); @@ -37,6 +43,33 @@ const WAIT_FOR_CONTENT_JS = ` setTimeout(() => { observer.disconnect(); resolve('timeout'); }, 5000); }) `; +const WAIT_FOR_SEARCH_ROUTE_JS = ` + new Promise((resolve) => { + const isSearchRoute = () => /\\/search_result(?:_ai)?/.test(location.pathname); + const isSecurityBlock = () => { + const text = document.body?.innerText || ''; + return /安全限制|Account abnormal|300011/.test(text) || /\\/website-login\\/error/.test(location.pathname); + }; + const detect = () => { + if (isSearchRoute()) return 'search_route'; + if (isSecurityBlock()) return 'security_block'; + return null; + }; + const found = detect(); + if (found) return resolve(found); + const timer = setInterval(() => { + const result = detect(); + if (result) { + clearInterval(timer); + resolve(result); + } + }, 100); + setTimeout(() => { + clearInterval(timer); + resolve('timeout'); + }, 8000); + }) +`; /** * Extract approximate publish date from a Xiaohongshu note URL. * XHS note IDs follow MongoDB ObjectID format where the first 8 hex @@ -80,6 +113,23 @@ function requireSearchRows(payload, phase) { } return rows; } +function assertSearchRoute(payload) { + const result = unwrapEvaluateResult(payload); + if (result === 'search_route') { + return; + } + if (result === 'security_block') { + throw new CommandExecutionError('Xiaohongshu search hit a security restriction page instead of search results.'); + } + throw new CommandExecutionError('Xiaohongshu search did not reach a search results route after submitting the query.'); +} +async function submitSearchFromHome(page, query) { + await page.goto('https://www.xiaohongshu.com/'); + await page.wait?.(1); + await page.click('#search-input-in-feeds'); + await page.typeText('#search-input-in-feeds', query); + await page.pressKey('Enter'); +} export function parseLimit(raw) { const parsed = Number(raw ?? 20); if (!Number.isFinite(parsed) || !Number.isInteger(parsed)) { @@ -282,8 +332,8 @@ export const command = cli({ columns: ['rank', 'title', 'author', 'likes', 'published_at', 'url'], func: async (page, kwargs) => { const limit = parseLimit(kwargs.limit); - const keyword = encodeURIComponent(kwargs.query); - await page.goto(`https://www.xiaohongshu.com/search_result?keyword=${keyword}&source=web_search_result_notes`); + await submitSearchFromHome(page, kwargs.query); + assertSearchRoute(await page.evaluate(WAIT_FOR_SEARCH_ROUTE_JS)); // Wait for search results to render (or login wall to appear). // Uses MutationObserver to resolve as soon as content appears, // instead of a fixed delay + blind retry. diff --git a/clis/xiaohongshu/search.test.js b/clis/xiaohongshu/search.test.js index 88b96ab67..7975d2a69 100644 --- a/clis/xiaohongshu/search.test.js +++ b/clis/xiaohongshu/search.test.js @@ -45,21 +45,47 @@ describe('xiaohongshu search', () => { }); expect(page.goto).not.toHaveBeenCalled(); }); + it('submits the keyword through the Xiaohongshu search box instead of direct-linking search results', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + const page = createPageMock([ + 'search_route', + 'content', + [ + { + title: 'Result A', + author: 'UserA', + likes: '10', + url: 'https://www.xiaohongshu.com/search_result/aaa', + author_url: '', + }, + ], + ]); + + await cmd.func(page, { query: '增额寿险靠谱吗', limit: 1 }); + + expect(page.goto).toHaveBeenCalledWith('https://www.xiaohongshu.com/'); + expect(page.click).toHaveBeenCalledWith('#search-input-in-feeds'); + expect(page.typeText).toHaveBeenCalledWith('#search-input-in-feeds', '增额寿险靠谱吗'); + expect(page.pressKey).toHaveBeenCalledWith('Enter'); + }); it('throws a clear error when the search page is blocked by a login wall', async () => { const cmd = getRegistry().get('xiaohongshu/search'); expect(cmd?.func).toBeTypeOf('function'); const page = createPageMock([ - // First evaluate: MutationObserver wait (login wall detected) + // First evaluate: search route reached + 'search_route', + // Second evaluate: MutationObserver wait (login wall detected) 'login_wall', ]); await expect(cmd.func(page, { query: '特斯拉', limit: 5 })).rejects.toThrow('Xiaohongshu search results are blocked behind a login wall'); // No scroll-until / autoScroll call when a login wall is detected early. - expect(page.evaluate).toHaveBeenCalledTimes(1); + expect(page.evaluate).toHaveBeenCalledTimes(2); expect(page.autoScroll).not.toHaveBeenCalled(); }); it('unwraps a browser-bridge envelope before handling login-wall wait result', async () => { const cmd = getRegistry().get('xiaohongshu/search'); const page = createPageMock([ + 'search_route', { session: 'site:xiaohongshu', data: 'login_wall' }, ]); @@ -67,6 +93,40 @@ describe('xiaohongshu search', () => { code: 'AUTH_REQUIRED', message: expect.stringContaining('blocked behind a login wall'), }); + expect(page.evaluate).toHaveBeenCalledTimes(2); + }); + it('throws a clear error when Xiaohongshu redirects search to a security restriction page', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + const page = createPageMock([ + { session: 'site:xiaohongshu', data: 'security_block' }, + ]); + + await expect(cmd.func(page, { query: '增额寿险靠谱吗', limit: 5 })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + message: expect.stringContaining('security restriction'), + }); + expect(page.evaluate).toHaveBeenCalledTimes(1); + }); + it('does not extract homepage feed cards when search submission never reaches a search results route', async () => { + const cmd = getRegistry().get('xiaohongshu/search'); + const page = createPageMock([ + 'timeout', + 'content', + [ + { + title: 'Homepage feed card', + author: 'FeedUser', + likes: '1', + url: 'https://www.xiaohongshu.com/explore/homefeed', + author_url: '', + }, + ], + ]); + + await expect(cmd.func(page, { query: '增额寿险靠谱吗', limit: 5 })).rejects.toMatchObject({ + code: 'COMMAND_EXEC', + message: expect.stringContaining('did not reach a search results route'), + }); expect(page.evaluate).toHaveBeenCalledTimes(1); }); it('returns ranked results with search_result url and author_url preserved', async () => { @@ -84,9 +144,11 @@ describe('xiaohongshu search', () => { }, ]; const page = createPageMock([ - // First evaluate: MutationObserver wait (content appeared) + // First evaluate: search route reached + 'search_route', + // Second evaluate: MutationObserver wait (content appeared) 'content', - // Second evaluate: initial DOM extraction (already enough results) through Browser Bridge envelope. + // Third evaluate: initial DOM extraction (already enough results) through Browser Bridge envelope. { session: 'site:xiaohongshu', data: rows }, ]); const result = await cmd.func(page, { query: '特斯拉', limit: 1 }); @@ -107,6 +169,7 @@ describe('xiaohongshu search', () => { it('fails typed instead of silently returning [] for malformed extraction payloads', async () => { const cmd = getRegistry().get('xiaohongshu/search'); const page = createPageMock([ + 'search_route', 'content', { session: 'site:xiaohongshu', data: { rows: [] } }, ]); @@ -120,9 +183,11 @@ describe('xiaohongshu search', () => { const cmd = getRegistry().get('xiaohongshu/search'); expect(cmd?.func).toBeTypeOf('function'); const page = createPageMock([ - // First evaluate: MutationObserver wait (content appeared) + // First evaluate: search route reached + 'search_route', + // Second evaluate: MutationObserver wait (content appeared) 'content', - // Second evaluate: initial DOM extraction (already enough valid rows) + // Third evaluate: initial DOM extraction (already enough valid rows) [ { title: 'Result A', @@ -156,26 +221,29 @@ describe('xiaohongshu search', () => { const cmd = getRegistry().get('xiaohongshu/search'); expect(cmd?.func).toBeTypeOf('function'); const page = createPageMock([ - // First evaluate: MutationObserver wait (content appeared) + // First evaluate: search route reached + 'search_route', + // Second evaluate: MutationObserver wait (content appeared) 'content', - // Second evaluate: initial extraction (no rows rendered) + // Third evaluate: initial extraction (no rows rendered) [], - // Third evaluate: scroll-until row count + // Fourth evaluate: scroll-until row count 0, - // Fourth evaluate: post-scroll extraction (still no rows) + // Fifth evaluate: post-scroll extraction (still no rows) [], ]); const result = (await cmd.func(page, { query: '测试等待', limit: 5 })); expect(result).toHaveLength(0); // Only one navigation, no retry expect(page.goto).toHaveBeenCalledTimes(1); - // Four evaluate calls: wait, initial extraction, scroll-until, post-scroll extraction. - expect(page.evaluate).toHaveBeenCalledTimes(4); + // Five evaluate calls: route wait, content wait, initial extraction, scroll-until, post-scroll extraction. + expect(page.evaluate).toHaveBeenCalledTimes(5); }); it('scrolls only when the initial extraction has fewer rows than requested', async () => { const cmd = getRegistry().get('xiaohongshu/search'); expect(cmd?.func).toBeTypeOf('function'); const page = createPageMock([ + 'search_route', 'content', [ { title: 'Result A', author: 'UserA', likes: '10', url: 'https://www.xiaohongshu.com/search_result/aaa', author_url: '' }, @@ -191,7 +259,7 @@ describe('xiaohongshu search', () => { expect(result).toHaveLength(2); expect(result.map((item) => item.title)).toEqual(['Result A', 'Result B']); - expect(page.evaluate).toHaveBeenCalledTimes(4); + expect(page.evaluate).toHaveBeenCalledTimes(5); }); it('separates fallback author text from appended relative date', async () => { const cmd = getRegistry().get('xiaohongshu/search'); @@ -208,6 +276,7 @@ describe('xiaohongshu search', () => { `, { url: 'https://www.xiaohongshu.com/search_result?keyword=test' }); markVisible(dom.window.document.querySelector('section.note-item')); const page = createPageMock([]); + page.evaluate.mockImplementationOnce(async () => 'search_route'); page.evaluate.mockImplementationOnce(async () => 'content'); page.evaluate.mockImplementationOnce(async (script) => Function('document', 'getComputedStyle', `return (${script})`)(dom.window.document, dom.window.getComputedStyle.bind(dom.window)));