refactor: xhs update

NanmiCoder · NanmiCoder · commit a4d9aaa34a3c · 2025-07-21T21:26:16.000+08:00
diff --git a/config/base_config.py b/config/base_config.py
@@ -39,7 +39,7 @@
 # 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取，提供更好的反检测能力
 # 启用后将自动检测并启动用户的Chrome/Edge浏览器，通过CDP协议进行控制
 # 这种方式使用真实的浏览器环境，包括用户的扩展、Cookie和设置，大大降低被检测的风险
-ENABLE_CDP_MODE = False
+ENABLE_CDP_MODE = True
 
 # CDP调试端口，用于与浏览器通信
 # 如果端口被占用，系统会自动尝试下一个可用端口
diff --git a/config/xhs_config.py b/config/xhs_config.py
@@ -15,9 +15,6 @@
 # 排序方式，具体的枚举值在media_platform/xhs/field.py中
 SORT_TYPE = "popularity_descending"
 
-# 用户代理，xhs自定义User-Agent
-UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
-
 # 指定笔记URL列表, 必须要携带xsec_token参数
 XHS_SPECIFIED_NOTE_URL_LIST = [
     "https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
diff --git a/main.py b/main.py
@@ -67,7 +67,8 @@ async def main():
 
 def cleanup():
     if crawler:
-        asyncio.run(crawler.close())
+        # asyncio.run(crawler.close())
+        pass
     if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
         asyncio.run(db.close())
 
diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py
@@ -95,17 +95,16 @@ async def request(self, method, url, **kwargs) -> Union[str, Any]:
         """
         # return response.text
         return_response = kwargs.pop("return_response", False)
-
         async with httpx.AsyncClient(proxies=self.proxies) as client:
             response = await client.request(method, url, timeout=self.timeout, **kwargs)
 
         if response.status_code == 471 or response.status_code == 461:
             # someday someone maybe will bypass captcha
             verify_type = response.headers["Verifytype"]
             verify_uuid = response.headers["Verifyuuid"]
-            raise Exception(
-                f"出现验证码，请求失败，Verifytype: {verify_type}，Verifyuuid: {verify_uuid}, Response: {response}"
-            )
+            msg = f"出现验证码，请求失败，Verifytype: {verify_type}，Verifyuuid: {verify_uuid}, Response: {response}"
+            utils.logger.error(msg)
+            raise Exception(msg)
 
         if return_response:
             return response.text
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
@@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
     def __init__(self) -> None:
         self.index_url = "https://www.xiaohongshu.com"
         # self.user_agent = utils.get_user_agent()
-        self.user_agent = (
-            config.UA
-            if config.UA
-            else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
-        )
+        self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
         self.cdp_manager = None
 
     async def start(self) -> None:
@@ -91,17 +87,6 @@ async def start(self) -> None:
                 )
             # stealth.min.js is a js script to prevent the website from detecting the crawler.
             await self.browser_context.add_init_script(path="libs/stealth.min.js")
-            # add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
-            await self.browser_context.add_cookies(
-                [
-                    {
-                        "name": "webId",
-                        "value": "xxx123",  # any value
-                        "domain": ".xiaohongshu.com",
-                        "path": "/",
-                    }
-                ]
-            )
             self.context_page = await self.browser_context.new_page()
             await self.context_page.goto(self.index_url)
 
@@ -152,7 +137,7 @@ async def search(self) -> None:
             page = 1
             search_id = get_search_id()
             while (
-                page - start_page + 1
+                    page - start_page + 1
             ) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
                 if page < start_page:
                     utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
@@ -294,11 +279,11 @@ async def get_specified_notes(self):
         await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
 
     async def get_note_detail_async_task(
-        self,
-        note_id: str,
-        xsec_source: str,
-        xsec_token: str,
-        semaphore: asyncio.Semaphore,
+            self,
+            note_id: str,
+            xsec_source: str,
+            xsec_token: str,
+            semaphore: asyncio.Semaphore,
     ) -> Optional[Dict]:
         """Get note detail
 
@@ -311,47 +296,31 @@ async def get_note_detail_async_task(
         Returns:
             Dict: note detail
         """
-        note_detail_from_html, note_detail_from_api = None, None
+        note_detail = None
         async with semaphore:
-            # When proxy is not enabled, increase the crawling interval
-            if config.ENABLE_IP_PROXY:
-                crawl_interval = random.random()
-            else:
-                crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
             try:
                 utils.logger.info(
                     f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
                 )
-                # 尝试直接获取网页版笔记详情，携带cookie
-                note_detail_from_html: Optional[Dict] = (
-                    await self.xhs_client.get_note_by_id_from_html(
-                        note_id, xsec_source, xsec_token, enable_cookie=True
+
+                try:
+                    note_detail = await self.xhs_client.get_note_by_id(
+                        note_id, xsec_source, xsec_token
                     )
+                except RetryError as e:
+                    pass
+
+                if not note_detail:
+                    note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
+                                                                                 enable_cookie=False)
+                    if not note_detail:
+                        raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
+
+                note_detail.update(
+                    {"xsec_token": xsec_token, "xsec_source": xsec_source}
                 )
-                time.sleep(crawl_interval)
-                if not note_detail_from_html:
-                    # 如果网页版笔记详情获取失败，则尝试不使用cookie获取
-                    note_detail_from_html = (
-                        await self.xhs_client.get_note_by_id_from_html(
-                            note_id, xsec_source, xsec_token, enable_cookie=False
-                        )
-                    )
-                    utils.logger.error(
-                        f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}"
-                    )
-                if not note_detail_from_html:
-                    # 如果网页版笔记详情获取失败，则尝试API获取
-                    note_detail_from_api: Optional[Dict] = (
-                        await self.xhs_client.get_note_by_id(
-                            note_id, xsec_source, xsec_token
-                        )
-                    )
-                note_detail = note_detail_from_html or note_detail_from_api
-                if note_detail:
-                    note_detail.update(
-                        {"xsec_token": xsec_token, "xsec_source": xsec_source}
-                    )
-                    return note_detail
+                return note_detail
+
             except DataFetchError as ex:
                 utils.logger.error(
                     f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
@@ -364,7 +333,7 @@ async def get_note_detail_async_task(
                 return None
 
     async def batch_get_note_comments(
-        self, note_list: List[str], xsec_tokens: List[str]
+            self, note_list: List[str], xsec_tokens: List[str]
     ):
         """Batch get note comments"""
         if not config.ENABLE_GET_COMMENTS:
@@ -389,7 +358,7 @@ async def batch_get_note_comments(
         await asyncio.gather(*task_list)
 
     async def get_comments(
-        self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
+            self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
     ):
         """Get note comments with keyword filtering and quantity limitation"""
         async with semaphore:
@@ -411,7 +380,7 @@ async def get_comments(
 
     @staticmethod
     def format_proxy_info(
-        ip_proxy_info: IpInfoModel,
+            ip_proxy_info: IpInfoModel,
     ) -> Tuple[Optional[Dict], Optional[Dict]]:
         """format proxy info for playwright and httpx"""
         playwright_proxy = {
@@ -447,11 +416,11 @@ async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClie
         return xhs_client_obj
 
     async def launch_browser(
-        self,
-        chromium: BrowserType,
-        playwright_proxy: Optional[Dict],
-        user_agent: Optional[str],
-        headless: bool = True,
+            self,
+            chromium: BrowserType,
+            playwright_proxy: Optional[Dict],
+            user_agent: Optional[str],
+            headless: bool = True,
     ) -> BrowserContext:
         """Launch browser and create browser context"""
         utils.logger.info(
@@ -480,11 +449,11 @@ async def launch_browser(
             return browser_context
 
     async def launch_browser_with_cdp(
-        self,
-        playwright: Playwright,
-        playwright_proxy: Optional[Dict],
-        user_agent: Optional[str],
-        headless: bool = True,
+            self,
+            playwright: Playwright,
+            playwright_proxy: Optional[Dict],
+            user_agent: Optional[str],
+            headless: bool = True,
     ) -> BrowserContext:
         """
         使用CDP模式启动浏览器