@@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
5151 def __init__ (self ) -> None :
5252 self .index_url = "https://www.xiaohongshu.com"
5353 # self.user_agent = utils.get_user_agent()
54- self .user_agent = (
55- config .UA
56- if config .UA
57- else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
58- )
54+ self .user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
5955 self .cdp_manager = None
6056
6157 async def start (self ) -> None :
@@ -91,17 +87,6 @@ async def start(self) -> None:
9187 )
9288 # stealth.min.js is a js script to prevent the website from detecting the crawler.
9389 await self .browser_context .add_init_script (path = "libs/stealth.min.js" )
94- # add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
95- await self .browser_context .add_cookies (
96- [
97- {
98- "name" : "webId" ,
99- "value" : "xxx123" , # any value
100- "domain" : ".xiaohongshu.com" ,
101- "path" : "/" ,
102- }
103- ]
104- )
10590 self .context_page = await self .browser_context .new_page ()
10691 await self .context_page .goto (self .index_url )
10792
@@ -152,7 +137,7 @@ async def search(self) -> None:
152137 page = 1
153138 search_id = get_search_id ()
154139 while (
155- page - start_page + 1
140+ page - start_page + 1
156141 ) * xhs_limit_count <= config .CRAWLER_MAX_NOTES_COUNT :
157142 if page < start_page :
158143 utils .logger .info (f"[XiaoHongShuCrawler.search] Skip page { page } " )
@@ -294,11 +279,11 @@ async def get_specified_notes(self):
294279 await self .batch_get_note_comments (need_get_comment_note_ids , xsec_tokens )
295280
296281 async def get_note_detail_async_task (
297- self ,
298- note_id : str ,
299- xsec_source : str ,
300- xsec_token : str ,
301- semaphore : asyncio .Semaphore ,
282+ self ,
283+ note_id : str ,
284+ xsec_source : str ,
285+ xsec_token : str ,
286+ semaphore : asyncio .Semaphore ,
302287 ) -> Optional [Dict ]:
303288 """Get note detail
304289
@@ -311,47 +296,31 @@ async def get_note_detail_async_task(
311296 Returns:
312297 Dict: note detail
313298 """
314- note_detail_from_html , note_detail_from_api = None , None
299+ note_detail = None
315300 async with semaphore :
316- # When proxy is not enabled, increase the crawling interval
317- if config .ENABLE_IP_PROXY :
318- crawl_interval = random .random ()
319- else :
320- crawl_interval = random .uniform (1 , config .CRAWLER_MAX_SLEEP_SEC )
321301 try :
322302 utils .logger .info (
323303 f"[get_note_detail_async_task] Begin get note detail, note_id: { note_id } "
324304 )
325- # 尝试直接获取网页版笔记详情,携带cookie
326- note_detail_from_html : Optional [ Dict ] = (
327- await self .xhs_client .get_note_by_id_from_html (
328- note_id , xsec_source , xsec_token , enable_cookie = True
305+
306+ try :
307+ note_detail = await self .xhs_client .get_note_by_id (
308+ note_id , xsec_source , xsec_token
329309 )
310+ except RetryError as e :
311+ pass
312+
313+ if not note_detail :
314+ note_detail = await self .xhs_client .get_note_by_id_from_html (note_id , xsec_source , xsec_token ,
315+ enable_cookie = False )
316+ if not note_detail :
317+ raise Exception (f"[get_note_detail_async_task] Failed to get note detail, Id: { note_id } " )
318+
319+ note_detail .update (
320+ {"xsec_token" : xsec_token , "xsec_source" : xsec_source }
330321 )
331- time .sleep (crawl_interval )
332- if not note_detail_from_html :
333- # 如果网页版笔记详情获取失败,则尝试不使用cookie获取
334- note_detail_from_html = (
335- await self .xhs_client .get_note_by_id_from_html (
336- note_id , xsec_source , xsec_token , enable_cookie = False
337- )
338- )
339- utils .logger .error (
340- f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: { note_id } "
341- )
342- if not note_detail_from_html :
343- # 如果网页版笔记详情获取失败,则尝试API获取
344- note_detail_from_api : Optional [Dict ] = (
345- await self .xhs_client .get_note_by_id (
346- note_id , xsec_source , xsec_token
347- )
348- )
349- note_detail = note_detail_from_html or note_detail_from_api
350- if note_detail :
351- note_detail .update (
352- {"xsec_token" : xsec_token , "xsec_source" : xsec_source }
353- )
354- return note_detail
322+ return note_detail
323+
355324 except DataFetchError as ex :
356325 utils .logger .error (
357326 f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: { ex } "
@@ -364,7 +333,7 @@ async def get_note_detail_async_task(
364333 return None
365334
366335 async def batch_get_note_comments (
367- self , note_list : List [str ], xsec_tokens : List [str ]
336+ self , note_list : List [str ], xsec_tokens : List [str ]
368337 ):
369338 """Batch get note comments"""
370339 if not config .ENABLE_GET_COMMENTS :
@@ -389,7 +358,7 @@ async def batch_get_note_comments(
389358 await asyncio .gather (* task_list )
390359
391360 async def get_comments (
392- self , note_id : str , xsec_token : str , semaphore : asyncio .Semaphore
361+ self , note_id : str , xsec_token : str , semaphore : asyncio .Semaphore
393362 ):
394363 """Get note comments with keyword filtering and quantity limitation"""
395364 async with semaphore :
@@ -411,7 +380,7 @@ async def get_comments(
411380
412381 @staticmethod
413382 def format_proxy_info (
414- ip_proxy_info : IpInfoModel ,
383+ ip_proxy_info : IpInfoModel ,
415384 ) -> Tuple [Optional [Dict ], Optional [Dict ]]:
416385 """format proxy info for playwright and httpx"""
417386 playwright_proxy = {
@@ -447,11 +416,11 @@ async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClie
447416 return xhs_client_obj
448417
449418 async def launch_browser (
450- self ,
451- chromium : BrowserType ,
452- playwright_proxy : Optional [Dict ],
453- user_agent : Optional [str ],
454- headless : bool = True ,
419+ self ,
420+ chromium : BrowserType ,
421+ playwright_proxy : Optional [Dict ],
422+ user_agent : Optional [str ],
423+ headless : bool = True ,
455424 ) -> BrowserContext :
456425 """Launch browser and create browser context"""
457426 utils .logger .info (
@@ -480,11 +449,11 @@ async def launch_browser(
480449 return browser_context
481450
482451 async def launch_browser_with_cdp (
483- self ,
484- playwright : Playwright ,
485- playwright_proxy : Optional [Dict ],
486- user_agent : Optional [str ],
487- headless : bool = True ,
452+ self ,
453+ playwright : Playwright ,
454+ playwright_proxy : Optional [Dict ],
455+ user_agent : Optional [str ],
456+ headless : bool = True ,
488457 ) -> BrowserContext :
489458 """
490459 使用CDP模式启动浏览器
0 commit comments