@@ -170,6 +170,8 @@ async def search(self):
170170 search_res = await self .wb_client .get_note_by_keyword (keyword = keyword , page = page , search_type = search_type )
171171 note_id_list : List [str ] = []
172172 note_list = filter_search_result_card (search_res .get ("cards" ))
173+ # 如果开启了全文获取功能,则批量获取帖子全文
174+ note_list = await self .batch_get_notes_full_text (note_list )
173175 for note_item in note_list :
174176 if note_item :
175177 mblog : Dict = note_item .get ("mblog" )
@@ -313,12 +315,18 @@ async def get_creators_and_notes(self) -> None:
313315 raise DataFetchError ("Get creator info error" )
314316 await weibo_store .save_creator (user_id , user_info = createor_info )
315317
318+ # 创建一个包装 callback,在保存数据前获取全文
319+ async def save_notes_with_full_text (note_list : List [Dict ]):
320+ # 如果开启了全文获取功能,先批量获取全文
321+ updated_note_list = await self .batch_get_notes_full_text (note_list )
322+ await weibo_store .batch_update_weibo_notes (updated_note_list )
323+
316324 # Get all note information of the creator
317325 all_notes_list = await self .wb_client .get_all_notes_by_creator_id (
318326 creator_id = user_id ,
319327 container_id = f"107603{ user_id } " ,
320328 crawl_interval = 0 ,
321- callback = weibo_store . batch_update_weibo_notes ,
329+ callback = save_notes_with_full_text ,
322330 )
323331
324332 note_ids = [note_item .get ("mblog" , {}).get ("id" ) for note_item in all_notes_list if note_item .get ("mblog" , {}).get ("id" )]
@@ -406,6 +414,61 @@ async def launch_browser_with_cdp(
406414 chromium = playwright .chromium
407415 return await self .launch_browser (chromium , playwright_proxy , user_agent , headless )
408416
417+ async def get_note_full_text (self , note_item : Dict ) -> Dict :
418+ """
419+ 获取帖子全文内容
420+ 如果帖子内容被截断(isLongText=True),则请求详情接口获取完整内容
421+ :param note_item: 帖子数据,包含 mblog 字段
422+ :return: 更新后的帖子数据
423+ """
424+ if not config .ENABLE_WEIBO_FULL_TEXT :
425+ return note_item
426+
427+ mblog = note_item .get ("mblog" , {})
428+ if not mblog :
429+ return note_item
430+
431+ # 检查是否是长文本
432+ is_long_text = mblog .get ("isLongText" , False )
433+ if not is_long_text :
434+ return note_item
435+
436+ note_id = mblog .get ("id" )
437+ if not note_id :
438+ return note_item
439+
440+ try :
441+ utils .logger .info (f"[WeiboCrawler.get_note_full_text] Fetching full text for note: { note_id } " )
442+ full_note = await self .wb_client .get_note_info_by_id (note_id )
443+ if full_note and full_note .get ("mblog" ):
444+ # 用完整内容替换原始内容
445+ note_item ["mblog" ] = full_note ["mblog" ]
446+ utils .logger .info (f"[WeiboCrawler.get_note_full_text] Successfully fetched full text for note: { note_id } " )
447+
448+ # 请求后休眠,避免风控
449+ await asyncio .sleep (config .CRAWLER_MAX_SLEEP_SEC )
450+ except DataFetchError as ex :
451+ utils .logger .error (f"[WeiboCrawler.get_note_full_text] Failed to fetch full text for note { note_id } : { ex } " )
452+ except Exception as ex :
453+ utils .logger .error (f"[WeiboCrawler.get_note_full_text] Unexpected error for note { note_id } : { ex } " )
454+
455+ return note_item
456+
457+ async def batch_get_notes_full_text (self , note_list : List [Dict ]) -> List [Dict ]:
458+ """
459+ 批量获取帖子全文内容
460+ :param note_list: 帖子列表
461+ :return: 更新后的帖子列表
462+ """
463+ if not config .ENABLE_WEIBO_FULL_TEXT :
464+ return note_list
465+
466+ result = []
467+ for note_item in note_list :
468+ updated_note = await self .get_note_full_text (note_item )
469+ result .append (updated_note )
470+ return result
471+
409472 async def close (self ):
410473 """Close browser context"""
411474 # 如果使用CDP模式,需要特殊处理
0 commit comments