Skip to content

Commit 55d8c77

Browse files
committed
feat: webo full context support
1 parent ff1b681 commit 55d8c77

File tree

2 files changed

+69
-2
lines changed

2 files changed

+69
-2
lines changed

config/weibo_config.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@
3131

3232
# 指定微博用户ID列表
3333
WEIBO_CREATOR_ID_LIST = [
34-
"5533390220",
34+
"5756404150",
3535
# ........................
3636
]
37+
38+
# 是否开启微博爬取全文的功能,默认开启
39+
# 如果开启的话会增加被风控的概率,相当于一个关键词搜索请求会再遍历所有帖子的时候,再请求一次帖子详情
40+
ENABLE_WEIBO_FULL_TEXT = True

media_platform/weibo/core.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,8 @@ async def search(self):
170170
search_res = await self.wb_client.get_note_by_keyword(keyword=keyword, page=page, search_type=search_type)
171171
note_id_list: List[str] = []
172172
note_list = filter_search_result_card(search_res.get("cards"))
173+
# 如果开启了全文获取功能,则批量获取帖子全文
174+
note_list = await self.batch_get_notes_full_text(note_list)
173175
for note_item in note_list:
174176
if note_item:
175177
mblog: Dict = note_item.get("mblog")
@@ -313,12 +315,18 @@ async def get_creators_and_notes(self) -> None:
313315
raise DataFetchError("Get creator info error")
314316
await weibo_store.save_creator(user_id, user_info=createor_info)
315317

318+
# 创建一个包装 callback,在保存数据前获取全文
319+
async def save_notes_with_full_text(note_list: List[Dict]):
320+
# 如果开启了全文获取功能,先批量获取全文
321+
updated_note_list = await self.batch_get_notes_full_text(note_list)
322+
await weibo_store.batch_update_weibo_notes(updated_note_list)
323+
316324
# Get all note information of the creator
317325
all_notes_list = await self.wb_client.get_all_notes_by_creator_id(
318326
creator_id=user_id,
319327
container_id=f"107603{user_id}",
320328
crawl_interval=0,
321-
callback=weibo_store.batch_update_weibo_notes,
329+
callback=save_notes_with_full_text,
322330
)
323331

324332
note_ids = [note_item.get("mblog", {}).get("id") for note_item in all_notes_list if note_item.get("mblog", {}).get("id")]
@@ -406,6 +414,61 @@ async def launch_browser_with_cdp(
406414
chromium = playwright.chromium
407415
return await self.launch_browser(chromium, playwright_proxy, user_agent, headless)
408416

417+
async def get_note_full_text(self, note_item: Dict) -> Dict:
418+
"""
419+
获取帖子全文内容
420+
如果帖子内容被截断(isLongText=True),则请求详情接口获取完整内容
421+
:param note_item: 帖子数据,包含 mblog 字段
422+
:return: 更新后的帖子数据
423+
"""
424+
if not config.ENABLE_WEIBO_FULL_TEXT:
425+
return note_item
426+
427+
mblog = note_item.get("mblog", {})
428+
if not mblog:
429+
return note_item
430+
431+
# 检查是否是长文本
432+
is_long_text = mblog.get("isLongText", False)
433+
if not is_long_text:
434+
return note_item
435+
436+
note_id = mblog.get("id")
437+
if not note_id:
438+
return note_item
439+
440+
try:
441+
utils.logger.info(f"[WeiboCrawler.get_note_full_text] Fetching full text for note: {note_id}")
442+
full_note = await self.wb_client.get_note_info_by_id(note_id)
443+
if full_note and full_note.get("mblog"):
444+
# 用完整内容替换原始内容
445+
note_item["mblog"] = full_note["mblog"]
446+
utils.logger.info(f"[WeiboCrawler.get_note_full_text] Successfully fetched full text for note: {note_id}")
447+
448+
# 请求后休眠,避免风控
449+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
450+
except DataFetchError as ex:
451+
utils.logger.error(f"[WeiboCrawler.get_note_full_text] Failed to fetch full text for note {note_id}: {ex}")
452+
except Exception as ex:
453+
utils.logger.error(f"[WeiboCrawler.get_note_full_text] Unexpected error for note {note_id}: {ex}")
454+
455+
return note_item
456+
457+
async def batch_get_notes_full_text(self, note_list: List[Dict]) -> List[Dict]:
458+
"""
459+
批量获取帖子全文内容
460+
:param note_list: 帖子列表
461+
:return: 更新后的帖子列表
462+
"""
463+
if not config.ENABLE_WEIBO_FULL_TEXT:
464+
return note_list
465+
466+
result = []
467+
for note_item in note_list:
468+
updated_note = await self.get_note_full_text(note_item)
469+
result.append(updated_note)
470+
return result
471+
409472
async def close(self):
410473
"""Close browser context"""
411474
# 如果使用CDP模式,需要特殊处理

0 commit comments

Comments
 (0)