fix: bili词云图修复

NanmiCoder · NanmiCoder · commit 889fa014664f · 2025-11-02T13:25:31.000+08:00
diff --git a/main.py b/main.py
@@ -24,6 +24,8 @@
 from media_platform.weibo import WeiboCrawler
 from media_platform.xhs import XiaoHongShuCrawler
 from media_platform.zhihu import ZhihuCrawler
+from tools.async_file_writer import AsyncFileWriter
+from var import crawler_type_var
 
 
 class CrawlerFactory:
@@ -72,6 +74,18 @@ async def main():
     crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
     await crawler.start()
 
+    # Generate wordcloud after crawling is complete
+    # Only for JSON save mode
+    if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
+        try:
+            file_writer = AsyncFileWriter(
+                platform=config.PLATFORM,
+                crawler_type=crawler_type_var.get()
+            )
+            await file_writer.generate_wordcloud_from_comments()
+        except Exception as e:
+            print(f"Error generating wordcloud: {e}")
+
 
 def cleanup():
     if crawler:
diff --git a/store/bilibili/_store_impl.py b/store/bilibili/_store_impl.py
@@ -37,7 +37,7 @@ class BiliCsvStoreImplement(AbstractStore):
     def __init__(self):
         self.file_writer = AsyncFileWriter(
             crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
         )
 
     async def store_content(self, content_item: Dict):
@@ -220,7 +220,7 @@ class BiliJsonStoreImplement(AbstractStore):
     def __init__(self):
         self.file_writer = AsyncFileWriter(
             crawler_type=crawler_type_var.get(),
-            platform="bilibili"
+            platform="bili"
         )
 
     async def store_content(self, content_item: Dict):
diff --git a/store/bilibili/bilibilli_store_media.py b/store/bilibili/bilibilli_store_media.py
@@ -22,7 +22,7 @@
 
 
 class BilibiliVideo(AbstractStoreVideo):
-    video_store_path: str = "data/bilibili/videos"
+    video_store_path: str = "data/bili/videos"
 
     async def store_video(self, video_content_item: Dict):
         """
diff --git a/tools/async_file_writer.py b/tools/async_file_writer.py
@@ -5,13 +5,16 @@
 import pathlib
 from typing import Dict, List
 import aiofiles
+import config
 from tools.utils import utils
+from tools.words import AsyncWordCloudGenerator
 
 class AsyncFileWriter:
     def __init__(self, platform: str, crawler_type: str):
         self.lock = asyncio.Lock()
         self.platform = platform
         self.crawler_type = crawler_type
+        self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
 
     def _get_file_path(self, file_type: str, item_type: str) -> str:
         base_path = f"data/{self.platform}/{file_type}"
@@ -47,4 +50,58 @@ async def write_single_item_to_json(self, item: Dict, item_type: str):
             existing_data.append(item)
 
             async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
-                await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
+                await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
+
+    async def generate_wordcloud_from_comments(self):
+        """
+        Generate wordcloud from comments data
+        Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
+        """
+        if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
+            return
+
+        if not self.wordcloud_generator:
+            return
+
+        try:
+            # Read comments from JSON file
+            comments_file_path = self._get_file_path('json', 'comments')
+            if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
+                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
+                return
+
+            async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
+                content = await f.read()
+                if not content:
+                    utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
+                    return
+
+                comments_data = json.loads(content)
+                if not isinstance(comments_data, list):
+                    comments_data = [comments_data]
+
+            # Filter comments data to only include 'content' field
+            # Handle different comment data structures across platforms
+            filtered_data = []
+            for comment in comments_data:
+                if isinstance(comment, dict):
+                    # Try different possible content field names
+                    content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
+                    if content_text:
+                        filtered_data.append({'content': content_text})
+
+            if not filtered_data:
+                utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
+                return
+
+            # Generate wordcloud
+            words_base_path = f"data/{self.platform}/words"
+            pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
+            words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
+
+            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
+            await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
+            utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
+
+        except Exception as e:
+            utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")
diff --git a/tools/utils.py b/tools/utils.py
@@ -26,6 +26,10 @@ def init_loging_config():
     )
     _logger = logging.getLogger("MediaCrawler")
     _logger.setLevel(level)
+
+    # 关闭 httpx 的 INFO 日志
+    logging.getLogger("httpx").setLevel(logging.WARNING)
+
     return _logger
 
 

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,10 @@ def init_loging_config():`
`26`	`26`	`)`
`27`	`27`	`_logger = logging.getLogger("MediaCrawler")`
`28`	`28`	`_logger.setLevel(level)`
	`29`	`+`
	`30`	`+ # 关闭 httpx 的 INFO 日志`
	`31`	`+ logging.getLogger("httpx").setLevel(logging.WARNING)`
	`32`	`+`
`29`	`33`	`return _logger`
`30`	`34`
`31`	`35`