Skip to content

Commit 889fa01

Browse files
committed
fix: bili词云图修复
1 parent 3f5925e commit 889fa01

File tree

5 files changed

+79
-4
lines changed

5 files changed

+79
-4
lines changed

main.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@
2424
from media_platform.weibo import WeiboCrawler
2525
from media_platform.xhs import XiaoHongShuCrawler
2626
from media_platform.zhihu import ZhihuCrawler
27+
from tools.async_file_writer import AsyncFileWriter
28+
from var import crawler_type_var
2729

2830

2931
class CrawlerFactory:
@@ -72,6 +74,18 @@ async def main():
7274
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
7375
await crawler.start()
7476

77+
# Generate wordcloud after crawling is complete
78+
# Only for JSON save mode
79+
if config.SAVE_DATA_OPTION == "json" and config.ENABLE_GET_WORDCLOUD:
80+
try:
81+
file_writer = AsyncFileWriter(
82+
platform=config.PLATFORM,
83+
crawler_type=crawler_type_var.get()
84+
)
85+
await file_writer.generate_wordcloud_from_comments()
86+
except Exception as e:
87+
print(f"Error generating wordcloud: {e}")
88+
7589

7690
def cleanup():
7791
if crawler:

store/bilibili/_store_impl.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ class BiliCsvStoreImplement(AbstractStore):
3737
def __init__(self):
3838
self.file_writer = AsyncFileWriter(
3939
crawler_type=crawler_type_var.get(),
40-
platform="bilibili"
40+
platform="bili"
4141
)
4242

4343
async def store_content(self, content_item: Dict):
@@ -220,7 +220,7 @@ class BiliJsonStoreImplement(AbstractStore):
220220
def __init__(self):
221221
self.file_writer = AsyncFileWriter(
222222
crawler_type=crawler_type_var.get(),
223-
platform="bilibili"
223+
platform="bili"
224224
)
225225

226226
async def store_content(self, content_item: Dict):

store/bilibili/bilibilli_store_media.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222

2323

2424
class BilibiliVideo(AbstractStoreVideo):
25-
video_store_path: str = "data/bilibili/videos"
25+
video_store_path: str = "data/bili/videos"
2626

2727
async def store_video(self, video_content_item: Dict):
2828
"""

tools/async_file_writer.py

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,16 @@
55
import pathlib
66
from typing import Dict, List
77
import aiofiles
8+
import config
89
from tools.utils import utils
10+
from tools.words import AsyncWordCloudGenerator
911

1012
class AsyncFileWriter:
1113
def __init__(self, platform: str, crawler_type: str):
1214
self.lock = asyncio.Lock()
1315
self.platform = platform
1416
self.crawler_type = crawler_type
17+
self.wordcloud_generator = AsyncWordCloudGenerator() if config.ENABLE_GET_WORDCLOUD else None
1518

1619
def _get_file_path(self, file_type: str, item_type: str) -> str:
1720
base_path = f"data/{self.platform}/{file_type}"
@@ -47,4 +50,58 @@ async def write_single_item_to_json(self, item: Dict, item_type: str):
4750
existing_data.append(item)
4851

4952
async with aiofiles.open(file_path, 'w', encoding='utf-8') as f:
50-
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
53+
await f.write(json.dumps(existing_data, ensure_ascii=False, indent=4))
54+
55+
async def generate_wordcloud_from_comments(self):
56+
"""
57+
Generate wordcloud from comments data
58+
Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
59+
"""
60+
if not config.ENABLE_GET_WORDCLOUD or not config.ENABLE_GET_COMMENTS:
61+
return
62+
63+
if not self.wordcloud_generator:
64+
return
65+
66+
try:
67+
# Read comments from JSON file
68+
comments_file_path = self._get_file_path('json', 'comments')
69+
if not os.path.exists(comments_file_path) or os.path.getsize(comments_file_path) == 0:
70+
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at {comments_file_path}")
71+
return
72+
73+
async with aiofiles.open(comments_file_path, 'r', encoding='utf-8') as f:
74+
content = await f.read()
75+
if not content:
76+
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty")
77+
return
78+
79+
comments_data = json.loads(content)
80+
if not isinstance(comments_data, list):
81+
comments_data = [comments_data]
82+
83+
# Filter comments data to only include 'content' field
84+
# Handle different comment data structures across platforms
85+
filtered_data = []
86+
for comment in comments_data:
87+
if isinstance(comment, dict):
88+
# Try different possible content field names
89+
content_text = comment.get('content') or comment.get('comment_text') or comment.get('text') or ''
90+
if content_text:
91+
filtered_data.append({'content': content_text})
92+
93+
if not filtered_data:
94+
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found")
95+
return
96+
97+
# Generate wordcloud
98+
words_base_path = f"data/{self.platform}/words"
99+
pathlib.Path(words_base_path).mkdir(parents=True, exist_ok=True)
100+
words_file_prefix = f"{words_base_path}/{self.crawler_type}_comments_{utils.get_current_date()}"
101+
102+
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from {len(filtered_data)} comments")
103+
await self.wordcloud_generator.generate_word_frequency_and_cloud(filtered_data, words_file_prefix)
104+
utils.logger.info(f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at {words_file_prefix}")
105+
106+
except Exception as e:
107+
utils.logger.error(f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: {e}")

tools/utils.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,10 @@ def init_loging_config():
2626
)
2727
_logger = logging.getLogger("MediaCrawler")
2828
_logger.setLevel(level)
29+
30+
# 关闭 httpx 的 INFO 日志
31+
logging.getLogger("httpx").setLevel(logging.WARNING)
32+
2933
return _logger
3034

3135

0 commit comments

Comments
 (0)