55import pathlib
66from typing import Dict , List
77import aiofiles
8+ import config
89from tools .utils import utils
10+ from tools .words import AsyncWordCloudGenerator
911
1012class AsyncFileWriter :
1113 def __init__ (self , platform : str , crawler_type : str ):
1214 self .lock = asyncio .Lock ()
1315 self .platform = platform
1416 self .crawler_type = crawler_type
17+ self .wordcloud_generator = AsyncWordCloudGenerator () if config .ENABLE_GET_WORDCLOUD else None
1518
1619 def _get_file_path (self , file_type : str , item_type : str ) -> str :
1720 base_path = f"data/{ self .platform } /{ file_type } "
@@ -47,4 +50,58 @@ async def write_single_item_to_json(self, item: Dict, item_type: str):
4750 existing_data .append (item )
4851
4952 async with aiofiles .open (file_path , 'w' , encoding = 'utf-8' ) as f :
50- await f .write (json .dumps (existing_data , ensure_ascii = False , indent = 4 ))
53+ await f .write (json .dumps (existing_data , ensure_ascii = False , indent = 4 ))
54+
55+ async def generate_wordcloud_from_comments (self ):
56+ """
57+ Generate wordcloud from comments data
58+ Only works when ENABLE_GET_WORDCLOUD and ENABLE_GET_COMMENTS are True
59+ """
60+ if not config .ENABLE_GET_WORDCLOUD or not config .ENABLE_GET_COMMENTS :
61+ return
62+
63+ if not self .wordcloud_generator :
64+ return
65+
66+ try :
67+ # Read comments from JSON file
68+ comments_file_path = self ._get_file_path ('json' , 'comments' )
69+ if not os .path .exists (comments_file_path ) or os .path .getsize (comments_file_path ) == 0 :
70+ utils .logger .info (f"[AsyncFileWriter.generate_wordcloud_from_comments] No comments file found at { comments_file_path } " )
71+ return
72+
73+ async with aiofiles .open (comments_file_path , 'r' , encoding = 'utf-8' ) as f :
74+ content = await f .read ()
75+ if not content :
76+ utils .logger .info (f"[AsyncFileWriter.generate_wordcloud_from_comments] Comments file is empty" )
77+ return
78+
79+ comments_data = json .loads (content )
80+ if not isinstance (comments_data , list ):
81+ comments_data = [comments_data ]
82+
83+ # Filter comments data to only include 'content' field
84+ # Handle different comment data structures across platforms
85+ filtered_data = []
86+ for comment in comments_data :
87+ if isinstance (comment , dict ):
88+ # Try different possible content field names
89+ content_text = comment .get ('content' ) or comment .get ('comment_text' ) or comment .get ('text' ) or ''
90+ if content_text :
91+ filtered_data .append ({'content' : content_text })
92+
93+ if not filtered_data :
94+ utils .logger .info (f"[AsyncFileWriter.generate_wordcloud_from_comments] No valid comment content found" )
95+ return
96+
97+ # Generate wordcloud
98+ words_base_path = f"data/{ self .platform } /words"
99+ pathlib .Path (words_base_path ).mkdir (parents = True , exist_ok = True )
100+ words_file_prefix = f"{ words_base_path } /{ self .crawler_type } _comments_{ utils .get_current_date ()} "
101+
102+ utils .logger .info (f"[AsyncFileWriter.generate_wordcloud_from_comments] Generating wordcloud from { len (filtered_data )} comments" )
103+ await self .wordcloud_generator .generate_word_frequency_and_cloud (filtered_data , words_file_prefix )
104+ utils .logger .info (f"[AsyncFileWriter.generate_wordcloud_from_comments] Wordcloud generated successfully at { words_file_prefix } " )
105+
106+ except Exception as e :
107+ utils .logger .error (f"[AsyncFileWriter.generate_wordcloud_from_comments] Error generating wordcloud: { e } " )
0 commit comments