1919
2020
2121import asyncio
22- import sys
23- import signal
24- from typing import Optional
22+ from typing import Optional , Type
2523
2624import cmd_arg
2725import config
3937
4038
4139class CrawlerFactory :
42- CRAWLERS = {
40+ CRAWLERS : dict [ str , Type [ AbstractCrawler ]] = {
4341 "xhs" : XiaoHongShuCrawler ,
4442 "dy" : DouYinCrawler ,
4543 "ks" : KuaishouCrawler ,
@@ -53,115 +51,96 @@ class CrawlerFactory:
5351 def create_crawler (platform : str ) -> AbstractCrawler :
5452 crawler_class = CrawlerFactory .CRAWLERS .get (platform )
5553 if not crawler_class :
56- raise ValueError (
57- "Invalid Media Platform Currently only supported xhs or dy or ks or bili ..."
58- )
54+ supported = ", " .join (sorted (CrawlerFactory .CRAWLERS ))
55+ raise ValueError (f"Invalid media platform: { platform !r} . Supported: { supported } " )
5956 return crawler_class ()
6057
6158
6259crawler : Optional [AbstractCrawler ] = None
6360
6461
65- 66- # 原因:增加 --init_db 功能,用于数据库初始化。
67- # 副作用:无
68- # 回滚策略:还原此文件。
69- async def main ():
70- # Init crawler
62+ def _flush_excel_if_needed () -> None :
63+ if config .SAVE_DATA_OPTION != "excel" :
64+ return
65+
66+ try :
67+ from store .excel_store_base import ExcelStoreBase
68+
69+ ExcelStoreBase .flush_all ()
70+ print ("[Main] Excel files saved successfully" )
71+ except Exception as e :
72+ print (f"[Main] Error flushing Excel data: { e } " )
73+
74+
75+ async def _generate_wordcloud_if_needed () -> None :
76+ if config .SAVE_DATA_OPTION != "json" or not config .ENABLE_GET_WORDCLOUD :
77+ return
78+
79+ try :
80+ file_writer = AsyncFileWriter (
81+ platform = config .PLATFORM ,
82+ crawler_type = crawler_type_var .get (),
83+ )
84+ await file_writer .generate_wordcloud_from_comments ()
85+ except Exception as e :
86+ print (f"[Main] Error generating wordcloud: { e } " )
87+
88+
89+ async def main () -> None :
7190 global crawler
7291
73- # parse cmd
7492 args = await cmd_arg .parse_cmd ()
75-
76- # init db
7793 if args .init_db :
7894 await db .init_db (args .init_db )
7995 print (f"Database { args .init_db } initialized successfully." )
80- return # Exit the main function cleanly
81-
82-
96+ return
8397
8498 crawler = CrawlerFactory .create_crawler (platform = config .PLATFORM )
8599 await crawler .start ()
86100
87- # Flush Excel data if using Excel export
88- if config .SAVE_DATA_OPTION == "excel" :
89- try :
90- from store .excel_store_base import ExcelStoreBase
91- ExcelStoreBase .flush_all ()
92- print ("[Main] Excel files saved successfully" )
93- except Exception as e :
94- print (f"[Main] Error flushing Excel data: { e } " )
101+ _flush_excel_if_needed ()
95102
96103 # Generate wordcloud after crawling is complete
97104 # Only for JSON save mode
98- if config .SAVE_DATA_OPTION == "json" and config .ENABLE_GET_WORDCLOUD :
99- try :
100- file_writer = AsyncFileWriter (
101- platform = config .PLATFORM ,
102- crawler_type = crawler_type_var .get ()
103- )
104- await file_writer .generate_wordcloud_from_comments ()
105- except Exception as e :
106- print (f"Error generating wordcloud: { e } " )
105+ await _generate_wordcloud_if_needed ()
107106
108107
109- async def async_cleanup ():
110- """异步清理函数,用于处理CDP浏览器等异步资源"""
108+ async def async_cleanup () -> None :
111109 global crawler
112110 if crawler :
113- # 检查并清理CDP浏览器
114- if hasattr (crawler , 'cdp_manager' ) and crawler .cdp_manager :
111+ if getattr (crawler , "cdp_manager" , None ):
115112 try :
116- await crawler .cdp_manager .cleanup (force = True ) # 强制清理浏览器进程
113+ await crawler .cdp_manager .cleanup (force = True )
117114 except Exception as e :
118- # 只在非预期错误时打印
119115 error_msg = str (e ).lower ()
120116 if "closed" not in error_msg and "disconnected" not in error_msg :
121117 print (f"[Main] 清理CDP浏览器时出错: { e } " )
122118
123- # 检查并清理标准浏览器上下文(仅在非CDP模式下)
124- elif hasattr (crawler , 'browser_context' ) and crawler .browser_context :
119+ elif getattr (crawler , "browser_context" , None ):
125120 try :
126- # 检查上下文是否仍然打开
127- if hasattr (crawler .browser_context , 'pages' ):
128- await crawler .browser_context .close ()
121+ await crawler .browser_context .close ()
129122 except Exception as e :
130- # 只在非预期错误时打印
131123 error_msg = str (e ).lower ()
132124 if "closed" not in error_msg and "disconnected" not in error_msg :
133125 print (f"[Main] 关闭浏览器上下文时出错: { e } " )
134126
135- # 关闭数据库连接
136- if config .SAVE_DATA_OPTION in ["db" , "sqlite" ]:
127+ if config .SAVE_DATA_OPTION in ("db" , "sqlite" ):
137128 await db .close ()
138129
139- def cleanup ():
140- """同步清理函数"""
141- try :
142- # 创建新的事件循环来执行异步清理
143- loop = asyncio .new_event_loop ()
144- asyncio .set_event_loop (loop )
145- loop .run_until_complete (async_cleanup ())
146- loop .close ()
147- except Exception as e :
148- print (f"[Main] 清理时出错: { e } " )
149-
150-
151- def signal_handler (signum , _frame ):
152- """信号处理器,处理Ctrl+C等中断信号"""
153- print (f"\n [Main] 收到中断信号 { signum } ,正在清理资源..." )
154- cleanup ()
155- sys .exit (0 )
156-
157130if __name__ == "__main__" :
158- # 注册信号处理器
159- signal .signal (signal .SIGINT , signal_handler ) # Ctrl+C
160- signal .signal (signal .SIGTERM , signal_handler ) # 终止信号
131+ from tools .app_runner import run
132+
133+ def _force_stop () -> None :
134+ c = crawler
135+ if not c :
136+ return
137+ cdp_manager = getattr (c , "cdp_manager" , None )
138+ launcher = getattr (cdp_manager , "launcher" , None )
139+ if not launcher :
140+ return
141+ try :
142+ launcher .cleanup ()
143+ except Exception :
144+ pass
161145
162- try :
163- asyncio .get_event_loop ().run_until_complete (main ())
164- except KeyboardInterrupt :
165- print ("\n [Main] 收到键盘中断,正在清理资源..." )
166- finally :
167- cleanup ()
146+ run (main , async_cleanup , cleanup_timeout_seconds = 15.0 , on_first_interrupt = _force_stop )
0 commit comments