feat: support time deplay for all platform

NanmiCoder · NanmiCoder · commit 2bce3593f7cf · 2025-09-02T16:43:09.000+08:00
diff --git a/README.md b/README.md
@@ -317,14 +317,3 @@ Thordata是全球代理IP解决方案提供商，支持大规模采集公共网
 ## 6. 最终解释权
 关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利，恕不另行通知。
 </div>
-
-
-## 🙏 致谢
-
-### JetBrains 开源许可证支持
-
-感谢 JetBrains 为本项目提供免费的开源许可证支持！
-
-<a href="https://www.jetbrains.com/?from=MediaCrawler">
-    <img src="https://www.jetbrains.com/company/brand/img/jetbrains_logo.png" width="100" alt="JetBrains" />
-</a>
diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py
@@ -15,7 +15,7 @@
 
 import asyncio
 import os
-import random
+# import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple, Union
 from datetime import datetime, timedelta
@@ -208,6 +208,11 @@ async def search_by_keywords(self):
                         await bilibili_store.update_up_info(video_item)
                         await self.get_bilibili_video(video_item, semaphore)
                 page += 1
+                
+                # Sleep after page navigation
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+                
                 await self.batch_get_video_comments(video_id_list)
 
     async def search_by_keywords_in_time_range(self, daily_limit: bool):
@@ -284,6 +289,11 @@ async def search_by_keywords_in_time_range(self, daily_limit: bool):
                                 await self.get_bilibili_video(video_item, semaphore)
 
                         page += 1
+                        
+                        # Sleep after page navigation
+                        await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                        utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+                        
                         await self.batch_get_video_comments(video_id_list)
 
                     except Exception as e:
@@ -318,10 +328,11 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
         async with semaphore:
             try:
                 utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
-                await asyncio.sleep(random.uniform(0.5, 1.5))
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}")
                 await self.bili_client.get_video_all_comments(
                     video_id=video_id,
-                    crawl_interval=random.random(),
+                    crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
                     is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
                     callback=bilibili_store.batch_update_bilibili_video_comments,
                     max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
@@ -347,7 +358,8 @@ async def get_creator_videos(self, creator_id: int):
             await self.get_specified_videos(video_bvids_list)
             if int(result["page"]["count"]) <= pn * ps:
                 break
-            await asyncio.sleep(random.random())
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+            utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
             pn += 1
 
     async def get_specified_videos(self, bvids_list: List[str]):
@@ -381,6 +393,11 @@ async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Sema
         async with semaphore:
             try:
                 result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
+                
+                # Sleep after fetching video details
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}")
+                
                 return result
             except DataFetchError as ex:
                 utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
@@ -544,7 +561,8 @@ async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphor
             return
 
         content = await self.bili_client.get_video_media(video_url)
-        await asyncio.sleep(random.random())
+        await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+        utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}")
         if content is None:
             return
         extension_file_name = f"video.mp4"
@@ -600,7 +618,7 @@ async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
                 utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
                 await self.bili_client.get_creator_all_fans(
                     creator_info=creator_info,
-                    crawl_interval=random.random(),
+                    crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
                     callback=bilibili_store.batch_update_bilibili_creator_fans,
                     max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
                 )
@@ -623,7 +641,7 @@ async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore)
                 utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
                 await self.bili_client.get_creator_all_followings(
                     creator_info=creator_info,
-                    crawl_interval=random.random(),
+                    crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
                     callback=bilibili_store.batch_update_bilibili_creator_followings,
                     max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
                 )
@@ -646,7 +664,7 @@ async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
                 utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
                 await self.bili_client.get_creator_all_dynamics(
                     creator_info=creator_info,
-                    crawl_interval=random.random(),
+                    crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
                     callback=bilibili_store.batch_update_bilibili_creator_dynamics,
                     max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
                 )
diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py
@@ -147,6 +147,9 @@ async def search(self) -> None:
                     aweme_list.append(aweme_info.get("aweme_id", ""))
                     await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
                     await self.get_aweme_media(aweme_item=aweme_info)
+                # Sleep after each page navigation
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
             utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
             await self.batch_get_note_comments(aweme_list)
 
@@ -165,7 +168,11 @@ async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) ->
         """Get note detail"""
         async with semaphore:
             try:
-                return await self.dy_client.get_video_by_id(aweme_id)
+                result = await self.dy_client.get_video_by_id(aweme_id)
+                # Sleep after fetching aweme detail
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[DouYinCrawler.get_aweme_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching aweme {aweme_id}")
+                return result
             except DataFetchError as ex:
                 utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
                 return None
@@ -193,13 +200,18 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Non
         async with semaphore:
             try:
                 # 将关键词列表传递给 get_aweme_all_comments 方法
+                # Use fixed crawling interval
+                crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
                 await self.dy_client.get_aweme_all_comments(
                     aweme_id=aweme_id,
-                    crawl_interval=random.random(),
+                    crawl_interval=crawl_interval,
                     is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
                     callback=douyin_store.batch_update_dy_aweme_comments,
                     max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
                 )
+                # Sleep after fetching comments
+                await asyncio.sleep(crawl_interval)
+                utils.logger.info(f"[DouYinCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for aweme {aweme_id}")
                 utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
             except DataFetchError as e:
                 utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")
diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py
@@ -11,7 +11,7 @@
 
 import asyncio
 import os
-import random
+# import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 import time
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple
@@ -159,6 +159,11 @@ async def search(self):
 
                 # batch fetch video comments
                 page += 1
+                
+                # Sleep after page navigation
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[KuaishouCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+                
                 await self.batch_get_video_comments(video_id_list)
 
     async def get_specified_videos(self):
@@ -181,6 +186,11 @@ async def get_video_info_task(
         async with semaphore:
             try:
                 result = await self.ks_client.get_video_info(video_id)
+                
+                # Sleep after fetching video details
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}")
+                
                 utils.logger.info(
                     f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
                 )
@@ -234,9 +244,14 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
                 utils.logger.info(
                     f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
                 )
+                
+                # Sleep before fetching comments
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[KuaishouCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for video {video_id}")
+                
                 await self.ks_client.get_video_all_comments(
                     photo_id=video_id,
-                    crawl_interval=random.random(),
+                    crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
                     callback=kuaishou_store.batch_update_ks_video_comments,
                     max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
                 )
diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py
@@ -11,7 +11,7 @@
 
 import asyncio
 import os
-import random
+# import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple
 
@@ -141,6 +141,11 @@ async def search(self) -> None:
                     await self.get_specified_notes(
                         note_id_list=[note_detail.note_id for note_detail in notes_list]
                     )
+                    
+                    # Sleep after page navigation
+                    await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                    utils.logger.info(f"[TieBaCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page}")
+                    
                     page += 1
                 except Exception as ex:
                     utils.logger.error(
@@ -178,6 +183,11 @@ async def get_specified_tieba_notes(self):
                     f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
                 )
                 await self.get_specified_notes([note.note_id for note in note_list])
+                
+                # Sleep after processing notes
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[TieBaCrawler.get_specified_tieba_notes] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after processing notes from page {page_number}")
+                
                 page_number += tieba_limit_count
 
     async def get_specified_notes(
@@ -222,6 +232,11 @@ async def get_note_detail_async_task(
                     f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
                 )
                 note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
+                
+                # Sleep after fetching note details
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[TieBaCrawler.get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
+                
                 if not note_detail:
                     utils.logger.error(
                         f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
@@ -277,9 +292,14 @@ async def get_comments_async_task(
             utils.logger.info(
                 f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
             )
+            
+            # Sleep before fetching comments
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+            utils.logger.info(f"[TieBaCrawler.get_comments_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_detail.note_id}")
+            
             await self.tieba_client.get_note_all_comments(
                 note_detail=note_detail,
-                crawl_interval=random.random(),
+                crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
                 callback=tieba_store.batch_update_tieba_note_comments,
                 max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
             )
diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py
@@ -15,7 +15,7 @@
 
 import asyncio
 import os
-import random
+# import random  # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
 from asyncio import Task
 from typing import Dict, List, Optional, Tuple
 
@@ -160,6 +160,11 @@ async def search(self):
                             await self.get_note_images(mblog)
 
                 page += 1
+                
+                # Sleep after page navigation
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
+                
                 await self.batch_get_notes_comments(note_id_list)
 
     async def get_specified_notes(self):
@@ -185,6 +190,11 @@ async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -
         async with semaphore:
             try:
                 result = await self.wb_client.get_note_info_by_id(note_id)
+                
+                # Sleep after fetching note details
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
+                
                 return result
             except DataFetchError as ex:
                 utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
@@ -221,9 +231,14 @@ async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
         async with semaphore:
             try:
                 utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
+                
+                # Sleep before fetching comments
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+                utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}")
+                
                 await self.wb_client.get_note_all_comments(
                     note_id=note_id,
-                    crawl_interval=random.randint(1, 3),  # 微博对API的限流比较严重，所以延时提高一些
+                    crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,  # Use fixed interval instead of random
                     callback=weibo_store.batch_update_weibo_note_comments,
                     max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
                 )
@@ -250,7 +265,8 @@ async def get_note_images(self, mblog: Dict):
             if not url:
                 continue
             content = await self.wb_client.get_note_image(url)
-            await asyncio.sleep(random.random())
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+            utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image")
             if content != None:
                 extension_file_name = url.split(".")[-1]
                 await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)
diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py
diff --git a/media_platform/zhihu/core.py b/media_platform/zhihu/core.py