Skip to content

Commit 2bce359

Browse files
committed
feat: support time deplay for all platform
1 parent eb799e1 commit 2bce359

File tree

8 files changed

+151
-48
lines changed

8 files changed

+151
-48
lines changed

README.md

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -317,14 +317,3 @@ Thordata是全球代理IP解决方案提供商,支持大规模采集公共网
317317
## 6. 最终解释权
318318
关于本项目的最终解释权归开发者所有。开发者保留随时更改或更新本免责声明的权利,恕不另行通知。
319319
</div>
320-
321-
322-
## 🙏 致谢
323-
324-
### JetBrains 开源许可证支持
325-
326-
感谢 JetBrains 为本项目提供免费的开源许可证支持!
327-
328-
<a href="https://www.jetbrains.com/?from=MediaCrawler">
329-
<img src="https://www.jetbrains.com/company/brand/img/jetbrains_logo.png" width="100" alt="JetBrains" />
330-
</a>

media_platform/bilibili/core.py

Lines changed: 26 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import asyncio
1717
import os
18-
import random
18+
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
1919
from asyncio import Task
2020
from typing import Dict, List, Optional, Tuple, Union
2121
from datetime import datetime, timedelta
@@ -208,6 +208,11 @@ async def search_by_keywords(self):
208208
await bilibili_store.update_up_info(video_item)
209209
await self.get_bilibili_video(video_item, semaphore)
210210
page += 1
211+
212+
# Sleep after page navigation
213+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
214+
utils.logger.info(f"[BilibiliCrawler.search_by_keywords] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
215+
211216
await self.batch_get_video_comments(video_id_list)
212217

213218
async def search_by_keywords_in_time_range(self, daily_limit: bool):
@@ -284,6 +289,11 @@ async def search_by_keywords_in_time_range(self, daily_limit: bool):
284289
await self.get_bilibili_video(video_item, semaphore)
285290

286291
page += 1
292+
293+
# Sleep after page navigation
294+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
295+
utils.logger.info(f"[BilibiliCrawler.search_by_keywords_in_time_range] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
296+
287297
await self.batch_get_video_comments(video_id_list)
288298

289299
except Exception as e:
@@ -318,10 +328,11 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
318328
async with semaphore:
319329
try:
320330
utils.logger.info(f"[BilibiliCrawler.get_comments] begin get video_id: {video_id} comments ...")
321-
await asyncio.sleep(random.uniform(0.5, 1.5))
331+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
332+
utils.logger.info(f"[BilibiliCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching comments for video {video_id}")
322333
await self.bili_client.get_video_all_comments(
323334
video_id=video_id,
324-
crawl_interval=random.random(),
335+
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
325336
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
326337
callback=bilibili_store.batch_update_bilibili_video_comments,
327338
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
@@ -347,7 +358,8 @@ async def get_creator_videos(self, creator_id: int):
347358
await self.get_specified_videos(video_bvids_list)
348359
if int(result["page"]["count"]) <= pn * ps:
349360
break
350-
await asyncio.sleep(random.random())
361+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
362+
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
351363
pn += 1
352364

353365
async def get_specified_videos(self, bvids_list: List[str]):
@@ -381,6 +393,11 @@ async def get_video_info_task(self, aid: int, bvid: str, semaphore: asyncio.Sema
381393
async with semaphore:
382394
try:
383395
result = await self.bili_client.get_video_info(aid=aid, bvid=bvid)
396+
397+
# Sleep after fetching video details
398+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
399+
utils.logger.info(f"[BilibiliCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {bvid or aid}")
400+
384401
return result
385402
except DataFetchError as ex:
386403
utils.logger.error(f"[BilibiliCrawler.get_video_info_task] Get video detail error: {ex}")
@@ -544,7 +561,8 @@ async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphor
544561
return
545562

546563
content = await self.bili_client.get_video_media(video_url)
547-
await asyncio.sleep(random.random())
564+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
565+
utils.logger.info(f"[BilibiliCrawler.get_bilibili_video] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video {aid}")
548566
if content is None:
549567
return
550568
extension_file_name = f"video.mp4"
@@ -600,7 +618,7 @@ async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
600618
utils.logger.info(f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
601619
await self.bili_client.get_creator_all_fans(
602620
creator_info=creator_info,
603-
crawl_interval=random.random(),
621+
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
604622
callback=bilibili_store.batch_update_bilibili_creator_fans,
605623
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
606624
)
@@ -623,7 +641,7 @@ async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore)
623641
utils.logger.info(f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
624642
await self.bili_client.get_creator_all_followings(
625643
creator_info=creator_info,
626-
crawl_interval=random.random(),
644+
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
627645
callback=bilibili_store.batch_update_bilibili_creator_followings,
628646
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
629647
)
@@ -646,7 +664,7 @@ async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
646664
utils.logger.info(f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
647665
await self.bili_client.get_creator_all_dynamics(
648666
creator_info=creator_info,
649-
crawl_interval=random.random(),
667+
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
650668
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
651669
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
652670
)

media_platform/douyin/core.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,9 @@ async def search(self) -> None:
147147
aweme_list.append(aweme_info.get("aweme_id", ""))
148148
await douyin_store.update_douyin_aweme(aweme_item=aweme_info)
149149
await self.get_aweme_media(aweme_item=aweme_info)
150+
# Sleep after each page navigation
151+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
152+
utils.logger.info(f"[DouYinCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
150153
utils.logger.info(f"[DouYinCrawler.search] keyword:{keyword}, aweme_list:{aweme_list}")
151154
await self.batch_get_note_comments(aweme_list)
152155

@@ -165,7 +168,11 @@ async def get_aweme_detail(self, aweme_id: str, semaphore: asyncio.Semaphore) ->
165168
"""Get note detail"""
166169
async with semaphore:
167170
try:
168-
return await self.dy_client.get_video_by_id(aweme_id)
171+
result = await self.dy_client.get_video_by_id(aweme_id)
172+
# Sleep after fetching aweme detail
173+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
174+
utils.logger.info(f"[DouYinCrawler.get_aweme_detail] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching aweme {aweme_id}")
175+
return result
169176
except DataFetchError as ex:
170177
utils.logger.error(f"[DouYinCrawler.get_aweme_detail] Get aweme detail error: {ex}")
171178
return None
@@ -193,13 +200,18 @@ async def get_comments(self, aweme_id: str, semaphore: asyncio.Semaphore) -> Non
193200
async with semaphore:
194201
try:
195202
# 将关键词列表传递给 get_aweme_all_comments 方法
203+
# Use fixed crawling interval
204+
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
196205
await self.dy_client.get_aweme_all_comments(
197206
aweme_id=aweme_id,
198-
crawl_interval=random.random(),
207+
crawl_interval=crawl_interval,
199208
is_fetch_sub_comments=config.ENABLE_GET_SUB_COMMENTS,
200209
callback=douyin_store.batch_update_dy_aweme_comments,
201210
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
202211
)
212+
# Sleep after fetching comments
213+
await asyncio.sleep(crawl_interval)
214+
utils.logger.info(f"[DouYinCrawler.get_comments] Sleeping for {crawl_interval} seconds after fetching comments for aweme {aweme_id}")
203215
utils.logger.info(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} comments have all been obtained and filtered ...")
204216
except DataFetchError as e:
205217
utils.logger.error(f"[DouYinCrawler.get_comments] aweme_id: {aweme_id} get comments failed, error: {e}")

media_platform/kuaishou/core.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import asyncio
1313
import os
14-
import random
14+
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
1515
import time
1616
from asyncio import Task
1717
from typing import Dict, List, Optional, Tuple
@@ -159,6 +159,11 @@ async def search(self):
159159

160160
# batch fetch video comments
161161
page += 1
162+
163+
# Sleep after page navigation
164+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
165+
utils.logger.info(f"[KuaishouCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
166+
162167
await self.batch_get_video_comments(video_id_list)
163168

164169
async def get_specified_videos(self):
@@ -181,6 +186,11 @@ async def get_video_info_task(
181186
async with semaphore:
182187
try:
183188
result = await self.ks_client.get_video_info(video_id)
189+
190+
# Sleep after fetching video details
191+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
192+
utils.logger.info(f"[KuaishouCrawler.get_video_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching video details {video_id}")
193+
184194
utils.logger.info(
185195
f"[KuaishouCrawler.get_video_info_task] Get video_id:{video_id} info result: {result} ..."
186196
)
@@ -234,9 +244,14 @@ async def get_comments(self, video_id: str, semaphore: asyncio.Semaphore):
234244
utils.logger.info(
235245
f"[KuaishouCrawler.get_comments] begin get video_id: {video_id} comments ..."
236246
)
247+
248+
# Sleep before fetching comments
249+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
250+
utils.logger.info(f"[KuaishouCrawler.get_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for video {video_id}")
251+
237252
await self.ks_client.get_video_all_comments(
238253
photo_id=video_id,
239-
crawl_interval=random.random(),
254+
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
240255
callback=kuaishou_store.batch_update_ks_video_comments,
241256
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
242257
)

media_platform/tieba/core.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
import asyncio
1313
import os
14-
import random
14+
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
1515
from asyncio import Task
1616
from typing import Dict, List, Optional, Tuple
1717

@@ -141,6 +141,11 @@ async def search(self) -> None:
141141
await self.get_specified_notes(
142142
note_id_list=[note_detail.note_id for note_detail in notes_list]
143143
)
144+
145+
# Sleep after page navigation
146+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
147+
utils.logger.info(f"[TieBaCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page}")
148+
144149
page += 1
145150
except Exception as ex:
146151
utils.logger.error(
@@ -178,6 +183,11 @@ async def get_specified_tieba_notes(self):
178183
f"[BaiduTieBaCrawler.get_specified_tieba_notes] tieba name: {tieba_name} note list len: {len(note_list)}"
179184
)
180185
await self.get_specified_notes([note.note_id for note in note_list])
186+
187+
# Sleep after processing notes
188+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
189+
utils.logger.info(f"[TieBaCrawler.get_specified_tieba_notes] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after processing notes from page {page_number}")
190+
181191
page_number += tieba_limit_count
182192

183193
async def get_specified_notes(
@@ -222,6 +232,11 @@ async def get_note_detail_async_task(
222232
f"[BaiduTieBaCrawler.get_note_detail] Begin get note detail, note_id: {note_id}"
223233
)
224234
note_detail: TiebaNote = await self.tieba_client.get_note_by_id(note_id)
235+
236+
# Sleep after fetching note details
237+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
238+
utils.logger.info(f"[TieBaCrawler.get_note_detail_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
239+
225240
if not note_detail:
226241
utils.logger.error(
227242
f"[BaiduTieBaCrawler.get_note_detail] Get note detail error, note_id: {note_id}"
@@ -277,9 +292,14 @@ async def get_comments_async_task(
277292
utils.logger.info(
278293
f"[BaiduTieBaCrawler.get_comments] Begin get note id comments {note_detail.note_id}"
279294
)
295+
296+
# Sleep before fetching comments
297+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
298+
utils.logger.info(f"[TieBaCrawler.get_comments_async_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_detail.note_id}")
299+
280300
await self.tieba_client.get_note_all_comments(
281301
note_detail=note_detail,
282-
crawl_interval=random.random(),
302+
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC,
283303
callback=tieba_store.batch_update_tieba_note_comments,
284304
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
285305
)

media_platform/weibo/core.py

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
import asyncio
1717
import os
18-
import random
18+
# import random # Removed as we now use fixed config.CRAWLER_MAX_SLEEP_SEC intervals
1919
from asyncio import Task
2020
from typing import Dict, List, Optional, Tuple
2121

@@ -160,6 +160,11 @@ async def search(self):
160160
await self.get_note_images(mblog)
161161

162162
page += 1
163+
164+
# Sleep after page navigation
165+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
166+
utils.logger.info(f"[WeiboCrawler.search] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {page-1}")
167+
163168
await self.batch_get_notes_comments(note_id_list)
164169

165170
async def get_specified_notes(self):
@@ -185,6 +190,11 @@ async def get_note_info_task(self, note_id: str, semaphore: asyncio.Semaphore) -
185190
async with semaphore:
186191
try:
187192
result = await self.wb_client.get_note_info_by_id(note_id)
193+
194+
# Sleep after fetching note details
195+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
196+
utils.logger.info(f"[WeiboCrawler.get_note_info_task] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching note details {note_id}")
197+
188198
return result
189199
except DataFetchError as ex:
190200
utils.logger.error(f"[WeiboCrawler.get_note_info_task] Get note detail error: {ex}")
@@ -221,9 +231,14 @@ async def get_note_comments(self, note_id: str, semaphore: asyncio.Semaphore):
221231
async with semaphore:
222232
try:
223233
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
234+
235+
# Sleep before fetching comments
236+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
237+
utils.logger.info(f"[WeiboCrawler.get_note_comments] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds before fetching comments for note {note_id}")
238+
224239
await self.wb_client.get_note_all_comments(
225240
note_id=note_id,
226-
crawl_interval=random.randint(1, 3), # 微博对API的限流比较严重,所以延时提高一些
241+
crawl_interval=config.CRAWLER_MAX_SLEEP_SEC, # Use fixed interval instead of random
227242
callback=weibo_store.batch_update_weibo_note_comments,
228243
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
229244
)
@@ -250,7 +265,8 @@ async def get_note_images(self, mblog: Dict):
250265
if not url:
251266
continue
252267
content = await self.wb_client.get_note_image(url)
253-
await asyncio.sleep(random.random())
268+
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
269+
utils.logger.info(f"[WeiboCrawler.get_note_images] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after fetching image")
254270
if content != None:
255271
extension_file_name = url.split(".")[-1]
256272
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)

0 commit comments

Comments
 (0)