Skip to content

Commit 4b63ea6

Browse files
authored
Merge pull request #538 from 2513502304/main
feat: bilibli support date range filter
2 parents 30d0e73 + 2d93ec5 commit 4b63ea6

File tree

4 files changed

+123
-45
lines changed

4 files changed

+123
-45
lines changed

config/base_config.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,17 @@
5757
# 爬取视频/帖子的数量控制
5858
CRAWLER_MAX_NOTES_COUNT = 200
5959

60+
# 爬取开始的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
61+
START_DAY = '2024-01-01'
62+
63+
# 爬取结束的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
64+
END_DAY = '2024-01-01'
65+
66+
# 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索
67+
# 若为 False,则忽略 START_DAY 与 END_DAY 设置的值
68+
# 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
69+
ALL_DAY = True
70+
6071
# 并发爬虫数量控制
6172
MAX_CONCURRENCY_NUM = 1
6273

@@ -69,7 +80,6 @@
6980
# 爬取一级评论的数量控制(单视频/帖子)
7081
CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = 10
7182

72-
7383
# 是否开启爬二级评论模式, 默认不开启爬二级评论
7484
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
7585
ENABLE_GET_SUB_COMMENTS = False
@@ -87,7 +97,6 @@
8797
# ........................
8898
]
8999

90-
91100
# 指定抖音需要爬取的ID列表
92101
DY_SPECIFIED_ID_LIST = [
93102
"7280854932641664319",
@@ -126,6 +135,7 @@
126135
# "盗墓笔记"
127136
]
128137

138+
# 指定贴吧创作者URL列表
129139
TIEBA_CREATOR_URL_LIST = [
130140
"https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs",
131141
# ........................

media_platform/bilibili/client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -147,8 +147,8 @@ async def search_video_by_keyword(self, keyword: str, page: int = 1, page_size:
147147
"page": page,
148148
"page_size": page_size,
149149
"order": order.value,
150-
"pubtime_begin": pubtime_begin_s,
151-
"pubtime_end": pubtime_end_s
150+
"pubtime_begin_s": pubtime_begin_s,
151+
"pubtime_end_s": pubtime_end_s
152152
}
153153
return await self.get(uri, post_data)
154154

media_platform/bilibili/core.py

Lines changed: 107 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,10 @@
1919
import random
2020
from asyncio import Task
2121
from typing import Dict, List, Optional, Tuple, Union
22+
from datetime import datetime, timedelta
23+
import pandas as pd
2224

23-
from playwright.async_api import (BrowserContext, BrowserType, Page,
24-
async_playwright)
25+
from playwright.async_api import (BrowserContext, BrowserType, Page, async_playwright)
2526

2627
import config
2728
from base.base_crawler import AbstractCrawler
@@ -95,56 +96,122 @@ async def start(self):
9596
utils.logger.info(
9697
"[BilibiliCrawler.start] Bilibili Crawler finished ...")
9798

99+
async def get_pubtime_datetime(self, start: str = config.START_DAY, end: str = config.END_DAY) -> tuple[str, str]:
100+
"""
101+
获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s
102+
---
103+
:param start: 发布日期起始时间,YYYY-MM-DD
104+
:param end: 发布日期结束时间,YYYY-MM-DD
105+
106+
Note
107+
---
108+
- 搜索的时间范围为 start 至 end,包含 start 和 end
109+
- 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒
110+
- 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399
111+
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59)
112+
- 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒
113+
- 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799
114+
转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59)
115+
"""
116+
# 转换 start 与 end 为 datetime 对象
117+
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
118+
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
119+
if start_day > end_day:
120+
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
121+
elif start_day == end_day: # 搜索同一天的内容
122+
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
123+
else: # 搜索 start 至 end
124+
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
125+
# 将其重新转换为时间戳
126+
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
127+
98128
async def search(self):
99129
"""
100130
search bilibili video with keywords
101131
:return:
102132
"""
103-
utils.logger.info(
104-
"[BilibiliCrawler.search] Begin search bilibli keywords")
133+
utils.logger.info("[BilibiliCrawler.search] Begin search bilibli keywords")
105134
bili_limit_count = 20 # bilibili limit page fixed value
106135
if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count:
107136
config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count
108137
start_page = config.START_PAGE # start page number
109138
for keyword in config.KEYWORDS.split(","):
110139
source_keyword_var.set(keyword)
111-
utils.logger.info(
112-
f"[BilibiliCrawler.search] Current search keyword: {keyword}")
113-
page = 1
114-
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
115-
if page < start_page:
116-
utils.logger.info(
117-
f"[BilibiliCrawler.search] Skip page: {page}")
140+
utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}")
141+
# 每个关键词最多返回 1000 条数据
142+
if not config.ALL_DAY:
143+
page = 1
144+
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
145+
if page < start_page:
146+
utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
147+
page += 1
148+
continue
149+
150+
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
151+
video_id_list: List[str] = []
152+
videos_res = await self.bili_client.search_video_by_keyword(
153+
keyword=keyword,
154+
page=page,
155+
page_size=bili_limit_count,
156+
order=SearchOrderType.DEFAULT,
157+
pubtime_begin_s=0, # 作品发布日期起始时间戳
158+
pubtime_end_s=0 # 作品发布日期结束日期时间戳
159+
)
160+
video_list: List[Dict] = videos_res.get("result")
161+
162+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
163+
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
164+
video_items = await asyncio.gather(*task_list)
165+
for video_item in video_items:
166+
if video_item:
167+
video_id_list.append(video_item.get("View").get("aid"))
168+
await bilibili_store.update_bilibili_video(video_item)
169+
await bilibili_store.update_up_info(video_item)
170+
await self.get_bilibili_video(video_item, semaphore)
118171
page += 1
119-
continue
120-
121-
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, page: {page}")
122-
video_id_list: List[str] = []
123-
videos_res = await self.bili_client.search_video_by_keyword(
124-
keyword=keyword,
125-
page=page,
126-
page_size=bili_limit_count,
127-
order=SearchOrderType.DEFAULT,
128-
pubtime_begin_s=0, # 作品发布日期起始时间戳
129-
pubtime_end_s=0 # 作品发布日期结束日期时间戳
130-
)
131-
video_list: List[Dict] = videos_res.get("result")
132-
133-
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
134-
task_list = [
135-
self.get_video_info_task(aid=video_item.get(
136-
"aid"), bvid="", semaphore=semaphore)
137-
for video_item in video_list
138-
]
139-
video_items = await asyncio.gather(*task_list)
140-
for video_item in video_items:
141-
if video_item:
142-
video_id_list.append(video_item.get("View").get("aid"))
143-
await bilibili_store.update_bilibili_video(video_item)
144-
await bilibili_store.update_up_info(video_item)
145-
await self.get_bilibili_video(video_item, semaphore)
146-
page += 1
147-
await self.batch_get_video_comments(video_id_list)
172+
await self.batch_get_video_comments(video_id_list)
173+
# 按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
174+
else:
175+
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
176+
# 按照每一天进行爬取的时间戳参数
177+
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
178+
page = 1
179+
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
180+
# ! Catch any error if response return nothing, go to next day
181+
try:
182+
# ! Don't skip any page, to make sure gather all video in one day
183+
# if page < start_page:
184+
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
185+
# page += 1
186+
# continue
187+
188+
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
189+
video_id_list: List[str] = []
190+
videos_res = await self.bili_client.search_video_by_keyword(
191+
keyword=keyword,
192+
page=page,
193+
page_size=bili_limit_count,
194+
order=SearchOrderType.DEFAULT,
195+
pubtime_begin_s=pubtime_begin_s, # 作品发布日期起始时间戳
196+
pubtime_end_s=pubtime_end_s # 作品发布日期结束日期时间戳
197+
)
198+
video_list: List[Dict] = videos_res.get("result")
199+
200+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
201+
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
202+
video_items = await asyncio.gather(*task_list)
203+
for video_item in video_items:
204+
if video_item:
205+
video_id_list.append(video_item.get("View").get("aid"))
206+
await bilibili_store.update_bilibili_video(video_item)
207+
await bilibili_store.update_up_info(video_item)
208+
await self.get_bilibili_video(video_item, semaphore)
209+
page += 1
210+
await self.batch_get_video_comments(video_id_list)
211+
# go to next day
212+
except Exception as e:
213+
print(e)
214+
break
148215

149216
async def batch_get_video_comments(self, video_id_list: List[str]):
150217
"""

requirements.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,5 @@ wordcloud==1.9.3
1515
matplotlib==3.9.0
1616
requests==2.32.3
1717
parsel==1.9.1
18-
pyexecjs==1.5.1
18+
pyexecjs==1.5.1
19+
pandas==2.2.3

0 commit comments

Comments
 (0)