Skip to content

Commit 122978b

Browse files
authored
Merge pull request #652 from gaoxiaobei/dev
feat(bilibili): Add flexible search modes and fix limit logic
2 parents 2753e76 + 8105b05 commit 122978b

File tree

18 files changed

+387
-247
lines changed

18 files changed

+387
-247
lines changed

config/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,4 +10,4 @@
1010

1111

1212
from .base_config import *
13-
from .db_config import *
13+
from .db_config import *

config/base_config.py

Lines changed: 1 addition & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -10,28 +10,16 @@
1010

1111

1212
# 基础配置
13-
PLATFORM = "xhs"
13+
PLATFORM = "xhs" # 平台,xhs | dy | ks | bili | wb | tieba | zhihu
1414
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔
1515
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
1616
COOKIES = ""
17-
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
18-
SORT_TYPE = "popularity_descending"
19-
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音
20-
PUBLISH_TIME_TYPE = 0
2117
CRAWLER_TYPE = (
2218
"search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
2319
)
24-
# 微博搜索类型 default (综合) | real_time (实时) | popular (热门) | video (视频)
25-
WEIBO_SEARCH_TYPE = "popular"
26-
# 自定义User Agent(暂时仅对XHS有效)
27-
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
28-
2920
# 是否开启 IP 代理
3021
ENABLE_IP_PROXY = False
3122

32-
# 未启用代理时的最大爬取间隔,单位秒(暂时仅对XHS有效)
33-
CRAWLER_MAX_SLEEP_SEC = 2
34-
3523
# 代理IP池数量
3624
IP_PROXY_POOL_COUNT = 2
3725

@@ -102,101 +90,6 @@
10290
# 老版本项目使用了 db, 则需参考 schema/tables.sql line 287 增加表字段
10391
ENABLE_GET_SUB_COMMENTS = False
10492

105-
# 已废弃⚠️⚠️⚠️指定小红书需要爬虫的笔记ID列表
106-
# 已废弃⚠️⚠️⚠️ 指定笔记ID笔记列表会因为缺少xsec_token和xsec_source参数导致爬取失败
107-
# XHS_SPECIFIED_ID_LIST = [
108-
# "66fad51c000000001b0224b8",
109-
# # ........................
110-
# ]
111-
112-
# 指定小红书需要爬虫的笔记URL列表, 目前要携带xsec_token和xsec_source参数
113-
XHS_SPECIFIED_NOTE_URL_LIST = [
114-
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"
115-
# ........................
116-
]
117-
118-
# 指定抖音需要爬取的ID列表
119-
DY_SPECIFIED_ID_LIST = [
120-
"7280854932641664319",
121-
"7202432992642387233",
122-
# ........................
123-
]
124-
125-
# 指定快手平台需要爬取的ID列表
126-
KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
127-
128-
# 指定B站平台需要爬取的视频bvid列表
129-
BILI_SPECIFIED_ID_LIST = [
130-
"BV1d54y1g7db",
131-
"BV1Sz4y1U77N",
132-
"BV14Q4y1n7jz",
133-
# ........................
134-
]
135-
136-
# 指定微博平台需要爬取的帖子列表
137-
WEIBO_SPECIFIED_ID_LIST = [
138-
"4982041758140155",
139-
# ........................
140-
]
141-
142-
# 指定weibo创作者ID列表
143-
WEIBO_CREATOR_ID_LIST = [
144-
"5533390220",
145-
# ........................
146-
]
147-
148-
# 指定贴吧需要爬取的帖子列表
149-
TIEBA_SPECIFIED_ID_LIST = []
150-
151-
# 指定贴吧名称列表,爬取该贴吧下的帖子
152-
TIEBA_NAME_LIST = [
153-
# "盗墓笔记"
154-
]
155-
156-
# 指定贴吧创作者URL列表
157-
TIEBA_CREATOR_URL_LIST = [
158-
"https://tieba.baidu.com/home/main/?id=tb.1.7f139e2e.6CyEwxu3VJruH_-QqpCi6g&fr=frs",
159-
# ........................
160-
]
161-
162-
# 指定小红书创作者ID列表
163-
XHS_CREATOR_ID_LIST = [
164-
"63e36c9a000000002703502b",
165-
# ........................
166-
]
167-
168-
# 指定Dy创作者ID列表(sec_id)
169-
DY_CREATOR_ID_LIST = [
170-
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
171-
# ........................
172-
]
173-
174-
# 指定bili创作者ID列表(sec_id)
175-
BILI_CREATOR_ID_LIST = [
176-
"20813884",
177-
# ........................
178-
]
179-
180-
# 指定快手创作者ID列表
181-
KS_CREATOR_ID_LIST = [
182-
"3x4sm73aye7jq7i",
183-
# ........................
184-
]
185-
186-
187-
# 指定知乎创作者主页url列表
188-
ZHIHU_CREATOR_URL_LIST = [
189-
"https://www.zhihu.com/people/yd1234567",
190-
# ........................
191-
]
192-
193-
# 指定知乎需要爬取的帖子ID列表
194-
ZHIHU_SPECIFIED_ID_LIST = [
195-
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
196-
"https://zhuanlan.zhihu.com/p/673461588", # 文章
197-
"https://www.zhihu.com/zvideo/1539542068422144000", # 视频
198-
]
199-
20093
# 词云相关
20194
# 是否开启生成评论词云图
20295
ENABLE_GET_WORDCLOUD = False
@@ -212,27 +105,3 @@
212105

213106
# 中文字体文件路径
214107
FONT_PATH = "./docs/STZHONGS.TTF"
215-
216-
# 爬取开始的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
217-
START_DAY = "2024-01-01"
218-
219-
# 爬取结束的天数,仅支持 bilibili 关键字搜索,YYYY-MM-DD 格式,若为 None 则表示不设置时间范围,按照默认关键字最多返回 1000 条视频的结果处理
220-
END_DAY = "2024-01-01"
221-
222-
# 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索
223-
# 若为 False,则忽略 START_DAY 与 END_DAY 设置的值
224-
# 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
225-
ALL_DAY = False
226-
227-
#!!! 下面仅支持 bilibili creator搜索
228-
# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
229-
CREATOR_MODE = True
230-
231-
# 爬取creator粉丝列表时起始爬取页数
232-
START_CONTACTS_PAGE = 1
233-
234-
# 爬取作者粉丝和关注列表数量控制(单作者)
235-
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
236-
237-
# 爬取作者动态数量控制(单作者)
238-
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50

main.py

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -45,25 +45,30 @@ def create_crawler(platform: str) -> AbstractCrawler:
4545
return crawler_class()
4646

4747
async def main():
48+
# Init crawler
49+
crawler: Optional[AbstractCrawler] = None
50+
try:
51+
# parse cmd
52+
await cmd_arg.parse_cmd()
4853

49-
# parse cmd
50-
await cmd_arg.parse_cmd()
51-
52-
# init db
53-
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
54-
await db.init_db()
54+
# init db
55+
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
56+
await db.init_db()
5557

56-
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
57-
await crawler.start()
58+
crawler = CrawlerFactory.create_crawler(platform=config.PLATFORM)
59+
await crawler.start()
5860

59-
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
60-
await db.close()
61+
finally:
62+
if crawler:
63+
await crawler.close()
6164

62-
65+
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
66+
await db.close()
6367

6468
if __name__ == '__main__':
6569
try:
6670
# asyncio.run(main())
6771
asyncio.get_event_loop().run_until_complete(main())
6872
except KeyboardInterrupt:
73+
print("\n[main] Caught keyboard interrupt, exiting.")
6974
sys.exit()

media_platform/bilibili/client.py

Lines changed: 50 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
# @Desc : bilibili 请求客户端
1616
import asyncio
1717
import json
18+
import random
1819
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
1920
from urllib.parse import urlencode
2021

@@ -53,7 +54,11 @@ async def request(self, method, url, **kwargs) -> Any:
5354
method, url, timeout=self.timeout,
5455
**kwargs
5556
)
56-
data: Dict = response.json()
57+
try:
58+
data: Dict = response.json()
59+
except json.JSONDecodeError:
60+
utils.logger.error(f"[BilibiliClient.request] Failed to decode JSON from response. status_code: {response.status_code}, response_text: {response.text}")
61+
raise DataFetchError(f"Failed to decode JSON, content: {response.text}")
5762
if data.get("code") != 0:
5863
raise DataFetchError(data.get("message", "unkonw error"))
5964
else:
@@ -78,8 +83,12 @@ async def get_wbi_keys(self) -> Tuple[str, str]:
7883
:return:
7984
"""
8085
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
81-
wbi_img_urls = local_storage.get("wbi_img_urls", "") or local_storage.get(
82-
"wbi_img_url") + "-" + local_storage.get("wbi_sub_url")
86+
wbi_img_urls = local_storage.get("wbi_img_urls", "")
87+
if not wbi_img_urls:
88+
img_url_from_storage = local_storage.get("wbi_img_url")
89+
sub_url_from_storage = local_storage.get("wbi_sub_url")
90+
if img_url_from_storage and sub_url_from_storage:
91+
wbi_img_urls = f"{img_url_from_storage}-{sub_url_from_storage}"
8392
if wbi_img_urls and "-" in wbi_img_urls:
8493
img_url, sub_url = wbi_img_urls.split("-")
8594
else:
@@ -235,16 +244,50 @@ async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.
235244
236245
:return:
237246
"""
238-
239247
result = []
240248
is_end = False
241249
next_page = 0
250+
max_retries = 3
242251
while not is_end and len(result) < max_count:
243-
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
252+
comments_res = None
253+
for attempt in range(max_retries):
254+
try:
255+
comments_res = await self.get_video_comments(video_id, CommentOrderType.DEFAULT, next_page)
256+
break # Success
257+
except DataFetchError as e:
258+
if attempt < max_retries - 1:
259+
delay = 5 * (2 ** attempt) + random.uniform(0, 1)
260+
utils.logger.warning(
261+
f"[BilibiliClient.get_video_all_comments] Retrying video_id {video_id} in {delay:.2f}s... (Attempt {attempt + 1}/{max_retries})"
262+
)
263+
await asyncio.sleep(delay)
264+
else:
265+
utils.logger.error(
266+
f"[BilibiliClient.get_video_all_comments] Max retries reached for video_id: {video_id}. Skipping comments. Error: {e}"
267+
)
268+
is_end = True
269+
break
270+
if not comments_res:
271+
break
272+
244273
cursor_info: Dict = comments_res.get("cursor")
274+
if not cursor_info:
275+
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] Could not find 'cursor' in response for video_id: {video_id}. Skipping.")
276+
break
277+
245278
comment_list: List[Dict] = comments_res.get("replies", [])
246-
is_end = cursor_info.get("is_end")
247-
next_page = cursor_info.get("next")
279+
280+
# 检查 is_end 和 next 是否存在
281+
if "is_end" not in cursor_info or "next" not in cursor_info:
282+
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.")
283+
is_end = True
284+
else:
285+
is_end = cursor_info.get("is_end")
286+
next_page = cursor_info.get("next")
287+
288+
if not isinstance(is_end, bool):
289+
utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' is not a boolean for video_id: {video_id}. Assuming end of comments.")
290+
is_end = True
248291
if is_fetch_sub_comments:
249292
for comment in comment_list:
250293
comment_id = comment['rpid']

media_platform/bilibili/config.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
2+
# 1. 不得用于任何商业用途。
3+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
4+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
5+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
6+
# 5. 不得用于任何非法或不当的用途。
7+
#
8+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
9+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
10+
11+
12+
from config import *
13+
14+
# 每天爬取视频/帖子的数量控制
15+
MAX_NOTES_PER_DAY = 1
16+
17+
# Bilibili 平台配置
18+
BILI_SPECIFIED_ID_LIST = [
19+
"BV1d54y1g7db",
20+
"BV1Sz4y1U77N",
21+
"BV14Q4y1n7jz",
22+
# ........................
23+
]
24+
START_DAY = "2024-01-01"
25+
END_DAY = "2024-01-01"
26+
BILI_SEARCH_MODE = "normal"
27+
CREATOR_MODE = True
28+
START_CONTACTS_PAGE = 1
29+
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
30+
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50
31+
BILI_CREATOR_ID_LIST = [
32+
"20813884",
33+
# ........................
34+
]

0 commit comments

Comments
 (0)