Skip to content

Commit 56bf5d2

Browse files
authored
The configuration file supports URL crawling
Feature/config refactor 20251018
2 parents 3b6fae8 + ae79557 commit 56bf5d2

File tree

19 files changed

+614
-76
lines changed

19 files changed

+614
-76
lines changed

config/base_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
3939
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
4040
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
41-
ENABLE_CDP_MODE = False
41+
ENABLE_CDP_MODE = True
4242

4343
# CDP调试端口,用于与浏览器通信
4444
# 如果端口被占用,系统会自动尝试下一个可用端口

config/bilibili_config.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,16 +13,23 @@
1313
# 每天爬取视频/帖子的数量控制
1414
MAX_NOTES_PER_DAY = 1
1515

16-
# 指定B站视频ID列表
16+
# 指定B站视频URL列表 (支持完整URL或BV号)
17+
# 示例:
18+
# - 完整URL: "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
19+
# - BV号: "BV1d54y1g7db"
1720
BILI_SPECIFIED_ID_LIST = [
18-
"BV1d54y1g7db",
21+
"https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click",
1922
"BV1Sz4y1U77N",
2023
"BV14Q4y1n7jz",
2124
# ........................
2225
]
2326

24-
# 指定B站用户ID列表
27+
# 指定B站创作者URL列表 (支持完整URL或UID)
28+
# 示例:
29+
# - 完整URL: "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
30+
# - UID: "20813884"
2531
BILI_CREATOR_ID_LIST = [
32+
"https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0",
2633
"20813884",
2734
# ........................
2835
]

config/dy_config.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,15 +11,27 @@
1111
# 抖音平台配置
1212
PUBLISH_TIME_TYPE = 0
1313

14-
# 指定DY视频ID列表
14+
# 指定DY视频URL列表 (支持多种格式)
15+
# 支持格式:
16+
# 1. 完整视频URL: "https://www.douyin.com/video/7525538910311632128"
17+
# 2. 带modal_id的URL: "https://www.douyin.com/user/xxx?modal_id=7525538910311632128"
18+
# 3. 搜索页带modal_id: "https://www.douyin.com/root/search/python?modal_id=7525538910311632128"
19+
# 4. 短链接: "https://v.douyin.com/drIPtQ_WPWY/"
20+
# 5. 纯视频ID: "7280854932641664319"
1521
DY_SPECIFIED_ID_LIST = [
16-
"7280854932641664319",
17-
"7202432992642387233",
22+
"https://www.douyin.com/video/7525538910311632128",
23+
"https://v.douyin.com/drIPtQ_WPWY/",
24+
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525538910311632128",
25+
"7202432992642387233",
1826
# ........................
1927
]
2028

21-
# 指定DY用户ID列表
29+
# 指定DY创作者URL列表 (支持完整URL或sec_user_id)
30+
# 支持格式:
31+
# 1. 完整创作者主页URL: "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main"
32+
# 2. sec_user_id: "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
2233
DY_CREATOR_ID_LIST = [
23-
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE",
34+
"https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main",
35+
"MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE"
2436
# ........................
2537
]

config/ks_config.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,22 @@
1010

1111
# 快手平台配置
1212

13-
# 指定快手视频ID列表
14-
KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
13+
# 指定快手视频URL列表 (支持完整URL或纯ID)
14+
# 支持格式:
15+
# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
16+
# 2. 纯视频ID: "3xf8enb8dbj6uig"
17+
KS_SPECIFIED_ID_LIST = [
18+
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
19+
"3xf8enb8dbj6uig",
20+
# ........................
21+
]
1522

16-
# 指定快手用户ID列表
23+
# 指定快手创作者URL列表 (支持完整URL或纯ID)
24+
# 支持格式:
25+
# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
26+
# 2. 纯user_id: "3x4sm73aye7jq7i"
1727
KS_CREATOR_ID_LIST = [
28+
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
1829
"3x4sm73aye7jq7i",
1930
# ........................
2031
]

config/xhs_config.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,12 @@
2121
# ........................
2222
]
2323

24-
# 指定用户ID列表
24+
# 指定创作者URL列表 (支持完整URL或纯ID)
25+
# 支持格式:
26+
# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
27+
# 2. 纯user_id: "63e36c9a000000002703502b"
2528
XHS_CREATOR_ID_LIST = [
26-
"63e36c9a000000002703502b",
29+
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
30+
"63e36c9a000000002703502b",
2731
# ........................
2832
]

media_platform/bilibili/core.py

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
from .client import BilibiliClient
4242
from .exception import DataFetchError
4343
from .field import SearchOrderType
44+
from .help import parse_video_info_from_url, parse_creator_info_from_url
4445
from .login import BilibiliLogin
4546

4647

@@ -103,8 +104,14 @@ async def start(self):
103104
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
104105
elif config.CRAWLER_TYPE == "creator":
105106
if config.CREATOR_MODE:
106-
for creator_id in config.BILI_CREATOR_ID_LIST:
107-
await self.get_creator_videos(int(creator_id))
107+
for creator_url in config.BILI_CREATOR_ID_LIST:
108+
try:
109+
creator_info = parse_creator_info_from_url(creator_url)
110+
utils.logger.info(f"[BilibiliCrawler.start] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
111+
await self.get_creator_videos(int(creator_info.creator_id))
112+
except ValueError as e:
113+
utils.logger.error(f"[BilibiliCrawler.start] Failed to parse creator URL: {e}")
114+
continue
108115
else:
109116
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
110117
else:
@@ -362,11 +369,23 @@ async def get_creator_videos(self, creator_id: int):
362369
utils.logger.info(f"[BilibiliCrawler.get_creator_videos] Sleeping for {config.CRAWLER_MAX_SLEEP_SEC} seconds after page {pn}")
363370
pn += 1
364371

365-
async def get_specified_videos(self, bvids_list: List[str]):
372+
async def get_specified_videos(self, video_url_list: List[str]):
366373
"""
367-
get specified videos info
374+
get specified videos info from URLs or BV IDs
375+
:param video_url_list: List of video URLs or BV IDs
368376
:return:
369377
"""
378+
utils.logger.info("[BilibiliCrawler.get_specified_videos] Parsing video URLs...")
379+
bvids_list = []
380+
for video_url in video_url_list:
381+
try:
382+
video_info = parse_video_info_from_url(video_url)
383+
bvids_list.append(video_info.video_id)
384+
utils.logger.info(f"[BilibiliCrawler.get_specified_videos] Parsed video ID: {video_info.video_id} from {video_url}")
385+
except ValueError as e:
386+
utils.logger.error(f"[BilibiliCrawler.get_specified_videos] Failed to parse video URL: {e}")
387+
continue
388+
370389
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
371390
task_list = [self.get_video_info_task(aid=0, bvid=video_id, semaphore=semaphore) for video_id in bvids_list]
372391
video_details = await asyncio.gather(*task_list)
@@ -568,18 +587,30 @@ async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphor
568587
extension_file_name = f"video.mp4"
569588
await bilibili_store.store_video(aid, content, extension_file_name)
570589

571-
async def get_all_creator_details(self, creator_id_list: List[int]):
590+
async def get_all_creator_details(self, creator_url_list: List[str]):
572591
"""
573-
creator_id_list: get details for creator from creator_id_list
592+
creator_url_list: get details for creator from creator URL list
574593
"""
575-
utils.logger.info(f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
576-
utils.logger.info(f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
594+
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Crawling the details of creators")
595+
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsing creator URLs...")
596+
597+
creator_id_list = []
598+
for creator_url in creator_url_list:
599+
try:
600+
creator_info = parse_creator_info_from_url(creator_url)
601+
creator_id_list.append(int(creator_info.creator_id))
602+
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] Parsed creator ID: {creator_info.creator_id} from {creator_url}")
603+
except ValueError as e:
604+
utils.logger.error(f"[BilibiliCrawler.get_all_creator_details] Failed to parse creator URL: {e}")
605+
continue
606+
607+
utils.logger.info(f"[BilibiliCrawler.get_all_creator_details] creator ids:{creator_id_list}")
577608

578609
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
579610
task_list: List[Task] = []
580611
try:
581612
for creator_id in creator_id_list:
582-
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=creator_id)
613+
task = asyncio.create_task(self.get_creator_details(creator_id, semaphore), name=str(creator_id))
583614
task_list.append(task)
584615
except Exception as e:
585616
utils.logger.warning(f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")

media_platform/bilibili/help.py

Lines changed: 70 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,17 @@
99
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
1010

1111

12-
# -*- coding: utf-8 -*-
12+
# -*- coding: utf-8 -*-
1313
# @Author : [email protected]
1414
# @Time : 2023/12/2 23:26
1515
# @Desc : bilibili 请求参数签名
1616
# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95
17+
import re
1718
import urllib.parse
1819
from hashlib import md5
1920
from typing import Dict
2021

22+
from model.m_bilibili import VideoUrlInfo, CreatorUrlInfo
2123
from tools import utils
2224

2325

@@ -66,16 +68,71 @@ def sign(self, req_data: Dict) -> Dict:
6668
return req_data
6769

6870

71+
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
72+
"""
73+
从B站视频URL中解析出视频ID
74+
Args:
75+
url: B站视频链接
76+
- https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click
77+
- https://www.bilibili.com/video/BV1d54y1g7db
78+
- BV1d54y1g7db (直接传入BV号)
79+
Returns:
80+
VideoUrlInfo: 包含视频ID的对象
81+
"""
82+
# 如果传入的已经是BV号,直接返回
83+
if url.startswith("BV"):
84+
return VideoUrlInfo(video_id=url)
85+
86+
# 使用正则表达式提取BV号
87+
# 匹配 /video/BV... 或 /video/av... 格式
88+
bv_pattern = r'/video/(BV[a-zA-Z0-9]+)'
89+
match = re.search(bv_pattern, url)
90+
91+
if match:
92+
video_id = match.group(1)
93+
return VideoUrlInfo(video_id=video_id)
94+
95+
raise ValueError(f"无法从URL中解析出视频ID: {url}")
96+
97+
98+
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
99+
"""
100+
从B站创作者空间URL中解析出创作者ID
101+
Args:
102+
url: B站创作者空间链接
103+
- https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0
104+
- https://space.bilibili.com/20813884
105+
- 434377496 (直接传入UID)
106+
Returns:
107+
CreatorUrlInfo: 包含创作者ID的对象
108+
"""
109+
# 如果传入的已经是纯数字ID,直接返回
110+
if url.isdigit():
111+
return CreatorUrlInfo(creator_id=url)
112+
113+
# 使用正则表达式提取UID
114+
# 匹配 /space.bilibili.com/数字 格式
115+
uid_pattern = r'space\.bilibili\.com/(\d+)'
116+
match = re.search(uid_pattern, url)
117+
118+
if match:
119+
creator_id = match.group(1)
120+
return CreatorUrlInfo(creator_id=creator_id)
121+
122+
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
123+
124+
69125
if __name__ == '__main__':
70-
_img_key = "7cd084941338484aae1ad9425b84077c"
71-
_sub_key = "4932caff0ff746eab6f01bf08b70ac45"
72-
_search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654"
73-
_req_data = dict()
74-
for params in _search_url.split("&"):
75-
kvalues = params.split("=")
76-
key = kvalues[0]
77-
value = kvalues[1]
78-
_req_data[key] = value
79-
print("pre req_data", _req_data)
80-
_req_data = BilibiliSign(img_key=_img_key, sub_key=_sub_key).sign(req_data={"aid":170001})
81-
print(_req_data)
126+
# 测试视频URL解析
127+
video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click"
128+
video_url2 = "BV1d54y1g7db"
129+
print("视频URL解析测试:")
130+
print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}")
131+
print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}")
132+
133+
# 测试创作者URL解析
134+
creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0"
135+
creator_url2 = "20813884"
136+
print("\n创作者URL解析测试:")
137+
print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}")
138+
print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}")

media_platform/douyin/client.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,3 +324,28 @@ async def get_aweme_media(self, url: str) -> Union[bytes, None]:
324324
except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx
325325
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试
326326
return None
327+
328+
async def resolve_short_url(self, short_url: str) -> str:
329+
"""
330+
解析抖音短链接,获取重定向后的真实URL
331+
Args:
332+
short_url: 短链接,如 https://v.douyin.com/iF12345ABC/
333+
Returns:
334+
重定向后的完整URL
335+
"""
336+
async with httpx.AsyncClient(proxy=self.proxy, follow_redirects=False) as client:
337+
try:
338+
utils.logger.info(f"[DouYinClient.resolve_short_url] Resolving short URL: {short_url}")
339+
response = await client.get(short_url, timeout=10)
340+
341+
# 短链接通常返回302重定向
342+
if response.status_code in [301, 302, 303, 307, 308]:
343+
redirect_url = response.headers.get("Location", "")
344+
utils.logger.info(f"[DouYinClient.resolve_short_url] Resolved to: {redirect_url}")
345+
return redirect_url
346+
else:
347+
utils.logger.warning(f"[DouYinClient.resolve_short_url] Unexpected status code: {response.status_code}")
348+
return ""
349+
except Exception as e:
350+
utils.logger.error(f"[DouYinClient.resolve_short_url] Failed to resolve short URL: {e}")
351+
return ""

0 commit comments

Comments
 (0)