Skip to content

Commit ae79557

Browse files
committed
feat: kuaishou support url link
1 parent a9dd086 commit ae79557

File tree

5 files changed

+164
-19
lines changed

5 files changed

+164
-19
lines changed

config/base_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
3939
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
4040
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
41-
ENABLE_CDP_MODE = False
41+
ENABLE_CDP_MODE = True
4242

4343
# CDP调试端口,用于与浏览器通信
4444
# 如果端口被占用,系统会自动尝试下一个可用端口

config/ks_config.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,22 @@
1010

1111
# 快手平台配置
1212

13-
# 指定快手视频ID列表
14-
KS_SPECIFIED_ID_LIST = ["3xf8enb8dbj6uig", "3x6zz972bchmvqe"]
13+
# 指定快手视频URL列表 (支持完整URL或纯ID)
14+
# 支持格式:
15+
# 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
16+
# 2. 纯视频ID: "3xf8enb8dbj6uig"
17+
KS_SPECIFIED_ID_LIST = [
18+
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
19+
"3xf8enb8dbj6uig",
20+
# ........................
21+
]
1522

16-
# 指定快手用户ID列表
23+
# 指定快手创作者URL列表 (支持完整URL或纯ID)
24+
# 支持格式:
25+
# 1. 创作者主页URL: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
26+
# 2. 纯user_id: "3x4sm73aye7jq7i"
1727
KS_CREATOR_ID_LIST = [
28+
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
1829
"3x4sm73aye7jq7i",
1930
# ........................
2031
]

media_platform/kuaishou/core.py

Lines changed: 29 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
import config
2828
from base.base_crawler import AbstractCrawler
29+
from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
2930
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
3031
from store import kuaishou as kuaishou_store
3132
from tools import utils
@@ -34,6 +35,7 @@
3435

3536
from .client import KuaiShouClient
3637
from .exception import DataFetchError
38+
from .help import parse_video_info_from_url, parse_creator_info_from_url
3739
from .login import KuaishouLogin
3840

3941

@@ -168,16 +170,27 @@ async def search(self):
168170

169171
async def get_specified_videos(self):
170172
"""Get the information and comments of the specified post"""
173+
utils.logger.info("[KuaishouCrawler.get_specified_videos] Parsing video URLs...")
174+
video_ids = []
175+
for video_url in config.KS_SPECIFIED_ID_LIST:
176+
try:
177+
video_info = parse_video_info_from_url(video_url)
178+
video_ids.append(video_info.video_id)
179+
utils.logger.info(f"Parsed video ID: {video_info.video_id} from {video_url}")
180+
except ValueError as e:
181+
utils.logger.error(f"Failed to parse video URL: {e}")
182+
continue
183+
171184
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
172185
task_list = [
173186
self.get_video_info_task(video_id=video_id, semaphore=semaphore)
174-
for video_id in config.KS_SPECIFIED_ID_LIST
187+
for video_id in video_ids
175188
]
176189
video_details = await asyncio.gather(*task_list)
177190
for video_detail in video_details:
178191
if video_detail is not None:
179192
await kuaishou_store.update_kuaishou_video(video_detail)
180-
await self.batch_get_video_comments(config.KS_SPECIFIED_ID_LIST)
193+
await self.batch_get_video_comments(video_ids)
181194

182195
async def get_video_info_task(
183196
self, video_id: str, semaphore: asyncio.Semaphore
@@ -367,11 +380,20 @@ async def get_creators_and_videos(self) -> None:
367380
utils.logger.info(
368381
"[KuaiShouCrawler.get_creators_and_videos] Begin get kuaishou creators"
369382
)
370-
for user_id in config.KS_CREATOR_ID_LIST:
371-
# get creator detail info from web html content
372-
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
373-
if createor_info:
374-
await kuaishou_store.save_creator(user_id, creator=createor_info)
383+
for creator_url in config.KS_CREATOR_ID_LIST:
384+
try:
385+
# Parse creator URL to get user_id
386+
creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
387+
utils.logger.info(f"[KuaiShouCrawler.get_creators_and_videos] Parse creator URL info: {creator_info}")
388+
user_id = creator_info.user_id
389+
390+
# get creator detail info from web html content
391+
createor_info: Dict = await self.ks_client.get_creator_info(user_id=user_id)
392+
if createor_info:
393+
await kuaishou_store.save_creator(user_id, creator=createor_info)
394+
except ValueError as e:
395+
utils.logger.error(f"[KuaiShouCrawler.get_creators_and_videos] Failed to parse creator URL: {e}")
396+
continue
375397

376398
# Get all video information of the creator
377399
all_video_list = await self.ks_client.get_all_videos_by_creator(

media_platform/kuaishou/help.py

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
2+
# 1. 不得用于任何商业用途。
3+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
4+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
5+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
6+
# 5. 不得用于任何非法或不当的用途。
7+
#
8+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
9+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
10+
11+
12+
# -*- coding: utf-8 -*-
13+
14+
import re
15+
from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo
16+
17+
18+
def parse_video_info_from_url(url: str) -> VideoUrlInfo:
19+
"""
20+
从快手视频URL中解析出视频ID
21+
支持以下格式:
22+
1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search"
23+
2. 纯视频ID: "3x3zxz4mjrsc8ke"
24+
25+
Args:
26+
url: 快手视频链接或视频ID
27+
Returns:
28+
VideoUrlInfo: 包含视频ID的对象
29+
"""
30+
# 如果不包含http且不包含kuaishou.com,认为是纯ID
31+
if not url.startswith("http") and "kuaishou.com" not in url:
32+
return VideoUrlInfo(video_id=url, url_type="normal")
33+
34+
# 从标准视频URL中提取ID: /short-video/视频ID
35+
video_pattern = r'/short-video/([a-zA-Z0-9_-]+)'
36+
match = re.search(video_pattern, url)
37+
if match:
38+
video_id = match.group(1)
39+
return VideoUrlInfo(video_id=video_id, url_type="normal")
40+
41+
raise ValueError(f"无法从URL中解析出视频ID: {url}")
42+
43+
44+
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
45+
"""
46+
从快手创作者主页URL中解析出创作者ID
47+
支持以下格式:
48+
1. 创作者主页: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs"
49+
2. 纯ID: "3x4sm73aye7jq7i"
50+
51+
Args:
52+
url: 快手创作者主页链接或user_id
53+
Returns:
54+
CreatorUrlInfo: 包含创作者ID的对象
55+
"""
56+
# 如果不包含http且不包含kuaishou.com,认为是纯ID
57+
if not url.startswith("http") and "kuaishou.com" not in url:
58+
return CreatorUrlInfo(user_id=url)
59+
60+
# 从创作者主页URL中提取user_id: /profile/xxx
61+
user_pattern = r'/profile/([a-zA-Z0-9_-]+)'
62+
match = re.search(user_pattern, url)
63+
if match:
64+
user_id = match.group(1)
65+
return CreatorUrlInfo(user_id=user_id)
66+
67+
raise ValueError(f"无法从URL中解析出创作者ID: {url}")
68+
69+
70+
if __name__ == '__main__':
71+
# 测试视频URL解析
72+
print("=== 视频URL解析测试 ===")
73+
test_video_urls = [
74+
"https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python",
75+
"3xf8enb8dbj6uig",
76+
]
77+
for url in test_video_urls:
78+
try:
79+
result = parse_video_info_from_url(url)
80+
print(f"✓ URL: {url[:80]}...")
81+
print(f" 结果: {result}\n")
82+
except Exception as e:
83+
print(f"✗ URL: {url}")
84+
print(f" 错误: {e}\n")
85+
86+
# 测试创作者URL解析
87+
print("=== 创作者URL解析测试 ===")
88+
test_creator_urls = [
89+
"https://www.kuaishou.com/profile/3x84qugg4ch9zhs",
90+
"3x4sm73aye7jq7i",
91+
]
92+
for url in test_creator_urls:
93+
try:
94+
result = parse_creator_info_from_url(url)
95+
print(f"✓ URL: {url[:80]}...")
96+
print(f" 结果: {result}\n")
97+
except Exception as e:
98+
print(f"✗ URL: {url}")
99+
print(f" 错误: {e}\n")

model/m_kuaishou.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,25 @@
1-
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
2-
# 1. 不得用于任何商业用途。
3-
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
4-
# 3. 不得进行大规模爬取或对平台造成运营干扰。
5-
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
1+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
2+
# 1. 不得用于任何商业用途。
3+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
4+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
5+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
66
# 5. 不得用于任何非法或不当的用途。
7-
#
8-
# 详细许可条款请参阅项目根目录下的LICENSE文件。
9-
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
7+
#
8+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
9+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
1010

1111

1212
# -*- coding: utf-8 -*-
13+
14+
from pydantic import BaseModel, Field
15+
16+
17+
class VideoUrlInfo(BaseModel):
18+
"""快手视频URL信息"""
19+
video_id: str = Field(title="video id (photo id)")
20+
url_type: str = Field(default="normal", title="url type: normal")
21+
22+
23+
class CreatorUrlInfo(BaseModel):
24+
"""快手创作者URL信息"""
25+
user_id: str = Field(title="user id (creator id)")

0 commit comments

Comments
 (0)