Skip to content

Commit dbbc33a

Browse files
authored
Merge pull request #674 from persist-1/chore
chore: 增加--help参数中文显示支持;增加"douyin_aweme"表"music_download_url"字段与功能实现
2 parents fc06c78 + 19df173 commit dbbc33a

File tree

7 files changed

+39
-12
lines changed

7 files changed

+39
-12
lines changed

.gitignore

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,4 +169,8 @@ cython_debug/
169169
*/.DS_Store
170170
.vscode
171171
/node_modules
172-
docs/.vitepress/cache
172+
docs/.vitepress/cache
173+
174+
# other gitignore
175+
.venv
176+
.refer

cmd_arg/arg.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,25 +17,29 @@
1717

1818
async def parse_cmd():
1919
# 读取command arg
20-
parser = argparse.ArgumentParser(description='Media crawler program.')
21-
parser.add_argument('--platform', type=str, help='Media platform select (xhs | dy | ks | bili | wb | tieba | zhihu)',
20+
parser = argparse.ArgumentParser(description='Media crawler program. / 媒体爬虫程序')
21+
parser.add_argument('--platform', type=str,
22+
help='Media platform select / 选择媒体平台 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)',
2223
choices=["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"], default=config.PLATFORM)
23-
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)',
24+
parser.add_argument('--lt', type=str,
25+
help='Login type / 登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)',
2426
choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE)
25-
parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)',
27+
parser.add_argument('--type', type=str,
28+
help='Crawler type / 爬取类型 (search=搜索 | detail=详情 | creator=创作者)',
2629
choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE)
2730
parser.add_argument('--start', type=int,
28-
help='number of start page', default=config.START_PAGE)
31+
help='Number of start page / 起始页码', default=config.START_PAGE)
2932
parser.add_argument('--keywords', type=str,
30-
help='please input keywords', default=config.KEYWORDS)
33+
help='Please input keywords / 请输入关键词', default=config.KEYWORDS)
3134
parser.add_argument('--get_comment', type=str2bool,
32-
help='''whether to crawl level one comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
35+
help='''Whether to crawl level one comment / 是否爬取一级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_COMMENTS)
3336
parser.add_argument('--get_sub_comment', type=str2bool,
34-
help=''''whether to crawl level two comment, supported values case insensitive ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
37+
help=''''Whether to crawl level two comment / 是否爬取二级评论, supported values case insensitive / 支持的值(不区分大小写) ('yes', 'true', 't', 'y', '1', 'no', 'false', 'f', 'n', '0')''', default=config.ENABLE_GET_SUB_COMMENTS)
3538
parser.add_argument('--save_data_option', type=str,
36-
help='where to save the data (csv or db or json or sqlite)', choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
39+
help='Where to save the data / 数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库)',
40+
choices=['csv', 'db', 'json', 'sqlite'], default=config.SAVE_DATA_OPTION)
3741
parser.add_argument('--cookies', type=str,
38-
help='cookies used for cookie login type', default=config.COOKIES)
42+
help='Cookies used for cookie login type / Cookie登录方式使用的Cookie值', default=config.COOKIES)
3943

4044
args = parser.parse_args()
4145

config/base_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
4040
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
4141
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
42-
ENABLE_CDP_MODE = True
42+
ENABLE_CDP_MODE = False
4343

4444
# CDP调试端口,用于与浏览器通信
4545
# 如果端口被占用,系统会自动尝试下一个可用端口

schema/sqlite_tables.db

-68 KB
Binary file not shown.

schema/sqlite_tables.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ CREATE TABLE douyin_aweme (
149149
aweme_url TEXT DEFAULT NULL,
150150
cover_url TEXT DEFAULT NULL,
151151
video_download_url TEXT DEFAULT NULL,
152+
music_download_url TEXT DEFAULT NULL,
152153
source_keyword TEXT DEFAULT ''
153154
);
154155

schema/tables.sql

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ CREATE TABLE `douyin_aweme`
149149
`aweme_url` varchar(255) DEFAULT NULL COMMENT '视频详情页URL',
150150
`cover_url` varchar(500) DEFAULT NULL COMMENT '视频封面图URL',
151151
`video_download_url` varchar(1024) DEFAULT NULL COMMENT '视频下载地址',
152+
`music_download_url` varchar(1024) DEFAULT NULL COMMENT '音乐下载地址',
152153
PRIMARY KEY (`id`),
153154
KEY `idx_douyin_awem_aweme_i_6f7bc6` (`aweme_id`),
154155
KEY `idx_douyin_awem_create__299dfe` (`create_time`)

store/douyin/__init__.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,22 @@ def _extract_video_download_url(aweme_detail: Dict) -> str:
105105
return actual_url_list[-1]
106106

107107

108+
def _extract_music_download_url(aweme_detail: Dict) -> str:
109+
"""
110+
提取音乐下载地址
111+
112+
Args:
113+
aweme_detail (Dict): 抖音视频
114+
115+
Returns:
116+
str: 音乐下载地址
117+
"""
118+
music_item = aweme_detail.get("music", {})
119+
play_url = music_item.get("play_url", {})
120+
music_url = play_url.get("uri", "")
121+
return music_url
122+
123+
108124
async def update_douyin_aweme(aweme_item: Dict):
109125
aweme_id = aweme_item.get("aweme_id")
110126
user_info = aweme_item.get("author", {})
@@ -131,6 +147,7 @@ async def update_douyin_aweme(aweme_item: Dict):
131147
"aweme_url": f"https://www.douyin.com/video/{aweme_id}",
132148
"cover_url": _extract_content_cover_url(aweme_item),
133149
"video_download_url": _extract_video_download_url(aweme_item),
150+
"music_download_url": _extract_music_download_url(aweme_item),
134151
"source_keyword": source_keyword_var.get(),
135152
}
136153
utils.logger.info(

0 commit comments

Comments
 (0)