Skip to content

Commit 66843f2

Browse files
committed
finish_all_for_expand_bili
1 parent 59619ff commit 66843f2

File tree

12 files changed

+51
-63
lines changed

12 files changed

+51
-63
lines changed

config/base_config.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,16 @@
1010

1111

1212
# 基础配置
13-
PLATFORM = "bili"
13+
PLATFORM = "xhs"
1414
KEYWORDS = "编程副业,编程兼职" # 关键词搜索配置,以英文逗号分隔
15-
LOGIN_TYPE = "phone" # qrcode or phone or cookie
15+
LOGIN_TYPE = "qrcode" # qrcode or phone or cookie
1616
COOKIES = ""
1717
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持小红书
1818
SORT_TYPE = "popularity_descending"
1919
# 具体值参见media_platform.xxx.field下的枚举值,暂时只支持抖音
2020
PUBLISH_TIME_TYPE = 0
2121
CRAWLER_TYPE = (
22-
"creator" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
22+
"search" # 爬取类型,search(关键词搜索) | detail(帖子详情)| creator(创作者主页数据)
2323
)
2424
# 自定义User Agent(暂时仅对XHS有效)
2525
UA = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0'
@@ -54,9 +54,6 @@
5454
# 爬取开始页数 默认从第一页开始
5555
START_PAGE = 1
5656

57-
# 爬取粉丝列表开始页数 默认从第一页开始
58-
START_CONTACTS_PAGE = 1
59-
6057
# 爬取视频/帖子的数量控制
6158
CRAWLER_MAX_NOTES_COUNT = 200
6259

@@ -147,11 +144,7 @@
147144

148145
# 指定bili创作者ID列表(sec_id)
149146
BILI_CREATOR_ID_LIST = [
150-
# "20813884",
151-
"520819684",
152-
# "472747194",
153-
# "519872016",
154-
# "372201438",
147+
"20813884",
155148
# ........................
156149
]
157150

@@ -202,8 +195,15 @@
202195
# 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
203196
ALL_DAY = False
204197

198+
#!!! 下面仅支持 bilibili creator搜索
199+
# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
200+
CREATOR_MODE = True
201+
202+
# 爬取creator粉丝列表时起始爬取页数
203+
START_CONTACTS_PAGE = 1
204+
205205
# 爬取作者粉丝和关注列表数量控制(单作者)
206206
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
207207

208-
# 爬取作者动态粉丝和关注列表数量控制(单作者)
208+
# 爬取作者动态数量控制(单作者)
209209
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50

config/db_config.py

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,11 @@
1212
import os
1313

1414
# mysql config
15-
# RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")
16-
# RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root")
17-
# RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost")
18-
# RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306)
19-
# RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler")
20-
RELATION_DB_HOST = "47.94.233.47" # 替换为你的数据库域名/公网IP
21-
RELATION_DB_PORT = 3306 # 替换为你的数据库端口(通常3306)
22-
RELATION_DB_USER = "remote_user" # 替换为你的数据库用户名
23-
RELATION_DB_PWD = "314159" # 替换为你的数据库密码
24-
RELATION_DB_NAME = "Test" # 替换为你的数据库名称
15+
RELATION_DB_PWD = os.getenv("RELATION_DB_PWD", "123456")
16+
RELATION_DB_USER = os.getenv("RELATION_DB_USER", "root")
17+
RELATION_DB_HOST = os.getenv("RELATION_DB_HOST", "localhost")
18+
RELATION_DB_PORT = os.getenv("RELATION_DB_PORT", 3306)
19+
RELATION_DB_NAME = os.getenv("RELATION_DB_NAME", "media_crawler")
2520

2621

2722
# redis config

media_platform/bilibili/client.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ async def get_video_comments(self,
224224

225225
async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
226226
callback: Optional[Callable] = None,
227-
max_count: int = 10, ):
227+
max_count: int = 10,):
228228
"""
229229
get video all comments include sub comments
230230
:param video_id:
@@ -251,7 +251,7 @@ async def get_video_all_comments(self, video_id: str, crawl_interval: float = 1.
251251
if (comment.get("rcount", 0) > 0):
252252
{
253253
await self.get_video_all_level_two_comments(
254-
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
254+
video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)
255255
}
256256
if len(result) + len(comment_list) > max_count:
257257
comment_list = comment_list[:max_count - len(result)]
@@ -321,8 +321,7 @@ async def get_video_level_two_comments(self,
321321
result = await self.get(uri, post_data)
322322
return result
323323

324-
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30,
325-
order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
324+
async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict:
326325
"""get all videos for a creator
327326
:param creator_id: 创作者 ID
328327
:param pn: 页数

media_platform/bilibili/core.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,11 @@ async def start(self):
8989
# Get the information and comments of the specified post
9090
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
9191
elif config.CRAWLER_TYPE == "creator":
92-
# for creator_id in config.BILI_CREATOR_ID_LIST:
93-
# await self.get_creator_videos(int(creator_id))
94-
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
92+
if config.CREATOR_MODE:
93+
for creator_id in config.BILI_CREATOR_ID_LIST:
94+
await self.get_creator_videos(int(creator_id))
95+
else:
96+
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
9597
else:
9698
pass
9799
utils.logger.info(
@@ -119,11 +121,9 @@ async def get_pubtime_datetime(start: str = config.START_DAY, end: str = config.
119121
start_day: datetime = datetime.strptime(start, '%Y-%m-%d')
120122
end_day: datetime = datetime.strptime(end, '%Y-%m-%d')
121123
if start_day > end_day:
122-
raise ValueError(
123-
'Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
124+
raise ValueError('Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end')
124125
elif start_day == end_day: # 搜索同一天的内容
125-
end_day = start_day + timedelta(days=1) - timedelta(
126-
seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
126+
end_day = start_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 start_day + 1 day - 1 second
127127
else: # 搜索 start 至 end
128128
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
129129
# 将其重新转换为时间戳
@@ -166,11 +166,9 @@ async def search(self):
166166
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
167167
task_list = []
168168
try:
169-
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore)
170-
for video_item in video_list]
169+
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
171170
except Exception as e:
172-
utils.logger.warning(
173-
f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
171+
utils.logger.warning(f"[BilibiliCrawler.search] error in the task list. The video for this page will not be included. {e}")
174172
video_items = await asyncio.gather(*task_list)
175173
for video_item in video_items:
176174
if video_item:
@@ -184,23 +182,21 @@ async def search(self):
184182
else:
185183
for day in pd.date_range(start=config.START_DAY, end=config.END_DAY, freq='D'):
186184
# 按照每一天进行爬取的时间戳参数
187-
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'),
188-
end=day.strftime('%Y-%m-%d'))
185+
pubtime_begin_s, pubtime_end_s = await self.get_pubtime_datetime(start=day.strftime('%Y-%m-%d'), end=day.strftime('%Y-%m-%d'))
189186
page = 1
190-
# !该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
191-
# !除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
192-
# !除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
187+
#!该段 while 语句在发生异常时(通常情况下为当天数据为空时)会自动跳转到下一天,以实现最大程度爬取该关键词下当天的所有视频
188+
#!除了仅保留现在原有的 try, except Exception 语句外,不要再添加其他的异常处理!!!否则将使该段代码失效,使其仅能爬取当天一天数据而无法跳转到下一天
189+
#!除非将该段代码的逻辑进行重构以实现相同的功能,否则不要进行修改!!!
193190
while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
194-
# ! Catch any error if response return nothing, go to next day
191+
#! Catch any error if response return nothing, go to next day
195192
try:
196-
# ! Don't skip any page, to make sure gather all video in one day
193+
#! Don't skip any page, to make sure gather all video in one day
197194
# if page < start_page:
198195
# utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}")
199196
# page += 1
200197
# continue
201198

202-
utils.logger.info(
203-
f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
199+
utils.logger.info(f"[BilibiliCrawler.search] search bilibili keyword: {keyword}, date: {day.ctime()}, page: {page}")
204200
video_id_list: List[str] = []
205201
videos_res = await self.bili_client.search_video_by_keyword(
206202
keyword=keyword,
@@ -213,9 +209,7 @@ async def search(self):
213209
video_list: List[Dict] = videos_res.get("result")
214210

215211
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
216-
task_list = [
217-
self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for
218-
video_item in video_list]
212+
task_list = [self.get_video_info_task(aid=video_item.get("aid"), bvid="", semaphore=semaphore) for video_item in video_list]
219213
video_items = await asyncio.gather(*task_list)
220214
for video_item in video_items:
221215
if video_item:

store/bilibili/bilibili_store_impl.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -38,15 +38,13 @@ def calculate_number_of_files(file_store_path: str) -> int:
3838
if not os.path.exists(file_store_path):
3939
return 1
4040
try:
41-
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
41+
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
4242
except ValueError:
4343
return 1
4444

45-
4645
class BiliCsvStoreImplement(AbstractStore):
4746
csv_store_path: str = "data/bilibili"
48-
file_count: int = calculate_number_of_files(csv_store_path)
49-
47+
file_count:int=calculate_number_of_files(csv_store_path)
5048
def make_save_file_name(self, store_type: str) -> str:
5149
"""
5250
make save file name by store type
@@ -196,7 +194,7 @@ async def store_creator(self, creator: Dict):
196194
creator["add_ts"] = utils.get_current_timestamp()
197195
await add_new_creator(creator)
198196
else:
199-
await update_creator_by_creator_id(creator_id, creator_item=creator)
197+
await update_creator_by_creator_id(creator_id,creator_item=creator)
200198

201199
async def store_contact(self, contact_item: Dict):
202200
"""
@@ -249,10 +247,11 @@ class BiliJsonStoreImplement(AbstractStore):
249247
json_store_path: str = "data/bilibili/json"
250248
words_store_path: str = "data/bilibili/words"
251249
lock = asyncio.Lock()
252-
file_count: int = calculate_number_of_files(json_store_path)
250+
file_count:int=calculate_number_of_files(json_store_path)
253251
WordCloud = words.AsyncWordCloudGenerator()
254252

255-
def make_save_file_name(self, store_type: str) -> (str, str):
253+
254+
def make_save_file_name(self, store_type: str) -> (str,str):
256255
"""
257256
make save file name by store type
258257
Args:
@@ -279,7 +278,7 @@ async def save_data_to_json(self, save_item: Dict, store_type: str):
279278
"""
280279
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
281280
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
282-
save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
281+
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
283282
save_data = []
284283

285284
async with self.lock:

store/bilibili/bilibili_store_sql.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ async def update_content_by_content_id(content_id: str, content_item: Dict) -> i
6666
return effect_row
6767

6868

69+
6970
async def query_comment_by_comment_id(comment_id: str) -> Dict:
7071
"""
7172
查询一条评论内容

store/douyin/douyin_store_impl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,7 @@ async def store_content(self, content_item: Dict):
238238

239239
async def store_comment(self, comment_item: Dict):
240240
"""
241-
comment JSON storage implementatio
241+
comment JSON storage implementation
242242
Args:
243243
comment_item:
244244

store/kuaishou/kuaishou_store_impl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ async def store_content(self, content_item: Dict):
215215

216216
async def store_comment(self, comment_item: Dict):
217217
"""
218-
comment JSON storage implementatio
218+
comment JSON storage implementation
219219
Args:
220220
comment_item:
221221

store/tieba/tieba_store_impl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,7 @@ async def store_content(self, content_item: Dict):
235235

236236
async def store_comment(self, comment_item: Dict):
237237
"""
238-
comment JSON storage implementatio
238+
comment JSON storage implementation
239239
Args:
240240
comment_item:
241241

store/weibo/weibo_store_impl.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ async def store_content(self, content_item: Dict):
241241

242242
async def store_comment(self, comment_item: Dict):
243243
"""
244-
comment JSON storage implementatio
244+
comment JSON storage implementation
245245
Args:
246246
comment_item:
247247

0 commit comments

Comments
 (0)