Skip to content

Commit ff41fae

Browse files
authored
Merge pull request #608 from Bowenwin/bili_expand
Bili_function_expand
2 parents 7ed6621 + 66843f2 commit ff41fae

File tree

7 files changed

+627
-18
lines changed

7 files changed

+627
-18
lines changed

config/base_config.py

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,4 +193,17 @@
193193
# 是否开启按每一天进行爬取的选项,仅支持 bilibili 关键字搜索
194194
# 若为 False,则忽略 START_DAY 与 END_DAY 设置的值
195195
# 若为 True,则按照 START_DAY 至 END_DAY 按照每一天进行筛选,这样能够突破 1000 条视频的限制,最大程度爬取该关键词下的所有视频
196-
ALL_DAY = False
196+
ALL_DAY = False
197+
198+
#!!! 下面仅支持 bilibili creator搜索
199+
# 爬取评论creator主页还是爬取creator动态和关系列表(True为前者)
200+
CREATOR_MODE = True
201+
202+
# 爬取creator粉丝列表时起始爬取页数
203+
START_CONTACTS_PAGE = 1
204+
205+
# 爬取作者粉丝和关注列表数量控制(单作者)
206+
CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES = 100
207+
208+
# 爬取作者动态数量控制(单作者)
209+
CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES = 50

media_platform/bilibili/client.py

Lines changed: 160 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
import httpx
2222
from playwright.async_api import BrowserContext, Page
2323

24+
import config
2425
from base.base_crawler import AbstractApiClient
2526
from tools import utils
2627

@@ -337,3 +338,162 @@ async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order
337338
"order": order_mode,
338339
}
339340
return await self.get(uri, post_data)
341+
342+
async def get_creator_info(self, creator_id: int) -> Dict:
343+
"""
344+
get creator info
345+
:param creator_id: 作者 ID
346+
"""
347+
uri = "/x/space/wbi/acc/info"
348+
post_data = {
349+
"mid": creator_id,
350+
}
351+
return await self.get(uri, post_data)
352+
353+
async def get_creator_fans(self,
354+
creator_id: int,
355+
pn: int,
356+
ps: int = 24,
357+
) -> Dict:
358+
"""
359+
get creator fans
360+
:param creator_id: 创作者 ID
361+
:param pn: 开始页数
362+
:param ps: 每页数量
363+
:return:
364+
"""
365+
uri = "/x/relation/fans"
366+
post_data = {
367+
'vmid': creator_id,
368+
"pn": pn,
369+
"ps": ps,
370+
"gaia_source": "main_web",
371+
372+
}
373+
return await self.get(uri, post_data)
374+
375+
async def get_creator_followings(self,
376+
creator_id: int,
377+
pn: int,
378+
ps: int = 24,
379+
) -> Dict:
380+
"""
381+
get creator followings
382+
:param creator_id: 创作者 ID
383+
:param pn: 开始页数
384+
:param ps: 每页数量
385+
:return:
386+
"""
387+
uri = "/x/relation/followings"
388+
post_data = {
389+
"vmid": creator_id,
390+
"pn": pn,
391+
"ps": ps,
392+
"gaia_source": "main_web",
393+
}
394+
return await self.get(uri, post_data)
395+
396+
async def get_creator_dynamics(self, creator_id: int, offset: str = ""):
397+
"""
398+
get creator comments
399+
:param creator_id: 创作者 ID
400+
:param offset: 发送请求所需参数
401+
:return:
402+
"""
403+
uri = "/x/polymer/web-dynamic/v1/feed/space"
404+
post_data = {
405+
"offset": offset,
406+
"host_mid": creator_id,
407+
"platform": "web",
408+
}
409+
410+
return await self.get(uri, post_data)
411+
412+
async def get_creator_all_fans(self, creator_info: Dict, crawl_interval: float = 1.0,
413+
callback: Optional[Callable] = None,
414+
max_count: int = 100) -> List:
415+
"""
416+
get creator all fans
417+
:param creator_info:
418+
:param crawl_interval:
419+
:param callback:
420+
:param max_count: 一个up主爬取的最大粉丝数量
421+
422+
:return: up主粉丝数列表
423+
"""
424+
creator_id = creator_info["id"]
425+
result = []
426+
pn = config.START_CONTACTS_PAGE
427+
while len(result) < max_count:
428+
fans_res: Dict = await self.get_creator_fans(creator_id, pn=pn)
429+
fans_list: List[Dict] = fans_res.get("list", [])
430+
431+
pn += 1
432+
if len(result) + len(fans_list) > max_count:
433+
fans_list = fans_list[:max_count - len(result)]
434+
if callback: # 如果有回调函数,就执行回调函数
435+
await callback(creator_info, fans_list)
436+
await asyncio.sleep(crawl_interval)
437+
if not fans_list:
438+
break
439+
result.extend(fans_list)
440+
return result
441+
442+
async def get_creator_all_followings(self, creator_info: Dict, crawl_interval: float = 1.0,
443+
callback: Optional[Callable] = None,
444+
max_count: int = 100) -> List:
445+
"""
446+
get creator all followings
447+
:param creator_info:
448+
:param crawl_interval:
449+
:param callback:
450+
:param max_count: 一个up主爬取的最大关注者数量
451+
452+
:return: up主关注者列表
453+
"""
454+
creator_id = creator_info["id"]
455+
result = []
456+
pn = config.START_CONTACTS_PAGE
457+
while len(result) < max_count:
458+
followings_res: Dict = await self.get_creator_followings(creator_id, pn=pn)
459+
followings_list: List[Dict] = followings_res.get("list", [])
460+
461+
pn += 1
462+
if len(result) + len(followings_list) > max_count:
463+
followings_list = followings_list[:max_count - len(result)]
464+
if callback: # 如果有回调函数,就执行回调函数
465+
await callback(creator_info, followings_list)
466+
await asyncio.sleep(crawl_interval)
467+
if not followings_list:
468+
break
469+
result.extend(followings_list)
470+
return result
471+
472+
async def get_creator_all_dynamics(self, creator_info: Dict, crawl_interval: float = 1.0,
473+
callback: Optional[Callable] = None,
474+
max_count: int = 20) -> List:
475+
"""
476+
get creator all followings
477+
:param creator_info:
478+
:param crawl_interval:
479+
:param callback:
480+
:param max_count: 一个up主爬取的最大动态数量
481+
482+
:return: up主关注者列表
483+
"""
484+
creator_id = creator_info["id"]
485+
result = []
486+
offset = ""
487+
has_more = True
488+
while has_more and len(result) < max_count:
489+
dynamics_res = await self.get_creator_dynamics(creator_id, offset)
490+
dynamics_list: List[Dict] = dynamics_res["items"]
491+
has_more = dynamics_res["has_more"]
492+
offset = dynamics_res["offset"]
493+
if len(result) + len(dynamics_list) > max_count:
494+
dynamics_list = dynamics_list[:max_count - len(result)]
495+
if callback:
496+
await callback(creator_info, dynamics_list)
497+
await asyncio.sleep(crawl_interval)
498+
result.extend(dynamics_list)
499+
return result

media_platform/bilibili/core.py

Lines changed: 124 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,8 +89,11 @@ async def start(self):
8989
# Get the information and comments of the specified post
9090
await self.get_specified_videos(config.BILI_SPECIFIED_ID_LIST)
9191
elif config.CRAWLER_TYPE == "creator":
92-
for creator_id in config.BILI_CREATOR_ID_LIST:
93-
await self.get_creator_videos(int(creator_id))
92+
if config.CREATOR_MODE:
93+
for creator_id in config.BILI_CREATOR_ID_LIST:
94+
await self.get_creator_videos(int(creator_id))
95+
else:
96+
await self.get_all_creator_details(config.BILI_CREATOR_ID_LIST)
9497
else:
9598
pass
9699
utils.logger.info(
@@ -125,7 +128,7 @@ async def get_pubtime_datetime(start: str = config.START_DAY, end: str = config.
125128
end_day = end_day + timedelta(days=1) - timedelta(seconds=1) # 则将 end_day 设置为 end_day + 1 day - 1 second
126129
# 将其重新转换为时间戳
127130
return str(int(start_day.timestamp())), str(int(end_day.timestamp()))
128-
131+
129132
async def search(self):
130133
"""
131134
search bilibili video with keywords
@@ -466,3 +469,121 @@ async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphor
466469
extension_file_name = f"video.mp4"
467470
await bilibili_store.store_video(aid, content, extension_file_name)
468471

472+
async def get_all_creator_details(self, creator_id_list: List[int]):
473+
"""
474+
creator_id_list: get details for creator from creator_id_list
475+
"""
476+
utils.logger.info(
477+
f"[BilibiliCrawler.get_creator_details] Crawling the detalis of creator")
478+
utils.logger.info(
479+
f"[BilibiliCrawler.get_creator_details] creator ids:{creator_id_list}")
480+
481+
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
482+
task_list: List[Task] = []
483+
try:
484+
for creator_id in creator_id_list:
485+
task = asyncio.create_task(self.get_creator_details(
486+
creator_id, semaphore), name=creator_id)
487+
task_list.append(task)
488+
except Exception as e:
489+
utils.logger.warning(
490+
f"[BilibiliCrawler.get_all_creator_details] error in the task list. The creator will not be included. {e}")
491+
492+
await asyncio.gather(*task_list)
493+
494+
async def get_creator_details(self, creator_id: int, semaphore: asyncio.Semaphore):
495+
"""
496+
get details for creator id
497+
:param creator_id:
498+
:param semaphore:
499+
:return:
500+
"""
501+
async with semaphore:
502+
creator_unhandled_info: Dict = await self.bili_client.get_creator_info(creator_id)
503+
creator_info: Dict = {
504+
"id": creator_id,
505+
"name": creator_unhandled_info.get("name"),
506+
"sign": creator_unhandled_info.get("sign"),
507+
"avatar": creator_unhandled_info.get("face"),
508+
}
509+
await self.get_fans(creator_info, semaphore)
510+
await self.get_followings(creator_info, semaphore)
511+
await self.get_dynamics(creator_info, semaphore)
512+
513+
async def get_fans(self, creator_info: Dict, semaphore: asyncio.Semaphore):
514+
"""
515+
get fans for creator id
516+
:param creator_info:
517+
:param semaphore:
518+
:return:
519+
"""
520+
creator_id = creator_info["id"]
521+
async with semaphore:
522+
try:
523+
utils.logger.info(
524+
f"[BilibiliCrawler.get_fans] begin get creator_id: {creator_id} fans ...")
525+
await self.bili_client.get_creator_all_fans(
526+
creator_info=creator_info,
527+
crawl_interval=random.random(),
528+
callback=bilibili_store.batch_update_bilibili_creator_fans,
529+
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
530+
)
531+
532+
except DataFetchError as ex:
533+
utils.logger.error(
534+
f"[BilibiliCrawler.get_fans] get creator_id: {creator_id} fans error: {ex}")
535+
except Exception as e:
536+
utils.logger.error(
537+
f"[BilibiliCrawler.get_fans] may be been blocked, err:{e}")
538+
539+
async def get_followings(self, creator_info: Dict, semaphore: asyncio.Semaphore):
540+
"""
541+
get followings for creator id
542+
:param creator_info:
543+
:param semaphore:
544+
:return:
545+
"""
546+
creator_id = creator_info["id"]
547+
async with semaphore:
548+
try:
549+
utils.logger.info(
550+
f"[BilibiliCrawler.get_followings] begin get creator_id: {creator_id} followings ...")
551+
await self.bili_client.get_creator_all_followings(
552+
creator_info=creator_info,
553+
crawl_interval=random.random(),
554+
callback=bilibili_store.batch_update_bilibili_creator_followings,
555+
max_count=config.CRAWLER_MAX_CONTACTS_COUNT_SINGLENOTES,
556+
)
557+
558+
except DataFetchError as ex:
559+
utils.logger.error(
560+
f"[BilibiliCrawler.get_followings] get creator_id: {creator_id} followings error: {ex}")
561+
except Exception as e:
562+
utils.logger.error(
563+
f"[BilibiliCrawler.get_followings] may be been blocked, err:{e}")
564+
565+
async def get_dynamics(self, creator_info: Dict, semaphore: asyncio.Semaphore):
566+
"""
567+
get dynamics for creator id
568+
:param creator_info:
569+
:param semaphore:
570+
:return:
571+
"""
572+
creator_id = creator_info["id"]
573+
async with semaphore:
574+
try:
575+
utils.logger.info(
576+
f"[BilibiliCrawler.get_dynamics] begin get creator_id: {creator_id} dynamics ...")
577+
await self.bili_client.get_creator_all_dynamics(
578+
creator_info=creator_info,
579+
crawl_interval=random.random(),
580+
callback=bilibili_store.batch_update_bilibili_creator_dynamics,
581+
max_count=config.CRAWLER_MAX_DYNAMICS_COUNT_SINGLENOTES,
582+
)
583+
584+
except DataFetchError as ex:
585+
utils.logger.error(
586+
f"[BilibiliCrawler.get_dynamics] get creator_id: {creator_id} dynamics error: {ex}")
587+
except Exception as e:
588+
utils.logger.error(
589+
f"[BilibiliCrawler.get_dynamics] may be been blocked, err:{e}")

0 commit comments

Comments
 (0)