Skip to content

Commit e4e0f65

Browse files
authored
Merge pull request #699 from 2513502304/main
添加了对媒体资源服务器的异常处理,参见 issue #691
2 parents b9d30bb + 81f2dbe commit e4e0f65

File tree

8 files changed

+50
-24
lines changed

8 files changed

+50
-24
lines changed

media_platform/bilibili/client.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -202,12 +202,17 @@ async def get_video_play_url(self, aid: int, cid: int) -> Dict:
202202

203203
async def get_video_media(self, url: str) -> Union[bytes, None]:
204204
async with httpx.AsyncClient(proxy=self.proxy) as client:
205-
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
206-
if not response.reason_phrase == "OK":
207-
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
205+
try:
206+
response = await client.request("GET", url, timeout=self.timeout, headers=self.headers)
207+
response.raise_for_status()
208+
if not response.reason_phrase == "OK":
209+
utils.logger.error(f"[BilibiliClient.get_video_media] request {url} err, res:{response.text}")
210+
return None
211+
else:
212+
return response.content
213+
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
214+
utils.logger.error(f"[BilibiliClient.get_video_media] {exc}")
208215
return None
209-
else:
210-
return response.content
211216

212217
async def get_video_comments(
213218
self,

media_platform/bilibili/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,7 @@ async def get_bilibili_video(self, video_item: Dict, semaphore: asyncio.Semaphor
544544
return
545545

546546
content = await self.bili_client.get_video_media(video_url)
547+
await asyncio.sleep(random.random())
547548
if content is None:
548549
return
549550
extension_file_name = f"video.mp4"

media_platform/douyin/client.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ class DouYinClient(AbstractApiClient):
3030

3131
def __init__(
3232
self,
33-
timeout=30, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
33+
timeout=60, # 若开启爬取媒体选项,抖音的短视频需要更久的超时时间
3434
proxy=None,
3535
*,
3636
headers: Dict,
@@ -305,17 +305,22 @@ async def get_all_user_aweme_posts(self, sec_user_id: str, callback: Optional[Ca
305305
posts_has_more = aweme_post_res.get("has_more", 0)
306306
max_cursor = aweme_post_res.get("max_cursor")
307307
aweme_list = aweme_post_res.get("aweme_list") if aweme_post_res.get("aweme_list") else []
308-
utils.logger.info(f"[DouYinCrawler.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
308+
utils.logger.info(f"[DouYinClient.get_all_user_aweme_posts] get sec_user_id:{sec_user_id} video len : {len(aweme_list)}")
309309
if callback:
310310
await callback(aweme_list)
311311
result.extend(aweme_list)
312312
return result
313313

314314
async def get_aweme_media(self, url: str) -> Union[bytes, None]:
315315
async with httpx.AsyncClient(proxy=self.proxy) as client:
316-
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
317-
if not response.reason_phrase == "OK":
318-
utils.logger.error(f"[DouYinCrawler.get_aweme_media] request {url} err, res:{response.text}")
316+
try:
317+
response = await client.request("GET", url, timeout=self.timeout, follow_redirects=True)
318+
response.raise_for_status()
319+
if not response.reason_phrase == "OK":
320+
utils.logger.error(f"[DouYinClient.get_aweme_media] request {url} err, res:{response.text}")
321+
return None
322+
else:
323+
return response.content
324+
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
325+
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}")
319326
return None
320-
else:
321-
return response.content

media_platform/douyin/core.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -362,6 +362,7 @@ async def get_aweme_images(self, aweme_item: Dict):
362362
if not url:
363363
continue
364364
content = await self.dy_client.get_aweme_media(url)
365+
await asyncio.sleep(random.random())
365366
if content is None:
366367
continue
367368
extension_file_name = f"{picNum:>03d}.jpeg"
@@ -385,6 +386,7 @@ async def get_aweme_video(self, aweme_item: Dict):
385386
if not video_download_url:
386387
return
387388
content = await self.dy_client.get_aweme_media(video_download_url)
389+
await asyncio.sleep(random.random())
388390
if content is None:
389391
return
390392
extension_file_name = f"video.mp4"

media_platform/weibo/client.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ class WeiboClient:
3535

3636
def __init__(
3737
self,
38-
timeout=30, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
38+
timeout=60, # 若开启爬取媒体选项,weibo 的图片需要更久的超时时间
3939
proxy=None,
4040
*,
4141
headers: Dict[str, str],
@@ -248,12 +248,17 @@ async def get_note_image(self, image_url: str) -> bytes:
248248
final_uri = (f"{self._image_agent_host}"
249249
f"{image_url}")
250250
async with httpx.AsyncClient(proxy=self.proxy) as client:
251-
response = await client.request("GET", final_uri, timeout=self.timeout)
252-
if not response.reason_phrase == "OK":
253-
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
251+
try:
252+
response = await client.request("GET", final_uri, timeout=self.timeout)
253+
response.raise_for_status()
254+
if not response.reason_phrase == "OK":
255+
utils.logger.error(f"[WeiboClient.get_note_image] request {final_uri} err, res:{response.text}")
256+
return None
257+
else:
258+
return response.content
259+
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
260+
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}")
254261
return None
255-
else:
256-
return response.content
257262

258263
async def get_creator_container_info(self, creator_id: str) -> Dict:
259264
"""

media_platform/weibo/core.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,7 @@ async def get_note_images(self, mblog: Dict):
250250
if not url:
251251
continue
252252
content = await self.wb_client.get_note_image(url)
253+
await asyncio.sleep(random.random())
253254
if content != None:
254255
extension_file_name = url.split(".")[-1]
255256
await weibo_store.update_weibo_note_image(pic["pid"], content, extension_file_name)

media_platform/xhs/client.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ class XiaoHongShuClient(AbstractApiClient):
3232

3333
def __init__(
3434
self,
35-
timeout=30, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间
35+
timeout=60, # 若开启爬取媒体选项,xhs 的长视频需要更久的超时时间
3636
proxy=None,
3737
*,
3838
headers: Dict[str, str],
@@ -152,12 +152,17 @@ async def post(self, uri: str, data: dict, **kwargs) -> Dict:
152152

153153
async def get_note_media(self, url: str) -> Union[bytes, None]:
154154
async with httpx.AsyncClient(proxy=self.proxy) as client:
155-
response = await client.request("GET", url, timeout=self.timeout)
156-
if not response.reason_phrase == "OK":
157-
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
155+
try:
156+
response = await client.request("GET", url, timeout=self.timeout)
157+
response.raise_for_status()
158+
if not response.reason_phrase == "OK":
159+
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
160+
return None
161+
else:
162+
return response.content
163+
except httpx.HTTPStatusError as exc: # some wrong when call httpx.request method, such as connection error, client error or server error
164+
utils.logger.error(f"[DouYinClient.get_aweme_media] {exc}")
158165
return None
159-
else:
160-
return response.content
161166

162167
async def pong(self) -> bool:
163168
"""

media_platform/xhs/core.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -453,6 +453,7 @@ async def get_note_images(self, note_item: Dict):
453453
if not url:
454454
continue
455455
content = await self.xhs_client.get_note_media(url)
456+
await asyncio.sleep(random.random())
456457
if content is None:
457458
continue
458459
extension_file_name = f"{picNum}.jpg"
@@ -476,6 +477,7 @@ async def get_notice_video(self, note_item: Dict):
476477
videoNum = 0
477478
for url in videos:
478479
content = await self.xhs_client.get_note_media(url)
480+
await asyncio.sleep(random.random())
479481
if content is None:
480482
continue
481483
extension_file_name = f"{videoNum}.mp4"

0 commit comments

Comments
 (0)