Skip to content

Commit 15b98fa

Browse files
authored
ip proxy expired logic switch
Fix/proxy 20251125
2 parents 1da347c + f1e7124 commit 15b98fa

File tree

20 files changed

+509
-41
lines changed

20 files changed

+509
-41
lines changed

media_platform/bilibili/client.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,22 +24,26 @@
2424
import asyncio
2525
import json
2626
import random
27-
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
27+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union
2828
from urllib.parse import urlencode
2929

3030
import httpx
3131
from playwright.async_api import BrowserContext, Page
3232

3333
import config
3434
from base.base_crawler import AbstractApiClient
35+
from proxy.proxy_mixin import ProxyRefreshMixin
3536
from tools import utils
3637

38+
if TYPE_CHECKING:
39+
from proxy.proxy_ip_pool import ProxyIpPool
40+
3741
from .exception import DataFetchError
3842
from .field import CommentOrderType, SearchOrderType
3943
from .help import BilibiliSign
4044

4145

42-
class BilibiliClient(AbstractApiClient):
46+
class BilibiliClient(AbstractApiClient, ProxyRefreshMixin):
4347

4448
def __init__(
4549
self,
@@ -49,15 +53,21 @@ def __init__(
4953
headers: Dict[str, str],
5054
playwright_page: Page,
5155
cookie_dict: Dict[str, str],
56+
proxy_ip_pool: Optional["ProxyIpPool"] = None,
5257
):
5358
self.proxy = proxy
5459
self.timeout = timeout
5560
self.headers = headers
5661
self._host = "https://api.bilibili.com"
5762
self.playwright_page = playwright_page
5863
self.cookie_dict = cookie_dict
64+
# 初始化代理池(来自 ProxyRefreshMixin)
65+
self.init_proxy_pool(proxy_ip_pool)
5966

6067
async def request(self, method, url, **kwargs) -> Any:
68+
# 每次请求前检测代理是否过期
69+
await self._refresh_proxy_if_expired()
70+
6171
async with httpx.AsyncClient(proxy=self.proxy) as client:
6272
response = await client.request(method, url, timeout=self.timeout, **kwargs)
6373
try:

media_platform/bilibili/core.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,13 @@ def __init__(self):
6464
self.index_url = "https://www.bilibili.com"
6565
self.user_agent = utils.get_user_agent()
6666
self.cdp_manager = None
67+
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
6768

6869
async def start(self):
6970
playwright_proxy_format, httpx_proxy_format = None, None
7071
if config.ENABLE_IP_PROXY:
71-
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
72-
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
72+
self.ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
73+
ip_proxy_info: IpInfoModel = await self.ip_proxy_pool.get_proxy()
7374
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
7475

7576
async with async_playwright() as playwright:
@@ -473,6 +474,7 @@ async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliCl
473474
},
474475
playwright_page=self.context_page,
475476
cookie_dict=cookie_dict,
477+
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
476478
)
477479
return bilibili_client_obj
478480

media_platform/douyin/client.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,25 @@
2121
import copy
2222
import json
2323
import urllib.parse
24-
from typing import Any, Callable, Dict, Union, Optional
24+
from typing import TYPE_CHECKING, Any, Callable, Dict, Union, Optional
2525

2626
import httpx
2727
from playwright.async_api import BrowserContext
2828

2929
from base.base_crawler import AbstractApiClient
30+
from proxy.proxy_mixin import ProxyRefreshMixin
3031
from tools import utils
3132
from var import request_keyword_var
3233

34+
if TYPE_CHECKING:
35+
from proxy.proxy_ip_pool import ProxyIpPool
36+
3337
from .exception import *
3438
from .field import *
3539
from .help import *
3640

3741

38-
class DouYinClient(AbstractApiClient):
42+
class DouYinClient(AbstractApiClient, ProxyRefreshMixin):
3943

4044
def __init__(
4145
self,
@@ -45,13 +49,16 @@ def __init__(
4549
headers: Dict,
4650
playwright_page: Optional[Page],
4751
cookie_dict: Dict,
52+
proxy_ip_pool: Optional["ProxyIpPool"] = None,
4853
):
4954
self.proxy = proxy
5055
self.timeout = timeout
5156
self.headers = headers
5257
self._host = "https://www.douyin.com"
5358
self.playwright_page = playwright_page
5459
self.cookie_dict = cookie_dict
60+
# 初始化代理池(来自 ProxyRefreshMixin)
61+
self.init_proxy_pool(proxy_ip_pool)
5562

5663
async def __process_req_params(
5764
self,
@@ -106,6 +113,9 @@ async def __process_req_params(
106113
params["a_bogus"] = a_bogus
107114

108115
async def request(self, method, url, **kwargs):
116+
# 每次请求前检测代理是否过期
117+
await self._refresh_proxy_if_expired()
118+
109119
async with httpx.AsyncClient(proxy=self.proxy) as client:
110120
response = await client.request(method, url, timeout=self.timeout, **kwargs)
111121
try:

media_platform/douyin/core.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -55,12 +55,13 @@ class DouYinCrawler(AbstractCrawler):
5555
def __init__(self) -> None:
5656
self.index_url = "https://www.douyin.com"
5757
self.cdp_manager = None
58+
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
5859

5960
async def start(self) -> None:
6061
playwright_proxy_format, httpx_proxy_format = None, None
6162
if config.ENABLE_IP_PROXY:
62-
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
63-
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
63+
self.ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
64+
ip_proxy_info: IpInfoModel = await self.ip_proxy_pool.get_proxy()
6465
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
6566

6667
async with async_playwright() as playwright:
@@ -305,6 +306,7 @@ async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DouYinClient
305306
},
306307
playwright_page=self.context_page,
307308
cookie_dict=cookie_dict,
309+
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
308310
)
309311
return douyin_client
310312

media_platform/kuaishou/client.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,21 +21,25 @@
2121
# -*- coding: utf-8 -*-
2222
import asyncio
2323
import json
24-
from typing import Any, Callable, Dict, List, Optional
24+
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
2525
from urllib.parse import urlencode
2626

2727
import httpx
2828
from playwright.async_api import BrowserContext, Page
2929

3030
import config
3131
from base.base_crawler import AbstractApiClient
32+
from proxy.proxy_mixin import ProxyRefreshMixin
3233
from tools import utils
3334

35+
if TYPE_CHECKING:
36+
from proxy.proxy_ip_pool import ProxyIpPool
37+
3438
from .exception import DataFetchError
3539
from .graphql import KuaiShouGraphQL
3640

3741

38-
class KuaiShouClient(AbstractApiClient):
42+
class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin):
3943
def __init__(
4044
self,
4145
timeout=10,
@@ -44,6 +48,7 @@ def __init__(
4448
headers: Dict[str, str],
4549
playwright_page: Page,
4650
cookie_dict: Dict[str, str],
51+
proxy_ip_pool: Optional["ProxyIpPool"] = None,
4752
):
4853
self.proxy = proxy
4954
self.timeout = timeout
@@ -52,8 +57,13 @@ def __init__(
5257
self.playwright_page = playwright_page
5358
self.cookie_dict = cookie_dict
5459
self.graphql = KuaiShouGraphQL()
60+
# 初始化代理池(来自 ProxyRefreshMixin)
61+
self.init_proxy_pool(proxy_ip_pool)
5562

5663
async def request(self, method, url, **kwargs) -> Any:
64+
# 每次请求前检测代理是否过期
65+
await self._refresh_proxy_if_expired()
66+
5767
async with httpx.AsyncClient(proxy=self.proxy) as client:
5868
response = await client.request(method, url, timeout=self.timeout, **kwargs)
5969
data: Dict = response.json()

media_platform/kuaishou/core.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,14 +58,15 @@ def __init__(self):
5858
self.index_url = "https://www.kuaishou.com"
5959
self.user_agent = utils.get_user_agent()
6060
self.cdp_manager = None
61+
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
6162

6263
async def start(self):
6364
playwright_proxy_format, httpx_proxy_format = None, None
6465
if config.ENABLE_IP_PROXY:
65-
ip_proxy_pool = await create_ip_pool(
66+
self.ip_proxy_pool = await create_ip_pool(
6667
config.IP_PROXY_POOL_COUNT, enable_validate_ip=True
6768
)
68-
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
69+
ip_proxy_info: IpInfoModel = await self.ip_proxy_pool.get_proxy()
6970
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(
7071
ip_proxy_info
7172
)
@@ -317,6 +318,7 @@ async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient:
317318
},
318319
playwright_page=self.context_page,
319320
cookie_dict=cookie_dict,
321+
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
320322
)
321323
return ks_client_obj
322324

media_platform/tieba/client.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,24 @@ def _sync_request(self, method, url, proxy=None, **kwargs):
8989
)
9090
return response
9191

92+
async def _refresh_proxy_if_expired(self) -> None:
93+
"""
94+
检测代理是否过期,如果过期则自动刷新
95+
"""
96+
if self.ip_pool is None:
97+
return
98+
99+
if self.ip_pool.is_current_proxy_expired():
100+
utils.logger.info(
101+
"[BaiduTieBaClient._refresh_proxy_if_expired] Proxy expired, refreshing..."
102+
)
103+
new_proxy = await self.ip_pool.get_or_refresh_proxy()
104+
# 更新代理URL
105+
_, self.default_ip_proxy = utils.format_proxy_info(new_proxy)
106+
utils.logger.info(
107+
f"[BaiduTieBaClient._refresh_proxy_if_expired] New proxy: {new_proxy.ip}:{new_proxy.port}"
108+
)
109+
92110
@retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
93111
async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
94112
"""
@@ -103,6 +121,9 @@ async def request(self, method, url, return_ori_content=False, proxy=None, **kwa
103121
Returns:
104122
105123
"""
124+
# 每次请求前检测代理是否过期
125+
await self._refresh_proxy_if_expired()
126+
106127
actual_proxy = proxy if proxy else self.default_ip_proxy
107128

108129
# 在线程池中执行同步的requests请求

media_platform/weibo/client.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
import copy
2727
import json
2828
import re
29-
from typing import Callable, Dict, List, Optional, Union
29+
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
3030
from urllib.parse import parse_qs, unquote, urlencode
3131

3232
import httpx
@@ -35,13 +35,17 @@
3535
from tenacity import retry, stop_after_attempt, wait_fixed
3636

3737
import config
38+
from proxy.proxy_mixin import ProxyRefreshMixin
3839
from tools import utils
3940

41+
if TYPE_CHECKING:
42+
from proxy.proxy_ip_pool import ProxyIpPool
43+
4044
from .exception import DataFetchError
4145
from .field import SearchType
4246

4347

44-
class WeiboClient:
48+
class WeiboClient(ProxyRefreshMixin):
4549

4650
def __init__(
4751
self,
@@ -51,6 +55,7 @@ def __init__(
5155
headers: Dict[str, str],
5256
playwright_page: Page,
5357
cookie_dict: Dict[str, str],
58+
proxy_ip_pool: Optional["ProxyIpPool"] = None,
5459
):
5560
self.proxy = proxy
5661
self.timeout = timeout
@@ -59,9 +64,14 @@ def __init__(
5964
self.playwright_page = playwright_page
6065
self.cookie_dict = cookie_dict
6166
self._image_agent_host = "https://i1.wp.com/"
67+
# 初始化代理池(来自 ProxyRefreshMixin)
68+
self.init_proxy_pool(proxy_ip_pool)
6269

6370
@retry(stop=stop_after_attempt(5), wait=wait_fixed(3))
6471
async def request(self, method, url, **kwargs) -> Union[Response, Dict]:
72+
# 每次请求前检测代理是否过期
73+
await self._refresh_proxy_if_expired()
74+
6575
enable_return_response = kwargs.pop("return_response", False)
6676
async with httpx.AsyncClient(proxy=self.proxy) as client:
6777
response = await client.request(method, url, timeout=self.timeout, **kwargs)

media_platform/weibo/core.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,13 @@ def __init__(self):
6363
self.user_agent = utils.get_user_agent()
6464
self.mobile_user_agent = utils.get_mobile_user_agent()
6565
self.cdp_manager = None
66+
self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新
6667

6768
async def start(self):
6869
playwright_proxy_format, httpx_proxy_format = None, None
6970
if config.ENABLE_IP_PROXY:
70-
ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
71-
ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy()
71+
self.ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True)
72+
ip_proxy_info: IpInfoModel = await self.ip_proxy_pool.get_proxy()
7273
playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info)
7374

7475
async with async_playwright() as playwright:
@@ -334,6 +335,7 @@ async def create_weibo_client(self, httpx_proxy: Optional[str]) -> WeiboClient:
334335
},
335336
playwright_page=self.context_page,
336337
cookie_dict=cookie_dict,
338+
proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新
337339
)
338340
return weibo_client_obj
339341

0 commit comments

Comments
 (0)