Skip to content

Commit a4d9aaa

Browse files
committed
refactor: xhs update
1 parent 26a4335 commit a4d9aaa

File tree

5 files changed

+44
-78
lines changed

5 files changed

+44
-78
lines changed

config/base_config.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@
3939
# 是否启用CDP模式 - 使用用户现有的Chrome/Edge浏览器进行爬取,提供更好的反检测能力
4040
# 启用后将自动检测并启动用户的Chrome/Edge浏览器,通过CDP协议进行控制
4141
# 这种方式使用真实的浏览器环境,包括用户的扩展、Cookie和设置,大大降低被检测的风险
42-
ENABLE_CDP_MODE = False
42+
ENABLE_CDP_MODE = True
4343

4444
# CDP调试端口,用于与浏览器通信
4545
# 如果端口被占用,系统会自动尝试下一个可用端口

config/xhs_config.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,6 @@
1515
# 排序方式,具体的枚举值在media_platform/xhs/field.py中
1616
SORT_TYPE = "popularity_descending"
1717

18-
# 用户代理,xhs自定义User-Agent
19-
UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0"
20-
2118
# 指定笔记URL列表, 必须要携带xsec_token参数
2219
XHS_SPECIFIED_NOTE_URL_LIST = [
2320
"https://www.xiaohongshu.com/explore/66fad51c000000001b0224b8?xsec_token=AB3rO-QopW5sgrJ41GwN01WCXh6yWPxjSoFI9D5JIMgKw=&xsec_source=pc_search"

main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ async def main():
6767

6868
def cleanup():
6969
if crawler:
70-
asyncio.run(crawler.close())
70+
# asyncio.run(crawler.close())
71+
pass
7172
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
7273
asyncio.run(db.close())
7374

media_platform/xhs/client.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,17 +95,16 @@ async def request(self, method, url, **kwargs) -> Union[str, Any]:
9595
"""
9696
# return response.text
9797
return_response = kwargs.pop("return_response", False)
98-
9998
async with httpx.AsyncClient(proxies=self.proxies) as client:
10099
response = await client.request(method, url, timeout=self.timeout, **kwargs)
101100

102101
if response.status_code == 471 or response.status_code == 461:
103102
# someday someone maybe will bypass captcha
104103
verify_type = response.headers["Verifytype"]
105104
verify_uuid = response.headers["Verifyuuid"]
106-
raise Exception(
107-
f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
108-
)
105+
msg = f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}"
106+
utils.logger.error(msg)
107+
raise Exception(msg)
109108

110109
if return_response:
111110
return response.text

media_platform/xhs/core.py

Lines changed: 38 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
5151
def __init__(self) -> None:
5252
self.index_url = "https://www.xiaohongshu.com"
5353
# self.user_agent = utils.get_user_agent()
54-
self.user_agent = (
55-
config.UA
56-
if config.UA
57-
else "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
58-
)
54+
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
5955
self.cdp_manager = None
6056

6157
async def start(self) -> None:
@@ -91,17 +87,6 @@ async def start(self) -> None:
9187
)
9288
# stealth.min.js is a js script to prevent the website from detecting the crawler.
9389
await self.browser_context.add_init_script(path="libs/stealth.min.js")
94-
# add a cookie attribute webId to avoid the appearance of a sliding captcha on the webpage
95-
await self.browser_context.add_cookies(
96-
[
97-
{
98-
"name": "webId",
99-
"value": "xxx123", # any value
100-
"domain": ".xiaohongshu.com",
101-
"path": "/",
102-
}
103-
]
104-
)
10590
self.context_page = await self.browser_context.new_page()
10691
await self.context_page.goto(self.index_url)
10792

@@ -152,7 +137,7 @@ async def search(self) -> None:
152137
page = 1
153138
search_id = get_search_id()
154139
while (
155-
page - start_page + 1
140+
page - start_page + 1
156141
) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT:
157142
if page < start_page:
158143
utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}")
@@ -294,11 +279,11 @@ async def get_specified_notes(self):
294279
await self.batch_get_note_comments(need_get_comment_note_ids, xsec_tokens)
295280

296281
async def get_note_detail_async_task(
297-
self,
298-
note_id: str,
299-
xsec_source: str,
300-
xsec_token: str,
301-
semaphore: asyncio.Semaphore,
282+
self,
283+
note_id: str,
284+
xsec_source: str,
285+
xsec_token: str,
286+
semaphore: asyncio.Semaphore,
302287
) -> Optional[Dict]:
303288
"""Get note detail
304289
@@ -311,47 +296,31 @@ async def get_note_detail_async_task(
311296
Returns:
312297
Dict: note detail
313298
"""
314-
note_detail_from_html, note_detail_from_api = None, None
299+
note_detail = None
315300
async with semaphore:
316-
# When proxy is not enabled, increase the crawling interval
317-
if config.ENABLE_IP_PROXY:
318-
crawl_interval = random.random()
319-
else:
320-
crawl_interval = random.uniform(1, config.CRAWLER_MAX_SLEEP_SEC)
321301
try:
322302
utils.logger.info(
323303
f"[get_note_detail_async_task] Begin get note detail, note_id: {note_id}"
324304
)
325-
# 尝试直接获取网页版笔记详情,携带cookie
326-
note_detail_from_html: Optional[Dict] = (
327-
await self.xhs_client.get_note_by_id_from_html(
328-
note_id, xsec_source, xsec_token, enable_cookie=True
305+
306+
try:
307+
note_detail = await self.xhs_client.get_note_by_id(
308+
note_id, xsec_source, xsec_token
329309
)
310+
except RetryError as e:
311+
pass
312+
313+
if not note_detail:
314+
note_detail = await self.xhs_client.get_note_by_id_from_html(note_id, xsec_source, xsec_token,
315+
enable_cookie=False)
316+
if not note_detail:
317+
raise Exception(f"[get_note_detail_async_task] Failed to get note detail, Id: {note_id}")
318+
319+
note_detail.update(
320+
{"xsec_token": xsec_token, "xsec_source": xsec_source}
330321
)
331-
time.sleep(crawl_interval)
332-
if not note_detail_from_html:
333-
# 如果网页版笔记详情获取失败,则尝试不使用cookie获取
334-
note_detail_from_html = (
335-
await self.xhs_client.get_note_by_id_from_html(
336-
note_id, xsec_source, xsec_token, enable_cookie=False
337-
)
338-
)
339-
utils.logger.error(
340-
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error, note_id: {note_id}"
341-
)
342-
if not note_detail_from_html:
343-
# 如果网页版笔记详情获取失败,则尝试API获取
344-
note_detail_from_api: Optional[Dict] = (
345-
await self.xhs_client.get_note_by_id(
346-
note_id, xsec_source, xsec_token
347-
)
348-
)
349-
note_detail = note_detail_from_html or note_detail_from_api
350-
if note_detail:
351-
note_detail.update(
352-
{"xsec_token": xsec_token, "xsec_source": xsec_source}
353-
)
354-
return note_detail
322+
return note_detail
323+
355324
except DataFetchError as ex:
356325
utils.logger.error(
357326
f"[XiaoHongShuCrawler.get_note_detail_async_task] Get note detail error: {ex}"
@@ -364,7 +333,7 @@ async def get_note_detail_async_task(
364333
return None
365334

366335
async def batch_get_note_comments(
367-
self, note_list: List[str], xsec_tokens: List[str]
336+
self, note_list: List[str], xsec_tokens: List[str]
368337
):
369338
"""Batch get note comments"""
370339
if not config.ENABLE_GET_COMMENTS:
@@ -389,7 +358,7 @@ async def batch_get_note_comments(
389358
await asyncio.gather(*task_list)
390359

391360
async def get_comments(
392-
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
361+
self, note_id: str, xsec_token: str, semaphore: asyncio.Semaphore
393362
):
394363
"""Get note comments with keyword filtering and quantity limitation"""
395364
async with semaphore:
@@ -411,7 +380,7 @@ async def get_comments(
411380

412381
@staticmethod
413382
def format_proxy_info(
414-
ip_proxy_info: IpInfoModel,
383+
ip_proxy_info: IpInfoModel,
415384
) -> Tuple[Optional[Dict], Optional[Dict]]:
416385
"""format proxy info for playwright and httpx"""
417386
playwright_proxy = {
@@ -447,11 +416,11 @@ async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClie
447416
return xhs_client_obj
448417

449418
async def launch_browser(
450-
self,
451-
chromium: BrowserType,
452-
playwright_proxy: Optional[Dict],
453-
user_agent: Optional[str],
454-
headless: bool = True,
419+
self,
420+
chromium: BrowserType,
421+
playwright_proxy: Optional[Dict],
422+
user_agent: Optional[str],
423+
headless: bool = True,
455424
) -> BrowserContext:
456425
"""Launch browser and create browser context"""
457426
utils.logger.info(
@@ -480,11 +449,11 @@ async def launch_browser(
480449
return browser_context
481450

482451
async def launch_browser_with_cdp(
483-
self,
484-
playwright: Playwright,
485-
playwright_proxy: Optional[Dict],
486-
user_agent: Optional[str],
487-
headless: bool = True,
452+
self,
453+
playwright: Playwright,
454+
playwright_proxy: Optional[Dict],
455+
user_agent: Optional[str],
456+
headless: bool = True,
488457
) -> BrowserContext:
489458
"""
490459
使用CDP模式启动浏览器

0 commit comments

Comments
 (0)