Skip to content

Commit 31a092c

Browse files
authored
Merge pull request #782 from NanmiCoder/fix/xhs-sign-20251127
feat: xhs sign playwright version
2 parents 15b98fa + f989ce0 commit 31a092c

File tree

3 files changed

+373
-40
lines changed

3 files changed

+373
-40
lines changed

media_platform/xhs/client.py

Lines changed: 18 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,12 @@
1919

2020
import asyncio
2121
import json
22-
import time
2322
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
24-
from urllib.parse import urlencode, urlparse, parse_qs
25-
23+
from urllib.parse import urlencode
2624

2725
import httpx
2826
from playwright.async_api import BrowserContext, Page
2927
from tenacity import retry, stop_after_attempt, wait_fixed
30-
from xhshow import Xhshow
3128

3229
import config
3330
from base.base_crawler import AbstractApiClient
@@ -39,8 +36,9 @@
3936

4037
from .exception import DataFetchError, IPBlockError
4138
from .field import SearchNoteType, SearchSortType
42-
from .help import get_search_id, sign
39+
from .help import get_search_id
4340
from .extractor import XiaoHongShuExtractor
41+
from .playwright_sign import sign_with_playwright
4442

4543

4644
class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
@@ -67,54 +65,36 @@ def __init__(
6765
self.playwright_page = playwright_page
6866
self.cookie_dict = cookie_dict
6967
self._extractor = XiaoHongShuExtractor()
70-
# 初始化 xhshow 客户端用于签名生成
71-
self._xhshow_client = Xhshow()
7268
# 初始化代理池(来自 ProxyRefreshMixin)
7369
self.init_proxy_pool(proxy_ip_pool)
7470

7571
async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict:
76-
"""请求头参数签名
72+
"""请求头参数签名(使用 playwright 注入方式)
7773
7874
Args:
79-
url: 请求的URL(GET请求是包含请求的参数)
75+
url: 请求的URL
8076
params: GET请求的参数
8177
payload: POST请求的参数
8278
8379
Returns:
8480
Dict: 请求头参数签名
8581
"""
8682
a1_value = self.cookie_dict.get("a1", "")
87-
parsed = urlparse(url)
88-
uri = parsed.path
83+
84+
# 确定请求数据和 URI
8985
if params is not None:
90-
x_s = self._xhshow_client.sign_xs_get(
91-
uri=uri, a1_value=a1_value, params=params
92-
)
86+
data = params
9387
elif payload is not None:
94-
x_s = self._xhshow_client.sign_xs_post(
95-
uri=uri, a1_value=a1_value, payload=payload
96-
)
88+
data = payload
9789
else:
9890
raise ValueError("params or payload is required")
9991

100-
# 获取 b1 值
101-
b1_value = ""
102-
try:
103-
if self.playwright_page:
104-
local_storage = await self.playwright_page.evaluate(
105-
"() => window.localStorage"
106-
)
107-
b1_value = local_storage.get("b1", "")
108-
except Exception as e:
109-
utils.logger.warning(
110-
f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}"
111-
)
112-
113-
signs = sign(
92+
# 使用 playwright 注入方式生成签名
93+
signs = await sign_with_playwright(
94+
page=self.playwright_page,
95+
uri=url,
96+
data=data,
11497
a1=a1_value,
115-
b1=b1_value,
116-
x_s=x_s,
117-
x_t=str(int(time.time() * 1000)),
11898
)
11999

120100
headers = {
@@ -177,11 +157,9 @@ async def get(self, uri: str, params: Optional[Dict] = None) -> Dict:
177157
"""
178158
headers = await self._pre_headers(uri, params)
179159
if isinstance(params, dict):
180-
# 使用 xhsshow build_url 构建完整的 URL
181-
full_url = self._xhshow_client.build_url(
182-
base_url=f"{self._host}{uri}",
183-
params=params
184-
)
160+
# 构建带参数的完整 URL
161+
query_string = urlencode(params)
162+
full_url = f"{self._host}{uri}?{query_string}"
185163
else:
186164
full_url = f"{self._host}{uri}"
187165

@@ -200,7 +178,7 @@ async def post(self, uri: str, data: dict, **kwargs) -> Dict:
200178
201179
"""
202180
headers = await self._pre_headers(uri, payload=data)
203-
json_str = self._xhshow_client.build_json_body(payload=data)
181+
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
204182
return await self.request(
205183
method="POST",
206184
url=f"{self._host}{uri}",
Lines changed: 203 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,203 @@
1+
# -*- coding: utf-8 -*-
2+
# Copyright (c) 2025 [email protected]
3+
#
4+
# This file is part of MediaCrawler project.
5+
# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/xhs/playwright_sign.py
6+
# GitHub: https://github.com/NanmiCoder
7+
# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1
8+
#
9+
# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则:
10+
# 1. 不得用于任何商业用途。
11+
# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。
12+
# 3. 不得进行大规模爬取或对平台造成运营干扰。
13+
# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。
14+
# 5. 不得用于任何非法或不当的用途。
15+
#
16+
# 详细许可条款请参阅项目根目录下的LICENSE文件。
17+
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
18+
19+
# 通过 Playwright 注入调用 window.mnsv2 生成小红书签名
20+
21+
import hashlib
22+
import json
23+
import time
24+
from typing import Any, Dict, Optional, Union
25+
from urllib.parse import urlparse
26+
27+
from playwright.async_api import Page
28+
29+
from .xhs_sign import b64_encode, encode_utf8, get_trace_id, mrc
30+
31+
32+
def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None) -> str:
33+
"""构建待签名字符串"""
34+
c = uri
35+
if data is not None:
36+
if isinstance(data, dict):
37+
c += json.dumps(data, separators=(",", ":"), ensure_ascii=False)
38+
elif isinstance(data, str):
39+
c += data
40+
return c
41+
42+
43+
def _md5_hex(s: str) -> str:
44+
"""计算 MD5 哈希值"""
45+
return hashlib.md5(s.encode("utf-8")).hexdigest()
46+
47+
48+
def _build_xs_payload(x3_value: str, data_type: str = "object") -> str:
49+
"""构建 x-s 签名"""
50+
s = {
51+
"x0": "4.2.1",
52+
"x1": "xhs-pc-web",
53+
"x2": "Mac OS",
54+
"x3": x3_value,
55+
"x4": data_type,
56+
}
57+
return "XYS_" + b64_encode(encode_utf8(json.dumps(s, separators=(",", ":"))))
58+
59+
60+
def _build_xs_common(a1: str, b1: str, x_s: str, x_t: str) -> str:
61+
"""构建 x-s-common 请求头"""
62+
payload = {
63+
"s0": 3,
64+
"s1": "",
65+
"x0": "1",
66+
"x1": "4.2.2",
67+
"x2": "Mac OS",
68+
"x3": "xhs-pc-web",
69+
"x4": "4.74.0",
70+
"x5": a1,
71+
"x6": x_t,
72+
"x7": x_s,
73+
"x8": b1,
74+
"x9": mrc(x_t + x_s + b1),
75+
"x10": 154,
76+
"x11": "normal",
77+
}
78+
return b64_encode(encode_utf8(json.dumps(payload, separators=(",", ":"))))
79+
80+
81+
async def get_b1_from_localstorage(page: Page) -> str:
82+
"""从 localStorage 获取 b1 值"""
83+
try:
84+
local_storage = await page.evaluate("() => window.localStorage")
85+
return local_storage.get("b1", "")
86+
except Exception:
87+
return ""
88+
89+
90+
async def call_mnsv2(page: Page, sign_str: str, md5_str: str) -> str:
91+
"""
92+
通过 playwright 调用 window.mnsv2 函数
93+
94+
Args:
95+
page: playwright Page 对象
96+
sign_str: 待签名字符串 (uri + JSON.stringify(data))
97+
md5_str: sign_str 的 MD5 哈希值
98+
99+
Returns:
100+
mnsv2 返回的签名字符串
101+
"""
102+
sign_str_escaped = sign_str.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
103+
md5_str_escaped = md5_str.replace("\\", "\\\\").replace("'", "\\'")
104+
105+
try:
106+
result = await page.evaluate(f"window.mnsv2('{sign_str_escaped}', '{md5_str_escaped}')")
107+
return result if result else ""
108+
except Exception:
109+
return ""
110+
111+
112+
async def sign_xs_with_playwright(
113+
page: Page,
114+
uri: str,
115+
data: Optional[Union[Dict, str]] = None,
116+
) -> str:
117+
"""
118+
通过 playwright 注入生成 x-s 签名
119+
120+
Args:
121+
page: playwright Page 对象(必须已打开小红书页面)
122+
uri: API 路径,如 "/api/sns/web/v1/search/notes"
123+
data: 请求数据(GET 的 params 或 POST 的 payload)
124+
125+
Returns:
126+
x-s 签名字符串
127+
"""
128+
sign_str = _build_sign_string(uri, data)
129+
md5_str = _md5_hex(sign_str)
130+
x3_value = await call_mnsv2(page, sign_str, md5_str)
131+
data_type = "object" if isinstance(data, (dict, list)) else "string"
132+
return _build_xs_payload(x3_value, data_type)
133+
134+
135+
async def sign_with_playwright(
136+
page: Page,
137+
uri: str,
138+
data: Optional[Union[Dict, str]] = None,
139+
a1: str = "",
140+
) -> Dict[str, Any]:
141+
"""
142+
通过 playwright 生成完整的签名请求头
143+
144+
Args:
145+
page: playwright Page 对象(必须已打开小红书页面)
146+
uri: API 路径
147+
data: 请求数据
148+
a1: cookie 中的 a1 值
149+
150+
Returns:
151+
包含 x-s, x-t, x-s-common, x-b3-traceid 的字典
152+
"""
153+
b1 = await get_b1_from_localstorage(page)
154+
x_s = await sign_xs_with_playwright(page, uri, data)
155+
x_t = str(int(time.time() * 1000))
156+
157+
return {
158+
"x-s": x_s,
159+
"x-t": x_t,
160+
"x-s-common": _build_xs_common(a1, b1, x_s, x_t),
161+
"x-b3-traceid": get_trace_id(),
162+
}
163+
164+
165+
async def pre_headers_with_playwright(
166+
page: Page,
167+
url: str,
168+
cookie_dict: Dict[str, str],
169+
params: Optional[Dict] = None,
170+
payload: Optional[Dict] = None,
171+
) -> Dict[str, str]:
172+
"""
173+
使用 playwright 注入方式生成请求头签名
174+
可直接替换 client.py 中的 _pre_headers 方法
175+
176+
Args:
177+
page: playwright Page 对象
178+
url: 请求 URL
179+
cookie_dict: cookie 字典
180+
params: GET 请求参数
181+
payload: POST 请求参数
182+
183+
Returns:
184+
签名后的请求头字典
185+
"""
186+
a1_value = cookie_dict.get("a1", "")
187+
uri = urlparse(url).path
188+
189+
if params is not None:
190+
data = params
191+
elif payload is not None:
192+
data = payload
193+
else:
194+
raise ValueError("params or payload is required")
195+
196+
signs = await sign_with_playwright(page, uri, data, a1_value)
197+
198+
return {
199+
"X-S": signs["x-s"],
200+
"X-T": signs["x-t"],
201+
"x-S-Common": signs["x-s-common"],
202+
"X-B3-Traceid": signs["x-b3-traceid"],
203+
}

0 commit comments

Comments
 (0)