Skip to content

Commit a1c5e07

Browse files
committed
fix: xhs sub comment bugfix #769
1 parent b6caa7a commit a1c5e07

File tree

2 files changed

+55
-39
lines changed

2 files changed

+55
-39
lines changed

main.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,7 @@ async def main():
8888

8989

9090
def cleanup():
91-
if crawler:
92-
# asyncio.run(crawler.close())
91+
if crawler:
9392
pass
9493
if config.SAVE_DATA_OPTION in ["db", "sqlite"]:
9594
asyncio.run(db.close())

media_platform/xhs/client.py

Lines changed: 54 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
import json
1313
import time
1414
from typing import Any, Callable, Dict, List, Optional, Union
15-
from urllib.parse import urlencode
15+
from urllib.parse import urlencode, urlparse, parse_qs
16+
1617

1718
import httpx
1819
from playwright.async_api import BrowserContext, Page
@@ -56,48 +57,49 @@ def __init__(
5657
# 初始化 xhshow 客户端用于签名生成
5758
self._xhshow_client = Xhshow()
5859

59-
async def _pre_headers(self, url: str, data=None) -> Dict:
60-
"""
61-
请求头参数签名,使用 xhshow 库生成签名
60+
async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict:
61+
"""请求头参数签名
62+
6263
Args:
63-
url: 完整的 URI(GET 请求包含查询参数)
64-
data: POST 请求的请求体数据
64+
url: 请求的URL(GET请求是包含请求的参数)
65+
params: GET请求的参数
66+
payload: POST请求的参数
6567
6668
Returns:
67-
68-
"""
69-
# 获取 a1 cookie 值
69+
Dict: 请求头参数签名
70+
"""
7071
a1_value = self.cookie_dict.get("a1", "")
71-
72-
# 根据请求类型使用不同的签名方法
73-
if data is None:
74-
# GET 请求:从 url 中提取参数
75-
from urllib.parse import urlparse, parse_qs
76-
parsed = urlparse(url)
77-
params = {k: v[0] if len(v) == 1 else v for k, v in parse_qs(parsed.query).items()}
78-
# 使用完整的 URL(包含 host)
79-
full_url = f"{self._host}{url}"
80-
x_s = self._xhshow_client.sign_xs_get(uri=full_url, a1_value=a1_value, params=params)
72+
parsed = urlparse(url)
73+
uri = parsed.path
74+
if params is not None:
75+
x_s = self._xhshow_client.sign_xs_get(
76+
uri=uri, a1_value=a1_value, params=params
77+
)
78+
elif payload is not None:
79+
x_s = self._xhshow_client.sign_xs_post(
80+
uri=uri, a1_value=a1_value, payload=payload
81+
)
8182
else:
82-
# POST 请求:使用 data 作为 payload
83-
full_url = f"{self._host}{url}"
84-
x_s = self._xhshow_client.sign_xs_post(uri=full_url, a1_value=a1_value, payload=data)
83+
raise ValueError("params or payload is required")
8584

86-
# 尝试获取 b1 值(从 localStorage),如果获取失败则使用空字符串
85+
# 获取 b1 值
8786
b1_value = ""
8887
try:
8988
if self.playwright_page:
90-
local_storage = await self.playwright_page.evaluate("() => window.localStorage")
89+
local_storage = await self.playwright_page.evaluate(
90+
"() => window.localStorage"
91+
)
9192
b1_value = local_storage.get("b1", "")
9293
except Exception as e:
93-
utils.logger.warning(f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}, using empty string")
94+
utils.logger.warning(
95+
f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}"
96+
)
9497

95-
# 使用 sign 函数生成其他签名头
9698
signs = sign(
9799
a1=a1_value,
98100
b1=b1_value,
99101
x_s=x_s,
100-
x_t=str(int(time.time() * 1000)), # x-t 使用毫秒时间戳
102+
x_t=str(int(time.time() * 1000)),
101103
)
102104

103105
headers = {
@@ -145,7 +147,7 @@ async def request(self, method, url, **kwargs) -> Union[str, Any]:
145147
err_msg = data.get("msg", None) or f"{response.text}"
146148
raise DataFetchError(err_msg)
147149

148-
async def get(self, uri: str, params=None) -> Dict:
150+
async def get(self, uri: str, params: Optional[Dict] = None) -> Dict:
149151
"""
150152
GET请求,对请求头签名
151153
Args:
@@ -155,12 +157,18 @@ async def get(self, uri: str, params=None) -> Dict:
155157
Returns:
156158
157159
"""
158-
final_uri = uri
160+
headers = await self._pre_headers(uri, params)
159161
if isinstance(params, dict):
160-
final_uri = f"{uri}?" f"{urlencode(params)}"
161-
headers = await self._pre_headers(final_uri)
162+
# 使用 xhsshow build_url 构建完整的 URL
163+
full_url = self._xhshow_client.build_url(
164+
base_url=f"{self._host}{uri}",
165+
params=params
166+
)
167+
else:
168+
full_url = f"{self._host}{uri}"
169+
162170
return await self.request(
163-
method="GET", url=f"{self._host}{final_uri}", headers=headers
171+
method="GET", url=full_url, headers=headers
164172
)
165173

166174
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
@@ -173,8 +181,8 @@ async def post(self, uri: str, data: dict, **kwargs) -> Dict:
173181
Returns:
174182
175183
"""
176-
headers = await self._pre_headers(uri, data)
177-
json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False)
184+
headers = await self._pre_headers(uri, payload=data)
185+
json_str = self._xhshow_client.build_json_body(payload=data)
178186
return await self.request(
179187
method="POST",
180188
url=f"{self._host}{uri}",
@@ -523,8 +531,15 @@ async def get_notes_by_creator(
523531
Returns:
524532
525533
"""
526-
uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}"
527-
return await self.get(uri)
534+
uri = f"/api/sns/web/v1/user_posted"
535+
params = {
536+
"num": page_size,
537+
"cursor": cursor,
538+
"user_id": creator,
539+
"xsec_token": xsec_token,
540+
"xsec_source": xsec_source,
541+
}
542+
return await self.get(uri, params)
528543

529544
async def get_all_notes_by_creator(
530545
self,
@@ -550,7 +565,9 @@ async def get_all_notes_by_creator(
550565
notes_has_more = True
551566
notes_cursor = ""
552567
while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT:
553-
notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source)
568+
notes_res = await self.get_notes_by_creator(
569+
user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source
570+
)
554571
if not notes_res:
555572
utils.logger.error(
556573
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."

0 commit comments

Comments
 (0)