1212import json
1313import time
1414from typing import Any , Callable , Dict , List , Optional , Union
15- from urllib .parse import urlencode
15+ from urllib .parse import urlencode , urlparse , parse_qs
16+
1617
1718import httpx
1819from playwright .async_api import BrowserContext , Page
@@ -56,48 +57,49 @@ def __init__(
5657 # 初始化 xhshow 客户端用于签名生成
5758 self ._xhshow_client = Xhshow ()
5859
59- async def _pre_headers (self , url : str , data = None ) -> Dict :
60- """
61- 请求头参数签名,使用 xhshow 库生成签名
60+ async def _pre_headers (self , url : str , params : Optional [ Dict ] = None , payload : Optional [ Dict ] = None ) -> Dict :
61+ """请求头参数签名
62+
6263 Args:
63- url: 完整的 URI(GET 请求包含查询参数)
64- data: POST 请求的请求体数据
64+ url: 请求的URL(GET请求是包含请求的参数)
65+ params: GET请求的参数
66+ payload: POST请求的参数
6567
6668 Returns:
67-
68- """
69- # 获取 a1 cookie 值
69+ Dict: 请求头参数签名
70+ """
7071 a1_value = self .cookie_dict .get ("a1" , "" )
71-
72- # 根据请求类型使用不同的签名方法
73- if data is None :
74- # GET 请求:从 url 中提取参数
75- from urllib . parse import urlparse , parse_qs
76- parsed = urlparse ( url )
77- params = { k : v [ 0 ] if len ( v ) == 1 else v for k , v in parse_qs ( parsed . query ). items ()}
78- # 使用完整的 URL(包含 host)
79- full_url = f" { self . _host } { url } "
80- x_s = self . _xhshow_client . sign_xs_get ( uri = full_url , a1_value = a1_value , params = params )
72+ parsed = urlparse ( url )
73+ uri = parsed . path
74+ if params is not None :
75+ x_s = self . _xhshow_client . sign_xs_get (
76+ uri = uri , a1_value = a1_value , params = params
77+ )
78+ elif payload is not None :
79+ x_s = self . _xhshow_client . sign_xs_post (
80+ uri = uri , a1_value = a1_value , payload = payload
81+ )
8182 else :
82- # POST 请求:使用 data 作为 payload
83- full_url = f"{ self ._host } { url } "
84- x_s = self ._xhshow_client .sign_xs_post (uri = full_url , a1_value = a1_value , payload = data )
83+ raise ValueError ("params or payload is required" )
8584
86- # 尝试获取 b1 值(从 localStorage),如果获取失败则使用空字符串
85+ # 获取 b1 值
8786 b1_value = ""
8887 try :
8988 if self .playwright_page :
90- local_storage = await self .playwright_page .evaluate ("() => window.localStorage" )
89+ local_storage = await self .playwright_page .evaluate (
90+ "() => window.localStorage"
91+ )
9192 b1_value = local_storage .get ("b1" , "" )
9293 except Exception as e :
93- utils .logger .warning (f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: { e } , using empty string" )
94+ utils .logger .warning (
95+ f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: { e } "
96+ )
9497
95- # 使用 sign 函数生成其他签名头
9698 signs = sign (
9799 a1 = a1_value ,
98100 b1 = b1_value ,
99101 x_s = x_s ,
100- x_t = str (int (time .time () * 1000 )), # x-t 使用毫秒时间戳
102+ x_t = str (int (time .time () * 1000 )),
101103 )
102104
103105 headers = {
@@ -145,7 +147,7 @@ async def request(self, method, url, **kwargs) -> Union[str, Any]:
145147 err_msg = data .get ("msg" , None ) or f"{ response .text } "
146148 raise DataFetchError (err_msg )
147149
148- async def get (self , uri : str , params = None ) -> Dict :
150+ async def get (self , uri : str , params : Optional [ Dict ] = None ) -> Dict :
149151 """
150152 GET请求,对请求头签名
151153 Args:
@@ -155,12 +157,18 @@ async def get(self, uri: str, params=None) -> Dict:
155157 Returns:
156158
157159 """
158- final_uri = uri
160+ headers = await self . _pre_headers ( uri , params )
159161 if isinstance (params , dict ):
160- final_uri = f"{ uri } ?" f"{ urlencode (params )} "
161- headers = await self ._pre_headers (final_uri )
162+ # 使用 xhsshow build_url 构建完整的 URL
163+ full_url = self ._xhshow_client .build_url (
164+ base_url = f"{ self ._host } { uri } " ,
165+ params = params
166+ )
167+ else :
168+ full_url = f"{ self ._host } { uri } "
169+
162170 return await self .request (
163- method = "GET" , url = f" { self . _host } { final_uri } " , headers = headers
171+ method = "GET" , url = full_url , headers = headers
164172 )
165173
166174 async def post (self , uri : str , data : dict , ** kwargs ) -> Dict :
@@ -173,8 +181,8 @@ async def post(self, uri: str, data: dict, **kwargs) -> Dict:
173181 Returns:
174182
175183 """
176- headers = await self ._pre_headers (uri , data )
177- json_str = json . dumps ( data , separators = ( "," , ":" ), ensure_ascii = False )
184+ headers = await self ._pre_headers (uri , payload = data )
185+ json_str = self . _xhshow_client . build_json_body ( payload = data )
178186 return await self .request (
179187 method = "POST" ,
180188 url = f"{ self ._host } { uri } " ,
@@ -523,8 +531,15 @@ async def get_notes_by_creator(
523531 Returns:
524532
525533 """
526- uri = f"/api/sns/web/v1/user_posted?num={ page_size } &cursor={ cursor } &user_id={ creator } &xsec_token={ xsec_token } &xsec_source={ xsec_source } "
527- return await self .get (uri )
534+ uri = f"/api/sns/web/v1/user_posted"
535+ params = {
536+ "num" : page_size ,
537+ "cursor" : cursor ,
538+ "user_id" : creator ,
539+ "xsec_token" : xsec_token ,
540+ "xsec_source" : xsec_source ,
541+ }
542+ return await self .get (uri , params )
528543
529544 async def get_all_notes_by_creator (
530545 self ,
@@ -550,7 +565,9 @@ async def get_all_notes_by_creator(
550565 notes_has_more = True
551566 notes_cursor = ""
552567 while notes_has_more and len (result ) < config .CRAWLER_MAX_NOTES_COUNT :
553- notes_res = await self .get_notes_by_creator (user_id , notes_cursor , xsec_token = xsec_token , xsec_source = xsec_source )
568+ notes_res = await self .get_notes_by_creator (
569+ user_id , notes_cursor , xsec_token = xsec_token , xsec_source = xsec_source
570+ )
554571 if not notes_res :
555572 utils .logger .error (
556573 f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data."
0 commit comments