Skip to content

Commit 637e7a9

Browse files
Merge pull request #79 from Dimlitter/rootphantomer/issue78
jd 的 ui 改变,同时也改变了评论的 api 接口。
2 parents 1301b74 + 89ce55b commit 637e7a9

File tree

2 files changed

+85
-54
lines changed

2 files changed

+85
-54
lines changed

auto_comment_plus.py

Lines changed: 28 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
import jdspider
2222

23+
# from http2_adapter import Http2Adapter
24+
2325
# constants
2426
CONFIG_PATH = "./config.yml"
2527
USER_CONFIG_PATH = "./config.user.yml"
@@ -121,6 +123,9 @@ def download_image(img_url, file_name):
121123

122124
# 上传图片到JD接口
123125
def upload_image(filename, file_path, session, headers):
126+
# session.mount(
127+
# "https://club.jd.com/myJdcomments/ajaxUploadImage.action", Http2Adapter()
128+
# )
124129

125130
files = {
126131
"name": (None, filename),
@@ -171,7 +176,7 @@ def generation(pname, _class: int = 0, _type: int = 1, opts: object = None):
171176
"赠品还行。",
172177
]
173178
else:
174-
result = spider.getData(4, 3) # 这里可以自己改
179+
result = spider.getData(2, 3) # 这里可以自己改
175180
opts["logger"].debug("Result: %s", result)
176181

177182
# class 0是评价 1是提取id
@@ -250,7 +255,7 @@ def delete_jpg():
250255

251256
# 普通评价
252257
def ordinary(N, opts=None):
253-
time.sleep(10)
258+
time.sleep(3)
254259
opts = opts or {}
255260
Order_data = []
256261
req_et = []
@@ -359,7 +364,7 @@ def ordinary(N, opts=None):
359364
imgurl2 = imgdata["imgComments"]["imgList"][1]["imageUrl"]
360365
opts["logger"].info("imgurl2 url: %s", imgurl2)
361366
session = requests.Session()
362-
imgBasic = "//img14.360buyimg.com/shaidan/"
367+
imgBasic = "//img20.360buyimg.com/shaidan/s645x515_"
363368
imgName1 = generate_unique_filename()
364369
opts["logger"].debug(f"Image :{imgName1}")
365370
downloaded_file1 = download_image(imgurl1, imgName1)
@@ -369,7 +374,7 @@ def ordinary(N, opts=None):
369374
imgName1, downloaded_file1, session, headers
370375
)
371376
# print(imgPart1) # 和上传图片操作
372-
if imgPart1.status_code == 200:
377+
if imgPart1.status_code == 200 and ".jpg" in imgPart1.text:
373378
imgurl1 = f"{imgBasic}{imgPart1.text}"
374379
else:
375380
imgurl1 = ""
@@ -384,7 +389,7 @@ def ordinary(N, opts=None):
384389
imgName2, downloaded_file2, session, headers
385390
)
386391
# print(imgPart2) # 和上传图片操作
387-
if imgPart2.status_code == 200:
392+
if imgPart2.status_code == 200 and ".jpg" in imgPart2.text:
388393
imgurl2 = f"{imgBasic}{imgPart2.text}"
389394
else:
390395
imgurl2 = ""
@@ -870,7 +875,7 @@ def main(opts=None):
870875
jdspider.cookie = ck.encode("utf-8")
871876

872877
headers2 = {
873-
"cookie": ck.encode("utf-8"),
878+
"Cookie": ck.encode("utf-8"),
874879
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
875880
"Chrome/114.0.5735.110 Safari/537.36",
876881
"Connection": "keep-alive",
@@ -892,25 +897,24 @@ def main(opts=None):
892897
# 'Content-Type':'application/x-www-form-urlencoded'
893898
}
894899
headers = {
895-
"cookie": ck.encode("utf-8"),
896-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
897-
"Chrome/98.0.4758.82 Safari/537.36",
898-
"Connection": "keep-alive",
899-
"Cache-Control": "max-age=0",
900-
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
901-
"sec-ch-ua-mobile": "?0",
902-
"sec-ch-ua-platform": '"Windows"',
900+
"Cookie": ck.encode("utf-8"),
901+
"User-Agent": '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0 Sec-Ch-Ua: "Chromium";v="136", "Microsoft Edge";v="136", "Not.A/Brand";v="99"''',
903902
"DNT": "1",
904-
"Upgrade-Insecure-Requests": "1",
905-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
906-
"application/signed-exchange;v=b3;q=0.9",
907-
"Sec-Fetch-Site": "same-site",
908-
"Sec-Fetch-Mode": "navigate",
909-
"Sec-Fetch-User": "?1",
910-
"Sec-Fetch-Dest": "document",
911-
"Referer": "https://order.jd.com/",
912-
"Accept-Encoding": "gzip, deflate, br",
913-
"Accept-Language": "zh-CN,zh;q=0.9",
903+
# "Connection": "keep-alive",
904+
# "Cache-Control": "max-age=0",
905+
# "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
906+
# "sec-ch-ua-mobile": "?0",
907+
# "sec-ch-ua-platform": '"Windows"',
908+
# "Upgrade-Insecure-Requests": "1",
909+
# "Accept": "*/*",
910+
# "Sec-Fetch-Site": "same-site",
911+
# "Sec-Fetch-Mode": "navigate",
912+
# "origin": "https://club.jd.com",
913+
# "Sec-Fetch-User": "?1",
914+
# "Sec-Fetch-Dest": "document",
915+
# "Referer": "https://order.jd.com/",
916+
# "Accept-Encoding": "gzip, deflate, br, zstd",
917+
# "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
914918
}
915919
logger.debug("Builtin HTTP request header: %s", headers)
916920

jdspider.py

Lines changed: 57 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
# 定义基础请求头,避免重复代码
3232
BASE_HEADERS = {
3333
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
34-
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
34+
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
3535
"accept-encoding": "gzip, deflate, br",
3636
"accept-language": "zh-CN,zh;q=0.9",
3737
"cache-control": "max-age=0",
@@ -44,21 +44,23 @@
4444
"sec-fetch-user": "?1",
4545
"upgrade-insecure-requests": "1",
4646
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
47-
"Chrome/98.0.4758.82 Safari/537.36",
47+
"Chrome/98.0.4758.82 Safari/537.36",
4848
}
4949

50+
5051
class JDSpider:
5152
"""
5253
京东爬虫类,用于爬取指定商品类别的评论信息。
5354
传入商品类别(如手机、电脑)构造实例,然后调用 getData 方法爬取数据。
5455
"""
56+
5557
def __init__(self, categlory):
5658
# 京东搜索商品的起始页面 URL
5759
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
5860
quote(categlory)
5961
)
6062
# 评论接口的基础 URL
61-
self.commentBaseUrl = "https://api.m.jd.com/?"
63+
self.commentBaseUrl = "https://club.jd.com"
6264
# 基础请求头
6365
self.headers = BASE_HEADERS.copy()
6466
# 带 cookie 的请求头
@@ -91,23 +93,27 @@ def getParamUrl(self, productid: str, page: str, score: str):
9193
:param score: 评论类型(1 差评,2 中评,3 好评)
9294
:return: 请求参数和完整 URL
9395
"""
94-
params = {
95-
"appid": "item-v3",
96-
"functionId": "pc_club_productPageComments",
97-
"client": "pc",
98-
"body": {
99-
"productId": productid,
100-
"score": score,
101-
"sortType": "5",
102-
"page": page,
103-
"pageSize": "10",
104-
"isShadowSku": "0",
105-
"rid": "0",
106-
"fold": "1",
107-
},
108-
}
109-
default_logger.info("请求参数: " + str(params))
110-
url = self.commentBaseUrl + urlencode(params)
96+
path = (
97+
"/discussion/getProductPageImageCommentList.action?productId=" + productid
98+
)
99+
params = {}
100+
# params = {
101+
# "appid": "item-v3",
102+
# "functionId": "pc_club_productPageComments",
103+
# "client": "pc",
104+
# "body": {
105+
# "productId": productid,
106+
# "score": score,
107+
# "sortType": "5",
108+
# "page": page,
109+
# "pageSize": "10",
110+
# "isShadowSku": "0",
111+
# "rid": "0",
112+
# "fold": "1",
113+
# },
114+
# }
115+
# default_logger.info("请求参数: " + str(params))
116+
url = self.commentBaseUrl + path
111117
default_logger.info("请求 URL: " + str(url))
112118
return params, url
113119

@@ -120,7 +126,7 @@ def getHeaders(self, productid: str) -> dict:
120126
return {
121127
"Referer": f"https://item.jd.com/{productid}.html",
122128
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
123-
"Chrome/75.0.3770.142 Safari/537.36",
129+
"Chrome/75.0.3770.142 Safari/537.36",
124130
# "cookie": cookie,
125131
}
126132

@@ -149,7 +155,9 @@ def getData(self, maxPage: int, score: int):
149155
"""
150156
comments = []
151157
scores = []
152-
default_logger.info("爬取商品数量最多为 8 个,请耐心等待,也可以自行修改 jdspider 文件")
158+
default_logger.info(
159+
"爬取商品数量最多为 8 个,请耐心等待,也可以自行修改 jdspider 文件"
160+
)
153161

154162
# 确定要爬取的商品数量
155163
product_count = min(len(self.productsId), 8) if self.productsId else 0
@@ -164,7 +172,9 @@ def getData(self, maxPage: int, score: int):
164172
default_logger.info(f"正在爬取第 {j + 1} 个商品的第 {i} 页评论信息")
165173

166174
try:
167-
default_logger.info(f"爬取商品评价的 URL 链接是 {url},商品的 ID 是:{product_id}")
175+
default_logger.info(
176+
f"爬取商品评价的 URL 链接是 {url},商品的 ID 是:{product_id}"
177+
)
168178
response = requests.get(url, headers=self.getHeaders(product_id))
169179
response.raise_for_status() # 检查响应状态码
170180
except requests.RequestException as e:
@@ -183,23 +193,39 @@ def getData(self, maxPage: int, score: int):
183193
default_logger.warning(f"JSON 解析异常: {e}")
184194
continue
185195

186-
if not res_json.get("comments"):
187-
default_logger.warning(f"页面次数已到:{i},超出范围(或未爬取到评论)")
196+
if res_json["imgComments"]["imgCommentCount"] == 0:
197+
default_logger.warning(
198+
f"爬取到的商品评价数量为 0,可能是最后一页或请求失败"
199+
)
188200
break
189201

190-
for comment_data in res_json["comments"]:
191-
comment = comment_data["content"].replace("\n", " ").replace("\r", " ")
202+
for comment_data in res_json["imgComments"]["imgList"]:
203+
comment = (
204+
comment_data["commentVo"]["content"]
205+
.replace("\n", " ")
206+
.replace("\r", " ")
207+
)
192208
comments.append(comment)
193-
scores.append(comment_data["score"])
209+
scores.append(comment_data["commentVo"]["score"])
194210

195211
default_logger.info(f"已爬取 {len(comments)}{self.comtype[score]} 评价信息")
196212

197213
# 处理评论,拆分成句子
198214
remarks = []
199215
for comment in comments:
200216
sentences = re.findall(zhon.hanzi.sentence, comment)
201-
if not sentences or sentences in [["。"], ["?"], ["!"], ["."], [","], ["?"], ["!"]]:
202-
default_logger.warning(f"拆分失败或结果不符(去除空格和标点符号):{sentences}")
217+
if not sentences or sentences in [
218+
["。"],
219+
["?"],
220+
["!"],
221+
["."],
222+
[","],
223+
["?"],
224+
["!"],
225+
]:
226+
default_logger.warning(
227+
f"拆分失败或结果不符(去除空格和标点符号):{sentences}"
228+
)
203229
else:
204230
remarks.append(sentences)
205231

@@ -243,6 +269,7 @@ def solvedata(self, remarks) -> list:
243269
default_logger.info("爬取的评价结果:" + str(sentences))
244270
return sentences
245271

272+
246273
# 测试用例
247274
if __name__ == "__main__":
248275
jdlist = ["商品名"]

0 commit comments

Comments
 (0)