Merge pull request #79 from Dimlitter/rootphantomer/issue78

rootphantomer · web-flow · commit 637e7a989c4f · 2025-05-14T10:23:52.000+08:00
jd 的 ui 改变，同时也改变了评论的 api 接口。
diff --git a/auto_comment_plus.py b/auto_comment_plus.py
@@ -20,6 +20,8 @@
 
 import jdspider
 
+# from http2_adapter import Http2Adapter
+
 # constants
 CONFIG_PATH = "./config.yml"
 USER_CONFIG_PATH = "./config.user.yml"
@@ -121,6 +123,9 @@ def download_image(img_url, file_name):
 
 # 上传图片到JD接口
 def upload_image(filename, file_path, session, headers):
+    # session.mount(
+    #     "https://club.jd.com/myJdcomments/ajaxUploadImage.action", Http2Adapter()
+    # )
 
     files = {
         "name": (None, filename),
@@ -171,7 +176,7 @@ def generation(pname, _class: int = 0, _type: int = 1, opts: object = None):
                 "赠品还行。",
             ]
         else:
-            result = spider.getData(4, 3)  # 这里可以自己改
+            result = spider.getData(2, 3)  # 这里可以自己改
         opts["logger"].debug("Result: %s", result)
 
     # class 0是评价 1是提取id
@@ -250,7 +255,7 @@ def delete_jpg():
 
 # 普通评价
 def ordinary(N, opts=None):
-    time.sleep(10)
+    time.sleep(3)
     opts = opts or {}
     Order_data = []
     req_et = []
@@ -359,7 +364,7 @@ def ordinary(N, opts=None):
                 imgurl2 = imgdata["imgComments"]["imgList"][1]["imageUrl"]
                 opts["logger"].info("imgurl2 url: %s", imgurl2)
                 session = requests.Session()
-                imgBasic = "//img14.360buyimg.com/shaidan/"
+                imgBasic = "//img20.360buyimg.com/shaidan/s645x515_"
                 imgName1 = generate_unique_filename()
                 opts["logger"].debug(f"Image :{imgName1}")
                 downloaded_file1 = download_image(imgurl1, imgName1)
@@ -369,7 +374,7 @@ def ordinary(N, opts=None):
                         imgName1, downloaded_file1, session, headers
                     )
                     # print(imgPart1)  # 和上传图片操作
-                    if imgPart1.status_code == 200:
+                    if imgPart1.status_code == 200 and ".jpg" in imgPart1.text:
                         imgurl1 = f"{imgBasic}{imgPart1.text}"
                     else:
                         imgurl1 = ""
@@ -384,7 +389,7 @@ def ordinary(N, opts=None):
                         imgName2, downloaded_file2, session, headers
                     )
                     # print(imgPart2)  # 和上传图片操作
-                    if imgPart2.status_code == 200:
+                    if imgPart2.status_code == 200 and ".jpg" in imgPart2.text:
                         imgurl2 = f"{imgBasic}{imgPart2.text}"
                     else:
                         imgurl2 = ""
@@ -870,7 +875,7 @@ def main(opts=None):
     jdspider.cookie = ck.encode("utf-8")
 
     headers2 = {
-        "cookie": ck.encode("utf-8"),
+        "Cookie": ck.encode("utf-8"),
         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
         "Chrome/114.0.5735.110 Safari/537.36",
         "Connection": "keep-alive",
@@ -892,25 +897,24 @@ def main(opts=None):
         # 'Content-Type':'application/x-www-form-urlencoded'
     }
     headers = {
-        "cookie": ck.encode("utf-8"),
-        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
-        "Chrome/98.0.4758.82 Safari/537.36",
-        "Connection": "keep-alive",
-        "Cache-Control": "max-age=0",
-        "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
-        "sec-ch-ua-mobile": "?0",
-        "sec-ch-ua-platform": '"Windows"',
+        "Cookie": ck.encode("utf-8"),
+        "User-Agent": '''Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36 Edg/136.0.0.0 Sec-Ch-Ua: "Chromium";v="136", "Microsoft Edge";v="136", "Not.A/Brand";v="99"''',
         "DNT": "1",
-        "Upgrade-Insecure-Requests": "1",
-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,"
-        "application/signed-exchange;v=b3;q=0.9",
-        "Sec-Fetch-Site": "same-site",
-        "Sec-Fetch-Mode": "navigate",
-        "Sec-Fetch-User": "?1",
-        "Sec-Fetch-Dest": "document",
-        "Referer": "https://order.jd.com/",
-        "Accept-Encoding": "gzip, deflate, br",
-        "Accept-Language": "zh-CN,zh;q=0.9",
+        # "Connection": "keep-alive",
+        # "Cache-Control": "max-age=0",
+        # "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
+        # "sec-ch-ua-mobile": "?0",
+        # "sec-ch-ua-platform": '"Windows"',
+        # "Upgrade-Insecure-Requests": "1",
+        # "Accept": "*/*",
+        # "Sec-Fetch-Site": "same-site",
+        # "Sec-Fetch-Mode": "navigate",
+        # "origin": "https://club.jd.com",
+        # "Sec-Fetch-User": "?1",
+        # "Sec-Fetch-Dest": "document",
+        # "Referer": "https://order.jd.com/",
+        # "Accept-Encoding": "gzip, deflate, br, zstd",
+        # "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
     }
     logger.debug("Builtin HTTP request header: %s", headers)
 
diff --git a/jdspider.py b/jdspider.py
@@ -31,7 +31,7 @@
 # 定义基础请求头，避免重复代码
 BASE_HEADERS = {
     "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
-              "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+    "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
     "accept-encoding": "gzip, deflate, br",
     "accept-language": "zh-CN,zh;q=0.9",
     "cache-control": "max-age=0",
@@ -44,21 +44,23 @@
     "sec-fetch-user": "?1",
     "upgrade-insecure-requests": "1",
     "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
-                  "Chrome/98.0.4758.82 Safari/537.36",
+    "Chrome/98.0.4758.82 Safari/537.36",
 }
 
+
 class JDSpider:
     """
     京东爬虫类，用于爬取指定商品类别的评论信息。
     传入商品类别（如手机、电脑）构造实例，然后调用 getData 方法爬取数据。
     """
+
     def __init__(self, categlory):
         # 京东搜索商品的起始页面 URL
         self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
             quote(categlory)
         )
         # 评论接口的基础 URL
-        self.commentBaseUrl = "https://api.m.jd.com/?"
+        self.commentBaseUrl = "https://club.jd.com"
         # 基础请求头
         self.headers = BASE_HEADERS.copy()
         # 带 cookie 的请求头
@@ -91,23 +93,27 @@ def getParamUrl(self, productid: str, page: str, score: str):
         :param score: 评论类型（1 差评，2 中评，3 好评）
         :return: 请求参数和完整 URL
         """
-        params = {
-            "appid": "item-v3",
-            "functionId": "pc_club_productPageComments",
-            "client": "pc",
-            "body": {
-                "productId": productid,
-                "score": score,
-                "sortType": "5",
-                "page": page,
-                "pageSize": "10",
-                "isShadowSku": "0",
-                "rid": "0",
-                "fold": "1",
-            },
-        }
-        default_logger.info("请求参数: " + str(params))
-        url = self.commentBaseUrl + urlencode(params)
+        path = (
+            "/discussion/getProductPageImageCommentList.action?productId=" + productid
+        )
+        params = {}
+        # params = {
+        #     "appid": "item-v3",
+        #     "functionId": "pc_club_productPageComments",
+        #     "client": "pc",
+        #     "body": {
+        #         "productId": productid,
+        #         "score": score,
+        #         "sortType": "5",
+        #         "page": page,
+        #         "pageSize": "10",
+        #         "isShadowSku": "0",
+        #         "rid": "0",
+        #         "fold": "1",
+        #     },
+        # }
+        # default_logger.info("请求参数: " + str(params))
+        url = self.commentBaseUrl + path
         default_logger.info("请求 URL: " + str(url))
         return params, url
 
@@ -120,7 +126,7 @@ def getHeaders(self, productid: str) -> dict:
         return {
             "Referer": f"https://item.jd.com/{productid}.html",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
-                          "Chrome/75.0.3770.142 Safari/537.36",
+            "Chrome/75.0.3770.142 Safari/537.36",
             # "cookie": cookie,
         }
 
@@ -149,7 +155,9 @@ def getData(self, maxPage: int, score: int):
         """
         comments = []
         scores = []
-        default_logger.info("爬取商品数量最多为 8 个，请耐心等待，也可以自行修改 jdspider 文件")
+        default_logger.info(
+            "爬取商品数量最多为 8 个，请耐心等待，也可以自行修改 jdspider 文件"
+        )
 
         # 确定要爬取的商品数量
         product_count = min(len(self.productsId), 8) if self.productsId else 0
@@ -164,7 +172,9 @@ def getData(self, maxPage: int, score: int):
                 default_logger.info(f"正在爬取第 {j + 1} 个商品的第 {i} 页评论信息")
 
                 try:
-                    default_logger.info(f"爬取商品评价的 URL 链接是 {url}，商品的 ID 是：{product_id}")
+                    default_logger.info(
+                        f"爬取商品评价的 URL 链接是 {url}，商品的 ID 是：{product_id}"
+                    )
                     response = requests.get(url, headers=self.getHeaders(product_id))
                     response.raise_for_status()  # 检查响应状态码
                 except requests.RequestException as e:
@@ -183,23 +193,39 @@ def getData(self, maxPage: int, score: int):
                     default_logger.warning(f"JSON 解析异常: {e}")
                     continue
 
-                if not res_json.get("comments"):
-                    default_logger.warning(f"页面次数已到：{i}，超出范围(或未爬取到评论)")
+                if res_json["imgComments"]["imgCommentCount"] == 0:
+                    default_logger.warning(
+                        f"爬取到的商品评价数量为 0，可能是最后一页或请求失败"
+                    )
                     break
 
-                for comment_data in res_json["comments"]:
-                    comment = comment_data["content"].replace("\n", " ").replace("\r", " ")
+                for comment_data in res_json["imgComments"]["imgList"]:
+                    comment = (
+                        comment_data["commentVo"]["content"]
+                        .replace("\n", " ")
+                        .replace("\r", " ")
+                    )
                     comments.append(comment)
-                    scores.append(comment_data["score"])
+                    scores.append(comment_data["commentVo"]["score"])
 
         default_logger.info(f"已爬取 {len(comments)} 条 {self.comtype[score]} 评价信息")
 
         # 处理评论，拆分成句子
         remarks = []
         for comment in comments:
             sentences = re.findall(zhon.hanzi.sentence, comment)
-            if not sentences or sentences in [["。"], ["？"], ["！"], ["."], [","], ["?"], ["!"]]:
-                default_logger.warning(f"拆分失败或结果不符(去除空格和标点符号)：{sentences}")
+            if not sentences or sentences in [
+                ["。"],
+                ["？"],
+                ["！"],
+                ["."],
+                [","],
+                ["?"],
+                ["!"],
+            ]:
+                default_logger.warning(
+                    f"拆分失败或结果不符(去除空格和标点符号)：{sentences}"
+                )
             else:
                 remarks.append(sentences)
 
@@ -243,6 +269,7 @@ def solvedata(self, remarks) -> list:
         default_logger.info("爬取的评价结果：" + str(sentences))
         return sentences
 
+
 # 测试用例
 if __name__ == "__main__":
     jdlist = ["商品名"]