refactor(jdspider): 优化代码结构和可读性

rootphantomer · rootphantomer · commit 7689e5c0e940 · 2025-04-24T00:22:55.000+08:00
- 将重复的请求头提取为常量 BASE_HEADERS，减少代码冗余
- 修正拼写错误，将 "nagetive" 改为 "negative"
- 添加详细的函数注释，提高代码可维护性
- 优化日志输出信息，使其更清晰易懂
- 改进异常处理，使用 `response.raise_for_status()` 检查响应状态码
diff --git a/jdspider.py b/jdspider.py
@@ -15,195 +15,198 @@
 import zhon.hanzi
 from lxml import etree
 
-# Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27
+# 加载配置文件
 with open("./config.yml", "r", encoding="utf-8") as f:
     cfg = yaml.safe_load(f)
 
+# 获取用户的 cookie
 cookie = cfg["user"]["cookie"]
+
+# 配置日志输出到标准错误流
 log_console = logging.StreamHandler(sys.stderr)
 default_logger = logging.getLogger("jdspider")
 default_logger.setLevel(logging.DEBUG)
 default_logger.addHandler(log_console)
 
+# 定义基础请求头，避免重复代码
+BASE_HEADERS = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
+              "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
+    "accept-encoding": "gzip, deflate, br",
+    "accept-language": "zh-CN,zh;q=0.9",
+    "cache-control": "max-age=0",
+    "dnt": "1",
+    "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": '"Windows"',
+    "sec-fetch-dest": "document",
+    "sec-fetch-site": "none",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
+                  "Chrome/98.0.4758.82 Safari/537.36",
+}
 
 class JDSpider:
-    # 爬虫实现类：传入商品类别（如手机、电脑），构造实例。然后调用getData爬取数据。
+    """
+    京东爬虫类，用于爬取指定商品类别的评论信息。
+    传入商品类别（如手机、电脑）构造实例，然后调用 getData 方法爬取数据。
+    """
     def __init__(self, categlory):
-        # jD起始搜索页面
+        # 京东搜索商品的起始页面 URL
         self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
             quote(categlory)
         )
+        # 评论接口的基础 URL
         self.commentBaseUrl = "https://api.m.jd.com/?"
-        self.headers = {
-            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
-            "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
-            "accept-encoding": "gzip, deflate, br",
-            "accept-language": "zh-CN,zh;q=0.9",
-            "cache-control": "max-age=0",
-            "dnt": "1",
-            "sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
-            "sec-ch-ua-mobile": "?0",
-            "sec-ch-ua-platform": '"Windows"',
-            "sec-fetch-dest": "document",
-            "sec-fetch-site": "none",
-            "sec-fetch-user": "?1",
-            "upgrade-insecure-requests": "1",
-            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/98.0.4758.82 Safari/537.36",
-        }
+        # 基础请求头
+        self.headers = BASE_HEADERS.copy()
+        # 带 cookie 的请求头
         self.headers2 = {
+            **BASE_HEADERS,
             "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
             "accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
-            "cache-control": "max-age=0",
             "cookie": cookie,
-            "dnt": "1",
             "priority": "u=0, i",
             "sec-ch-ua": '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
             "sec-ch-ua-mobile": "?0",
             "sec-ch-ua-platform": '"macOS"',
-            "sec-fetch-dest": "document",
             "sec-fetch-mode": "navigate",
-            "sec-fetch-site": "none",
-            "sec-fetch-user": "?1",
-            "upgrade-insecure-requests": "1",
             "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0",
         }
+        # 获取商品 ID 列表
         self.productsId = self.getId()
-        self.comtype = {1: "nagetive", 2: "medium", 3: "positive"}
+        # 评论类型映射，1 差评，2 中评，3 好评
+        self.comtype = {1: "negative", 2: "medium", 3: "positive"}  # 修正拼写错误
+        # 商品类别
         self.categlory = categlory
+        # IP 列表，用于代理（当前为空）
         self.iplist = {"http": [], "https": []}
 
     def getParamUrl(self, productid: str, page: str, score: str):
+        """
+        生成评论接口的请求参数和完整 URL。
+        :param productid: 商品 ID
+        :param page: 评论页码
+        :param score: 评论类型（1 差评，2 中评，3 好评）
+        :return: 请求参数和完整 URL
+        """
         params = {
             "appid": "item-v3",
             "functionId": "pc_club_productPageComments",
             "client": "pc",
-            "body": {  # 用于控制页数，页面信息数的数据，非常重要，必不可少，要不然会被JD识别出来，爬不出相应的数据。
-                "productId": "%s" % productid,
-                "score": "%s" % score,  # 1表示差评，2表示中评，3表示好评
+            "body": {
+                "productId": productid,
+                "score": score,
                 "sortType": "5",
-                "page": "%s" % page,
+                "page": page,
                 "pageSize": "10",
                 "isShadowSku": "0",
                 "rid": "0",
                 "fold": "1",
             },
         }
-        default_logger.info("params:" + str(params))
+        default_logger.info("请求参数: " + str(params))
         url = self.commentBaseUrl + urlencode(params)
-        default_logger.info("url:" + str(url))
+        default_logger.info("请求 URL: " + str(url))
         return params, url
 
-    def getHeaders(
-        self, productid: str
-    ) -> (
-        dict
-    ):  # 和初始的self.header不同，这是爬取某个商品的header，加入了商品id，我也不知道去掉了会怎样。
-        header = {
-            "Referer": "https://item.jd.com/%s.html" % productid,
+    def getHeaders(self, productid: str) -> dict:
+        """
+        生成爬取指定商品评论时所需的请求头。
+        :param productid: 商品 ID
+        :return: 请求头字典
+        """
+        return {
+            "Referer": f"https://item.jd.com/{productid}.html",
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
-            "Chrome/75.0.3770.142 Safari/537.36",
+                          "Chrome/75.0.3770.142 Safari/537.36",
             # "cookie": cookie,
         }
-        return header
 
-    def getId(
-        self,
-    ) -> (
-        list
-    ):  # 获取商品id，为了得到具体商品页面的网址。结果保存在self.productId的数组里
-        response = requests.get(self.startUrl, headers=self.headers2)
-        default_logger.info("获取同类产品的搜索 url 结果：" + self.startUrl)
-        if response.status_code != 200:
-            default_logger.warning("状态码错误，爬虫连接异常！")
+    def getId(self) -> list:
+        """
+        从京东搜索页面获取商品 ID 列表。
+        :return: 商品 ID 列表
+        """
+        try:
+            response = requests.get(self.startUrl, headers=self.headers2)
+            response.raise_for_status()  # 检查响应状态码
+            default_logger.info("获取同类产品的搜索 URL 结果：" + self.startUrl)
+        except requests.RequestException as e:
+            default_logger.warning(f"请求异常，状态码错误，爬虫连接异常！错误信息: {e}")
+            return []
+
         html = etree.HTML(response.text)
         return html.xpath('//li[@class="gl-item"]/@data-sku')
 
-    def getData(
-        self,
-        maxPage: int,
-        score: int,
-    ):  # maxPage是爬取评论的最大页数，每页10条数据。差评和好评的最大一般页码不相同，一般情况下：好评>>差评>中评
-        # maxPage遇到超出的页码会自动跳出，所以设大点也没有关系。
-        # score是指那种评价类型，好评3、中评2、差评1。
-
+    def getData(self, maxPage: int, score: int):
+        """
+        爬取指定商品类别的评论信息。
+        :param maxPage: 最大爬取页数，每页 10 条评论
+        :param score: 评论类型（1 差评，2 中评，3 好评）
+        :return: 处理后的评论列表
+        """
         comments = []
         scores = []
-        default_logger.info(
-            "爬取商品数量最多为8个,请耐心等待,也可以自行修改jdspider文件"
-        )
-        if len(self.productsId) == 0:
-            default_logger.warning(f"完了，self.productsId是空的，后面会导致默认评价了")
-            sum_ = 0
-        elif 0 < len(self.productsId) < 8:  # limit the sum of products
-            sum_ = len(self.productsId)
-        else:
-            sum_: int = 3
-        default_logger.info("sum_:" + str(sum_))
-        for j in range(sum_):
-            id_: str = self.productsId[j]
-            # header = self.getHeaders(id)
+        default_logger.info("爬取商品数量最多为 8 个，请耐心等待，也可以自行修改 jdspider 文件")
+
+        # 确定要爬取的商品数量
+        product_count = min(len(self.productsId), 8) if self.productsId else 0
+        if product_count == 0:
+            default_logger.warning("self.productsId 为空，将使用默认评价")
+        default_logger.info("要爬取的商品数量: " + str(product_count))
+
+        for j in range(product_count):
+            product_id = self.productsId[j]
             for i in range(1, maxPage):
-                param, url = self.getParamUrl(id_, str(i), str(score))
-                default_logger.info(
-                    f"正在爬取当前商品的评论信息>>>>>>>>>第：%d 个，第 %d 页"
-                    % (j + 1, i)
-                )
+                params, url = self.getParamUrl(product_id, str(i), str(score))
+                default_logger.info(f"正在爬取第 {j + 1} 个商品的第 {i} 页评论信息")
+
                 try:
-                    default_logger.info(
-                        "爬取商品评价的 url 链接是" + url + "，商品的 id 是：" + id_
-                    )
-                    response = requests.get(url)
-                except Exception as e:
-                    default_logger.warning(e)
-                    break
-                if response.status_code != 200:
-                    default_logger.warning("状态码错误，爬虫连接异常")
+                    default_logger.info(f"爬取商品评价的 URL 链接是 {url}，商品的 ID 是：{product_id}")
+                    response = requests.get(url, headers=self.getHeaders(product_id))
+                    response.raise_for_status()  # 检查响应状态码
+                except requests.RequestException as e:
+                    default_logger.warning(f"请求异常: {e}")
                     continue
-                time.sleep(random.randint(5, 10))  # 设置时延，防止被封IP
-                if response.text == "":
+
+                time.sleep(random.randint(5, 10))  # 设置时延，防止被封 IP
+
+                if not response.text:
                     default_logger.warning("未爬取到信息")
                     continue
+
                 try:
                     res_json = json.loads(response.text)
-                except Exception as e:
-                    default_logger.warning(e)
+                except json.JSONDecodeError as e:
+                    default_logger.warning(f"JSON 解析异常: {e}")
                     continue
-                if len((res_json["comments"])) == 0:
-                    default_logger.warning(
-                        "页面次数已到：%d,超出范围(或未爬取到评论)" % i
-                    )
+
+                if not res_json.get("comments"):
+                    default_logger.warning(f"页面次数已到：{i}，超出范围(或未爬取到评论)")
                     break
-                for cdit in res_json["comments"]:
-                    comment = cdit["content"].replace("\n", " ").replace("\r", " ")
+
+                for comment_data in res_json["comments"]:
+                    comment = comment_data["content"].replace("\n", " ").replace("\r", " ")
                     comments.append(comment)
-                    scores.append(cdit["score"])
-        # savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
-        default_logger.info(
-            "已爬取%d 条 %s 评价信息" % (len(comments), self.comtype[score])
-        )
-        # 存入列表,简单处理评价
+                    scores.append(comment_data["score"])
+
+        default_logger.info(f"已爬取 {len(comments)} 条 {self.comtype[score]} 评价信息")
+
+        # 处理评论，拆分成句子
         remarks = []
-        for i in range(len(comments)):
-            rst = re.findall(zhon.hanzi.sentence, comments[i])
-            if (
-                len(rst) == 0
-                or rst == ["。"]
-                or rst == ["？"]
-                or rst == ["！"]
-                or rst == ["."]
-                or rst == [","]
-                or rst == ["?"]
-                or rst == ["!"]
-            ):
-                default_logger.warning(
-                    "拆分失败或结果不符(去除空格和标点符号)：%s" % (rst)
-                )
+        for comment in comments:
+            sentences = re.findall(zhon.hanzi.sentence, comment)
+            if not sentences or sentences in [["。"], ["？"], ["！"], ["."], [","], ["?"], ["!"]]:
+                default_logger.warning(f"拆分失败或结果不符(去除空格和标点符号)：{sentences}")
             else:
-                remarks.append(rst)
+                remarks.append(sentences)
+
         result = self.solvedata(remarks=remarks)
-        if len(result) == 0:
-            default_logger.warning("当前商品没有评价,使用默认评价")
+
+        if not result:
+            default_logger.warning("当前商品没有评价，使用默认评价")
             result = [
                 "考虑买这个$之前我是有担心过的，因为我不知道$的质量和品质怎么样，但是看了评论后我就放心了。",
                 "买这个$之前我是有看过好几家店，最后看到这家店的评价不错就决定在这家店买 ",
@@ -224,45 +227,22 @@ def getData(
                 "大大的好评!以后买$再来你们店！(￣▽￣)",
                 "真是一次愉快的购物！",
             ]
+
         return result
 
     def solvedata(self, remarks) -> list:
-        # 将数据拆分成句子
+        """
+        将评论拆分成句子列表。
+        :param remarks: 包含评论句子列表的列表
+        :return: 所有评论句子组成的列表
+        """
         sentences = []
-        for i in range(len(remarks)):
-            for j in range(len(remarks[i])):
-                sentences.append(remarks[i][j])
+        for item in remarks:
+            for sentence in item:
+                sentences.append(sentence)
         default_logger.info("爬取的评价结果：" + str(sentences))
         return sentences
 
-        # 存入mysql数据库
-        """
-        db = pymysql.connect(host='主机名',user='用户名',password='密码',db='数据库名',charset='utf8mb4')
-        mycursor = db.cursor()
-        mycursor.execute("use jd") # 根据自己的数据库名称更改
-        mycursor.execute("TRUNCATE table jd")
-        for i in range(len(comments)):
-            sql = "insert into jd(i,scores,comments) values('%s','%s','%s')"%(id,scores[i],comments[i]) # 根据自己的表结构更改
-            try:
-                mycursor.execute(sql)
-                db.commit()
-            except Exception as e:
-                logging.warning(e)
-                db.rollback()
-        mycursor.close()
-        db.close()
-        logging.warning("已存入数据库")
-        """
-
-        # 存入csv文件
-        """    
-        with open(savepath,'a+',encoding ='utf8') as f:
-            for i in range(len(comments)):
-                f.write("%d\t%s\t%s\n"%(i,scores[i],comments[i]))
-        logging.warning("数据已保存在 %s"%(savepath))
-        """
-
-
 # 测试用例
 if __name__ == "__main__":
     jdlist = ["商品名"]