1515import zhon .hanzi
1616from lxml import etree
1717
18- # Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27
18+ # 加载配置文件
1919with open ("./config.yml" , "r" , encoding = "utf-8" ) as f :
2020 cfg = yaml .safe_load (f )
2121
22+ # 获取用户的 cookie
2223cookie = cfg ["user" ]["cookie" ]
24+
25+ # 配置日志输出到标准错误流
2326log_console = logging .StreamHandler (sys .stderr )
2427default_logger = logging .getLogger ("jdspider" )
2528default_logger .setLevel (logging .DEBUG )
2629default_logger .addHandler (log_console )
2730
31+ # 定义基础请求头,避免重复代码
32+ BASE_HEADERS = {
33+ "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
34+ "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ,
35+ "accept-encoding" : "gzip, deflate, br" ,
36+ "accept-language" : "zh-CN,zh;q=0.9" ,
37+ "cache-control" : "max-age=0" ,
38+ "dnt" : "1" ,
39+ "sec-ch-ua" : '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"' ,
40+ "sec-ch-ua-mobile" : "?0" ,
41+ "sec-ch-ua-platform" : '"Windows"' ,
42+ "sec-fetch-dest" : "document" ,
43+ "sec-fetch-site" : "none" ,
44+ "sec-fetch-user" : "?1" ,
45+ "upgrade-insecure-requests" : "1" ,
46+ "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
47+ "Chrome/98.0.4758.82 Safari/537.36" ,
48+ }
2849
2950class JDSpider :
30- # 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData爬取数据。
51+ """
52+ 京东爬虫类,用于爬取指定商品类别的评论信息。
53+ 传入商品类别(如手机、电脑)构造实例,然后调用 getData 方法爬取数据。
54+ """
3155 def __init__ (self , categlory ):
32- # jD起始搜索页面
56+ # 京东搜索商品的起始页面 URL
3357 self .startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
3458 quote (categlory )
3559 )
60+ # 评论接口的基础 URL
3661 self .commentBaseUrl = "https://api.m.jd.com/?"
37- self .headers = {
38- "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
39- "*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ,
40- "accept-encoding" : "gzip, deflate, br" ,
41- "accept-language" : "zh-CN,zh;q=0.9" ,
42- "cache-control" : "max-age=0" ,
43- "dnt" : "1" ,
44- "sec-ch-ua" : '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"' ,
45- "sec-ch-ua-mobile" : "?0" ,
46- "sec-ch-ua-platform" : '"Windows"' ,
47- "sec-fetch-dest" : "document" ,
48- "sec-fetch-site" : "none" ,
49- "sec-fetch-user" : "?1" ,
50- "upgrade-insecure-requests" : "1" ,
51- "user-agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
52- "Chrome/98.0.4758.82 Safari/537.36" ,
53- }
62+ # 基础请求头
63+ self .headers = BASE_HEADERS .copy ()
64+ # 带 cookie 的请求头
5465 self .headers2 = {
66+ ** BASE_HEADERS ,
5567 "accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7" ,
5668 "accept-language" : "en,zh-CN;q=0.9,zh;q=0.8" ,
57- "cache-control" : "max-age=0" ,
5869 "cookie" : cookie ,
59- "dnt" : "1" ,
6070 "priority" : "u=0, i" ,
6171 "sec-ch-ua" : '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"' ,
6272 "sec-ch-ua-mobile" : "?0" ,
6373 "sec-ch-ua-platform" : '"macOS"' ,
64- "sec-fetch-dest" : "document" ,
6574 "sec-fetch-mode" : "navigate" ,
66- "sec-fetch-site" : "none" ,
67- "sec-fetch-user" : "?1" ,
68- "upgrade-insecure-requests" : "1" ,
6975 "user-agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0" ,
7076 }
77+ # 获取商品 ID 列表
7178 self .productsId = self .getId ()
72- self .comtype = {1 : "nagetive" , 2 : "medium" , 3 : "positive" }
79+ # 评论类型映射,1 差评,2 中评,3 好评
80+ self .comtype = {1 : "negative" , 2 : "medium" , 3 : "positive" } # 修正拼写错误
81+ # 商品类别
7382 self .categlory = categlory
83+ # IP 列表,用于代理(当前为空)
7484 self .iplist = {"http" : [], "https" : []}
7585
7686 def getParamUrl (self , productid : str , page : str , score : str ):
87+ """
88+ 生成评论接口的请求参数和完整 URL。
89+ :param productid: 商品 ID
90+ :param page: 评论页码
91+ :param score: 评论类型(1 差评,2 中评,3 好评)
92+ :return: 请求参数和完整 URL
93+ """
7794 params = {
7895 "appid" : "item-v3" ,
7996 "functionId" : "pc_club_productPageComments" ,
8097 "client" : "pc" ,
81- "body" : { # 用于控制页数,页面信息数的数据,非常重要,必不可少,要不然会被JD识别出来,爬不出相应的数据。
82- "productId" : "%s" % productid ,
83- "score" : "%s" % score , # 1表示差评,2表示中评,3表示好评
98+ "body" : {
99+ "productId" : productid ,
100+ "score" : score ,
84101 "sortType" : "5" ,
85- "page" : "%s" % page ,
102+ "page" : page ,
86103 "pageSize" : "10" ,
87104 "isShadowSku" : "0" ,
88105 "rid" : "0" ,
89106 "fold" : "1" ,
90107 },
91108 }
92- default_logger .info ("params: " + str (params ))
109+ default_logger .info ("请求参数: " + str (params ))
93110 url = self .commentBaseUrl + urlencode (params )
94- default_logger .info ("url: " + str (url ))
111+ default_logger .info ("请求 URL: " + str (url ))
95112 return params , url
96113
97- def getHeaders (
98- self , productid : str
99- ) -> (
100- dict
101- ): # 和初始的self.header不同,这是爬取某个商品的header,加入了商品id,我也不知道去掉了会怎样。
102- header = {
103- "Referer" : "https://item.jd.com/%s.html" % productid ,
114+ def getHeaders (self , productid : str ) -> dict :
115+ """
116+ 生成爬取指定商品评论时所需的请求头。
117+ :param productid: 商品 ID
118+ :return: 请求头字典
119+ """
120+ return {
121+ "Referer" : f"https://item.jd.com/{ productid } .html" ,
104122 "User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
105- "Chrome/75.0.3770.142 Safari/537.36" ,
123+ "Chrome/75.0.3770.142 Safari/537.36" ,
106124 # "cookie": cookie,
107125 }
108- return header
109126
110- def getId (
111- self ,
112- ) -> (
113- list
114- ): # 获取商品id,为了得到具体商品页面的网址。结果保存在self.productId的数组里
115- response = requests .get (self .startUrl , headers = self .headers2 )
116- default_logger .info ("获取同类产品的搜索 url 结果:" + self .startUrl )
117- if response .status_code != 200 :
118- default_logger .warning ("状态码错误,爬虫连接异常!" )
127+ def getId (self ) -> list :
128+ """
129+ 从京东搜索页面获取商品 ID 列表。
130+ :return: 商品 ID 列表
131+ """
132+ try :
133+ response = requests .get (self .startUrl , headers = self .headers2 )
134+ response .raise_for_status () # 检查响应状态码
135+ default_logger .info ("获取同类产品的搜索 URL 结果:" + self .startUrl )
136+ except requests .RequestException as e :
137+ default_logger .warning (f"请求异常,状态码错误,爬虫连接异常!错误信息: { e } " )
138+ return []
139+
119140 html = etree .HTML (response .text )
120141 return html .xpath ('//li[@class="gl-item"]/@data-sku' )
121142
122- def getData (
123- self ,
124- maxPage : int ,
125- score : int ,
126- ): # maxPage是爬取评论的最大页数,每页10条数据。差评和好评的最大一般页码不相同,一般情况下:好评>>差评>中评
127- # maxPage遇到超出的页码会自动跳出,所以设大点也没有关系。
128- # score是指那种评价类型,好评3、中评2、差评1。
129-
143+ def getData (self , maxPage : int , score : int ):
144+ """
145+ 爬取指定商品类别的评论信息。
146+ :param maxPage: 最大爬取页数,每页 10 条评论
147+ :param score: 评论类型(1 差评,2 中评,3 好评)
148+ :return: 处理后的评论列表
149+ """
130150 comments = []
131151 scores = []
132- default_logger .info (
133- "爬取商品数量最多为8个,请耐心等待,也可以自行修改jdspider文件"
134- )
135- if len (self .productsId ) == 0 :
136- default_logger .warning (f"完了,self.productsId是空的,后面会导致默认评价了" )
137- sum_ = 0
138- elif 0 < len (self .productsId ) < 8 : # limit the sum of products
139- sum_ = len (self .productsId )
140- else :
141- sum_ : int = 3
142- default_logger .info ("sum_:" + str (sum_ ))
143- for j in range (sum_ ):
144- id_ : str = self .productsId [j ]
145- # header = self.getHeaders(id)
152+ default_logger .info ("爬取商品数量最多为 8 个,请耐心等待,也可以自行修改 jdspider 文件" )
153+
154+ # 确定要爬取的商品数量
155+ product_count = min (len (self .productsId ), 8 ) if self .productsId else 0
156+ if product_count == 0 :
157+ default_logger .warning ("self.productsId 为空,将使用默认评价" )
158+ default_logger .info ("要爬取的商品数量: " + str (product_count ))
159+
160+ for j in range (product_count ):
161+ product_id = self .productsId [j ]
146162 for i in range (1 , maxPage ):
147- param , url = self .getParamUrl (id_ , str (i ), str (score ))
148- default_logger .info (
149- f"正在爬取当前商品的评论信息>>>>>>>>>第:%d 个,第 %d 页"
150- % (j + 1 , i )
151- )
163+ params , url = self .getParamUrl (product_id , str (i ), str (score ))
164+ default_logger .info (f"正在爬取第 { j + 1 } 个商品的第 { i } 页评论信息" )
165+
152166 try :
153- default_logger .info (
154- "爬取商品评价的 url 链接是" + url + ",商品的 id 是:" + id_
155- )
156- response = requests .get (url )
157- except Exception as e :
158- default_logger .warning (e )
159- break
160- if response .status_code != 200 :
161- default_logger .warning ("状态码错误,爬虫连接异常" )
167+ default_logger .info (f"爬取商品评价的 URL 链接是 { url } ,商品的 ID 是:{ product_id } " )
168+ response = requests .get (url , headers = self .getHeaders (product_id ))
169+ response .raise_for_status () # 检查响应状态码
170+ except requests .RequestException as e :
171+ default_logger .warning (f"请求异常: { e } " )
162172 continue
163- time .sleep (random .randint (5 , 10 )) # 设置时延,防止被封IP
164- if response .text == "" :
173+
174+ time .sleep (random .randint (5 , 10 )) # 设置时延,防止被封 IP
175+
176+ if not response .text :
165177 default_logger .warning ("未爬取到信息" )
166178 continue
179+
167180 try :
168181 res_json = json .loads (response .text )
169- except Exception as e :
170- default_logger .warning (e )
182+ except json . JSONDecodeError as e :
183+ default_logger .warning (f"JSON 解析异常: { e } " )
171184 continue
172- if len ((res_json ["comments" ])) == 0 :
173- default_logger .warning (
174- "页面次数已到:%d,超出范围(或未爬取到评论)" % i
175- )
185+
186+ if not res_json .get ("comments" ):
187+ default_logger .warning (f"页面次数已到:{ i } ,超出范围(或未爬取到评论)" )
176188 break
177- for cdit in res_json ["comments" ]:
178- comment = cdit ["content" ].replace ("\n " , " " ).replace ("\r " , " " )
189+
190+ for comment_data in res_json ["comments" ]:
191+ comment = comment_data ["content" ].replace ("\n " , " " ).replace ("\r " , " " )
179192 comments .append (comment )
180- scores .append (cdit ["score" ])
181- # savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
182- default_logger .info (
183- "已爬取%d 条 %s 评价信息" % (len (comments ), self .comtype [score ])
184- )
185- # 存入列表,简单处理评价
193+ scores .append (comment_data ["score" ])
194+
195+ default_logger .info (f"已爬取 { len (comments )} 条 { self .comtype [score ]} 评价信息" )
196+
197+ # 处理评论,拆分成句子
186198 remarks = []
187- for i in range (len (comments )):
188- rst = re .findall (zhon .hanzi .sentence , comments [i ])
189- if (
190- len (rst ) == 0
191- or rst == ["。" ]
192- or rst == ["?" ]
193- or rst == ["!" ]
194- or rst == ["." ]
195- or rst == ["," ]
196- or rst == ["?" ]
197- or rst == ["!" ]
198- ):
199- default_logger .warning (
200- "拆分失败或结果不符(去除空格和标点符号):%s" % (rst )
201- )
199+ for comment in comments :
200+ sentences = re .findall (zhon .hanzi .sentence , comment )
201+ if not sentences or sentences in [["。" ], ["?" ], ["!" ], ["." ], ["," ], ["?" ], ["!" ]]:
202+ default_logger .warning (f"拆分失败或结果不符(去除空格和标点符号):{ sentences } " )
202203 else :
203- remarks .append (rst )
204+ remarks .append (sentences )
205+
204206 result = self .solvedata (remarks = remarks )
205- if len (result ) == 0 :
206- default_logger .warning ("当前商品没有评价,使用默认评价" )
207+
208+ if not result :
209+ default_logger .warning ("当前商品没有评价,使用默认评价" )
207210 result = [
208211 "考虑买这个$之前我是有担心过的,因为我不知道$的质量和品质怎么样,但是看了评论后我就放心了。" ,
209212 "买这个$之前我是有看过好几家店,最后看到这家店的评价不错就决定在这家店买 " ,
@@ -224,45 +227,22 @@ def getData(
224227 "大大的好评!以后买$再来你们店!( ̄▽ ̄)" ,
225228 "真是一次愉快的购物!" ,
226229 ]
230+
227231 return result
228232
229233 def solvedata (self , remarks ) -> list :
230- # 将数据拆分成句子
234+ """
235+ 将评论拆分成句子列表。
236+ :param remarks: 包含评论句子列表的列表
237+ :return: 所有评论句子组成的列表
238+ """
231239 sentences = []
232- for i in range ( len ( remarks )) :
233- for j in range ( len ( remarks [ i ])) :
234- sentences .append (remarks [ i ][ j ] )
240+ for item in remarks :
241+ for sentence in item :
242+ sentences .append (sentence )
235243 default_logger .info ("爬取的评价结果:" + str (sentences ))
236244 return sentences
237245
238- # 存入mysql数据库
239- """
240- db = pymysql.connect(host='主机名',user='用户名',password='密码',db='数据库名',charset='utf8mb4')
241- mycursor = db.cursor()
242- mycursor.execute("use jd") # 根据自己的数据库名称更改
243- mycursor.execute("TRUNCATE table jd")
244- for i in range(len(comments)):
245- sql = "insert into jd(i,scores,comments) values('%s','%s','%s')"%(id,scores[i],comments[i]) # 根据自己的表结构更改
246- try:
247- mycursor.execute(sql)
248- db.commit()
249- except Exception as e:
250- logging.warning(e)
251- db.rollback()
252- mycursor.close()
253- db.close()
254- logging.warning("已存入数据库")
255- """
256-
257- # 存入csv文件
258- """
259- with open(savepath,'a+',encoding ='utf8') as f:
260- for i in range(len(comments)):
261- f.write("%d\t %s\t %s\n "%(i,scores[i],comments[i]))
262- logging.warning("数据已保存在 %s"%(savepath))
263- """
264-
265-
266246# 测试用例
267247if __name__ == "__main__" :
268248 jdlist = ["商品名" ]
0 commit comments