Skip to content

Commit 7689e5c

Browse files
committed
refactor(jdspider): 优化代码结构和可读性
- 将重复的请求头提取为常量 BASE_HEADERS,减少代码冗余 - 修正拼写错误,将 "nagetive" 改为 "negative" - 添加详细的函数注释,提高代码可维护性 - 优化日志输出信息,使其更清晰易懂 - 改进异常处理,使用 `response.raise_for_status()` 检查响应状态码
1 parent 3d9399c commit 7689e5c

File tree

1 file changed

+133
-153
lines changed

1 file changed

+133
-153
lines changed

jdspider.py

Lines changed: 133 additions & 153 deletions
Original file line numberDiff line numberDiff line change
@@ -15,195 +15,198 @@
1515
import zhon.hanzi
1616
from lxml import etree
1717

18-
# Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27
18+
# 加载配置文件
1919
with open("./config.yml", "r", encoding="utf-8") as f:
2020
cfg = yaml.safe_load(f)
2121

22+
# 获取用户的 cookie
2223
cookie = cfg["user"]["cookie"]
24+
25+
# 配置日志输出到标准错误流
2326
log_console = logging.StreamHandler(sys.stderr)
2427
default_logger = logging.getLogger("jdspider")
2528
default_logger.setLevel(logging.DEBUG)
2629
default_logger.addHandler(log_console)
2730

31+
# 定义基础请求头,避免重复代码
32+
BASE_HEADERS = {
33+
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
34+
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
35+
"accept-encoding": "gzip, deflate, br",
36+
"accept-language": "zh-CN,zh;q=0.9",
37+
"cache-control": "max-age=0",
38+
"dnt": "1",
39+
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
40+
"sec-ch-ua-mobile": "?0",
41+
"sec-ch-ua-platform": '"Windows"',
42+
"sec-fetch-dest": "document",
43+
"sec-fetch-site": "none",
44+
"sec-fetch-user": "?1",
45+
"upgrade-insecure-requests": "1",
46+
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
47+
"Chrome/98.0.4758.82 Safari/537.36",
48+
}
2849

2950
class JDSpider:
30-
# 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData爬取数据。
51+
"""
52+
京东爬虫类,用于爬取指定商品类别的评论信息。
53+
传入商品类别(如手机、电脑)构造实例,然后调用 getData 方法爬取数据。
54+
"""
3155
def __init__(self, categlory):
32-
# jD起始搜索页面
56+
# 京东搜索商品的起始页面 URL
3357
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
3458
quote(categlory)
3559
)
60+
# 评论接口的基础 URL
3661
self.commentBaseUrl = "https://api.m.jd.com/?"
37-
self.headers = {
38-
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,"
39-
"*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
40-
"accept-encoding": "gzip, deflate, br",
41-
"accept-language": "zh-CN,zh;q=0.9",
42-
"cache-control": "max-age=0",
43-
"dnt": "1",
44-
"sec-ch-ua": '" Not A;Brand";v="99", "Chromium";v="98", "Google Chrome";v="98"',
45-
"sec-ch-ua-mobile": "?0",
46-
"sec-ch-ua-platform": '"Windows"',
47-
"sec-fetch-dest": "document",
48-
"sec-fetch-site": "none",
49-
"sec-fetch-user": "?1",
50-
"upgrade-insecure-requests": "1",
51-
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
52-
"Chrome/98.0.4758.82 Safari/537.36",
53-
}
62+
# 基础请求头
63+
self.headers = BASE_HEADERS.copy()
64+
# 带 cookie 的请求头
5465
self.headers2 = {
66+
**BASE_HEADERS,
5567
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
5668
"accept-language": "en,zh-CN;q=0.9,zh;q=0.8",
57-
"cache-control": "max-age=0",
5869
"cookie": cookie,
59-
"dnt": "1",
6070
"priority": "u=0, i",
6171
"sec-ch-ua": '"Microsoft Edge";v="129", "Not=A?Brand";v="8", "Chromium";v="129"',
6272
"sec-ch-ua-mobile": "?0",
6373
"sec-ch-ua-platform": '"macOS"',
64-
"sec-fetch-dest": "document",
6574
"sec-fetch-mode": "navigate",
66-
"sec-fetch-site": "none",
67-
"sec-fetch-user": "?1",
68-
"upgrade-insecure-requests": "1",
6975
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/129.0.0.0 Safari/537.36 Edg/129.0.0.0",
7076
}
77+
# 获取商品 ID 列表
7178
self.productsId = self.getId()
72-
self.comtype = {1: "nagetive", 2: "medium", 3: "positive"}
79+
# 评论类型映射,1 差评,2 中评,3 好评
80+
self.comtype = {1: "negative", 2: "medium", 3: "positive"} # 修正拼写错误
81+
# 商品类别
7382
self.categlory = categlory
83+
# IP 列表,用于代理(当前为空)
7484
self.iplist = {"http": [], "https": []}
7585

7686
def getParamUrl(self, productid: str, page: str, score: str):
87+
"""
88+
生成评论接口的请求参数和完整 URL。
89+
:param productid: 商品 ID
90+
:param page: 评论页码
91+
:param score: 评论类型(1 差评,2 中评,3 好评)
92+
:return: 请求参数和完整 URL
93+
"""
7794
params = {
7895
"appid": "item-v3",
7996
"functionId": "pc_club_productPageComments",
8097
"client": "pc",
81-
"body": { # 用于控制页数,页面信息数的数据,非常重要,必不可少,要不然会被JD识别出来,爬不出相应的数据。
82-
"productId": "%s" % productid,
83-
"score": "%s" % score, # 1表示差评,2表示中评,3表示好评
98+
"body": {
99+
"productId": productid,
100+
"score": score,
84101
"sortType": "5",
85-
"page": "%s" % page,
102+
"page": page,
86103
"pageSize": "10",
87104
"isShadowSku": "0",
88105
"rid": "0",
89106
"fold": "1",
90107
},
91108
}
92-
default_logger.info("params:" + str(params))
109+
default_logger.info("请求参数: " + str(params))
93110
url = self.commentBaseUrl + urlencode(params)
94-
default_logger.info("url:" + str(url))
111+
default_logger.info("请求 URL: " + str(url))
95112
return params, url
96113

97-
def getHeaders(
98-
self, productid: str
99-
) -> (
100-
dict
101-
): # 和初始的self.header不同,这是爬取某个商品的header,加入了商品id,我也不知道去掉了会怎样。
102-
header = {
103-
"Referer": "https://item.jd.com/%s.html" % productid,
114+
def getHeaders(self, productid: str) -> dict:
115+
"""
116+
生成爬取指定商品评论时所需的请求头。
117+
:param productid: 商品 ID
118+
:return: 请求头字典
119+
"""
120+
return {
121+
"Referer": f"https://item.jd.com/{productid}.html",
104122
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
105-
"Chrome/75.0.3770.142 Safari/537.36",
123+
"Chrome/75.0.3770.142 Safari/537.36",
106124
# "cookie": cookie,
107125
}
108-
return header
109126

110-
def getId(
111-
self,
112-
) -> (
113-
list
114-
): # 获取商品id,为了得到具体商品页面的网址。结果保存在self.productId的数组里
115-
response = requests.get(self.startUrl, headers=self.headers2)
116-
default_logger.info("获取同类产品的搜索 url 结果:" + self.startUrl)
117-
if response.status_code != 200:
118-
default_logger.warning("状态码错误,爬虫连接异常!")
127+
def getId(self) -> list:
128+
"""
129+
从京东搜索页面获取商品 ID 列表。
130+
:return: 商品 ID 列表
131+
"""
132+
try:
133+
response = requests.get(self.startUrl, headers=self.headers2)
134+
response.raise_for_status() # 检查响应状态码
135+
default_logger.info("获取同类产品的搜索 URL 结果:" + self.startUrl)
136+
except requests.RequestException as e:
137+
default_logger.warning(f"请求异常,状态码错误,爬虫连接异常!错误信息: {e}")
138+
return []
139+
119140
html = etree.HTML(response.text)
120141
return html.xpath('//li[@class="gl-item"]/@data-sku')
121142

122-
def getData(
123-
self,
124-
maxPage: int,
125-
score: int,
126-
): # maxPage是爬取评论的最大页数,每页10条数据。差评和好评的最大一般页码不相同,一般情况下:好评>>差评>中评
127-
# maxPage遇到超出的页码会自动跳出,所以设大点也没有关系。
128-
# score是指那种评价类型,好评3、中评2、差评1。
129-
143+
def getData(self, maxPage: int, score: int):
144+
"""
145+
爬取指定商品类别的评论信息。
146+
:param maxPage: 最大爬取页数,每页 10 条评论
147+
:param score: 评论类型(1 差评,2 中评,3 好评)
148+
:return: 处理后的评论列表
149+
"""
130150
comments = []
131151
scores = []
132-
default_logger.info(
133-
"爬取商品数量最多为8个,请耐心等待,也可以自行修改jdspider文件"
134-
)
135-
if len(self.productsId) == 0:
136-
default_logger.warning(f"完了,self.productsId是空的,后面会导致默认评价了")
137-
sum_ = 0
138-
elif 0 < len(self.productsId) < 8: # limit the sum of products
139-
sum_ = len(self.productsId)
140-
else:
141-
sum_: int = 3
142-
default_logger.info("sum_:" + str(sum_))
143-
for j in range(sum_):
144-
id_: str = self.productsId[j]
145-
# header = self.getHeaders(id)
152+
default_logger.info("爬取商品数量最多为 8 个,请耐心等待,也可以自行修改 jdspider 文件")
153+
154+
# 确定要爬取的商品数量
155+
product_count = min(len(self.productsId), 8) if self.productsId else 0
156+
if product_count == 0:
157+
default_logger.warning("self.productsId 为空,将使用默认评价")
158+
default_logger.info("要爬取的商品数量: " + str(product_count))
159+
160+
for j in range(product_count):
161+
product_id = self.productsId[j]
146162
for i in range(1, maxPage):
147-
param, url = self.getParamUrl(id_, str(i), str(score))
148-
default_logger.info(
149-
f"正在爬取当前商品的评论信息>>>>>>>>>第:%d 个,第 %d 页"
150-
% (j + 1, i)
151-
)
163+
params, url = self.getParamUrl(product_id, str(i), str(score))
164+
default_logger.info(f"正在爬取第 {j + 1} 个商品的第 {i} 页评论信息")
165+
152166
try:
153-
default_logger.info(
154-
"爬取商品评价的 url 链接是" + url + ",商品的 id 是:" + id_
155-
)
156-
response = requests.get(url)
157-
except Exception as e:
158-
default_logger.warning(e)
159-
break
160-
if response.status_code != 200:
161-
default_logger.warning("状态码错误,爬虫连接异常")
167+
default_logger.info(f"爬取商品评价的 URL 链接是 {url},商品的 ID 是:{product_id}")
168+
response = requests.get(url, headers=self.getHeaders(product_id))
169+
response.raise_for_status() # 检查响应状态码
170+
except requests.RequestException as e:
171+
default_logger.warning(f"请求异常: {e}")
162172
continue
163-
time.sleep(random.randint(5, 10)) # 设置时延,防止被封IP
164-
if response.text == "":
173+
174+
time.sleep(random.randint(5, 10)) # 设置时延,防止被封 IP
175+
176+
if not response.text:
165177
default_logger.warning("未爬取到信息")
166178
continue
179+
167180
try:
168181
res_json = json.loads(response.text)
169-
except Exception as e:
170-
default_logger.warning(e)
182+
except json.JSONDecodeError as e:
183+
default_logger.warning(f"JSON 解析异常: {e}")
171184
continue
172-
if len((res_json["comments"])) == 0:
173-
default_logger.warning(
174-
"页面次数已到:%d,超出范围(或未爬取到评论)" % i
175-
)
185+
186+
if not res_json.get("comments"):
187+
default_logger.warning(f"页面次数已到:{i},超出范围(或未爬取到评论)")
176188
break
177-
for cdit in res_json["comments"]:
178-
comment = cdit["content"].replace("\n", " ").replace("\r", " ")
189+
190+
for comment_data in res_json["comments"]:
191+
comment = comment_data["content"].replace("\n", " ").replace("\r", " ")
179192
comments.append(comment)
180-
scores.append(cdit["score"])
181-
# savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
182-
default_logger.info(
183-
"已爬取%d 条 %s 评价信息" % (len(comments), self.comtype[score])
184-
)
185-
# 存入列表,简单处理评价
193+
scores.append(comment_data["score"])
194+
195+
default_logger.info(f"已爬取 {len(comments)}{self.comtype[score]} 评价信息")
196+
197+
# 处理评论,拆分成句子
186198
remarks = []
187-
for i in range(len(comments)):
188-
rst = re.findall(zhon.hanzi.sentence, comments[i])
189-
if (
190-
len(rst) == 0
191-
or rst == ["。"]
192-
or rst == ["?"]
193-
or rst == ["!"]
194-
or rst == ["."]
195-
or rst == [","]
196-
or rst == ["?"]
197-
or rst == ["!"]
198-
):
199-
default_logger.warning(
200-
"拆分失败或结果不符(去除空格和标点符号):%s" % (rst)
201-
)
199+
for comment in comments:
200+
sentences = re.findall(zhon.hanzi.sentence, comment)
201+
if not sentences or sentences in [["。"], ["?"], ["!"], ["."], [","], ["?"], ["!"]]:
202+
default_logger.warning(f"拆分失败或结果不符(去除空格和标点符号):{sentences}")
202203
else:
203-
remarks.append(rst)
204+
remarks.append(sentences)
205+
204206
result = self.solvedata(remarks=remarks)
205-
if len(result) == 0:
206-
default_logger.warning("当前商品没有评价,使用默认评价")
207+
208+
if not result:
209+
default_logger.warning("当前商品没有评价,使用默认评价")
207210
result = [
208211
"考虑买这个$之前我是有担心过的,因为我不知道$的质量和品质怎么样,但是看了评论后我就放心了。",
209212
"买这个$之前我是有看过好几家店,最后看到这家店的评价不错就决定在这家店买 ",
@@ -224,45 +227,22 @@ def getData(
224227
"大大的好评!以后买$再来你们店!( ̄▽ ̄)",
225228
"真是一次愉快的购物!",
226229
]
230+
227231
return result
228232

229233
def solvedata(self, remarks) -> list:
230-
# 将数据拆分成句子
234+
"""
235+
将评论拆分成句子列表。
236+
:param remarks: 包含评论句子列表的列表
237+
:return: 所有评论句子组成的列表
238+
"""
231239
sentences = []
232-
for i in range(len(remarks)):
233-
for j in range(len(remarks[i])):
234-
sentences.append(remarks[i][j])
240+
for item in remarks:
241+
for sentence in item:
242+
sentences.append(sentence)
235243
default_logger.info("爬取的评价结果:" + str(sentences))
236244
return sentences
237245

238-
# 存入mysql数据库
239-
"""
240-
db = pymysql.connect(host='主机名',user='用户名',password='密码',db='数据库名',charset='utf8mb4')
241-
mycursor = db.cursor()
242-
mycursor.execute("use jd") # 根据自己的数据库名称更改
243-
mycursor.execute("TRUNCATE table jd")
244-
for i in range(len(comments)):
245-
sql = "insert into jd(i,scores,comments) values('%s','%s','%s')"%(id,scores[i],comments[i]) # 根据自己的表结构更改
246-
try:
247-
mycursor.execute(sql)
248-
db.commit()
249-
except Exception as e:
250-
logging.warning(e)
251-
db.rollback()
252-
mycursor.close()
253-
db.close()
254-
logging.warning("已存入数据库")
255-
"""
256-
257-
# 存入csv文件
258-
"""
259-
with open(savepath,'a+',encoding ='utf8') as f:
260-
for i in range(len(comments)):
261-
f.write("%d\t%s\t%s\n"%(i,scores[i],comments[i]))
262-
logging.warning("数据已保存在 %s"%(savepath))
263-
"""
264-
265-
266246
# 测试用例
267247
if __name__ == "__main__":
268248
jdlist = ["商品名"]

0 commit comments

Comments
 (0)