Skip to content

Commit 43a9aca

Browse files
authored
Merge branch 'main' into Dimlitter-patch-1
2 parents 2698cc6 + 68b8cf0 commit 43a9aca

File tree

2 files changed

+118
-83
lines changed

2 files changed

+118
-83
lines changed

auto_comment_plus.py

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,21 +5,30 @@
55

66
import random
77
import time
8+
89
import jieba.analyse
910
import requests
11+
import yaml
1012
from lxml import etree
13+
1114
import jdspider
1215

16+
17+
CONFIG_PATH = './config.yml'
18+
19+
1320
jieba.setLogLevel(jieba.logging.INFO)
14-
"""
15-
ck填到下面就好,只支持网页版的Ck
16-
以下为最短格式
17-
"""
18-
ck = ''
21+
22+
23+
with open(CONFIG_PATH, 'r', encoding='utf-8') as f:
24+
cfg = yaml.safe_load(f)
25+
ck = cfg['user']['cookie']
1926

2027
headers = {
2128
'cookie': ck,
22-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',
29+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/'
30+
'537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/'
31+
'537.36'
2332
}
2433

2534

@@ -35,7 +44,7 @@ def generation(pname, _class=0):
3544
# class 0是评价 1是提取id
3645
try:
3746
name = jieba.analyse.textrank(pname, topK=5, allowPOS='n')[0]
38-
except:
47+
except Exception as _:
3948
name = "宝贝"
4049
if _class == 1:
4150
return name
@@ -50,6 +59,7 @@ def generation(pname, _class=0):
5059
return 5, (
5160
comments.replace("$", name))
5261

62+
5363
# 查询全部评价
5464
def all_evaluate():
5565
N = {}
@@ -73,7 +83,8 @@ def ordinary(N):
7383
Order_data = []
7484
req_et = []
7585
for i in range((N['待评价订单'] // 20) + 1):
76-
url = f'https://club.jd.com/myJdcomments/myJdcomment.action?sort=0&page={i + 1}'
86+
url = (f'https://club.jd.com/myJdcomments/myJdcomment.action?sort=0&'
87+
f'page={i + 1}')
7788
req = requests.get(url, headers=headers)
7889
req_et.append(etree.HTML(req.text))
7990
for i in req_et:
@@ -86,14 +97,16 @@ def ordinary(N):
8697
print(f"当前共有{N['待评价订单']}个评价。")
8798
for i, Order in enumerate(Order_data):
8899
oid = Order.xpath('tr[@class="tr-th"]/td/span[3]/a/text()')[0]
89-
oname_data = Order.xpath('tr[@class="tr-bd"]/td[1]/div[1]/div[2]/div/a/text()')
90-
pid_data = Order.xpath('tr[@class="tr-bd"]/td[1]/div[1]/div[2]/div/a/@href')
100+
oname_data = Order.xpath(
101+
'tr[@class="tr-bd"]/td[1]/div[1]/div[2]/div/a/text()')
102+
pid_data = Order.xpath(
103+
'tr[@class="tr-bd"]/td[1]/div[1]/div[2]/div/a/@href')
91104
for oname, pid in zip(oname_data, pid_data):
92105
pid = pid.replace('//item.jd.com/', '').replace('.html', '')
93106

94107
print(f"\t{i}.开始评价订单\t{oname}[{oid}]")
95-
url2 = f"https://club.jd.com/myJdcomments/saveProductComment.action"
96-
xing,Str = generation(oname)
108+
url2 = "https://club.jd.com/myJdcomments/saveProductComment.action"
109+
xing, Str = generation(oname)
97110
print(f'\t\t评价内容,星级{xing}:', Str)
98111
data2 = {
99112
'orderId': oid,
@@ -113,10 +126,12 @@ def ordinary(N):
113126
def sunbw(N):
114127
Order_data = []
115128
for i in range((N['待晒单'] // 20) + 1):
116-
url = f"https://club.jd.com/myJdcomments/myJdcomment.action?sort=1&page={i + 1}"
129+
url = (f'https://club.jd.com/myJdcomments/myJdcomment.action?sort=1'
130+
f'&page={i + 1}')
117131
req = requests.get(url, headers=headers)
118132
req_et = etree.HTML(req.text)
119-
Order_data.extend(req_et.xpath('//*[@id="evalu01"]/div[2]/div[1]/div[@class="comt-plist"]/div[1]'))
133+
Order_data.extend(req_et.xpath(
134+
'//*[@id="evalu01"]/div[2]/div[1]/div[@class="comt-plist"]/div[1]'))
120135
print(f"当前共有{N['待晒单']}个需要晒单。")
121136
for i, Order in enumerate(Order_data):
122137
oname = Order.xpath('ul/li[1]/div/div[2]/div[1]/a/text()')[0]
@@ -126,17 +141,21 @@ def sunbw(N):
126141
print(f'\t开始晒单{i},{oname}')
127142
# 获取图片
128143
pname = generation(pname=oname, _class=1)
129-
url1 = f"https://club.jd.com/discussion/getProductPageImageCommentList.action?productId={pid}"
144+
url1 = (f'https://club.jd.com/discussion/getProductPageImageCommentList'
145+
f'.action?productId={pid}')
130146
imgdata = requests.get(url1, headers=headers).json()
131147
if imgdata["imgComments"]["imgCommentCount"] == 0:
132-
url1 = "https://club.jd.com/discussion/getProductPageImageCommentList.action?productId=1190881"
148+
url1 = ('https://club.jd.com/discussion/getProductPageImage'
149+
'CommentList.action?productId=1190881')
133150
imgdata = requests.get(url1, headers=headers).json()
134151
imgurl = imgdata["imgComments"]["imgList"][0]["imageUrl"]
135152

136153
#
137154
print(f'\t\t图片url={imgurl}')
138-
url2 = "https://club.jd.com/myJdcomments/saveShowOrder.action" # 提交晒单
139-
headers['Referer'] = 'https://club.jd.com/myJdcomments/myJdcomment.action?sort=1'
155+
# 提交晒单
156+
url2 = "https://club.jd.com/myJdcomments/saveShowOrder.action"
157+
headers['Referer'] = ('https://club.jd.com/myJdcomments/myJdcomment.'
158+
'action?sort=1')
140159
headers['Origin'] = 'https://club.jd.com'
141160
headers['Content-Type'] = 'application/x-www-form-urlencoded'
142161
data = {
@@ -162,21 +181,27 @@ def review(N):
162181
req_et = []
163182
Order_data = []
164183
for i in range((N['待追评'] // 20) + 1):
165-
url = f"https://club.jd.com/myJdcomments/myJdcomment.action?sort=3&page={i + 1}"
184+
url = (f"https://club.jd.com/myJdcomments/myJdcomment.action?sort=3"
185+
f"&page={i + 1}")
166186
req = requests.get(url, headers=headers)
167187
req_et.append(etree.HTML(req.text))
168188
for i in req_et:
169-
Order_data.extend(i.xpath('//*[@id="main"]/div[2]/div[2]/table/tr[@class="tr-bd"]'))
189+
Order_data.extend(
190+
i.xpath('//*[@id="main"]/div[2]/div[2]/table/tr[@class="tr-bd"]'))
170191
if len(Order_data) != N['待追评']:
171192
for i in req_et:
172-
Order_data.extend(i.xpath('//*[@id="main"]/div[2]/div[2]/table/tbody/tr[@class="tr-bd"]'))
193+
Order_data.extend(i.xpath(
194+
'//*[@id="main"]/div[2]/div[2]/table/tbody/tr[@class="tr-bd"]'))
173195
print(f"当前共有{N['待追评']}个需要追评。")
174196
for i, Order in enumerate(Order_data):
175197
oname = Order.xpath('td[1]/div/div[2]/div/a/text()')[0]
176198
_id = Order.xpath('td[3]/div/a/@href')[0]
177199
print(f'\t开始第{i}{oname}')
178-
url1 = "https://club.jd.com/afterComments/saveAfterCommentAndShowOrder.action"
179-
pid, oid = _id.replace('http://club.jd.com/afterComments/productPublish.action?sku=', "").split('&orderId=')
200+
url1 = ("https://club.jd.com/afterComments/"
201+
"saveAfterCommentAndShowOrder.action")
202+
pid, oid = _id.replace(
203+
'http://club.jd.com/afterComments/productPublish.action?sku=',
204+
"").split('&orderId=')
180205
context = generation(oname)
181206
print(f'\t\t追评内容:{context}')
182207
req_url1 = requests.post(url1, headers=headers, data={
@@ -197,21 +222,25 @@ def Service_rating(N):
197222
Order_data = []
198223
req_et = []
199224
for i in range((N['服务评价'] // 20) + 1):
200-
url = f"https://club.jd.com/myJdcomments/myJdcomment.action?sort=4&page={i + 1}"
225+
url = (f"https://club.jd.com/myJdcomments/myJdcomment.action?sort=4"
226+
f"&page={i + 1}")
201227
req = requests.get(url, headers=headers)
202228
req_et.append(etree.HTML(req.text))
203229
for i in req_et:
204-
Order_data.extend(i.xpath('//*[@id="main"]/div[2]/div[2]/table/tbody/tr[@class="tr-bd"]'))
230+
Order_data.extend(i.xpath(
231+
'//*[@id="main"]/div[2]/div[2]/table/tbody/tr[@class="tr-bd"]'))
205232
if len(Order_data) != N['服务评价']:
206233
Order_data = []
207234
for i in req_et:
208-
Order_data.extend(i.xpath('//*[@id="main"]/div[2]/div[2]/table/tr[@class="tr-bd"]'))
235+
Order_data.extend(i.xpath(
236+
'//*[@id="main"]/div[2]/div[2]/table/tr[@class="tr-bd"]'))
209237
print(f"当前共有{N['服务评价']}个需要服务评价。")
210238
for i, Order in enumerate(Order_data):
211239
oname = Order.xpath('td[1]/div[1]/div[2]/div/a/text()')[0]
212240
oid = Order.xpath('td[4]/div/a[1]/@oid')[0]
213241
print(f'\t开始第{i}{oname}')
214-
url1 = f'https://club.jd.com/myJdcomments/insertRestSurvey.action?voteid=145&ruleid={oid}'
242+
url1 = (f'https://club.jd.com/myJdcomments/insertRestSurvey.action'
243+
f'?voteid=145&ruleid={oid}')
215244
data1 = {
216245
'oid': oid,
217246
'gid': '32',
@@ -273,4 +302,4 @@ def main():
273302
try:
274303
main()
275304
except RecursionError:
276-
print("多次出现未完成情况,程序自动退出")
305+
print("多次出现未完成情况,程序自动退出")

jdspider.py

Lines changed: 61 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -2,68 +2,72 @@
22
# @Author :@Zhang Jiale and @Dimlitter
33
# @File : jdspider.py
44

5-
import sys
6-
import requests
7-
from lxml import etree
8-
import logging
9-
from urllib.parse import quote
105
import json
11-
from urllib.parse import urlencode
12-
import time
6+
import logging
137
import random
14-
import zhon.hanzi
158
import re
16-
#import pymysql
9+
import sys
10+
import time
11+
from urllib.parse import quote, urlencode
12+
13+
import requests
14+
import zhon.hanzi
15+
from lxml import etree
16+
17+
# import pymysql
18+
1719

1820
class JDSpider:
19-
# 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData爬取数据。
20-
def __init__(self,categlory):
21-
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8"%(quote(categlory)) #jD起始搜索页面
21+
# 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData爬取数据。
22+
def __init__(self, categlory):
23+
# jD起始搜索页面
24+
self.startUrl = "https://search.jd.com/Search?keyword=%s&enc=utf-8" % (
25+
quote(categlory))
2226
self.commentBaseUrl = "https://sclub.jd.com/comment/productPageComments.action?"
23-
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36',}
27+
self.headers = {
28+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36', }
2429
self.productsId = self.getId()
25-
self.comtype = {1:"nagetive",2:"medium",3:"positive"}
30+
self.comtype = {1: "nagetive", 2: "medium", 3: "positive"}
2631
self.categlory = categlory
2732
self.iplist = {
28-
'http':[],
29-
'https':[]
33+
'http': [],
34+
'https': []
3035
}
31-
def getParamUrl(self,productid,page,score):
32-
params = { #用于控制页数,页面信息数的数据,非常重要,必不可少,要不然会被JD识别出来,爬不出相应的数据。
33-
"productId": "%s"%(productid),
34-
"score": "%s"%(score), #1表示差评,2表示中评,3表示好评
36+
37+
def getParamUrl(self, productid, page, score):
38+
params = { # 用于控制页数,页面信息数的数据,非常重要,必不可少,要不然会被JD识别出来,爬不出相应的数据。
39+
"productId": "%s" % (productid),
40+
"score": "%s" % (score), # 1表示差评,2表示中评,3表示好评
3541
"sortType": "5",
36-
"page": "%s"%(page),
42+
"page": "%s" % (page),
3743
"pageSize": "10",
3844
"isShadowSku": "0",
3945
"rid": "0",
4046
"fold": "1"
4147
}
42-
url = self.commentBaseUrl+urlencode(params)
43-
return params,url
44-
48+
url = self.commentBaseUrl + urlencode(params)
49+
return params, url
4550

46-
def getHeaders(self,productid): #和初始的self.header不同,这是爬取某个商品的header,加入了商品id,我也不知道去掉了会怎样。
47-
header = {"Referer": "https://item.jd.com/%s.html"%(productid),
48-
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
51+
def getHeaders(self, productid): # 和初始的self.header不同,这是爬取某个商品的header,加入了商品id,我也不知道去掉了会怎样。
52+
header = {"Referer": "https://item.jd.com/%s.html" % (productid),
53+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
4954
}
5055
return header
5156

52-
def getId(self): #获取商品id,为了得到具体商品页面的网址。结果保存在self.productId的数组里
53-
response = requests.get(self.startUrl, headers = self.headers)
57+
def getId(self): # 获取商品id,为了得到具体商品页面的网址。结果保存在self.productId的数组里
58+
response = requests.get(self.startUrl, headers=self.headers)
5459
if response.status_code != 200:
5560
logging.warning("状态码错误,爬虫连接异常!")
5661
html = etree.HTML(response.text)
5762
return html.xpath('//li[@class="gl-item"]/@data-sku')
5863

64+
def getData(self, maxPage, score,): # maxPage是爬取评论的最大页数,每页10条数据。差评和好评的最大一般页码不相同,一般情况下:好评>>差评>中评
65+
# maxPage遇到超出的页码会自动跳出,所以设大点也没有关系。
66+
# score是指那种评价类型,好评3、中评2、差评1。
5967

60-
def getData(self,maxPage,score,): #maxPage是爬取评论的最大页数,每页10条数据。差评和好评的最大一般页码不相同,一般情况下:好评>>差评>中评
61-
#maxPage遇到超出的页码会自动跳出,所以设大点也没有关系。
62-
#score是指那种评价类型,好评3、中评2、差评1。
63-
6468
comments = []
6569
scores = []
66-
if len(self.productsId) < 10: #limit the sum of products
70+
if len(self.productsId) < 10: # limit the sum of products
6771
sum = len(self.productsId)
6872
else:
6973
sum = 10
@@ -74,33 +78,36 @@ def getData(self,maxPage,score,): #maxPage是爬取评论的最大页数,每
7478
param,url = self.getParamUrl(id,i,score)
7579
print("正在爬取评论信息>>>>>>>>>第:%d 个,第 %d 页"%(j,i))
7680
try:
77-
response = requests.get(url,headers = header,params=param)
81+
response = requests.get(url, headers=header, params=param)
7882
except Exception as e:
7983
logging.warning(e)
8084
break
81-
if response.status_code !=200:
85+
if response.status_code != 200:
8286
logging.warning("状态码错误,爬虫连接异常")
8387
continue
84-
time.sleep(random.randint(5,10)) #设置时延,防止被封IP
85-
if response.text=='':
88+
time.sleep(random.randint(5, 10)) # 设置时延,防止被封IP
89+
if response.text == '':
8690
logging.warning("未爬取到信息")
8791
continue
8892
try:
8993
res_json = json.loads(response.text)
9094
except Exception as e:
9195
logging.warning(e)
9296
continue
93-
if len((res_json['comments']))==0:
94-
logging.warning("页面次数已到:%d,超出范围"%(i))
97+
if len((res_json['comments'])) == 0:
98+
logging.warning("页面次数已到:%d,超出范围" % (i))
9599
break
96-
logging.info("正在爬取%s %s 第 %d"%(self.categlory,self.comtype[score],i))
100+
logging.info("正在爬取%s %s 第 %d" %
101+
(self.categlory, self.comtype[score], i))
97102
for cdit in res_json['comments']:
98-
comment = cdit['content'].replace("\n",' ').replace('\r',' ')
103+
comment = cdit['content'].replace(
104+
"\n", ' ').replace('\r', ' ')
99105
comments.append(comment)
100106
scores.append(cdit['score'])
101-
#savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
102-
logging.warning("已爬取%d 条 %s 评价信息"%(len(comments),self.comtype[score]))
103-
#存入列表,简单处理评价
107+
# savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
108+
logging.warning("已爬取%d 条 %s 评价信息" %
109+
(len(comments), self.comtype[score]))
110+
# 存入列表,简单处理评价
104111
remarks = []
105112
for i in range(len(comments)):
106113
rst = re.findall(zhon.hanzi.sentence,comments[i])
@@ -132,17 +139,16 @@ def getData(self,maxPage,score,): #maxPage是爬取评论的最大页数,每
132139
]
133140
return result
134141

135-
136-
def solvedata(self,remarks):
137-
#将数据拆分成句子
142+
def solvedata(self, remarks):
143+
# 将数据拆分成句子
138144
sentences = []
139145
for i in range(len(remarks)):
140146
for j in range(len(remarks[i])):
141147
sentences.append(remarks[i][j])
142-
print("爬取的评价结果:"+ str(sentences))
148+
print("爬取的评价结果:" + str(sentences))
143149
return sentences
144150

145-
#存入mysql数据库
151+
# 存入mysql数据库
146152
'''
147153
db = pymysql.connect(host='主机名',user='用户名',password='密码',db='数据库名',charset='utf8mb4')
148154
mycursor = db.cursor()
@@ -161,18 +167,18 @@ def solvedata(self,remarks):
161167
logging.warning("已存入数据库")
162168
'''
163169

164-
#存入csv文件
170+
# 存入csv文件
165171
'''
166172
with open(savepath,'a+',encoding ='utf8') as f:
167173
for i in range(len(comments)):
168174
f.write("%d\t%s\t%s\n"%(i,scores[i],comments[i]))
169175
logging.warning("数据已保存在 %s"%(savepath))
170176
'''
171-
172-
#测试用例
173-
if __name__ =="__main__":
177+
178+
179+
# 测试用例
180+
if __name__ == "__main__":
174181
list = ['商品名']
175182
for item in list:
176183
spider = JDSpider(item)
177184
spider.getData(2, 3)
178-

0 commit comments

Comments
 (0)