66import logging
77import random
88import re
9+ import sys
910import time
1011from urllib .parse import quote , urlencode
1112
1415from lxml import etree
1516
1617
18+ # Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27
19+ log_console = logging .StreamHandler (sys .stderr )
20+ default_logger = logging .getLogger ('jdspider' )
21+ default_logger .setLevel (logging .DEBUG )
22+ default_logger .addHandler (log_console )
23+
24+
1725class JDSpider :
1826 # 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData爬取数据。
1927 def __init__ (self , categlory ):
@@ -67,7 +75,7 @@ def getHeaders(self, productid): # 和初始的self.header不同,这是爬取
6775 def getId (self ): # 获取商品id,为了得到具体商品页面的网址。结果保存在self.productId的数组里
6876 response = requests .get (self .startUrl , headers = self .headers )
6977 if response .status_code != 200 :
70- logging .warning ("状态码错误,爬虫连接异常!" )
78+ default_logger .warning ("状态码错误,爬虫连接异常!" )
7179 html = etree .HTML (response .text )
7280 return html .xpath ('//li[@class="gl-item"]/@data-sku' )
7381
@@ -86,48 +94,48 @@ def getData(self, maxPage, score,): # maxPage是爬取评论的最大页数,
8694 header = self .getHeaders (id )
8795 for i in range (1 , maxPage ):
8896 param , url = self .getParamUrl (id , i , score )
89- print ("正在爬取评论信息>>>>>>>>>第:%d 个,第 %d 页" % (j , i ))
97+ default_logger . info ("正在爬取评论信息>>>>>>>>>第:%d 个,第 %d 页" % (j , i ))
9098 try :
9199 response = requests .get (url , headers = header , params = param )
92100 except Exception as e :
93- logging .warning (e )
101+ default_logger .warning (e )
94102 break
95103 if response .status_code != 200 :
96- logging .warning ("状态码错误,爬虫连接异常" )
104+ default_logger .warning ("状态码错误,爬虫连接异常" )
97105 continue
98106 time .sleep (random .randint (5 , 10 )) # 设置时延,防止被封IP
99107 if response .text == '' :
100- logging .warning ("未爬取到信息" )
108+ default_logger .warning ("未爬取到信息" )
101109 continue
102110 try :
103111 res_json = json .loads (response .text )
104112 except Exception as e :
105- logging .warning (e )
113+ default_logger .warning (e )
106114 continue
107115 if len ((res_json ['comments' ])) == 0 :
108- logging .warning ("页面次数已到:%d,超出范围" % (i ))
116+ default_logger .warning ("页面次数已到:%d,超出范围" % (i ))
109117 break
110- logging .info ("正在爬取%s %s 第 %d" %
111- (self .categlory , self .comtype [score ], i ))
118+ default_logger .info ("正在爬取%s %s 第 %d" %
119+ (self .categlory , self .comtype [score ], i ))
112120 for cdit in res_json ['comments' ]:
113121 comment = cdit ['content' ].replace (
114122 "\n " , ' ' ).replace ('\r ' , ' ' )
115123 comments .append (comment )
116124 scores .append (cdit ['score' ])
117125 # savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
118- logging .warning ("已爬取%d 条 %s 评价信息" %
119- (len (comments ), self .comtype [score ]))
126+ default_logger .warning ("已爬取%d 条 %s 评价信息" %
127+ (len (comments ), self .comtype [score ]))
120128 # 存入列表,简单处理评价
121129 remarks = []
122130 for i in range (len (comments )):
123131 rst = re .findall (zhon .hanzi .sentence , comments [i ])
124132 if len (rst ) == 0 or rst == ['。' ] or rst == ['?' ] or rst == ['!' ] or rst == ['.' ] or rst == [',' ] or rst == ['?' ] or rst == ['!' ]:
125- logging .warning ("拆分失败或结果不符(去除空格和标点符号):%s" % (rst ))
133+ default_logger .warning ("拆分失败或结果不符(去除空格和标点符号):%s" % (rst ))
126134 else :
127135 remarks .append (rst )
128136 result = self .solvedata (remarks = remarks )
129137 if len (result ) == 0 :
130- logging .warning ("当前商品没有评价,使用默认评价" )
138+ default_logger .warning ("当前商品没有评价,使用默认评价" )
131139 result = ["考虑买这个$之前我是有担心过的,因为我不知道$的质量和品质怎么样,但是看了评论后我就放心了。" ,
132140 "买这个$之前我是有看过好几家店,最后看到这家店的评价不错就决定在这家店买 " ,
133141 "看了好几家店,也对比了好几家店,最后发现还是这一家的$评价最好。" ,
@@ -155,7 +163,7 @@ def solvedata(self, remarks):
155163 for i in range (len (remarks )):
156164 for j in range (len (remarks [i ])):
157165 sentences .append (remarks [i ][j ])
158- print ("爬取的评价结果:" + str (sentences ))
166+ default_logger . info ("爬取的评价结果:" + str (sentences ))
159167 return sentences
160168
161169 # 存入mysql数据库
0 commit comments