Skip to content

Commit 77a3243

Browse files
committed
Set JDSpider default logging to the customized one
1 parent 5a7bf36 commit 77a3243

File tree

2 files changed

+27
-14
lines changed

2 files changed

+27
-14
lines changed

auto_comment_plus.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,10 @@ def main(opts=None):
629629
jieba.default_logger = logging.getLogger('jieba')
630630
jieba.default_logger.setLevel(level=_logging_level)
631631
jieba.default_logger.addHandler(console)
632+
# It's another hack!!!
633+
jdspider.default_logger = logging.getLogger('spider')
634+
jdspider.default_logger.setLevel(level=_logging_level)
635+
jdspider.default_logger.addHandler(console)
632636

633637
logger.debug('Successfully set up console logger')
634638
logger.debug('CLI arguments: %s', args)
@@ -644,6 +648,7 @@ def main(opts=None):
644648
handler.setFormatter(rawformatter)
645649
logger.addHandler(handler)
646650
jieba.default_logger.addHandler(handler)
651+
jdspider.default_logger.addHandler(handler)
647652
logger.debug('Successfully set up file logger')
648653
logger.debug('Options passed to functions: %s', opts)
649654
logger.debug('Builtin constants:')

jdspider.py

Lines changed: 22 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging
77
import random
88
import re
9+
import sys
910
import time
1011
from urllib.parse import quote, urlencode
1112

@@ -14,6 +15,13 @@
1415
from lxml import etree
1516

1617

18+
# Reference: https://github.com/fxsjy/jieba/blob/1e20c89b66f56c9301b0feed211733ffaa1bd72a/jieba/__init__.py#L27
19+
log_console = logging.StreamHandler(sys.stderr)
20+
default_logger = logging.getLogger('jdspider')
21+
default_logger.setLevel(logging.DEBUG)
22+
default_logger.addHandler(log_console)
23+
24+
1725
class JDSpider:
1826
# 爬虫实现类:传入商品类别(如手机、电脑),构造实例。然后调用getData爬取数据。
1927
def __init__(self, categlory):
@@ -67,7 +75,7 @@ def getHeaders(self, productid): # 和初始的self.header不同,这是爬取
6775
def getId(self): # 获取商品id,为了得到具体商品页面的网址。结果保存在self.productId的数组里
6876
response = requests.get(self.startUrl, headers=self.headers)
6977
if response.status_code != 200:
70-
logging.warning("状态码错误,爬虫连接异常!")
78+
default_logger.warning("状态码错误,爬虫连接异常!")
7179
html = etree.HTML(response.text)
7280
return html.xpath('//li[@class="gl-item"]/@data-sku')
7381

@@ -86,48 +94,48 @@ def getData(self, maxPage, score,): # maxPage是爬取评论的最大页数,
8694
header = self.getHeaders(id)
8795
for i in range(1, maxPage):
8896
param, url = self.getParamUrl(id, i, score)
89-
print("正在爬取评论信息>>>>>>>>>第:%d 个,第 %d 页" % (j, i))
97+
default_logger.info("正在爬取评论信息>>>>>>>>>第:%d 个,第 %d 页" % (j, i))
9098
try:
9199
response = requests.get(url, headers=header, params=param)
92100
except Exception as e:
93-
logging.warning(e)
101+
default_logger.warning(e)
94102
break
95103
if response.status_code != 200:
96-
logging.warning("状态码错误,爬虫连接异常")
104+
default_logger.warning("状态码错误,爬虫连接异常")
97105
continue
98106
time.sleep(random.randint(5, 10)) # 设置时延,防止被封IP
99107
if response.text == '':
100-
logging.warning("未爬取到信息")
108+
default_logger.warning("未爬取到信息")
101109
continue
102110
try:
103111
res_json = json.loads(response.text)
104112
except Exception as e:
105-
logging.warning(e)
113+
default_logger.warning(e)
106114
continue
107115
if len((res_json['comments'])) == 0:
108-
logging.warning("页面次数已到:%d,超出范围" % (i))
116+
default_logger.warning("页面次数已到:%d,超出范围" % (i))
109117
break
110-
logging.info("正在爬取%s %s 第 %d" %
111-
(self.categlory, self.comtype[score], i))
118+
default_logger.info("正在爬取%s %s 第 %d" %
119+
(self.categlory, self.comtype[score], i))
112120
for cdit in res_json['comments']:
113121
comment = cdit['content'].replace(
114122
"\n", ' ').replace('\r', ' ')
115123
comments.append(comment)
116124
scores.append(cdit['score'])
117125
# savepath = './'+self.categlory+'_'+self.comtype[score]+'.csv'
118-
logging.warning("已爬取%d 条 %s 评价信息" %
119-
(len(comments), self.comtype[score]))
126+
default_logger.warning("已爬取%d 条 %s 评价信息" %
127+
(len(comments), self.comtype[score]))
120128
# 存入列表,简单处理评价
121129
remarks = []
122130
for i in range(len(comments)):
123131
rst = re.findall(zhon.hanzi.sentence, comments[i])
124132
if len(rst) == 0 or rst == ['。'] or rst == ['?'] or rst == ['!'] or rst == ['.'] or rst == [','] or rst == ['?'] or rst == ['!']:
125-
logging.warning("拆分失败或结果不符(去除空格和标点符号):%s" % (rst))
133+
default_logger.warning("拆分失败或结果不符(去除空格和标点符号):%s" % (rst))
126134
else:
127135
remarks.append(rst)
128136
result = self.solvedata(remarks=remarks)
129137
if len(result) == 0:
130-
logging.warning("当前商品没有评价,使用默认评价")
138+
default_logger.warning("当前商品没有评价,使用默认评价")
131139
result = ["考虑买这个$之前我是有担心过的,因为我不知道$的质量和品质怎么样,但是看了评论后我就放心了。",
132140
"买这个$之前我是有看过好几家店,最后看到这家店的评价不错就决定在这家店买 ",
133141
"看了好几家店,也对比了好几家店,最后发现还是这一家的$评价最好。",
@@ -155,7 +163,7 @@ def solvedata(self, remarks):
155163
for i in range(len(remarks)):
156164
for j in range(len(remarks[i])):
157165
sentences.append(remarks[i][j])
158-
print("爬取的评价结果:" + str(sentences))
166+
default_logger.info("爬取的评价结果:" + str(sentences))
159167
return sentences
160168

161169
# 存入mysql数据库

0 commit comments

Comments
 (0)