-
Notifications
You must be signed in to change notification settings - Fork 38
Expand file tree
/
Copy pathsimplecrawlerWAP.py
More file actions
1956 lines (1765 loc) · 84.4 KB
/
simplecrawlerWAP.py
File metadata and controls
1956 lines (1765 loc) · 84.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
# # -*- coding: utf-8 -*-
import sys
"""
这是一个新浪微群爬虫。目前只有命令行界面。
使用方法:下回分解。
"""
__author__ = 'David Lau'
__version__= '0.1'
__nonsense__ = 'weiqun crawler'
#导入sina_reptile
from sina_reptile import *
#调试HTTP Web服务
import httplib
httplib.HTTPConnection.debuglevel = 0
#数据库 Beautiful Soup库
import sqlite3 as sqlite
from bs4 import BeautifulSoup
#url处理
import urlparse
#处理gzip
import StringIO
import StringIO
import gzip
import urllib2
import re
import os
import sys
import datetime
import time
import itertools
import termios
import fcntl
"""
Login to Sina Weibo with cookie
"""
PAGE_REQUEST_ERROR = '请求页不存在'
PAGE_REDIRECT = '如果没有自动跳转,请'
NOBODY_POST_THIS_PAGE = '你加入的群还没有人说话!'
NO_REPLY = '还没有人针对这条微博发表评论!'
WEIBO_SQUARE ='微博广场'
USERPROFILE_PREFIX = 'http://weibo.cn/%s/profile'#uid个人主页
#一般fans/follows第i页的url: http://weibo.cn/uid/fans?page=[i]
FANPAGE_PREFIX = 'http://weibo.cn/%s/fans'#uid的粉丝用户页面
FOPAGE_PREFIX = 'http://weibo.cn/%s/follow'#uid的关注用户页面
FANPAGE_SURFFIX = '?page=%s&st=c23d'
FOPAGE_SURFFIX = '?page=%s&st=c23d'
WEIBO_PER_PAGE = 10
MEET_TRAP = 99
#trap times 记录次数 到一定值则换cookie
TRAP_TIMES = 0
WEIQUN_BASE='http://q.weibo.cn/group/'
def fromUTF8toSysCoding(html):
'''
decode and encode with sysencoding
'''
syscodetype = sys.getfilesystemencoding()
return html.decode('utf-8').encode(syscodetype)
def degzip(compresseddata):
'''
decompress gzip (html) file,return ungzip html string
'''
if(compresseddata[0:3]=='\x1f\x8b\x08'):
compressedstream = StringIO.StringIO(compresseddata)
gzipper = gzip.GzipFile(fileobj=compressedstream)
html = gzipper.read()
return html
else:
return compresseddata
def storehtml(html,path,url=None,showdetail=True):
'''
将html存到磁盘path (html来源url)
#没有文件夹则建立
'''
(basename,filename) = os.path.split(path)
if not os.path.exists(basename):
os.makedirs(basename)
#建立文件
try:
fd = open(path,'w')#w+模式代表截断(或清空)文件,然后打开文件用于写
fd.write(html)
if showdetail:
print "网页保存到本地"+path+',\turl='+url
except Exception,E:
if showdetail:
print "网页保存到本地"+path+',\turl='+url
finally:
fd.close()
def urlprocessor(url,header,absUrl=True):
'''
带cookie(header)下载url页面html,检查html是否gzip压缩,有则解压;
需要用Firebug查看浏览器访问时发送的GET headers(包括cookie),模拟,才可以正常访问;否则返回无法找到该页。
返回: None 错误
html 正常下载页面内容
'''
try:
req = urllib2.Request(url, headers=header)
f = urllib2.urlopen(req)
html = f.read()
'''
#解压缩get得到的网页gzip包
if(f.headers.get('Content-Encoding') == 'gzip'):
html = degzip(html)
#把相对地址替换成绝对地址
if absUrl:
html = getAbsUrl(url,html)
return html
'''
except Exception,E:
print "urlprocessor():下载页面错误:"+url+',错误代码:'+str(E)
return None
if html:
#解压缩get得到的网页gzip包
if(f.headers.get('Content-Encoding') == 'gzip'):
html = degzip(html)
#把相对地址替换成绝对地址
if absUrl:
html = getAbsUrl(url,html)
return html
else:
return None
def getAbsUrl(base,html):
'''
用re把html内的相对地址替换成绝对地址,返回带绝对地址的html
'''
regex = '<a href="(.+?)"'
reobj = re.compile(regex)
#找出所有相对地址,存放在reladdrList中
reladdrList = reobj.findall(html)
for reladdr in reladdrList:
if ':' not in reladdr:
#补全相对地址为绝对地址url,用re替换
url = urlparse.urljoin(base,reladdr)
#print "getAbsUrl()处理相对地址:"+reladdr
html = html.replace('<a href="'+reladdr,'<a href="'+url)
return html
def testgetAbsUrl():
'''
测试testgetAbsUrl
'''
html = '''<div><a href="/profile/1808877652">Nulooper</a><span class="ctt">:NLP初学者报道,希望多多指教</span> <a href="/group/viewRt/225241/103r08nr6th">转发</a> <a href="/group/review/225241/103r08nr6th?&#cmtfrm" class="cc">评论[0]</a> <span class="ct">01月26日 19:30</span></div>'''
weiqunUrl = 'http://q.weibo.cn/group/225241/'
html = getAbsUrl(weiqunUrl,html)
print html
class Weiqun_crawler:
'''
微群爬虫类
'''
def __init__(self,weibodbname,userdbname,usersdir,savedir,weiqunID,startpage,endpage,headers_weiqun,headers_login,cookies_list,loginurl,begin_with_cookie_n=None):
'''
初始化类,设置保存地址,微群号,连接数据库
'''
print '----------- 初始化微群爬虫 id:%s -----------'%weiqunID
#header cookie
self.begin_with_cookie_n = begin_with_cookie_n
self.cookies = []
self.cookies.extend(cookies_list)
self.cookie_iter = itertools.cycle(self.cookies)
self.headers_weiqun = headers_weiqun
self.headers_login = headers_login
self.cur_cookie = ''
self.change_cookie_times = 0 #改变cookie的次数
self.loginurl= loginurl
self.init_cookie_headers()
#下载网页保存地址
self.savedir = savedir
self.weiqunID = str(weiqunID)
self.usersdir = usersdir
#扫微群页数
self.startpage = startpage
self.endpage = endpage
#数据库名称
self.weibodbname = weibodbname
self.userdbname = userdbname
#设置微博数据库
print self.weibodbname
self.con_weibo = sqlite.connect(self.weibodbname)
self.cur_weibo = self.con_weibo.cursor()
#设置用户数据库
self.con_user = sqlite.connect(self.userdbname)
self.cur_user = self.con_user.cursor()
#初始化db表
self.createweibostable()
self.createuserstable()
#析取微博信息
self.allweiboinfos = [] #weiboinfo={raw,id,content...见db table weibos}
self.replyinfos=[]#储存replyinfo{weiboid,...}(同数据库中weibos的item)
self.rtinfos=[]#同上
self.profiles=[]#储存profile={userid,username,weibos微博数,followers粉丝数,followings关注数}
self.failed_reply_paths=[]#储存分析失败的reply.html路径,需要重新爬取
self.failed_rt_paths=[]#同上
#给crawler.cralw..函数返回用 或db网页下载状态用
self.STATUS_FAILED = 2
self.STATUS_DOWNLOADED = 1
self.STATUS_UNDOWNLOAD = 0
#上次爬虫下载的内容,用与self.is_trap
self.last_crawl_html = ''
#要下载的微群页表 类型ints
self.weiqun_pages2download=[]
def __del__(self):
'''
关闭数据库连接
'''
#print '----------- 销毁微群爬虫 id:%s -----------'%self.weiqunID
self.con_weibo.close()
self.con_user.close()
def init_cookie_headers(self):
#设置开始爬行使用的用户COOKIE
if self.begin_with_cookie_n is not None:
if len(self.cookies) >= self.begin_with_cookie_n:
if self.begin_with_cookie_n >= 0:
self.cur_cookie = self.cookies[self.begin_with_cookie_n]
self.headers_weiqun.update({'Cookie':self.cur_cookie})
self.headers_login.update({'Cookie':self.cur_cookie})
#print 'INIT COOKIE!!!!!!!!!!!!!!!!!!!!!'
#self.test_login('http://weibo.cn')
return
#如果没有设置开始用户,则佢第0个COOKIE
self.cur_cookie = self.cookies[0]
self.headers_weiqun.update({'Cookie':self.cur_cookie})
self.headers_login.update({'Cookie':self.cur_cookie})
#print 'INIT COOKIE!!!!!!!!!!!!!!!!!!!!!'
#self.test_login('http://weibo.cn')
def change_cookie_headers(self):
self.cur_cookie = self.cookie_iter.next()
self.headers_weiqun.update({'Cookie':self.cur_cookie})
self.headers_login.update({'Cookie':self.cur_cookie})
print 'CHANGE COOKIE!!!!!!!!!!!!!!!!!!!!!'
self.test_login('http://weibo.cn')
self.change_cookie_times+=1
def get_user_relation_txt(self):
'''
从指定self.微群id 的db获取users,从user.db获取users的关系: follower_uid target_uid
输出: user_relation_weiqunID.txt (Win CRLF Ansi?)
第一个用户id标示源用户id,第二个用户id标示目标用户id,源用户关注目标用户
user_list_all_weiqunID.txt (CRLF)
每行是所有在关系里出现过的uid unique
返回:True 成功
False 失败 读取数据库时
None 无查询结果
'''
print 'get_user_relation_txt: 正在生成用户关系对txt weiqun=%s'% str(self.weiqunID)
count = 0
#选择某微群的所有用户
sql_weiqundb = ''' SELECT DISTINCT userid FROM weibos ;'''
#选择某用户的所有关系
sql_usersdb = ''' SELECT followerid,userid FROM relation WHERE userid=='%s' or followerid=='%s' ;'''
#-----------获取微群db的用户列表---------------------------------------------------------
try:
self.cur_weibo.execute(sql_weiqundb)
self.dbcommit()
except Exception,E:
print 'get_user_relation_txt 数据库操作错误sql:%s'%sql_weiqundb
print E
return False
userids = []
res = self.cur_weibo.fetchall()
if len(res)<1:#查询不到东西 返回None
print 'get_user_relation_txt:微群%s的数据库%s中没有用户'%(self.weiqunID,self.weiqunID+'.db')
return None
else:#有查询结果(有用户) ,添加到用户列表中userids
for row in res:
userid, = row
#print userid
#print type(userid)
if userid not in userids:
userids.append(str(userid))
#-----------获取users.db的用户关系---------------------------------------------------------
print 'get_user_relation_txt:正在从users.db读取%d个用户的关系'%len(userids)
relations = []
for userid in userids:
#print '\t获取用户关系uid:%s'%str(userid)
try:
self.cur_user.execute(sql_usersdb % (userid,userid))
self.dbcommit()
except Exception,E:
print 'get_user_relation_txt 数据库操作错误sql:%s'%(sql_usersdb % (userid,userid))
print E
return False
res2 = self.cur_user.fetchall()
if len(res2)<1:#查询不到某个用户的关系 跳过
print 'get_user_relation_txt:users.db中没有微群:%s,uid=%s的用户关系'%(self.weiqunID+'.db',userid)
#return None
continue
else:#有查询结果(有用户关系) ,添加到用户列表中userids
for row in res2:
followerid,userid = row
relation = (followerid,userid)
#!!!!!!!!! 可能有重复的 !!!!!!!!!!!
relations.append(relation)
count+=1
#写到文件中:user_relation_weiqunID.txt (CRLF)
path = '../weiqun/user_relation_%s.txt' % str(self.weiqunID)
txtlines = []
relations.sort()
for rela in relations:
followerid,userid = rela
txtline = str(followerid) + '\t' + str(userid) +'\r\n'
txtlines.append(txtline)
with open(path,'w') as f:
f.writelines(txtlines)
f.close()
#写到文件中:user_list_all_weiqunID.txt (CRLF)
usernum=0
path = '../weiqun/user_list_all_%s.txt' % str(self.weiqunID)
txtlines = []
all_uid = set([])#use set as dinstinct list,fast!!
for rela in relations:
followerid,userid = rela
if followerid not in all_uid:
all_uid.add(followerid)
if userid not in all_uid:
all_uid.add(userid)
all_uid = [i for i in all_uid]
all_uid.sort()
for userid in all_uid:
txtline = str(userid) +'\r\n'
usernum+=1
txtlines.append(txtline)
with open(path,'w') as f:
f.writelines(txtlines)
f.close()
print 'got_user_relation_txt: weiqun=%s'% (str(self.weiqunID))
print '\t有关注关系',count
print '\t所有用户(出现在关注关系中的)',usernum
def load_weiqun_pages2download(self):
'''
任务:返回未下载\陷阱的微群页(微群id=self.weiqunID),
更改:
self.weiqun_pages2download[] of int(page)s
返回:self.weiqun_pages2download[]
False 失败
'''
print 'load_weiqun_pages2download:读取微群%s的下载列表'%self.weiqunID
#path, url, status, type='weiqunpage',userid=微群id
sql = '''SELECT path,url,status,type,userid,page FROM download WHERE userid == '%s' and type=='%s' '''%(str(self.weiqunID),'weiqunpage')
try:
self.cur_user.execute(sql)
self.dbcommit()
except Exception,E:
print 'load_download_db_state数据库操作错误sql:%s'%sql
print E
return False
res = self.cur_user.fetchall()
if len(res)<1:#查询不到东西 返回[startpage ~ endpage]
allpages = [i for i in range(self.startpage, self.endpage)]
self.weiqun_pages2download = allpages
return self.weiqun_pages2download
else:#有查询结果(有下载过)
self.weiqun_pages2download = [i for i in range(self.startpage, self.endpage)]
#将已下载过的pages从上表剔除
for row in res:
path,url,status,type,weiqunid,page = row
#如果 没有下载过page 且 page不在weiqun_pages2download中,加入
if status==self.STATUS_DOWNLOADED and \
page in self.weiqun_pages2download:
if page < self.endpage:#db记录未下载页数要小于传参的endpage页数
self.weiqun_pages2download.remove(page)
print 'load_weiqun_pages2download:共%d个未下载'%len(self.weiqun_pages2download)
return self.weiqun_pages2download
def update_download_db_state(self,url,path,type,status,page=None,userid=None,now=None):
if now is None:
now = datetime.datetime.now()
try:
self.cur_user.execute('''REPLACE INTO download(userid,type,page,status,url,path) VALUES ('%s','%s',%d,%d,'%s','%s');'''%(str(userid),str(type),int(page),int(status),str(url),str(path)))
self.dbcommit()
except Exception,E:
print 'update_download_db_state:无法replace项'
print E
return False
return True
def load_download_db_state(self,url,path):
'''
任务:给定PK:url,path,查询self.userdbname的table:download
返回:(userid,type,page,status,url,path,randurl,datetime) (最后一项)符合的项,
None 若无查询结果
'''
sql = '''SELECT userid,type,page,status,url,path,randurl FROM download WHERE url == '%s' and path== '%s' '''%(str(url),str(path))
try:
self.cur_user.execute(sql)
self.dbcommit()
except Exception,E:
print 'load_download_db_state数据库操作错误sql:%s'%sql
print E
return None
res = self.cur_user.fetchall()
if len(res)<1:#查询不到东西
return None
if len(res)>1:
print 'load_download_db_state得到多个sql查询结果,返回最后一个结果'
for row in res:
#print type(row)#tuple
userid,type,page,status,url,path,randurl = row
return userid,type,page,status,url,path,randurl,None
def update_download_list(self,endpage=None,weiqunid='',showdetail = False):
'''
任务:将给定微群id的 已/未下载的微群页记录到下载列表self.userdbname -> download table中
返回:pages_undownload[] 未下载的微群页数列表
'''
pages_undownload = []
#若参数没指定微群id则用crawler自己的微群id endpage
if weiqunid == '':
weiqunid = str(self.weiqunID)
if endpage == None:
endpage = self.endpage
#开始扫描
for i in range(1,endpage+1):
weiqunUrl = WEIQUN_BASE + str(weiqunid)
pageurl = weiqunUrl + '?page=' + str(i)
path = self.savedir + '/' + str(weiqunid) + '?page=' + str(i) + '.html'
#判断是否下载过
try:
f = open(path,'r')
localhtml=''
lines = f.readlines()
for line in lines:
localhtml+=line
#如果下载过(且不是陷阱页)就不下载了 返回True
#return True
if not self.is_weiqun_page_trap(localhtml,showdetail=False):
if showdetail:
print '\t下载过且非陷阱,更新download table状态:已下载:%s,长度%d'%(path,len(localhtml))
# replace download table状态:已下载
# 格式path, url, status=1, type='weiqunpage',userid=微群id,page=i
succ = self.update_download_db_state(pageurl,path,type='weiqunpage',status=self.STATUS_DOWNLOADED,page=i,userid=weiqunid)
if not succ:
print 'update_download_list:更新数据库失败'
else:
#if showdetail:
print '\t下了陷阱,更新download table状态:下载失败:%s,长度%d'%(path,len(localhtml))
# replace download table状态:陷阱
# 格式path, url, status=2, type='weiqunpage',userid=微群id,page=i
pages_undownload.append(i)
succ = self.update_download_db_state(pageurl,path,type='weiqunpage',status=self.STATUS_FAILED,page=i,userid=weiqunid)
if not succ:
print 'update_download_list:更新数据库失败'
except Exception as e:
#if showdetail:
print '\t没下载过,更新download table状态:待下载:%s'%(path)
#没有这个文件,改download table状态:待下载
# 格式path, url, status=0, type='weiqunpage',userid=微群id,page=i
pages_undownload.append(i)
succ = self.update_download_db_state(pageurl,path,type='weiqunpage',status=self.STATUS_UNDOWNLOAD,page=i,userid=weiqunid)
if not succ:
print 'update_download_list:更新数据库失败'
return pages_undownload
def weiqun_crawl_page(self,i):
'''
爬虫方法,爬取微群=weiqunID的i页的页面,保存到路径self.savedir内。(如果下载过且非陷阱则跳过)
保存格式:self.savedir/weiqunID?page=i,i是页码
修改:下载列表状态:成功下载
返回:True
False
'''
weiqunUrl = WEIQUN_BASE + str(self.weiqunID)
pageurl = weiqunUrl + '?page=' + str(i)
#path = self.savedir/weiqunID?page=2.html,same as http://q.weiqun.cn/group/weiqunID?page=2
path = self.savedir + '/' + str(self.weiqunID) + '?page=' + str(i) + '.html'
'''#用update_download_list 代替判断
#判断是否下载过
try:
f = open(path,'r')
localhtml=''
lines = f.readlines()
for line in lines:
localhtml+=line
#如果下载过(且不是陷阱页)就不下载了 返回True
#return True
if not self.is_weiqun_page_trap(localhtml):
print '\t下载过且非陷阱,跳过:%s,长度%d'%(path,len(localhtml))
return True
except Exception as e:
pass
'''
#下载url的html,把绝对地址转换成相对地址
pagehtml = urlprocessor(pageurl,self.headers_weiqun,absUrl=True)
#判断是否是trap
if pagehtml:
if self.is_weiqun_page_trap(pagehtml) :
storehtml(pagehtml,path,pageurl)
print "下载微群页面可能出错,错误样本:%s"%path
return False
else:
#下载页面错误
return False
#转换成系统编码
#pagehtml = fromUTF8toSysCoding(pagehtml)
storehtml(pagehtml,path,pageurl)
#修改db的下载列表状态:成功下载
succ = self.update_download_db_state(pageurl,path,type='weiqunpage',status=self.STATUS_DOWNLOADED,page=i,userid=self.weiqunID)
if not succ:
print 'update_download_list:更新数据库失败'
return True
def is_weiqun_page_trap(self,html,showdetail=True):
#是否 定向到 失败页
if html:
if PAGE_REQUEST_ERROR in html :
if showdetail:
print 'is_weiqun_page_trap陷阱:%s'%PAGE_REQUEST_ERROR
return True
if NOBODY_POST_THIS_PAGE in html:
if showdetail:
print 'is_weiqun_page_trap陷阱:%s'%NOBODY_POST_THIS_PAGE
return True
if PAGE_REDIRECT in html:
if showdetail:
print 'is_weiqun_page_trap陷阱:%s'%PAGE_REDIRECT
return True
if html is None:
if showdetail:
print 'is_weiqun_page_trap陷阱: html is None'
return True
if len(html)< 4000:
if showdetail:
print 'is_weiqun_page_trap陷阱:网页长度过小'
return True
#是否与上次访问重复
if self.last_crawl_html == str(html):
if showdetail:
print "is_spider_trap遇到重复网页:%s"%str(html)
self.last_crawl_html = ''
return True
else:
#缓存上次下载的页面,以备检查是否下载相同页面(反爬虫页)
self.last_crawl_html = str(html)
return False
def rtreply_crawl(self,startpage,endpage,showdetail=False):
'''
待改善:速度慢,可并行加快下载,串行io提高效率
前提:运行了 weiqun_crawl_page(),start_analyze_weibos(),end_analyze_weibos() 或数据库中表weibos有原创weibo(isoriginal=1)信息
任务:从数据库中读取startpage~endpage页每条微博的评论`转发url,下载到:
该weibo[i].html目录下的 ./weibo[i]/reply,./weibo[i]/rt 目录中
每条转发\评论命名为rt[j].html,reply[j].html
'''
print "开始从数据库中读取每条微博的评论`转发url信息,下载到本地磁盘"
header = self.headers_weiqun
self.cur_weibo.execute("SELECT weiboid,path,replyurl,rturl,reply,rt FROM weibos WHERE isoriginal = 1")
count = 0
rtcount=0
replycount=0
for row in self.cur_weibo.fetchall():
count+=1
weiboid,path,replyurl,rturl,reply,rt = row
#------------handle reply-------------------------------------
#选取原创微博(isoriginal=1)的 本地存储地址path,评论replyurl
if reply!=0:
replyhtml = urlprocessor(replyurl,header)
if ("请求页不存在 出错了" not in replyhtml):
replycount+=1
if showdetail: print "处理微博reply,第"+str(replycount)+'条reply:'+path+"的replyurl:"+replyurl
replypath = path.rstrip('.html')+'/'+'reply.html'
if showdetail: print "\t保存微博reply,第"+str(replycount)+'条reply到路径:'+replypath
storehtml(replyhtml,replypath,replyurl,showdetail)
else:
print "下载reply页出错:请求页不存在 出错了,路径:"+path
pass
#------------handle rt 基本同上-----------------------------------
#选取原创微博(isoriginal=1)的 本地存储地址path,转发rturl
if rt!=0:
rthtml = urlprocessor(rturl,header)
if ( PAGE_REQUEST_ERROR not in rthtml):
rtcount+=1
if showdetail: print "处理微博rt,第"+str(rtcount)+'条rt:'+path+"的rturl:"+rturl
rtpath = path.rstrip('.html')+'/'+'rt.html'
if showdetail: print "\t保存微博rt,第"+str(rtcount)+'条rt到路径:'+rtpath
storehtml(rthtml,rtpath,rturl,showdetail)
else:
print "下载rt页出错:请求页不存在 出错了,路径:"+path
pass
print "完成"+str(count)+("条微博的reply(%d) rt(%d)的下载,从:"%(replycount,rtcount))+ str(self.weibodbname)
def start_rtreply_analyze(self):
'''
开始对硬盘的所有rt,reply网页进行分析,
'''
'''
前提:当rtreply_crawl()下载好微群的每个reply(rt).html存至磁盘
任务:1.从磁盘遍历savedir下所有路径为weiqunid?page=[i]/weibo[j]/reply(rt).html,交给self.rtreply_analyze(路径名):
返回:分析失败的rt/reply.html相对路径表failed_reply_paths[],failed_rt_paths[],待重新爬取
'''
print "开始分析微博rt,reply:读取本地目录%s下的所有rt,reply"%(self.savedir)
#self.replyinfos=[]#储存replyinfo{weiboid,...}(同数据库中weibos的item)
#self.rtinfos=[]#同上
#self.failed_reply_paths=[]#储存分析失败的reply.html路径,需要重新爬取
#self.failed_rt_paths=[]#同上
countrt=0#记录抽取微博rt数量
countrtfail=0#rt抽取失败的数量
countreplyfail=0#reply抽取失败的数量
countreply=0#记录抽取微博reply数量
#dir是文件名或目录,path是目录
for pagedir in os.listdir(self.savedir):
pagepath = self.savedir +'/' + pagedir
if os.path.isdir(pagepath):
for weibodir in os.listdir(pagepath):
#print pagepath #:./NLP/225241?page=9
#print weibodir #:weibo9.html weibo1 ...
weibopath = pagepath + '/' + weibodir
if os.path.isdir(weibopath):
rtpath = weibopath +'/rt.html'
replypath = weibopath+'/reply.html'
#分析reply页的html
if os.path.isfile(rtpath):
print '分析rt:'+ replypath
try:
f=open(rtpath,'r')
html=f.read()
except Exception,E:
print 'Weiqun_crawler.start_rtreply_analyze()打开本地缓存页面失败:'+str(rtpath)
f.close()
continue#不分析该页
finally:
f.close()
#分析rt.html
rtinfos = self.rt_analyze(html,rtpath)
#如果分析失败,记录到失败表中
if rtinfos is None:
countrtfail+=1
self.failed_rt_paths.append(replypath)
#成功加入表
else:
for rtinfo in rtinfos:
countrt+=1
self.rtinfos.append(rtinfo)
#分析reply页的html
if os.path.isfile(replypath):
#print '分析reply:'+ replypath
try:
f=open(replypath,'r')
html=f.read()
except Exception,E:
print 'Weiqun_crawler.start_rtreply_analyze()打开本地缓存页面失败:'+str(replypath)
f.close()
continue#不分析该页
finally:
f.close()
#分析reply.html
replyinfos = self.reply_analyze(html,replypath)
#如果改页分析失败,记录到失败表中
if replyinfos is None:
countreplyfail+=1
self.failed_reply_paths.append(replypath)
#成功加入表
else:
for replyinfo in replyinfos:
countreply+=1
self.replyinfos.append(replyinfo)
print "完成%d条reply,%d条rt的分析,失败reply:%d,rt:%d条,从本地目录:%s"%(countreply,countrt,countreplyfail,countrtfail,self.savedir)
#返回:分析失败的rt/reply.html相对路径表failed_reply_paths[],failed_rt_paths[],待重新爬取
return self.failed_reply_paths,self.failed_rt_paths
def end_rtreply_analyze_to_db(self):
'''
前提:start_rtreply_analyze()或rtreply_analyze()分析出评论转发内容存入replyinfos[] rtinfos[]
任务:把replyinfos[] rtinfos[]的项存入数据库self.weibodbname
'''
countreply = 0
countrt =0
for rtinfo in self.rtinfos:
countrt+=1
pass
for replyinfo in self.replyinfos:
countreply+=1
value=(replyinfo['weiboid'].replace(":",""),\
#replyinfo['raw'].replace(":","").replace('"','').replace("'",""),\
'',
replyinfo['location'].replace(":",""),\
replyinfo['content'].replace(":","").replace('"','').replace("'",""),\
#replyinfo['contentraw'].replace(":","").replace('"','').replace("'",""),\
'',
replyinfo['username'].replace(":",""),\
replyinfo['datetime'].replace(":",""),\
replyinfo['isreplyto'].replace(":",""),\
replyinfo['replyurl'].replace(":",""),\
replyinfo['isrtto'].replace(":",""),\
replyinfo['rturl'].replace(":",""),\
replyinfo['atwho'].replace(":",""),\
replyinfo['reply'],\
replyinfo['rt'],\
replyinfo['isoriginal'],\
)
try:
#每条原创、评论、转发都看做一个weibo项,用weiboid区分
#print value
self.cur_weibo.execute("""INSERT INTO weibos(weiboid,raw,pagelocation,content,contentraw,username,datetime,isreplyto,replyurl,isrtto,rturl,atwho,reply,rt,isoriginal) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s',%d,%d,%d);""" %value)
except sqlite.Error,E:
print 'DB:weibos表中插入weiboinfo项(reply)出现异常:INSERT VALUES=' + str(value)
print E
finally:
self.dbcommit()
print "把%d/%d条reply/rt存入数据库:%s"%(countreply,countrt,self.weibodbname)
def reply_analyze(self,html,filename=None):
'''
任务:分析reply的html,将每条reply封装成replyinfo{},放入replyinfos[]
返回:replyinfos[] :replyinfo{}的列表
'''
replyinfos=[]
soup = BeautifulSoup(html)
title = soup.title.string
if NO_REPLY in html or '评论列表' not in title:
print '分析失败(无效reply页,可能需要重新爬取):reply_analyze(html,%s)'%filename
return None
else:#有reply
allc = soup.find_all("div", { "class" : "c" })
#wap
subject = allc[1]#被评论微博
replies = allc[2:]#几个评论
isreplyto = subject['id']#reply to的微博id,格式M_vr02n0ha52
#print isreplyto
if isreplyto:
for reply in replies:
weiboid = None
userid = ''
content =''
contentraw=''
username = ''
userpage = ''
datetime = ''
replyurl =''
isrtto = ''
rturl = ''
rt=0
replynum=0
isoriginal = 3 #reply=3
atwho = ''
try:
weiboid = reply['id']
#print reply['id']#评论id格作为weiboid,格式C_1120421220327297
except KeyError,E:
weiboid = None
pass
if weiboid:
contentraw = str(reply)
content = str(reply.find('span',{'class':'ctt'}).get_text())
datetime = str(reply.find('span',{'class':'ct'}).get_text())
replyurl = str(reply.find('span',{'class':'cc'}).get('href'))
username = str(reply.a.get_text())
#提取"回复@谁:"的谁
r = re.compile(r'回复@(.*?):').search(content)
if r:
atwho = r.group(1)
#print atwho
replyinfo={}
if filename:
replyinfo.update({'location':filename})
else:
replyinfo.update({'location':''})
replyinfo.update({'weiboid':weiboid})
replyinfo.update({'raw':html})
replyinfo.update({'content':content})
replyinfo.update({'contentraw':contentraw})
replyinfo.update({'username':username})
replyinfo.update({'datetime':datetime})
replyinfo.update({'isreplyto':isreplyto})
replyinfo.update({'isrtto':isrtto})
replyinfo.update({'rt':rt})
replyinfo.update({'reply':replynum})
replyinfo.update({'replyurl':replyurl})
replyinfo.update({'rturl':rturl})
replyinfo.update({'isoriginal':isoriginal})
replyinfo.update({'atwho':atwho})
replyinfos.append(replyinfo)
return replyinfos
pass
def rt_analyze(self,html):
rtinfo={}
soup = BeautifulSoup(html)
return rtinfo
pass
def test_login(self,url):
'''
带header cookie访问weibo.cn,测试是否成功登陆首页
'''
header = self.headers_login
#打开url处理html
html = urlprocessor(url,header,absUrl=True)
#转换成系统编码
#html = fromUTF8toSysCoding(html)
#获取用户信息打印
print '测试登陆,请检查用户是否正确:'
soup = BeautifulSoup(html)
if soup.find("div", { "class" : "ut" }):
print soup.find("div", { "class" : "ut" }).get_text()
#re查找html的用户信息
#pat_title = re.compile('<div class="ut">(.+?)</div>')
#r = pat_title.search(html)
#if r:
# print r.group(1)
#store html to disk
#path = self.savedir + '/waplogin.html'
#storehtml(html,path,url)
#打开./waplogin.html看是否有用户名存在
def dbcommit(self):
'''
提交db操作
'''
self.con_weibo.commit()
self.con_user.commit()
def dbtest(self):
#value=('weiboid','1','12341a','4','5','6','7','8','9','10',11,'12','13',14,'15',16,'17')
#self.cur_weibo.execute("INSERT INTO weibos(weiboid,raw,pagelocation,contentraw,content,userid,userpage,username,datetime,isreplyto,reply,replyurl,isrtto,rt,rturl,isoriginal,atwho) VALUES('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%d','%s','%s','%d','%s','%d','%s');" %value)
self.dbcommit()
pass
def createuserstable(self):
'''
初始化 用户信息、用户关系表
'''
#建表relation
try:
self.cur_user.execute('CREATE TABLE relation(userid TEXT,followerid TEXT,PRIMARY KEY(userid,followerid))')
self.dbcommit()
except sqlite.OperationalError,E:
#print 'DB:%s建表relation出现异常:'%self.userdbname
#print E
pass
#建表profile
try:
self.cur_user.execute('CREATE TABLE profile(userid TEXT PRIMARY KEY,username TEXT,followers INTERGER,followings INTERGER,weibos INTERGER)')
self.dbcommit()
except sqlite.OperationalError,E:
#print 'DB:%s建表profile出现异常:'%self.userdbname
#print E
pass
#建表download,保存网页下载进度信息
try:
self.cur_user.execute('CREATE TABLE download(url TEXT,path TEXT,type TEXT ,status INTERGER,page INTERGER ,userid TEXT ,randurl TEXT ,datetime TIMESTAMP,PRIMARY KEY(url,path))')
self.dbcommit()
except sqlite.OperationalError,E:
#print 'DB:%s建表download出现异常:'%self.userdbname
#print E
pass
def createweibostable(self):
'''
初始化微博信息db表
'''
try:
#每条原创、评论、转发都看做一个weibo项,用weiboid区分
'''
self.cur_weibo.execute('CREATE TABLE weibos\
(weiboid TEXT PRIMARY KEY,#主键,每个微博分配一个独立的id=genweiboid() 暂用生成先后顺序分配\
raw TEXT, #此条微博的全部源代码\
path TEXT, #此条微博储存到本地磁盘的位置
pagelocation TEXT, #此条微博所在的网页地址(是爬下来存放在本地磁盘的路径)\
contentraw TEXT, #此条微博内容的源代码\
content, #正文\
userid TEXT, #发微博用户id\
userpage TEXT, #发微博用户主页\
username TEXT, #发微博用户名\
datetime TEXT, #发微博日期时间\
isreplyto TEXT, #是否是某条微博的评论\
reply INTEGER, #微博的评论数\
replyurl TEXT, #微博的评论超链接url\
isrtto TEXT, #是否是某条微博的转发\
rt INTEGER, #微博的转发数\
rturl TEXT, #微博的转发超链接url\
isoriginal INTEGER, #是原创微博吗 1:是 0:否\
atwho TEXT, #这条微博at了谁,如:id1,id2,id3\
)')
'''
self.cur_weibo.execute('CREATE TABLE weibos(weiboid TEXT PRIMARY KEY,path TEXT,pagelocation TEXT,content TEXT,userid TEXT,userpage TEXT,username TEXT,datetime TEXT,isreplyto TEXT,reply INTEGER,replyurl TEXT,isrtto TEXT,rt INTEGER,rturl TEXT,isoriginal INTEGER,atwho TEXT,raw TEXT,contentraw TEXT)')
self.dbcommit()
except sqlite.OperationalError,E:
print 'DB建表weibos出现异常:'
#print E
pass #已经存在table
def start_crawl_profiles_from_uid_in_weibodb(self,showdetail=False):
'''
前提:db:self.weibodbname有uid
任务:读取db:self.weibodbname的uid,下载uid的type:profile,fans,follow页到本地磁盘/uid/[type]/[i].html
db:写self.userdbname的download表
返回:True 完成退出(可能有几个没爬)
'''
header = self.headers_login
print "start_crawl_profiles_from_uid_in_weibodb:开始从数据库读取uid,从网络下载uid主页到本地磁盘self.usersdir/uid/profile.html"
try:
self.cur_weibo.execute("SELECT DISTINCT userid FROM weibos")
self.dbcommit()
except Exception,E:
print 'start_crawl_profiles_from_uid_in_weibodb:从db读取uid错误'
print E
return None
count_done = 0
count_invalid_uid = 0
count_trap = 0
count_skip = 0
list = self.cur_weibo.fetchall()
print '\t共有用户:%d个'%len(list)
for row in list:
#print type(row)#tuple
uid, = row
#print type(uid)#unicode
if uid==None or uid==0 or uid=='':
count_invalid_uid+=1
if showdetail:
print "start_crawl_profiles_from_uid_in_weibodb:无效uid:%s"%uid
continue
else:
type = 'profile'
page = 0
#下载uid的profile页,存到磁盘
html = self.crawl_user_page(uid, page, type, header,showdetail)
#处理下载好的html
if html is None:
#下载profile失败,则加入失败列表
count_trap+=1
if showdetail:
print 'start_crawl_profiles_from_uid_in_weibodb:下载用户profile失败,记录在download表上uid:%s' % uid
elif html == '':
count_skip+=1
else: