11import scrapy
22import scrapy .http .response .html
3+ from scrapy .spidermiddlewares .httperror import HttpError
34
45from v2ex_scrapy import v2ex_parser
56from v2ex_scrapy .DB import DB
910class V2exTopicSpider (scrapy .Spider ):
1011 name = "v2ex-member"
1112
12- def __init__ (self , name = None , ** kwargs ):
13- super ().__init__ (name , ** kwargs )
13+ def __init__ (self , start_id = 1 , end_id = 635000 , * args , ** kwargs ):
14+ super ().__init__ (* args , ** kwargs )
1415 self .db = DB ()
15- self .start_id = 1
16- self .end_id = 635000
16+ self .start_id = start_id
17+ self .end_id = end_id
1718 self .logger .info (f"start from topic id { self .start_id } , end at { self .end_id } " )
1819
1920 def start_requests (self ):
@@ -25,17 +26,20 @@ def start_requests(self):
2526 errback = self .member_err ,
2627 cb_kwargs = {"uid" : i },
2728 )
29+ else :
30+ self .logger .info (f"skip member id:{ i } , because it exists" )
2831
2932 def parse (self , response : scrapy .http .response .html .HtmlResponse , uid : int ):
3033 for i in v2ex_parser .parse_member (response ):
3134 i .uid = uid
3235 yield i
3336
3437 def member_err (self , failure ):
35- yield MemberItem (
36- username = "" ,
37- avatar_url = "" ,
38- create_at = 0 ,
39- social_link = [],
40- uid = failure .request .cb_kwargs ["uid" ],
41- )
38+ if failure .check (HttpError ):
39+ yield MemberItem (
40+ username = "" ,
41+ avatar_url = "" ,
42+ create_at = 0 ,
43+ social_link = [],
44+ uid = failure .request .cb_kwargs ["uid" ],
45+ )
0 commit comments