88
99class V2exTopicSpider (scrapy .Spider ):
1010 name = "v2ex"
11- UPDATE_TOPIC = False
12- # only work when UPDATE_TOPIC = True
11+ FORCE_UPDATE_TOPIC = False
1312 UPDATE_COMMENT = True
1413
1514 def __init__ (self , name = None , ** kwargs ):
@@ -23,18 +22,20 @@ def __init__(self, name=None, **kwargs):
2322 self .logger .info (f"start from topic id { self .start_id } , end at { self .end_id } " )
2423
2524 def start_requests (self ):
26- # 之前的评论和用户信息可能没爬完,所以继续爬停止时的topic
27- yield scrapy .Request (
28- url = f"https://www.v2ex.com/t/{ self .start_id } " ,
29- callback = self .common_spider .parse_topic ,
30- errback = self .common_spider .parse_topic_err ,
31- cb_kwargs = {"topic_id" : self .start_id },
32- )
3325 for i in range (self .start_id + 1 , self .end_id + 1 ):
34- if self .UPDATE_TOPIC or not self .db .exist (TopicItem , i ):
26+ if (
27+ self .FORCE_UPDATE_TOPIC
28+ or (not self .db .exist (TopicItem , i ))
29+ or (
30+ self .db .get_topic_comment_count (i )
31+ > self .db .get_comment_count_by_topic (i )
32+ )
33+ ):
3534 yield scrapy .Request (
3635 url = f"https://www.v2ex.com/t/{ i } " ,
3736 callback = self .common_spider .parse_topic ,
3837 errback = self .common_spider .parse_topic_err ,
3938 cb_kwargs = {"topic_id" : i },
4039 )
40+ else :
41+ self .logger .info (f"skip topic { i } " )
0 commit comments