update update topic logic

oldshensheep · oldshensheep · commit 332be35a6b89 · 2023-07-20T09:54:35.000Z
diff --git a/v2ex_scrapy/DB.py b/v2ex_scrapy/DB.py
@@ -66,3 +66,11 @@ def get_topic_comment_count(self, topic_id) -> int:
         if result is None or result[0] is None:
             return 0
         return int(result[0])
+
+    def get_comment_count_by_topic(self, topic_id) -> int:
+        result = self.session.execute(
+            text("select count(*) from comment where topic_id = :q"), {"q": topic_id}
+        ).fetchone()
+        if result is None or result[0] is None:
+            return 0
+        return int(result[0])
diff --git a/v2ex_scrapy/spiders/V2exSpider.py b/v2ex_scrapy/spiders/V2exSpider.py
@@ -8,8 +8,7 @@
 
 class V2exTopicSpider(scrapy.Spider):
     name = "v2ex"
-    UPDATE_TOPIC = False
-    # only work when UPDATE_TOPIC = True
+    FORCE_UPDATE_TOPIC = False
     UPDATE_COMMENT = True
 
     def __init__(self, name=None, **kwargs):
@@ -23,18 +22,20 @@ def __init__(self, name=None, **kwargs):
         self.logger.info(f"start from topic id {self.start_id}, end at {self.end_id}")
 
     def start_requests(self):
-        # 之前的评论和用户信息可能没爬完，所以继续爬停止时的topic
-        yield scrapy.Request(
-            url=f"https://www.v2ex.com/t/{self.start_id}",
-            callback=self.common_spider.parse_topic,
-            errback=self.common_spider.parse_topic_err,
-            cb_kwargs={"topic_id": self.start_id},
-        )
         for i in range(self.start_id + 1, self.end_id + 1):
-            if self.UPDATE_TOPIC or not self.db.exist(TopicItem, i):
+            if (
+                self.FORCE_UPDATE_TOPIC
+                or (not self.db.exist(TopicItem, i))
+                or (
+                    self.db.get_topic_comment_count(i)
+                    > self.db.get_comment_count_by_topic(i)
+                )
+            ):
                 yield scrapy.Request(
                     url=f"https://www.v2ex.com/t/{i}",
                     callback=self.common_spider.parse_topic,
                     errback=self.common_spider.parse_topic_err,
                     cb_kwargs={"topic_id": i},
                 )
+            else:
+                self.logger.info(f"skip topic {i}")