Skip to content

Commit 332be35

Browse files
committed
update update topic logic
1 parent 75494d5 commit 332be35

File tree

2 files changed

+19
-10
lines changed

2 files changed

+19
-10
lines changed

v2ex_scrapy/DB.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,11 @@ def get_topic_comment_count(self, topic_id) -> int:
6666
if result is None or result[0] is None:
6767
return 0
6868
return int(result[0])
69+
70+
def get_comment_count_by_topic(self, topic_id) -> int:
71+
result = self.session.execute(
72+
text("select count(*) from comment where topic_id = :q"), {"q": topic_id}
73+
).fetchone()
74+
if result is None or result[0] is None:
75+
return 0
76+
return int(result[0])

v2ex_scrapy/spiders/V2exSpider.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,7 @@
88

99
class V2exTopicSpider(scrapy.Spider):
1010
name = "v2ex"
11-
UPDATE_TOPIC = False
12-
# only work when UPDATE_TOPIC = True
11+
FORCE_UPDATE_TOPIC = False
1312
UPDATE_COMMENT = True
1413

1514
def __init__(self, name=None, **kwargs):
@@ -23,18 +22,20 @@ def __init__(self, name=None, **kwargs):
2322
self.logger.info(f"start from topic id {self.start_id}, end at {self.end_id}")
2423

2524
def start_requests(self):
26-
# 之前的评论和用户信息可能没爬完,所以继续爬停止时的topic
27-
yield scrapy.Request(
28-
url=f"https://www.v2ex.com/t/{self.start_id}",
29-
callback=self.common_spider.parse_topic,
30-
errback=self.common_spider.parse_topic_err,
31-
cb_kwargs={"topic_id": self.start_id},
32-
)
3325
for i in range(self.start_id + 1, self.end_id + 1):
34-
if self.UPDATE_TOPIC or not self.db.exist(TopicItem, i):
26+
if (
27+
self.FORCE_UPDATE_TOPIC
28+
or (not self.db.exist(TopicItem, i))
29+
or (
30+
self.db.get_topic_comment_count(i)
31+
> self.db.get_comment_count_by_topic(i)
32+
)
33+
):
3534
yield scrapy.Request(
3635
url=f"https://www.v2ex.com/t/{i}",
3736
callback=self.common_spider.parse_topic,
3837
errback=self.common_spider.parse_topic_err,
3938
cb_kwargs={"topic_id": i},
4039
)
40+
else:
41+
self.logger.info(f"skip topic {i}")

0 commit comments

Comments
 (0)