Skip to content

Commit e3619dd

Browse files
committed
add V2ex node spider
1 parent 65e455d commit e3619dd

File tree

4 files changed

+166
-88
lines changed

4 files changed

+166
-88
lines changed

v2ex_scrapy/items.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ class TopicItem(Base):
3535
thank_count: Mapped[int] = mapped_column(nullable=False)
3636
favorite_count: Mapped[int] = mapped_column(nullable=False)
3737

38-
@classmethod
39-
def err_topic(cls, topic_id: int):
38+
@staticmethod
39+
def err_topic(topic_id: int):
4040
return TopicItem(
4141
id_=topic_id,
4242
author="",
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
import math
2+
3+
import scrapy
4+
import scrapy.http.response.html
5+
6+
from v2ex_scrapy import v2ex_parser
7+
from v2ex_scrapy.DB import DB
8+
from v2ex_scrapy.items import MemberItem, TopicItem
9+
10+
11+
class CommonSpider:
12+
def __init__(
13+
self, logger, update_topic=False, update_member=False, update_comment=False
14+
):
15+
self.db = DB()
16+
self.logger = logger
17+
self.UPDATE_MEMBER = update_member
18+
# only work when UPDATE_TOPIC
19+
self.UPDATE_COMMENT = update_comment
20+
21+
def parse_topic_err(self, failure):
22+
topic_id = failure.request.cb_kwargs["topic_id"]
23+
self.logger.warn(f"Crawl Topic Err {topic_id}")
24+
yield TopicItem.err_topic(topic_id=topic_id)
25+
26+
def parse_topic(
27+
self, response: scrapy.http.response.html.HtmlResponse, topic_id: int
28+
):
29+
self.logger.info(f"Crawl Topic {topic_id}")
30+
31+
if response.status == 302:
32+
# need login or account too young
33+
yield TopicItem.err_topic(topic_id=topic_id)
34+
else:
35+
for i in v2ex_parser.parse_topic_supplement(response, topic_id):
36+
yield i
37+
for topic in v2ex_parser.parse_topic(response, topic_id):
38+
yield topic
39+
for i in self.crawl_member(topic.author, response):
40+
yield i
41+
for i in self.parse_comment(response, topic_id):
42+
yield i
43+
# crawl sub page comment
44+
topic_reply_count = int(
45+
response.css(
46+
"#Main > div:nth-child(4) > div:nth-child(1) > span::text"
47+
).re_first(r"\d+", "-1")
48+
)
49+
c = self.db.get_topic_comment_count(topic_id)
50+
if (
51+
# 爬了一部分 并且设置更新评论
52+
(0 < c < topic_reply_count)
53+
and self.UPDATE_COMMENT
54+
) or (
55+
# 没有爬 并且有评论
56+
topic_reply_count > 0
57+
and c == 0
58+
):
59+
total_page = math.ceil(topic_reply_count / 100)
60+
for i in range(max(2, math.ceil(c / 100)), total_page + 1):
61+
for j in self.crawl_comment(topic_id, i, response):
62+
yield j
63+
64+
def crawl_comment(self, topic_id, page, response):
65+
yield response.follow(
66+
f"/t/{topic_id}?p={page}",
67+
callback=self.parse_comment,
68+
cb_kwargs={"topic_id": topic_id},
69+
)
70+
71+
def parse_comment(self, response: scrapy.http.response.html.HtmlResponse, topic_id):
72+
for comment_item in v2ex_parser.parse_comment(response, topic_id):
73+
yield comment_item
74+
for i in self.crawl_member(comment_item.commenter, response):
75+
yield i
76+
77+
def crawl_member(self, username, response: scrapy.http.response.html.HtmlResponse):
78+
if username != "" and (
79+
self.UPDATE_MEMBER or not self.db.exist(MemberItem, username)
80+
):
81+
yield response.follow(
82+
f"/member/{username}",
83+
callback=self.parse_member,
84+
errback=self.member_err,
85+
cb_kwargs={"username": username},
86+
)
87+
88+
def member_err(self, failure):
89+
username = failure.request.cb_kwargs["username"]
90+
self.logger.warn(f"Crawl Member Err {username}")
91+
yield MemberItem(
92+
username=username,
93+
avatar_url="",
94+
create_at=0,
95+
social_link=[],
96+
uid=-1,
97+
)
98+
99+
def parse_member(
100+
self, response: scrapy.http.response.html.HtmlResponse, username: str
101+
):
102+
self.logger.info(f"Crawl Member {username}")
103+
for i in v2ex_parser.parse_member(response=response):
104+
yield i
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
import scrapy
2+
import scrapy.http.response.html
3+
4+
from v2ex_scrapy.DB import DB
5+
from v2ex_scrapy.items import TopicItem
6+
from v2ex_scrapy.spiders.CommonSpider import CommonSpider
7+
8+
9+
class V2exTopicSpider(scrapy.Spider):
10+
name = "v2ex-node"
11+
12+
UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
13+
UPDATE_COMMENT = True # only work when UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
14+
15+
def __init__(self, node="flamewar", *args, **kwargs):
16+
super().__init__(*args, **kwargs)
17+
self.db = DB()
18+
self.node = node
19+
self.common_spider = CommonSpider(
20+
self.logger, update_comment=self.UPDATE_COMMENT
21+
)
22+
23+
def start_requests(self):
24+
for i in range(552, 0, -1):
25+
yield scrapy.Request(
26+
url=f"https://www.v2ex.com/go/{self.node}?p={i}",
27+
callback=self.parse,
28+
cb_kwargs={"page": i},
29+
)
30+
31+
def parse(self, response: scrapy.http.response.html.HtmlResponse, page: int):
32+
topics = [
33+
(int(x), int(y))
34+
for x, y in zip(
35+
response.xpath('//span[@class="item_title"]/a/@id').re(r"\d+"),
36+
response.xpath('//span[@class="item_title"]/a/@href').re(r"reply(\d+)"),
37+
)
38+
]
39+
for i, reply_count in topics:
40+
if not self.db.exist(TopicItem, i) or (
41+
self.UPDATE_TOPIC_WHEN_REPLY_CHANGE
42+
and self.db.get_topic_comment_count(i) < reply_count
43+
):
44+
yield scrapy.Request(
45+
url=f"https://www.v2ex.com/t/{i}",
46+
callback=self.common_spider.parse_topic,
47+
errback=self.common_spider.parse_topic_err,
48+
cb_kwargs={"topic_id": i},
49+
)

v2ex_scrapy/spiders/V2exSpider.py

Lines changed: 11 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,116 +1,41 @@
1-
import math
2-
31
import scrapy
42
import scrapy.http.response.html
53

6-
from v2ex_scrapy import v2ex_parser
74
from v2ex_scrapy.DB import DB
8-
from v2ex_scrapy.items import MemberItem, TopicItem
5+
from v2ex_scrapy.items import TopicItem
6+
from v2ex_scrapy.spiders.CommonSpider import CommonSpider
97

108

119
class V2exTopicSpider(scrapy.Spider):
1210
name = "v2ex"
1311
start_id = 1
1412
end_id = 1000000
1513
UPDATE_TOPIC = False
16-
UPDATE_COMMENT = False
17-
UPDATE_MEMBER = False
14+
# only work when UPDATE_TOPIC = True
15+
UPDATE_COMMENT = True
1816

1917
def __init__(self, name=None, **kwargs):
2018
super().__init__(name, **kwargs)
2119
self.db = DB()
2220
self.start_id = self.db.get_max_topic_id()
21+
self.common_spider = CommonSpider(
22+
self.logger, update_comment=self.UPDATE_COMMENT
23+
)
2324
self.logger.info(f"start from topic id {self.start_id}, end at {self.end_id}")
2425

2526
def start_requests(self):
2627
# 之前的评论和用户信息可能没爬完,所以继续爬停止时的topic
2728
yield scrapy.Request(
2829
url=f"https://www.v2ex.com/t/{self.start_id}",
29-
callback=self.parse,
30+
callback=self.common_spider.parse_topic,
31+
errback=self.common_spider.parse_topic_err,
3032
cb_kwargs={"topic_id": self.start_id},
3133
)
3234
for i in range(self.start_id + 1, self.end_id + 1):
3335
if self.UPDATE_TOPIC or not self.db.exist(TopicItem, i):
3436
yield scrapy.Request(
3537
url=f"https://www.v2ex.com/t/{i}",
36-
callback=self.parse,
37-
errback=self.parse_topic_err,
38+
callback=self.common_spider.parse_topic,
39+
errback=self.common_spider.parse_topic_err,
3840
cb_kwargs={"topic_id": i},
3941
)
40-
41-
def parse_topic_err(self, failure):
42-
topic_id = failure.request.cb_kwargs["topic_id"]
43-
self.logger.warn(f"Crawl Topic Err {topic_id}")
44-
yield TopicItem.err_topic(topic_id)
45-
46-
def parse(self, response: scrapy.http.response.html.HtmlResponse, topic_id: int):
47-
self.logger.info(f"Crawl Topic {topic_id}")
48-
49-
if response.status == 302:
50-
# need login or account too young
51-
yield TopicItem.err_topic(topic_id=topic_id)
52-
else:
53-
for i in v2ex_parser.parse_topic_supplement(response, topic_id):
54-
yield i
55-
for topic in v2ex_parser.parse_topic(response, topic_id):
56-
yield topic
57-
for i in self.crawl_member(topic.author, response):
58-
yield i
59-
for i in self.parse_comment(response, topic_id):
60-
yield i
61-
# crawl sub page comment
62-
topic_reply_count = int(
63-
response.css(
64-
"#Main > div:nth-child(4) > div:nth-child(1) > span::text"
65-
).re_first(r"\d+", "-1")
66-
)
67-
if (
68-
self.UPDATE_COMMENT
69-
or self.db.get_topic_comment_count(topic_id) < topic_reply_count
70-
):
71-
total_page = math.ceil(topic_reply_count / 100)
72-
for i in range(2, total_page + 1):
73-
for j in self.crawl_comment(topic_id, i, response):
74-
yield j
75-
76-
def crawl_comment(self, topic_id, page, response):
77-
yield response.follow(
78-
f"/t/{topic_id}?p={page}",
79-
callback=self.parse_comment,
80-
cb_kwargs={"topic_id": topic_id},
81-
)
82-
83-
def parse_comment(self, response: scrapy.http.response.html.HtmlResponse, topic_id):
84-
for comment_item in v2ex_parser.parse_comment(response, topic_id):
85-
yield comment_item
86-
for i in self.crawl_member(comment_item.commenter, response):
87-
yield i
88-
89-
def crawl_member(self, username, response: scrapy.http.response.html.HtmlResponse):
90-
if username != "" and (
91-
self.UPDATE_MEMBER or not self.db.exist(MemberItem, username)
92-
):
93-
yield response.follow(
94-
f"/member/{username}",
95-
callback=self.parse_member,
96-
errback=self.member_err,
97-
cb_kwargs={"username": username},
98-
)
99-
100-
def member_err(self, failure):
101-
username = failure.request.cb_kwargs["username"]
102-
self.logger.warn(f"Crawl Member Err {username}")
103-
yield MemberItem(
104-
username=username,
105-
avatar_url="",
106-
create_at=0,
107-
social_link=[],
108-
uid=-1,
109-
)
110-
111-
def parse_member(
112-
self, response: scrapy.http.response.html.HtmlResponse, username: str
113-
):
114-
self.logger.info(f"Crawl Member {username}")
115-
for i in v2ex_parser.parse_member(response=response):
116-
yield i

0 commit comments

Comments
 (0)