|
1 | | -import math |
2 | | - |
3 | 1 | import scrapy |
4 | 2 | import scrapy.http.response.html |
5 | 3 |
|
6 | | -from v2ex_scrapy import v2ex_parser |
7 | 4 | from v2ex_scrapy.DB import DB |
8 | | -from v2ex_scrapy.items import MemberItem, TopicItem |
| 5 | +from v2ex_scrapy.items import TopicItem |
| 6 | +from v2ex_scrapy.spiders.CommonSpider import CommonSpider |
9 | 7 |
|
10 | 8 |
|
11 | 9 | class V2exTopicSpider(scrapy.Spider): |
12 | 10 | name = "v2ex" |
13 | 11 | start_id = 1 |
14 | 12 | end_id = 1000000 |
15 | 13 | UPDATE_TOPIC = False |
16 | | - UPDATE_COMMENT = False |
17 | | - UPDATE_MEMBER = False |
| 14 | + # only work when UPDATE_TOPIC = True |
| 15 | + UPDATE_COMMENT = True |
18 | 16 |
|
19 | 17 | def __init__(self, name=None, **kwargs): |
20 | 18 | super().__init__(name, **kwargs) |
21 | 19 | self.db = DB() |
22 | 20 | self.start_id = self.db.get_max_topic_id() |
| 21 | + self.common_spider = CommonSpider( |
| 22 | + self.logger, update_comment=self.UPDATE_COMMENT |
| 23 | + ) |
23 | 24 | self.logger.info(f"start from topic id {self.start_id}, end at {self.end_id}") |
24 | 25 |
|
25 | 26 | def start_requests(self): |
26 | 27 | # 之前的评论和用户信息可能没爬完,所以继续爬停止时的topic |
27 | 28 | yield scrapy.Request( |
28 | 29 | url=f"https://www.v2ex.com/t/{self.start_id}", |
29 | | - callback=self.parse, |
| 30 | + callback=self.common_spider.parse_topic, |
| 31 | + errback=self.common_spider.parse_topic_err, |
30 | 32 | cb_kwargs={"topic_id": self.start_id}, |
31 | 33 | ) |
32 | 34 | for i in range(self.start_id + 1, self.end_id + 1): |
33 | 35 | if self.UPDATE_TOPIC or not self.db.exist(TopicItem, i): |
34 | 36 | yield scrapy.Request( |
35 | 37 | url=f"https://www.v2ex.com/t/{i}", |
36 | | - callback=self.parse, |
37 | | - errback=self.parse_topic_err, |
| 38 | + callback=self.common_spider.parse_topic, |
| 39 | + errback=self.common_spider.parse_topic_err, |
38 | 40 | cb_kwargs={"topic_id": i}, |
39 | 41 | ) |
40 | | - |
41 | | - def parse_topic_err(self, failure): |
42 | | - topic_id = failure.request.cb_kwargs["topic_id"] |
43 | | - self.logger.warn(f"Crawl Topic Err {topic_id}") |
44 | | - yield TopicItem.err_topic(topic_id) |
45 | | - |
46 | | - def parse(self, response: scrapy.http.response.html.HtmlResponse, topic_id: int): |
47 | | - self.logger.info(f"Crawl Topic {topic_id}") |
48 | | - |
49 | | - if response.status == 302: |
50 | | - # need login or account too young |
51 | | - yield TopicItem.err_topic(topic_id=topic_id) |
52 | | - else: |
53 | | - for i in v2ex_parser.parse_topic_supplement(response, topic_id): |
54 | | - yield i |
55 | | - for topic in v2ex_parser.parse_topic(response, topic_id): |
56 | | - yield topic |
57 | | - for i in self.crawl_member(topic.author, response): |
58 | | - yield i |
59 | | - for i in self.parse_comment(response, topic_id): |
60 | | - yield i |
61 | | - # crawl sub page comment |
62 | | - topic_reply_count = int( |
63 | | - response.css( |
64 | | - "#Main > div:nth-child(4) > div:nth-child(1) > span::text" |
65 | | - ).re_first(r"\d+", "-1") |
66 | | - ) |
67 | | - if ( |
68 | | - self.UPDATE_COMMENT |
69 | | - or self.db.get_topic_comment_count(topic_id) < topic_reply_count |
70 | | - ): |
71 | | - total_page = math.ceil(topic_reply_count / 100) |
72 | | - for i in range(2, total_page + 1): |
73 | | - for j in self.crawl_comment(topic_id, i, response): |
74 | | - yield j |
75 | | - |
76 | | - def crawl_comment(self, topic_id, page, response): |
77 | | - yield response.follow( |
78 | | - f"/t/{topic_id}?p={page}", |
79 | | - callback=self.parse_comment, |
80 | | - cb_kwargs={"topic_id": topic_id}, |
81 | | - ) |
82 | | - |
83 | | - def parse_comment(self, response: scrapy.http.response.html.HtmlResponse, topic_id): |
84 | | - for comment_item in v2ex_parser.parse_comment(response, topic_id): |
85 | | - yield comment_item |
86 | | - for i in self.crawl_member(comment_item.commenter, response): |
87 | | - yield i |
88 | | - |
89 | | - def crawl_member(self, username, response: scrapy.http.response.html.HtmlResponse): |
90 | | - if username != "" and ( |
91 | | - self.UPDATE_MEMBER or not self.db.exist(MemberItem, username) |
92 | | - ): |
93 | | - yield response.follow( |
94 | | - f"/member/{username}", |
95 | | - callback=self.parse_member, |
96 | | - errback=self.member_err, |
97 | | - cb_kwargs={"username": username}, |
98 | | - ) |
99 | | - |
100 | | - def member_err(self, failure): |
101 | | - username = failure.request.cb_kwargs["username"] |
102 | | - self.logger.warn(f"Crawl Member Err {username}") |
103 | | - yield MemberItem( |
104 | | - username=username, |
105 | | - avatar_url="", |
106 | | - create_at=0, |
107 | | - social_link=[], |
108 | | - uid=-1, |
109 | | - ) |
110 | | - |
111 | | - def parse_member( |
112 | | - self, response: scrapy.http.response.html.HtmlResponse, username: str |
113 | | - ): |
114 | | - self.logger.info(f"Crawl Member {username}") |
115 | | - for i in v2ex_parser.parse_member(response=response): |
116 | | - yield i |
0 commit comments