1+ import httpx
12import scrapy
23import scrapy .http .response .html
4+ from parsel import Selector
5+ from scrapy .utils .project import get_project_settings
36
47from v2ex_scrapy .DB import DB
58from v2ex_scrapy .items import TopicItem
69from v2ex_scrapy .spiders .CommonSpider import CommonSpider
10+ from v2ex_scrapy import utils
711
812
913class V2exTopicSpider (scrapy .Spider ):
1014 name = "v2ex-node"
1115
1216 UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
1317 UPDATE_COMMENT = True # only work when UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
18+ URL = "https://www.v2ex.com/go/"
19+
20+ """
21+ 现存在的几个问题,因为节点的排序是动态的,如果爬完一页后未爬的主题跑到爬完的页数里那就爬不到了。
22+ 解决方法1,开始爬取时先获取全部帖子ID再开始爬,获取ID的速度比较快所以排序改变的幅度不会很大。
23+ """
1424
1525 def __init__ (self , node = "flamewar" , * args , ** kwargs ):
1626 super ().__init__ (* args , ** kwargs )
@@ -19,11 +29,25 @@ def __init__(self, node="flamewar", *args, **kwargs):
1929 self .common_spider = CommonSpider (
2030 self .logger , update_comment = self .UPDATE_COMMENT
2131 )
32+ settings = get_project_settings ()
33+ resp = httpx .get (
34+ f"{ self .URL } { self .node } " ,
35+ timeout = 10 ,
36+ follow_redirects = True ,
37+ cookies = utils .cookie_str2cookie_dict (settings .get ("COOKIES" , "" )), # type: ignore
38+ headers = {"User-Agent" : settings .get ("USER_AGENT" , "" )}, # type: ignore
39+ ).text
40+ max_page = (
41+ Selector (text = resp )
42+ .xpath ('//tr/td[@align="left" and @width="92%"]/a[last()]/text()' )
43+ .get ("1" )
44+ )
45+ self .max_page = int (max_page )
2246
2347 def start_requests (self ):
24- for i in range (552 , 0 , - 1 ):
48+ for i in range (self . max_page , 0 , - 1 ):
2549 yield scrapy .Request (
26- url = f"https://www.v2ex.com/go/ { self .node } ?p={ i } " ,
50+ url = f"{ self . URL } { self .node } ?p={ i } " ,
2751 callback = self .parse ,
2852 cb_kwargs = {"page" : i },
2953 )
@@ -33,6 +57,7 @@ def parse(self, response: scrapy.http.response.html.HtmlResponse, page: int):
3357 (int (x ), int (y ))
3458 for x , y in zip (
3559 response .xpath ('//span[@class="item_title"]/a/@id' ).re (r"\d+" ),
60+ # not correct when some comments are deleted, fuck
3661 response .xpath ('//span[@class="item_title"]/a/@href' ).re (r"reply(\d+)" ),
3762 )
3863 ]
0 commit comments