Skip to content

Commit bed01ae

Browse files
committed
change database schema, auto detect max page of node
1 parent e3619dd commit bed01ae

File tree

10 files changed

+67
-28
lines changed

10 files changed

+67
-28
lines changed

requirements-analysis.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
pandas==2.0.1
2+
plotly==5.14.1

requirements.txt

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
arrow==1.2.3
2-
pandas==2.0.1
3-
plotly==5.14.1
2+
httpx==0.24.1
43
Scrapy==2.9.0
54
SQLAlchemy==2.0.17

v2ex_scrapy/DB.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,9 @@ def __new__(cls):
2626
cls._instance = super().__new__(cls)
2727
return cls._instance
2828

29-
def __init__(self):
29+
def __init__(self, database_name="v2ex.sqlite"):
3030
self.engine = create_engine(
31-
"sqlite:///v2ex.sqlite",
31+
f"sqlite:///{database_name}",
3232
echo=False,
3333
json_serializer=lambda x: json.dumps(x, ensure_ascii=False),
3434
)
@@ -61,7 +61,7 @@ def get_max_topic_id(self) -> int:
6161

6262
def get_topic_comment_count(self, topic_id) -> int:
6363
result = self.session.execute(
64-
text("select count(*) from comment where topic_id = :q"), {"q": topic_id}
64+
text("select reply_count from topic where id = :q"), {"q": topic_id}
6565
).fetchone()
6666
if result is None or result[0] is None:
6767
return 0

v2ex_scrapy/items.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,14 +26,15 @@ class TopicItem(Base):
2626
id_: Mapped[int] = mapped_column(name="id", primary_key=True)
2727
author: Mapped[str] = mapped_column(nullable=False)
2828
title: Mapped[str] = mapped_column(nullable=False)
29-
content: Mapped[str]
29+
content: Mapped[str] = mapped_column()
3030
node: Mapped[str] = mapped_column(nullable=False)
3131
tag: Mapped[list[str]] = mapped_column(nullable=False)
3232
clicks: Mapped[int] = mapped_column(nullable=False)
3333
votes: Mapped[int] = mapped_column(nullable=False)
3434
create_at: Mapped[int] = mapped_column(nullable=False)
3535
thank_count: Mapped[int] = mapped_column(nullable=False)
3636
favorite_count: Mapped[int] = mapped_column(nullable=False)
37+
reply_count: Mapped[int] = mapped_column(nullable=False)
3738

3839
@staticmethod
3940
def err_topic(topic_id: int):
@@ -49,6 +50,7 @@ def err_topic(topic_id: int):
4950
votes=-1,
5051
thank_count=-1,
5152
favorite_count=-1,
53+
reply_count=-1,
5254
)
5355

5456

@@ -72,6 +74,7 @@ class CommentItem(Base):
7274
content: Mapped[str] = mapped_column(nullable=False)
7375
thank_count: Mapped[int] = mapped_column(nullable=False)
7476
create_at: Mapped[int] = mapped_column(nullable=False)
77+
no: Mapped[int] = mapped_column(nullable=False)
7578

7679

7780
@dataclass(kw_only=True)

v2ex_scrapy/middlewares.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,13 @@
77

88
import random
99
import time
10-
from http.cookies import SimpleCookie
1110

1211
import scrapy
1312
import scrapy.http.response.html
1413
from scrapy import signals
15-
14+
from scrapy.exceptions import IgnoreRequest
1615
from v2ex_scrapy.DB import DB, LogItem
16+
from v2ex_scrapy import utils
1717

1818

1919
class TutorialScrapySpiderMiddleware:
@@ -95,10 +95,14 @@ def process_request(self, request: scrapy.Request, spider):
9595
return None
9696

9797
def process_response(
98-
self, request, response: scrapy.http.response.html.HtmlResponse, spider
98+
self,
99+
request: scrapy.Request,
100+
response: scrapy.http.response.html.HtmlResponse,
101+
spider: scrapy.Spider,
99102
):
100103
# Called with the response returned from the downloader.
101-
104+
if response.status == 403:
105+
raise IgnoreRequest(f"403 url {response.url}")
102106
# Must either;
103107
# - return a Response object
104108
# - return a Request object
@@ -118,10 +122,8 @@ def process_exception(self, request, exception, spider):
118122
def spider_opened(self, spider: scrapy.Spider):
119123
self.proxies = spider.settings.get("PROXIES", []) # type: ignore
120124

121-
if type(cookie_str := spider.settings.get("COOKIES", "")) == str:
122-
simple_cookie = SimpleCookie()
123-
simple_cookie.load(cookie_str) # type: ignore
124-
self.cookies = {k: v.value for k, v in simple_cookie.items()}
125+
cookie_str = spider.settings.get("COOKIES", "")
126+
self.cookies = utils.cookie_str2cookie_dict(cookie_str) # type: ignore
125127

126128
spider.logger.info("Spider opened: %s" % spider.name)
127129

v2ex_scrapy/pipelines.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,12 +11,7 @@
1111
# don't remove
1212
import v2ex_scrapy.insert_ignore
1313
from v2ex_scrapy.DB import DB
14-
from v2ex_scrapy.items import (
15-
CommentItem,
16-
MemberItem,
17-
TopicItem,
18-
TopicSupplementItem,
19-
)
14+
from v2ex_scrapy.items import CommentItem, MemberItem, TopicItem, TopicSupplementItem
2015

2116
ItemsType = Union[TopicItem, CommentItem, MemberItem, TopicSupplementItem]
2217

@@ -48,7 +43,11 @@ def process_item(
4843
self.db.session.commit()
4944
return item
5045

51-
def close_spider(self, spider):
46+
def save_all(self):
5247
for _, v in self.data.items():
5348
self.db.session.add_all(v)
49+
self.db.session.commit()
50+
51+
def close_spider(self, spider):
52+
self.save_all()
5453
self.db.close()

v2ex_scrapy/spiders/CommonSpider.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,13 +9,10 @@
99

1010

1111
class CommonSpider:
12-
def __init__(
13-
self, logger, update_topic=False, update_member=False, update_comment=False
14-
):
12+
def __init__(self, logger, update_member=False, update_comment=False):
1513
self.db = DB()
1614
self.logger = logger
1715
self.UPDATE_MEMBER = update_member
18-
# only work when UPDATE_TOPIC
1916
self.UPDATE_COMMENT = update_comment
2017

2118
def parse_topic_err(self, failure):

v2ex_scrapy/spiders/V2exNodeTopicSpider.py

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,26 @@
1+
import httpx
12
import scrapy
23
import scrapy.http.response.html
4+
from parsel import Selector
5+
from scrapy.utils.project import get_project_settings
36

47
from v2ex_scrapy.DB import DB
58
from v2ex_scrapy.items import TopicItem
69
from v2ex_scrapy.spiders.CommonSpider import CommonSpider
10+
from v2ex_scrapy import utils
711

812

913
class V2exTopicSpider(scrapy.Spider):
1014
name = "v2ex-node"
1115

1216
UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
1317
UPDATE_COMMENT = True # only work when UPDATE_TOPIC_WHEN_REPLY_CHANGE = True
18+
URL = "https://www.v2ex.com/go/"
19+
20+
"""
21+
现存在的几个问题,因为节点的排序是动态的,如果爬完一页后未爬的主题跑到爬完的页数里那就爬不到了。
22+
解决方法1,开始爬取时先获取全部帖子ID再开始爬,获取ID的速度比较快所以排序改变的幅度不会很大。
23+
"""
1424

1525
def __init__(self, node="flamewar", *args, **kwargs):
1626
super().__init__(*args, **kwargs)
@@ -19,11 +29,25 @@ def __init__(self, node="flamewar", *args, **kwargs):
1929
self.common_spider = CommonSpider(
2030
self.logger, update_comment=self.UPDATE_COMMENT
2131
)
32+
settings = get_project_settings()
33+
resp = httpx.get(
34+
f"{self.URL}{self.node}",
35+
timeout=10,
36+
follow_redirects=True,
37+
cookies=utils.cookie_str2cookie_dict(settings.get("COOKIES", "")), # type: ignore
38+
headers={"User-Agent": settings.get("USER_AGENT", "")}, # type: ignore
39+
).text
40+
max_page = (
41+
Selector(text=resp)
42+
.xpath('//tr/td[@align="left" and @width="92%"]/a[last()]/text()')
43+
.get("1")
44+
)
45+
self.max_page = int(max_page)
2246

2347
def start_requests(self):
24-
for i in range(552, 0, -1):
48+
for i in range(self.max_page, 0, -1):
2549
yield scrapy.Request(
26-
url=f"https://www.v2ex.com/go/{self.node}?p={i}",
50+
url=f"{self.URL}{self.node}?p={i}",
2751
callback=self.parse,
2852
cb_kwargs={"page": i},
2953
)
@@ -33,6 +57,7 @@ def parse(self, response: scrapy.http.response.html.HtmlResponse, page: int):
3357
(int(x), int(y))
3458
for x, y in zip(
3559
response.xpath('//span[@class="item_title"]/a/@id').re(r"\d+"),
60+
# not correct when some comments are deleted, fuck
3661
response.xpath('//span[@class="item_title"]/a/@href').re(r"reply(\d+)"),
3762
)
3863
]

v2ex_scrapy/utils.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from http.cookies import SimpleCookie
12
import json
23
from typing import Union
34

@@ -27,6 +28,12 @@ def json_to_str(j):
2728
return json.dumps(j, ensure_ascii=False)
2829

2930

31+
def cookie_str2cookie_dict(cookie_str: str):
32+
simple_cookie = SimpleCookie()
33+
simple_cookie.load(cookie_str)
34+
return {k: v.value for k, v in simple_cookie.items()}
35+
36+
3037
if __name__ == "__main__":
3138
a = ["2022-04-28 13:24:38 +08:00", "287 天前", "1 小时前"]
3239

v2ex_scrapy/v2ex_parser.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,10 @@ def parse_comment(response: scrapy.http.response.html.HtmlResponse, topic_id):
4242
reply_content = cbox.xpath('.//div[@class="reply_content"]').get("")
4343
reply_time = cbox.css(".ago::attr(title)").get("")
4444
thank_count = cbox.css(".fade::text").get("0").strip()
45+
no = cbox.css(".no::text").get("-1").strip()
4546
yield CommentItem(
4647
id_=int(comment_id),
48+
no=int(no),
4749
commenter=author_name,
4850
topic_id=topic_id,
4951
content=reply_content,
@@ -74,7 +76,9 @@ def parse_topic(response: scrapy.http.response.html.HtmlResponse, topic_id):
7476
)
7577

7678
topic_content = response.css(".cell .topic_content").get("")
77-
79+
topic_reply_count = response.css(".box > .cell > .gray::text").re_first(
80+
r"(\d+) 条回复", "0"
81+
)
7882
yield TopicItem(
7983
id_=topic_id,
8084
author=topic_author,
@@ -87,6 +91,7 @@ def parse_topic(response: scrapy.http.response.html.HtmlResponse, topic_id):
8791
votes=int(topic_vote),
8892
thank_count=int(topic_thank_count),
8993
favorite_count=int(topic_favorite_count),
94+
reply_count=int(topic_reply_count),
9095
)
9196

9297

0 commit comments

Comments
 (0)