Skip to content

Commit 3b13aae

Browse files
committed
add member spider
1 parent 332be35 commit 3b13aae

File tree

3 files changed

+66
-9
lines changed

3 files changed

+66
-9
lines changed

requirements-analysis.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1 @@
11
pandas==2.0.1
2-
plotly==5.14.1

v2ex_scrapy/pipelines.py

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,14 +6,13 @@
66

77
# useful for handling different item types with a single interface
88

9-
from typing import Any, Union
9+
from typing import Any
1010

1111
# don't remove
12-
import v2ex_scrapy.insert_ignore
1312
from v2ex_scrapy.DB import DB
1413
from v2ex_scrapy.items import CommentItem, MemberItem, TopicItem, TopicSupplementItem
1514

16-
ItemsType = Union[TopicItem, CommentItem, MemberItem, TopicSupplementItem]
15+
ItemsType = TopicItem | CommentItem | MemberItem | TopicSupplementItem
1716

1817

1918
class TutorialScrapyPipeline:
@@ -31,22 +30,40 @@ def __init__(self):
3130

3231
def process_item(
3332
self,
34-
item: Union[ItemsType, Any],
33+
item: ItemsType | Any,
3534
spider,
3635
):
3736
if isinstance(item, (TopicItem, CommentItem, MemberItem, TopicSupplementItem)):
3837
item_type = type(item)
3938
self.data[item_type].append(item)
4039
if len(self.data[item_type]) >= self.BATCH:
41-
self.db.session.add_all(self.data[item_type])
40+
self.process_it(self.data[item_type])
4241
self.data[item_type] = []
43-
self.db.session.commit()
4442
return item
4543

44+
def process_it(self, items: list[ItemsType]):
45+
if len(items) > 0 and isinstance(items[0], MemberItem):
46+
self.process_members(items)
47+
else:
48+
self.db.session.add_all(items)
49+
self.db.session.commit()
50+
51+
def process_members(self, items: list[MemberItem]):
52+
for item in items:
53+
e = (
54+
self.db.session.query(MemberItem)
55+
.where(MemberItem.username == item.username)
56+
.first()
57+
)
58+
if e is None:
59+
self.db.session.add_all([item])
60+
elif e.uid is None:
61+
e.uid = item.uid
62+
self.db.session.commit()
63+
4664
def save_all(self):
4765
for _, v in self.data.items():
48-
self.db.session.add_all(v)
49-
self.db.session.commit()
66+
self.process_it(v)
5067

5168
def close_spider(self, spider):
5269
self.save_all()
Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
import scrapy
2+
import scrapy.http.response.html
3+
4+
from v2ex_scrapy import v2ex_parser
5+
from v2ex_scrapy.DB import DB
6+
from v2ex_scrapy.items import MemberItem
7+
8+
9+
class V2exTopicSpider(scrapy.Spider):
10+
name = "v2ex-member"
11+
12+
def __init__(self, name=None, **kwargs):
13+
super().__init__(name, **kwargs)
14+
self.db = DB()
15+
self.start_id = 1
16+
self.end_id = 635000
17+
self.logger.info(f"start from topic id {self.start_id}, end at {self.end_id}")
18+
19+
def start_requests(self):
20+
for i in range(self.start_id, self.end_id + 1):
21+
if not self.db.exist(MemberItem, i):
22+
yield scrapy.Request(
23+
url=f"https://www.v2ex.com/uid/{i}",
24+
callback=self.parse,
25+
errback=self.member_err,
26+
cb_kwargs={"uid": i},
27+
)
28+
29+
def parse(self, response: scrapy.http.response.html.HtmlResponse, uid: int):
30+
for i in v2ex_parser.parse_member(response):
31+
i.uid = uid
32+
yield i
33+
34+
def member_err(self, failure):
35+
yield MemberItem(
36+
username="",
37+
avatar_url="",
38+
create_at=0,
39+
social_link=[],
40+
uid=failure.request.cb_kwargs["uid"],
41+
)

0 commit comments

Comments
 (0)