Skip to content

Commit 18de14b

Browse files
committed
feat: 08_爬虫入门实战1_静态网页提取,代码完成。。。
1 parent a67838d commit 18de14b

File tree

4 files changed

+357
-0
lines changed

4 files changed

+357
-0
lines changed

static/images/1000000010.png

201 KB
Loading
Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author : [email protected]
3+
# @Time : 2024/3/27 23:50
4+
# @Desc : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据+推文数据获取 - 同步版本
5+
6+
from typing import List
7+
8+
import requests
9+
from bs4 import BeautifulSoup
10+
11+
from common import NoteContent, NoteContentDetail, NotePushComment
12+
13+
FIRST_N_PAGE = 10 # 前N页的论坛帖子数据
14+
BASE_HOST = "https://www.ptt.cc"
15+
HEADERS = {
16+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
17+
}
18+
19+
20+
def parse_note_use_bs(html_content: str) -> NoteContent:
21+
"""
22+
使用BeautifulSoup提取帖子标题、作者、发布日期,基于css选择器提取
23+
需要注意的时,我们在提取帖子的时候,可能有些帖子状态不正常,会导致没有link之类的数据,所以我们在取值时最好判断一下元素长度
24+
:param html_content: html源代码内容
25+
:return:
26+
"""
27+
# 初始化一个帖子保存容器
28+
note_content = NoteContent()
29+
30+
soup = BeautifulSoup(html_content, "lxml")
31+
# 提取标题并去左右除换行空格字符
32+
note_content.title = soup.select("div.r-ent div.title a")[0].text.strip() if len(
33+
soup.select("div.r-ent div.title a")) > 0 else ""
34+
35+
# 提取作者
36+
note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip() if len(
37+
soup.select("div.r-ent div.meta div.author")) > 0 else ""
38+
39+
# 提取发布日期
40+
note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip() if len(
41+
soup.select("div.r-ent div.meta div.date")) > 0 else ""
42+
43+
# 提取帖子链接
44+
note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"] if len(
45+
soup.select("div.r-ent div.title a")) > 0 else ""
46+
return note_content
47+
48+
49+
def get_previos_page_number() -> int:
50+
"""
51+
打开首页提取上一页的分页Number
52+
:return:
53+
"""
54+
uri = "/bbs/Stock/index.html"
55+
reponse = requests.get(url=BASE_HOST + uri, headers=HEADERS)
56+
if reponse.status_code != 200:
57+
raise Exception("send request got error status code, reason:", reponse.text)
58+
soup = BeautifulSoup(reponse.text, "lxml")
59+
60+
# 下面这一串css选择器获取的最好的办法是使用chrom工具,进入F12控制台,选中'上页'按钮, 右键,点击 Copy -> Copy Css Selector就自动生成了。
61+
css_selector = "#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
62+
pagination_link = soup.select(css_selector)[0]["href"].strip()
63+
64+
# pagination_link: /bbs/Stock/index7084.html 提取数字部分,可以使用正则表达式,也可以使用字符串替换,我这里就使用字符串替换暴力解决了
65+
previos_page_number = int(pagination_link.replace("/bbs/Stock/index", "").replace(".html", ""))
66+
67+
return previos_page_number
68+
69+
70+
def fetch_bbs_note_list(previos_number: int) -> List[NoteContent]:
71+
"""
72+
获取前N页的帖子列表
73+
:return:
74+
"""
75+
notes_list: List[NoteContent] = []
76+
77+
# 计算分页的其实位置和终止位置,由于我们也是要爬首页的,所以得到上一页的分页Number之后,应该还要加1才是我们的起始位置
78+
start_page_number = previos_number + 1
79+
end_page_number = start_page_number - FIRST_N_PAGE
80+
for page_number in range(start_page_number, end_page_number, -1):
81+
print(f"开始获取第 {page_number} 页的帖子列表 ...")
82+
83+
# 根据分页Number拼接帖子列表的URL
84+
uri = f"/bbs/Stock/index{page_number}.html"
85+
response = requests.get(url=BASE_HOST + uri, headers=HEADERS)
86+
if response.status_code != 200:
87+
print(f"第{page_number}页帖子获取异常,原因:{response.text}")
88+
continue
89+
90+
# 使BeautifulSoup的CSS选择器解析数据,div.r-ent 是帖子列表html页面中每一个帖子都有的css class
91+
soup = BeautifulSoup(response.text, "lxml")
92+
all_note_elements = soup.select("div.r-ent")
93+
for note_element in all_note_elements:
94+
# 调用prettify()方法可以获取整个div元素的HTML内容
95+
note_content: NoteContent = parse_note_use_bs(note_element.prettify())
96+
notes_list.append(note_content)
97+
print(f"结束获取第 {page_number} 页的帖子列表,本次获取到:{len(all_note_elements)} 篇帖子...")
98+
return notes_list
99+
100+
101+
def fetch_bbs_note_detail(note_content: NoteContent) -> NoteContentDetail:
102+
"""
103+
获取帖子详情页数据
104+
:param note_content:
105+
:return:
106+
"""
107+
print(f"开始获取帖子 {note_content.detail_link} 详情页....")
108+
note_content_detail = NoteContentDetail()
109+
110+
# note_content有值的, 我们直接赋值,就不要去网页提取了,能偷懒就偷懒(初学者还是要老老实实的都去提取一下数据)
111+
note_content_detail.title = note_content.title
112+
note_content_detail.author = note_content.author
113+
note_content_detail.detail_link = BASE_HOST + note_content.detail_link
114+
115+
response = requests.get(url=BASE_HOST + note_content.detail_link, headers=HEADERS)
116+
if response.status_code != 200:
117+
print(f"帖子:{note_content.title} 获取异常,原因:{response.text}")
118+
return note_content_detail
119+
120+
soup = BeautifulSoup(response.text, "lxml")
121+
note_content_detail.publish_datetime = soup.select("#main-content > div:nth-child(4) > span.article-meta-value")[
122+
0].text
123+
124+
# 处理推文
125+
note_content_detail.push_comment = []
126+
all_push_elements = soup.select("#main-content > div.push")
127+
for push_element in all_push_elements:
128+
note_push_comment = NotePushComment()
129+
if len(push_element.select("span")) < 3:
130+
continue
131+
132+
note_push_comment.push_user_name = push_element.select("span")[1].text.strip()
133+
note_push_comment.push_cotent = push_element.select("span")[2].text.strip().replace(": ", "")
134+
note_push_comment.push_time = push_element.select("span")[3].text.strip()
135+
note_content_detail.push_comment.append(note_push_comment)
136+
137+
print(note_content_detail)
138+
return note_content_detail
139+
140+
141+
def run_crawler(save_notes: List[NoteContentDetail]):
142+
"""
143+
爬虫主程序
144+
:param save_notes: 数据保存容器
145+
:return:
146+
"""
147+
# step1 获取分页number
148+
previos_number: int = get_previos_page_number()
149+
150+
# step2 获取前N页帖子集合列表
151+
note_list: List[NoteContent] = fetch_bbs_note_list(previos_number)
152+
153+
# step3 获取帖子详情+推文
154+
for note_content in note_list:
155+
if not note_content.detail_link:
156+
continue
157+
note_content_detail = fetch_bbs_note_detail(note_content)
158+
save_notes.append(note_content_detail)
159+
160+
print("任务爬取完成.......")
161+
162+
163+
if __name__ == '__main__':
164+
all_note_content_detail: List[NoteContentDetail] = []
165+
run_crawler(all_note_content_detail)
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author : [email protected]
3+
# @Time : 2024/3/27 23:50
4+
# @Desc : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据获取 - 异步版本
5+
6+
from typing import List
7+
8+
import httpx
9+
from parsel import Selector
10+
11+
from common import NoteContent, NoteContentDetail, NotePushComment
12+
13+
FIRST_N_PAGE = 10 # 前N页的论坛帖子数据
14+
BASE_HOST = "https://www.ptt.cc"
15+
HEADERS = {
16+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
17+
}
18+
19+
20+
async def parse_note_use_parsel(html_content: str) -> NoteContent:
21+
"""
22+
使用parse提取帖子标题、作者、发布日期,基于css选择器提取
23+
需要注意的时,我们在提取帖子的时候,可能有些帖子状态不正常,会导致没有link之类的数据,所以我们在取值时最好判断一下元素长度
24+
:param html_content: html源代码内容
25+
:return:
26+
"""
27+
note_content = NoteContent()
28+
selector = Selector(text=html_content)
29+
title_elements = selector.css("div.r-ent div.title a")
30+
author_elements = selector.css("div.r-ent div.meta div.author")
31+
date_elements = selector.css("div.r-ent div.meta div.date")
32+
33+
note_content.title = title_elements[0].root.text.strip() if title_elements else ""
34+
note_content.author = author_elements[0].root.text.strip() if author_elements else ""
35+
note_content.publish_date = date_elements[0].root.text.strip() if date_elements else ""
36+
note_content.detail_link = title_elements[0].attrib['href'] if title_elements else ""
37+
return note_content
38+
39+
40+
async def get_previous_page_number() -> int:
41+
"""
42+
打开首页提取上一页的分页Number
43+
:return:
44+
"""
45+
uri = "/bbs/Stock/index.html"
46+
async with httpx.AsyncClient() as client:
47+
response = await client.get(BASE_HOST + uri, headers=HEADERS)
48+
if response.status_code != 200:
49+
raise Exception("send request got error status code, reason:", response.text)
50+
selector = Selector(text=response.text)
51+
css_selector = "#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
52+
pagination_link = selector.css(css_selector)[0].attrib['href'].strip()
53+
previous_page_number = int(pagination_link.replace("/bbs/Stock/index", "").replace(".html", ""))
54+
return previous_page_number
55+
56+
57+
async def fetch_bbs_note_list(previous_number: int) -> List[NoteContent]:
58+
"""
59+
获取前N页的帖子列表
60+
:param previous_number:
61+
:return:
62+
"""
63+
notes_list: List[NoteContent] = []
64+
start_page_number = previous_number + 1
65+
end_page_number = start_page_number - FIRST_N_PAGE
66+
async with httpx.AsyncClient() as client:
67+
for page_number in range(start_page_number, end_page_number, -1):
68+
print(f"开始获取第 {page_number} 页的帖子列表 ...")
69+
uri = f"/bbs/Stock/index{page_number}.html"
70+
response = await client.get(BASE_HOST + uri, headers=HEADERS)
71+
if response.status_code != 200:
72+
print(f"第{page_number}页帖子获取异常,原因:{response.text}")
73+
continue
74+
selector = Selector(text=response.text)
75+
all_note_elements = selector.css("div.r-ent")
76+
for note_element_html in all_note_elements:
77+
note_content: NoteContent = await parse_note_use_parsel(note_element_html.get())
78+
notes_list.append(note_content)
79+
print(f"结束获取第 {page_number} 页的帖子列表,本次获取到:{len(all_note_elements)} 篇帖子...")
80+
return notes_list
81+
82+
83+
async def fetch_bbs_note_detail(note_content: NoteContent) -> NoteContentDetail:
84+
"""
85+
获取帖子详情页数据
86+
:param note_content:
87+
:return:
88+
"""
89+
print(f"开始获取帖子 {note_content.detail_link} 详情页....")
90+
note_content_detail = NoteContentDetail()
91+
note_content_detail.title = note_content.title
92+
note_content_detail.author = note_content.author
93+
note_content_detail.detail_link = BASE_HOST + note_content.detail_link
94+
95+
async with httpx.AsyncClient() as client:
96+
response = await client.get(note_content_detail.detail_link, headers=HEADERS)
97+
if response.status_code != 200:
98+
print(f"帖子:{note_content.title} 获取异常,原因:{response.text}")
99+
return note_content_detail
100+
selector = Selector(text=response.text)
101+
note_content_detail.publish_datetime = \
102+
selector.css("#main-content > div:nth-child(4) > span.article-meta-value")[0].root.text
103+
104+
# 解析推文
105+
note_content_detail.push_comment = []
106+
all_push_elements = selector.css("#main-content > div.push")
107+
for push_element in all_push_elements:
108+
note_push_comment = NotePushComment()
109+
spans = push_element.css("span")
110+
if len(spans) < 3:
111+
continue
112+
note_push_comment.push_user_name = spans[1].root.text.strip()
113+
note_push_comment.push_cotent = spans[2].root.text.strip().replace(": ", "")
114+
note_push_comment.push_time = spans[3].root.text.strip()
115+
note_content_detail.push_comment.append(note_push_comment)
116+
print(note_content_detail)
117+
return note_content_detail
118+
119+
120+
async def run_crawler(save_notes: List[NoteContentDetail]):
121+
previous_number = await get_previous_page_number()
122+
note_list = await fetch_bbs_note_list(previous_number)
123+
for note_content in note_list:
124+
if not note_content.detail_link:
125+
continue
126+
note_content_detail = await fetch_bbs_note_detail(note_content)
127+
save_notes.append(note_content_detail)
128+
print("任务爬取完成.......")
129+
130+
131+
if __name__ == '__main__':
132+
import asyncio
133+
134+
all_note_content_detail: List[NoteContentDetail] = []
135+
asyncio.run(run_crawler(all_note_content_detail))
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author : [email protected]
3+
# @Time : 2024/3/28 01:09
4+
# @Desc : 公共模型代码
5+
6+
from typing import List
7+
8+
9+
class NoteContent:
10+
"""
11+
帖子简介存储容器
12+
"""
13+
title: str = "" # 帖子标题
14+
author: str = "" # 帖子作者
15+
publish_date: str = "" # 帖子发表日期
16+
detail_link: str = "" # 帖子详情
17+
18+
def __str__(self):
19+
return f"""
20+
Title: {self.title}
21+
User: {self.author}
22+
Publish Date: {self.publish_date}
23+
Detail Link: {self.detail_link}
24+
"""
25+
26+
27+
class NotePushComment:
28+
"""
29+
推文存储容器
30+
"""
31+
push_user_name: str = "" # 推文人
32+
push_cotent: str = "" # 推文内容
33+
push_time: str = "" # 推文时间
34+
35+
def __repr__(self):
36+
# 这里有用repr的原因是以为了NoteContentDetail的push_comment List结构方便打印
37+
return f"NotePushComment(push_user_name='{self.push_user_name}', push_cotent='{self.push_cotent}', push_time='{self.push_time}')"
38+
39+
40+
class NoteContentDetail:
41+
"""
42+
帖子
43+
"""
44+
title: str = "" # 帖子标题
45+
author: str = "" # 帖子作者
46+
publish_datetime: str = "" # 帖子发表日期
47+
detail_link: str = "" # 帖子详情链接
48+
push_comment: List[NotePushComment] = [] # 帖子推文列表,相当于国内评论列表
49+
50+
def __str__(self):
51+
return f"""
52+
Title: {self.title}
53+
User: {self.author}
54+
Publish Datetime: {self.publish_datetime}
55+
Detail Link: {self.detail_link}
56+
Push Comments: {self.push_comment}
57+
"""

0 commit comments

Comments
 (0)