Skip to content

Commit c54e7d9

Browse files
committed
feat: 08_爬虫入门实战1_静态网页数据提取.md 增加提取代码示例
1 parent c91de71 commit c54e7d9

File tree

2 files changed

+104
-13
lines changed

2 files changed

+104
-13
lines changed

源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/001_网页数据提取代码.py

Lines changed: 36 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# @Time : 2024/3/27 22:47
44
# @Desc : 分别使用两个库演示如何提取html文档结构数据
55
from bs4 import BeautifulSoup
6-
6+
from parsel import Selector
77

88
class NoteContent:
99
title: str = ""
@@ -12,7 +12,13 @@ class NoteContent:
1212
detail_link: str = ""
1313

1414
def __str__(self):
15-
return f"{self.title}_{self.detail_link}"
15+
return f"""
16+
Title: {self.title}
17+
User: {self.author}
18+
Publish Date: {self.publish_date}
19+
Detail Link: {self.detail_link}
20+
"""
21+
1622

1723

1824
def parse_html_use_bs(html_content: str):
@@ -24,21 +30,42 @@ def parse_html_use_bs(html_content: str):
2430
# 初始化一个帖子保存容器
2531
note_content = NoteContent()
2632
# 初始化bs查询对象
27-
soup = BeautifulSoup(html_content)
33+
soup = BeautifulSoup(html_content, "lxml")
2834
# 提取标题并去左右除换行空格字符
2935
note_content.title = soup.select("div.r-ent div.title a")[0].text.strip()
30-
# 提取作则
31-
print(soup)
36+
# 提取作者
37+
note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip()
38+
# 提取发布日期
39+
note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip()
40+
# 提取帖子链接
41+
note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"]
42+
print("BeautifulSoup" + "*" * 30)
43+
print(note_content)
44+
print("BeautifulSoup" + "*" * 30)
3245

3346

3447
def parse_html_use_parse(html_content: str):
3548
"""
36-
使用parse提取帖子标题、作者、发布日期,基于xpath选择器提取
49+
使用parsel提取帖子标题、作者、发布日期,基于xpath选择器提取
3750
:param html_content: html源代码内容
3851
:return:
3952
"""
40-
pass
53+
# 初始化一个帖子保存容器
54+
note_content = NoteContent()
55+
# 使用parsel创建选择器对象
56+
selector = Selector(text=html_content)
57+
# 使用XPath提取标题并去除左右空格
58+
note_content.title = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/text()").extract_first().strip()
59+
# 使用XPath提取作者
60+
note_content.author = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='author']/text()").extract_first().strip()
61+
# 使用XPath提取发布日期
62+
note_content.publish_date = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='date']/text()").extract_first().strip()
63+
# 使用XPath提取帖子链接
64+
note_content.detail_link = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/@href").extract_first()
4165

66+
print("parsel" + "*" * 30)
67+
print(note_content)
68+
print("parsel" + "*" * 30)
4269

4370
if __name__ == '__main__':
4471
ori_html = """
@@ -68,3 +95,5 @@ def parse_html_use_parse(html_content: str):
6895
</div>
6996
"""
7097
parse_html_use_bs(ori_html)
98+
print("")
99+
parse_html_use_parse(ori_html)

爬虫入门/08_爬虫入门实战1_静态网页数据提取.md

Lines changed: 68 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,74 @@
8585
```
8686

8787
```python
88-
# BeautifulSoup
89-
from bs4 import BeautifulSoup as bs
90-
soup = bs.select("div.r-ent")
91-
print(soup)
92-
93-
# parsel
88+
# -*- coding: utf-8 -*-
89+
# @Author : [email protected]
90+
# @Time : 2024/3/27 22:47
91+
# @Desc : 分别使用两个库演示如何提取html文档结构数据
92+
from bs4 import BeautifulSoup
93+
from parsel import Selector
94+
95+
class NoteContent:
96+
title: str = ""
97+
author: str = ""
98+
publish_date: str = ""
99+
detail_link: str = ""
100+
101+
def __str__(self):
102+
return f"""
103+
Title: {self.title}
104+
User: {self.author}
105+
Publish Date: {self.publish_date}
106+
Detail Link: {self.detail_link}
107+
"""
108+
109+
110+
111+
def parse_html_use_bs(html_content: str):
112+
"""
113+
使用BeautifulSoup提取帖子标题、作者、发布日期,基于css选择器提取
114+
:param html_content: html源代码内容
115+
:return:
116+
"""
117+
# 初始化一个帖子保存容器
118+
note_content = NoteContent()
119+
# 初始化bs查询对象
120+
soup = BeautifulSoup(html_content, "lxml")
121+
# 提取标题并去左右除换行空格字符
122+
note_content.title = soup.select("div.r-ent div.title a")[0].text.strip()
123+
# 提取作者
124+
note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip()
125+
# 提取发布日期
126+
note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip()
127+
# 提取帖子链接
128+
note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"]
129+
print("BeautifulSoup" + "*" * 30)
130+
print(note_content)
131+
print("BeautifulSoup" + "*" * 30)
132+
133+
134+
def parse_html_use_parse(html_content: str):
135+
"""
136+
使用parsel提取帖子标题、作者、发布日期,基于xpath选择器提取
137+
:param html_content: html源代码内容
138+
:return:
139+
"""
140+
# 初始化一个帖子保存容器
141+
note_content = NoteContent()
142+
# 使用parsel创建选择器对象
143+
selector = Selector(text=html_content)
144+
# 使用XPath提取标题并去除左右空格
145+
note_content.title = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/text()").extract_first().strip()
146+
# 使用XPath提取作者
147+
note_content.author = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='author']/text()").extract_first().strip()
148+
# 使用XPath提取发布日期
149+
note_content.publish_date = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='date']/text()").extract_first().strip()
150+
# 使用XPath提取帖子链接
151+
note_content.detail_link = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/@href").extract_first()
152+
153+
print("parsel" + "*" * 30)
154+
print(note_content)
155+
print("parsel" + "*" * 30)
94156
```
95157

96158

0 commit comments

Comments
 (0)