33# @Time : 2024/3/27 22:47
44# @Desc : 分别使用两个库演示如何提取html文档结构数据
55from bs4 import BeautifulSoup
6-
6+ from parsel import Selector
77
88class NoteContent :
99 title : str = ""
@@ -12,7 +12,13 @@ class NoteContent:
1212 detail_link : str = ""
1313
1414 def __str__ (self ):
15- return f"{ self .title } _{ self .detail_link } "
15+ return f"""
16+ Title: { self .title }
17+ User: { self .author }
18+ Publish Date: { self .publish_date }
19+ Detail Link: { self .detail_link }
20+ """
21+
1622
1723
1824def parse_html_use_bs (html_content : str ):
@@ -24,21 +30,42 @@ def parse_html_use_bs(html_content: str):
2430 # 初始化一个帖子保存容器
2531 note_content = NoteContent ()
2632 # 初始化bs查询对象
27- soup = BeautifulSoup (html_content )
33+ soup = BeautifulSoup (html_content , "lxml" )
2834 # 提取标题并去左右除换行空格字符
2935 note_content .title = soup .select ("div.r-ent div.title a" )[0 ].text .strip ()
30- # 提取作则
31- print (soup )
36+ # 提取作者
37+ note_content .author = soup .select ("div.r-ent div.meta div.author" )[0 ].text .strip ()
38+ # 提取发布日期
39+ note_content .publish_date = soup .select ("div.r-ent div.meta div.date" )[0 ].text .strip ()
40+ # 提取帖子链接
41+ note_content .detail_link = soup .select ("div.r-ent div.title a" )[0 ]["href" ]
42+ print ("BeautifulSoup" + "*" * 30 )
43+ print (note_content )
44+ print ("BeautifulSoup" + "*" * 30 )
3245
3346
3447def parse_html_use_parse (html_content : str ):
3548 """
36- 使用parse提取帖子标题 、作者、发布日期,基于xpath选择器提取
49+ 使用parsel提取帖子标题 、作者、发布日期,基于xpath选择器提取
3750 :param html_content: html源代码内容
3851 :return:
3952 """
40- pass
53+ # 初始化一个帖子保存容器
54+ note_content = NoteContent ()
55+ # 使用parsel创建选择器对象
56+ selector = Selector (text = html_content )
57+ # 使用XPath提取标题并去除左右空格
58+ note_content .title = selector .xpath ("//div[@class='r-ent']/div[@class='title']/a/text()" ).extract_first ().strip ()
59+ # 使用XPath提取作者
60+ note_content .author = selector .xpath ("//div[@class='r-ent']/div[@class='meta']/div[@class='author']/text()" ).extract_first ().strip ()
61+ # 使用XPath提取发布日期
62+ note_content .publish_date = selector .xpath ("//div[@class='r-ent']/div[@class='meta']/div[@class='date']/text()" ).extract_first ().strip ()
63+ # 使用XPath提取帖子链接
64+ note_content .detail_link = selector .xpath ("//div[@class='r-ent']/div[@class='title']/a/@href" ).extract_first ()
4165
66+ print ("parsel" + "*" * 30 )
67+ print (note_content )
68+ print ("parsel" + "*" * 30 )
4269
4370if __name__ == '__main__' :
4471 ori_html = """
@@ -68,3 +95,5 @@ def parse_html_use_parse(html_content: str):
6895 </div>
6996 """
7097 parse_html_use_bs (ori_html )
98+ print ("" )
99+ parse_html_use_parse (ori_html )
0 commit comments