feat: 08_爬虫入门实战1_静态网页数据提取.md 增加提取代码示例

NanmiCoder · NanmiCoder · commit c54e7d9a81ae · 2024-03-27T23:20:17.000+08:00
diff --git a/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/001_网页数据提取代码.py b/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/001_网页数据提取代码.py
@@ -3,7 +3,7 @@
 # @Time    : 2024/3/27 22:47
 # @Desc    : 分别使用两个库演示如何提取html文档结构数据
 from bs4 import BeautifulSoup
-
+from parsel import Selector
 
 class NoteContent:
     title: str = ""
@@ -12,7 +12,13 @@ class NoteContent:
     detail_link: str = ""
 
     def __str__(self):
-        return f"{self.title}_{self.detail_link}"
+        return f"""
+Title: {self.title}
+User: {self.author}
+Publish Date: {self.publish_date}
+Detail Link: {self.detail_link}        
+"""
+
 
 
 def parse_html_use_bs(html_content: str):
@@ -24,21 +30,42 @@ def parse_html_use_bs(html_content: str):
     # 初始化一个帖子保存容器
     note_content = NoteContent()
     # 初始化bs查询对象
-    soup = BeautifulSoup(html_content)
+    soup = BeautifulSoup(html_content, "lxml")
     # 提取标题并去左右除换行空格字符
     note_content.title = soup.select("div.r-ent div.title a")[0].text.strip()
-    # 提取作则
-    print(soup)
+    # 提取作者
+    note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip()
+    # 提取发布日期
+    note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip()
+    # 提取帖子链接
+    note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"]
+    print("BeautifulSoup" + "*" * 30)
+    print(note_content)
+    print("BeautifulSoup" + "*" * 30)
 
 
 def parse_html_use_parse(html_content: str):
     """
-    使用parse提取帖子标题、作者、发布日期，基于xpath选择器提取
+    使用parsel提取帖子标题、作者、发布日期，基于xpath选择器提取
     :param html_content: html源代码内容
     :return:
     """
-    pass
+    # 初始化一个帖子保存容器
+    note_content = NoteContent()
+    # 使用parsel创建选择器对象
+    selector = Selector(text=html_content)
+    # 使用XPath提取标题并去除左右空格
+    note_content.title = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/text()").extract_first().strip()
+    # 使用XPath提取作者
+    note_content.author = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='author']/text()").extract_first().strip()
+    # 使用XPath提取发布日期
+    note_content.publish_date = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='date']/text()").extract_first().strip()
+    # 使用XPath提取帖子链接
+    note_content.detail_link = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/@href").extract_first()
 
+    print("parsel" + "*" * 30)
+    print(note_content)
+    print("parsel" + "*" * 30)
 
 if __name__ == '__main__':
     ori_html = """
@@ -68,3 +95,5 @@ def parse_html_use_parse(html_content: str):
     </div>
     """
     parse_html_use_bs(ori_html)
+    print("")
+    parse_html_use_parse(ori_html)
diff --git a/爬虫入门/08_爬虫入门实战1_静态网页数据提取.md b/爬虫入门/08_爬虫入门实战1_静态网页数据提取.md
@@ -85,12 +85,74 @@
 ```
 
 ```python
-# BeautifulSoup
-from bs4 import BeautifulSoup as bs
-soup = bs.select("div.r-ent")
-print(soup)
-
-# parsel
+# -*- coding: utf-8 -*-
+# @Author  : relakkes@gmail.com
+# @Time    : 2024/3/27 22:47
+# @Desc    : 分别使用两个库演示如何提取html文档结构数据
+from bs4 import BeautifulSoup
+from parsel import Selector
+
+class NoteContent:
+    title: str = ""
+    author: str = ""
+    publish_date: str = ""
+    detail_link: str = ""
+
+    def __str__(self):
+        return f"""
+Title: {self.title}
+User: {self.author}
+Publish Date: {self.publish_date}
+Detail Link: {self.detail_link}        
+"""
+
+
+
+def parse_html_use_bs(html_content: str):
+    """
+    使用BeautifulSoup提取帖子标题、作者、发布日期，基于css选择器提取
+    :param html_content: html源代码内容
+    :return:
+    """
+    # 初始化一个帖子保存容器
+    note_content = NoteContent()
+    # 初始化bs查询对象
+    soup = BeautifulSoup(html_content, "lxml")
+    # 提取标题并去左右除换行空格字符
+    note_content.title = soup.select("div.r-ent div.title a")[0].text.strip()
+    # 提取作者
+    note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip()
+    # 提取发布日期
+    note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip()
+    # 提取帖子链接
+    note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"]
+    print("BeautifulSoup" + "*" * 30)
+    print(note_content)
+    print("BeautifulSoup" + "*" * 30)
+
+
+def parse_html_use_parse(html_content: str):
+    """
+    使用parsel提取帖子标题、作者、发布日期，基于xpath选择器提取
+    :param html_content: html源代码内容
+    :return:
+    """
+    # 初始化一个帖子保存容器
+    note_content = NoteContent()
+    # 使用parsel创建选择器对象
+    selector = Selector(text=html_content)
+    # 使用XPath提取标题并去除左右空格
+    note_content.title = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/text()").extract_first().strip()
+    # 使用XPath提取作者
+    note_content.author = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='author']/text()").extract_first().strip()
+    # 使用XPath提取发布日期
+    note_content.publish_date = selector.xpath("//div[@class='r-ent']/div[@class='meta']/div[@class='date']/text()").extract_first().strip()
+    # 使用XPath提取帖子链接
+    note_content.detail_link = selector.xpath("//div[@class='r-ent']/div[@class='title']/a/@href").extract_first()
+
+    print("parsel" + "*" * 30)
+    print(note_content)
+    print("parsel" + "*" * 30)
 ```