|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +# @Time : 2024/3/27 22:47 |
| 4 | +# @Desc : 分别使用两个库演示如何提取html文档结构数据 |
| 5 | +from bs4 import BeautifulSoup |
| 6 | + |
| 7 | + |
| 8 | +class NoteContent: |
| 9 | + title: str = "" |
| 10 | + author: str = "" |
| 11 | + publish_date: str = "" |
| 12 | + detail_link: str = "" |
| 13 | + |
| 14 | + def __str__(self): |
| 15 | + return f"{self.title}_{self.detail_link}" |
| 16 | + |
| 17 | + |
| 18 | +def parse_html_use_bs(html_content: str): |
| 19 | + """ |
| 20 | + 使用BeautifulSoup提取帖子标题、作者、发布日期,基于css选择器提取 |
| 21 | + :param html_content: html源代码内容 |
| 22 | + :return: |
| 23 | + """ |
| 24 | + # 初始化一个帖子保存容器 |
| 25 | + note_content = NoteContent() |
| 26 | + # 初始化bs查询对象 |
| 27 | + soup = BeautifulSoup(html_content) |
| 28 | + # 提取标题并去左右除换行空格字符 |
| 29 | + note_content.title = soup.select("div.r-ent div.title a")[0].text.strip() |
| 30 | + # 提取作则 |
| 31 | + print(soup) |
| 32 | + |
| 33 | + |
| 34 | +def parse_html_use_parse(html_content: str): |
| 35 | + """ |
| 36 | + 使用parse提取帖子标题、作者、发布日期,基于xpath选择器提取 |
| 37 | + :param html_content: html源代码内容 |
| 38 | + :return: |
| 39 | + """ |
| 40 | + pass |
| 41 | + |
| 42 | + |
| 43 | +if __name__ == '__main__': |
| 44 | + ori_html = """ |
| 45 | + <div class="r-ent"> |
| 46 | + <div class="nrec"><span class="hl f3">11</span></div> |
| 47 | + <div class="title"> |
| 48 | +
|
| 49 | + <a href="/bbs/Stock/M.1711544298.A.9F8.html">[新聞] 童子賢:用稅收補貼電費非長久之計 應共</a> |
| 50 | +
|
| 51 | + </div> |
| 52 | + <div class="meta"> |
| 53 | + <div class="author">addy7533967</div> |
| 54 | + <div class="article-menu"> |
| 55 | +
|
| 56 | + <div class="trigger">⋯</div> |
| 57 | + <div class="dropdown"> |
| 58 | + <div class="item"><a href="/bbs/Stock/search?q=thread%3A%5B%E6%96%B0%E8%81%9E%5D+%E7%AB%A5%E5%AD%90%E8%B3%A2%EF%BC%9A%E7%94%A8%E7%A8%85%E6%94%B6%E8%A3%9C%E8%B2%BC%E9%9B%BB%E8%B2%BB%E9%9D%9E%E9%95%B7%E4%B9%85%E4%B9%8B%E8%A8%88+%E6%87%89%E5%85%B1">搜尋同標題文章</a></div> |
| 59 | +
|
| 60 | + <div class="item"><a href="/bbs/Stock/search?q=author%3Aaddy7533967">搜尋看板內 addy7533967 的文章</a></div> |
| 61 | +
|
| 62 | + </div> |
| 63 | +
|
| 64 | + </div> |
| 65 | + <div class="date"> 3/27</div> |
| 66 | + <div class="mark"></div> |
| 67 | + </div> |
| 68 | + </div> |
| 69 | + """ |
| 70 | + parse_html_use_bs(ori_html) |
0 commit comments