Skip to content

Commit c91de71

Browse files
committed
feat: 08_爬虫入门实战1_静态网页数据提取.md 70%
1 parent 47b6370 commit c91de71

File tree

13 files changed

+172
-2
lines changed

13 files changed

+172
-2
lines changed

static/images/100000003.png

175 KB
Loading

static/images/100000004.png

208 KB
Loading

static/images/100000005.png

237 KB
Loading

static/images/100000006.png

319 KB
Loading

static/images/100000007.png

191 KB
Loading

static/images/100000008.png

574 KB
Loading

static/images/100000009.png

401 KB
Loading
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
# -*- coding: utf-8 -*-
2+
# @Author : [email protected]
3+
# @Time : 2024/3/27 22:47
4+
# @Desc : 分别使用两个库演示如何提取html文档结构数据
5+
from bs4 import BeautifulSoup
6+
7+
8+
class NoteContent:
9+
title: str = ""
10+
author: str = ""
11+
publish_date: str = ""
12+
detail_link: str = ""
13+
14+
def __str__(self):
15+
return f"{self.title}_{self.detail_link}"
16+
17+
18+
def parse_html_use_bs(html_content: str):
19+
"""
20+
使用BeautifulSoup提取帖子标题、作者、发布日期,基于css选择器提取
21+
:param html_content: html源代码内容
22+
:return:
23+
"""
24+
# 初始化一个帖子保存容器
25+
note_content = NoteContent()
26+
# 初始化bs查询对象
27+
soup = BeautifulSoup(html_content)
28+
# 提取标题并去左右除换行空格字符
29+
note_content.title = soup.select("div.r-ent div.title a")[0].text.strip()
30+
# 提取作则
31+
print(soup)
32+
33+
34+
def parse_html_use_parse(html_content: str):
35+
"""
36+
使用parse提取帖子标题、作者、发布日期,基于xpath选择器提取
37+
:param html_content: html源代码内容
38+
:return:
39+
"""
40+
pass
41+
42+
43+
if __name__ == '__main__':
44+
ori_html = """
45+
<div class="r-ent">
46+
<div class="nrec"><span class="hl f3">11</span></div>
47+
<div class="title">
48+
49+
<a href="/bbs/Stock/M.1711544298.A.9F8.html">[新聞] 童子賢:用稅收補貼電費非長久之計 應共</a>
50+
51+
</div>
52+
<div class="meta">
53+
<div class="author">addy7533967</div>
54+
<div class="article-menu">
55+
56+
<div class="trigger">⋯</div>
57+
<div class="dropdown">
58+
<div class="item"><a href="/bbs/Stock/search?q=thread%3A%5B%E6%96%B0%E8%81%9E%5D+%E7%AB%A5%E5%AD%90%E8%B3%A2%EF%BC%9A%E7%94%A8%E7%A8%85%E6%94%B6%E8%A3%9C%E8%B2%BC%E9%9B%BB%E8%B2%BB%E9%9D%9E%E9%95%B7%E4%B9%85%E4%B9%8B%E8%A8%88+%E6%87%89%E5%85%B1">搜尋同標題文章</a></div>
59+
60+
<div class="item"><a href="/bbs/Stock/search?q=author%3Aaddy7533967">搜尋看板內 addy7533967 的文章</a></div>
61+
62+
</div>
63+
64+
</div>
65+
<div class="date"> 3/27</div>
66+
<div class="mark"></div>
67+
</div>
68+
</div>
69+
"""
70+
parse_html_use_bs(ori_html)

0 commit comments

Comments
 (0)