Skip to content

Commit a67838d

Browse files
committed
feat: 08_爬虫入门实战1_静态网页提取,代码完成。。。
1 parent c54e7d9 commit a67838d

File tree

3 files changed

+333
-17
lines changed

3 files changed

+333
-17
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
- [x] [05_常用的抓包工具有那些](爬虫入门/05_常用的抓包工具有那些.md)
2828
- [x] [06_为什么说用Python写爬虫有天生优势](爬虫入门/06_为什么说用Python写爬虫有天生优势.md)
2929
- [x] [07_Python常见的网络请求库](爬虫入门/07_Python常见的网络请求库.md)
30-
- [ ] [08_爬虫入门实战1_静态网页数据提取](爬虫入门/08_爬虫入门实战1_静态网页数据提取.md)
30+
- [x] [08_爬虫入门实战1_静态网页数据提取](爬虫入门/08_爬虫入门实战1_静态网页数据提取.md)
3131
- [ ] [09_爬虫入门实战2_动态数据提取](爬虫入门/09_爬虫入门实战2_动态数据提取.md)
3232
- [ ] [10_爬虫入门实战3_数据存储实现](爬虫入门/10_爬虫入门实战3_数据存储实现.md)
3333
- [ ] [11_爬虫入门实战4_高效率的爬虫实现](爬虫入门/11_爬虫入门实战4_高效率的爬虫实现.md)

源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/001_网页数据提取代码.py

Lines changed: 1 addition & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,20 +5,7 @@
55
from bs4 import BeautifulSoup
66
from parsel import Selector
77

8-
class NoteContent:
9-
title: str = ""
10-
author: str = ""
11-
publish_date: str = ""
12-
detail_link: str = ""
13-
14-
def __str__(self):
15-
return f"""
16-
Title: {self.title}
17-
User: {self.author}
18-
Publish Date: {self.publish_date}
19-
Detail Link: {self.detail_link}
20-
"""
21-
8+
from common import NoteContent
229

2310

2411
def parse_html_use_bs(html_content: str):

爬虫入门/08_爬虫入门实战1_静态网页数据提取.md

Lines changed: 331 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,335 @@ def parse_html_use_parse(html_content: str):
157157

158158

159159
### 简易流程图
160-
> 一般像这一类比较简单的爬虫需求,我几乎不化流程图,但是为了让大家更清楚的知道,我在代码编写前都会给大家画一下,养成一个coding前画图的其实对于自己提升和代码容错都有一定的帮助
160+
> 一般像这一类比较简单的爬虫需求,我几乎不化流程图,但是为了让大家更清楚的知道,我在代码编写前都会给大家画一下,养成一个coding前画图的习惯,其实对于自己提升和代码容错都有一定的帮助,画图的过程中就在思考代码流程了。
161161
162-
### 代码实现
162+
![](../static/images/1000000010.png)
163+
### 代码实现
164+
#### 依赖库安装
165+
```shell
166+
pip3 install requests
167+
pip3 install beautifulsoup4
168+
pip3 install lxml
169+
pip3 install httpx
170+
pip3 install parsel
171+
172+
```
173+
#### requests + BeautifulSoup 同步版本
174+
> 代码路径:源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/002_源码实现_同步版本.py
175+
```python
176+
# -*- coding: utf-8 -*-
177+
# @Author : [email protected]
178+
# @Time : 2024/3/27 23:50
179+
# @Desc : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据+推文数据获取 - 同步版本
180+
181+
from typing import List
182+
183+
import requests
184+
from bs4 import BeautifulSoup
185+
186+
from common import NoteContent, NoteContentDetail, NotePushComment
187+
188+
FIRST_N_PAGE = 10 # 前N页的论坛帖子数据
189+
BASE_HOST = "https://www.ptt.cc"
190+
HEADERS = {
191+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
192+
}
193+
194+
195+
def parse_note_use_bs(html_content: str) -> NoteContent:
196+
"""
197+
使用BeautifulSoup提取帖子标题、作者、发布日期,基于css选择器提取
198+
需要注意的时,我们在提取帖子的时候,可能有些帖子状态不正常,会导致没有link之类的数据,所以我们在取值时最好判断一下元素长度
199+
:param html_content: html源代码内容
200+
:return:
201+
"""
202+
# 初始化一个帖子保存容器
203+
note_content = NoteContent()
204+
205+
soup = BeautifulSoup(html_content, "lxml")
206+
# 提取标题并去左右除换行空格字符
207+
note_content.title = soup.select("div.r-ent div.title a")[0].text.strip() if len(
208+
soup.select("div.r-ent div.title a")) > 0 else ""
209+
210+
# 提取作者
211+
note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip() if len(
212+
soup.select("div.r-ent div.meta div.author")) > 0 else ""
213+
214+
# 提取发布日期
215+
note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip() if len(
216+
soup.select("div.r-ent div.meta div.date")) > 0 else ""
217+
218+
# 提取帖子链接
219+
note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"] if len(
220+
soup.select("div.r-ent div.title a")) > 0 else ""
221+
return note_content
222+
223+
224+
def get_previos_page_number() -> int:
225+
"""
226+
打开首页提取上一页的分页Number
227+
:return:
228+
"""
229+
uri = "/bbs/Stock/index.html"
230+
reponse = requests.get(url=BASE_HOST + uri, headers=HEADERS)
231+
if reponse.status_code != 200:
232+
raise Exception("send request got error status code, reason:", reponse.text)
233+
soup = BeautifulSoup(reponse.text, "lxml")
234+
235+
# 下面这一串css选择器获取的最好的办法是使用chrom工具,进入F12控制台,选中'上页'按钮, 右键,点击 Copy -> Copy Css Selector就自动生成了。
236+
css_selector = "#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
237+
pagination_link = soup.select(css_selector)[0]["href"].strip()
238+
239+
# pagination_link: /bbs/Stock/index7084.html 提取数字部分,可以使用正则表达式,也可以使用字符串替换,我这里就使用字符串替换暴力解决了
240+
previos_page_number = int(pagination_link.replace("/bbs/Stock/index", "").replace(".html", ""))
241+
242+
return previos_page_number
243+
244+
245+
def fetch_bbs_note_list(previos_number: int) -> List[NoteContent]:
246+
"""
247+
获取前N页的帖子列表
248+
:return:
249+
"""
250+
notes_list: List[NoteContent] = []
251+
252+
# 计算分页的其实位置和终止位置,由于我们也是要爬首页的,所以得到上一页的分页Number之后,应该还要加1才是我们的起始位置
253+
start_page_number = previos_number + 1
254+
end_page_number = start_page_number - FIRST_N_PAGE
255+
for page_number in range(start_page_number, end_page_number, -1):
256+
print(f"开始获取第 {page_number} 页的帖子列表 ...")
257+
258+
# 根据分页Number拼接帖子列表的URL
259+
uri = f"/bbs/Stock/index{page_number}.html"
260+
response = requests.get(url=BASE_HOST + uri, headers=HEADERS)
261+
if response.status_code != 200:
262+
print(f"{page_number}页帖子获取异常,原因:{response.text}")
263+
continue
264+
265+
# 使BeautifulSoup的CSS选择器解析数据,div.r-ent 是帖子列表html页面中每一个帖子都有的css class
266+
soup = BeautifulSoup(response.text, "lxml")
267+
all_note_elements = soup.select("div.r-ent")
268+
for note_element in all_note_elements:
269+
# 调用prettify()方法可以获取整个div元素的HTML内容
270+
note_content: NoteContent = parse_note_use_bs(note_element.prettify())
271+
notes_list.append(note_content)
272+
print(f"结束获取第 {page_number} 页的帖子列表,本次获取到:{len(all_note_elements)} 篇帖子...")
273+
return notes_list
274+
275+
276+
def fetch_bbs_note_detail(note_content: NoteContent) -> NoteContentDetail:
277+
"""
278+
获取帖子详情页数据
279+
:param note_content:
280+
:return:
281+
"""
282+
print(f"开始获取帖子 {note_content.detail_link} 详情页....")
283+
note_content_detail = NoteContentDetail()
284+
285+
# note_content有值的, 我们直接赋值,就不要去网页提取了,能偷懒就偷懒(初学者还是要老老实实的都去提取一下数据)
286+
note_content_detail.title = note_content.title
287+
note_content_detail.author = note_content.author
288+
note_content_detail.detail_link = BASE_HOST + note_content.detail_link
289+
290+
response = requests.get(url=BASE_HOST + note_content.detail_link, headers=HEADERS)
291+
if response.status_code != 200:
292+
print(f"帖子:{note_content.title} 获取异常,原因:{response.text}")
293+
return note_content_detail
294+
295+
soup = BeautifulSoup(response.text, "lxml")
296+
note_content_detail.publish_datetime = soup.select("#main-content > div:nth-child(4) > span.article-meta-value")[
297+
0].text
298+
299+
# 处理推文
300+
note_content_detail.push_comment = []
301+
all_push_elements = soup.select("#main-content > div.push")
302+
for push_element in all_push_elements:
303+
note_push_comment = NotePushComment()
304+
if len(push_element.select("span")) < 3:
305+
continue
306+
307+
note_push_comment.push_user_name = push_element.select("span")[1].text.strip()
308+
note_push_comment.push_cotent = push_element.select("span")[2].text.strip().replace(": ", "")
309+
note_push_comment.push_time = push_element.select("span")[3].text.strip()
310+
note_content_detail.push_comment.append(note_push_comment)
311+
312+
print(note_content_detail)
313+
return note_content_detail
314+
315+
316+
def run_crawler(save_notes: List[NoteContentDetail]):
317+
"""
318+
爬虫主程序
319+
:param save_notes: 数据保存容器
320+
:return:
321+
"""
322+
# step1 获取分页number
323+
previos_number: int = get_previos_page_number()
324+
325+
# step2 获取前N页帖子集合列表
326+
note_list: List[NoteContent] = fetch_bbs_note_list(previos_number)
327+
328+
# step3 获取帖子详情+推文
329+
for note_content in note_list:
330+
if not note_content.detail_link:
331+
continue
332+
note_content_detail = fetch_bbs_note_detail(note_content)
333+
save_notes.append(note_content_detail)
334+
335+
print("任务爬取完成.......")
336+
337+
338+
if __name__ == '__main__':
339+
all_note_content_detail: List[NoteContentDetail] = []
340+
run_crawler(all_note_content_detail)
341+
342+
343+
```
344+
#### httpx + parsel 异步版本
345+
> 代码路径:源代码/爬虫入门/08_爬虫入门实战1xxx
346+
```python
347+
# -*- coding: utf-8 -*-
348+
# @Author : [email protected]
349+
# @Time : 2024/3/27 23:50
350+
# @Desc : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据获取 - 异步版本
351+
352+
import httpx
353+
from parsel import Selector
354+
from typing import List
355+
356+
from common import NoteContent, NoteContentDetail, NotePushComment
357+
358+
FIRST_N_PAGE = 10 # 前N页的论坛帖子数据
359+
BASE_HOST = "https://www.ptt.cc"
360+
HEADERS = {
361+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
362+
}
363+
364+
365+
async def parse_note_use_parsel(html_content: str) -> NoteContent:
366+
"""
367+
使用parse提取帖子标题、作者、发布日期,基于css选择器提取
368+
需要注意的时,我们在提取帖子的时候,可能有些帖子状态不正常,会导致没有link之类的数据,所以我们在取值时最好判断一下元素长度
369+
:param html_content: html源代码内容
370+
:return:
371+
"""
372+
note_content = NoteContent()
373+
selector = Selector(text=html_content)
374+
title_elements = selector.css("div.r-ent div.title a")
375+
author_elements = selector.css("div.r-ent div.meta div.author")
376+
date_elements = selector.css("div.r-ent div.meta div.date")
377+
378+
note_content.title = title_elements[0].root.text.strip() if title_elements else ""
379+
note_content.author = author_elements[0].root.text.strip() if author_elements else ""
380+
note_content.publish_date = date_elements[0].root.text.strip() if date_elements else ""
381+
note_content.detail_link = title_elements[0].attrib['href'] if title_elements else ""
382+
return note_content
383+
384+
385+
async def get_previous_page_number() -> int:
386+
"""
387+
打开首页提取上一页的分页Number
388+
:return:
389+
"""
390+
uri = "/bbs/Stock/index.html"
391+
async with httpx.AsyncClient() as client:
392+
response = await client.get(BASE_HOST + uri, headers=HEADERS)
393+
if response.status_code != 200:
394+
raise Exception("send request got error status code, reason:", response.text)
395+
selector = Selector(text=response.text)
396+
css_selector = "#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
397+
pagination_link = selector.css(css_selector)[0].attrib['href'].strip()
398+
previous_page_number = int(pagination_link.replace("/bbs/Stock/index", "").replace(".html", ""))
399+
return previous_page_number
400+
401+
402+
async def fetch_bbs_note_list(previous_number: int) -> List[NoteContent]:
403+
"""
404+
获取前N页的帖子列表
405+
:param previous_number:
406+
:return:
407+
"""
408+
notes_list: List[NoteContent] = []
409+
start_page_number = previous_number + 1
410+
end_page_number = start_page_number - FIRST_N_PAGE
411+
async with httpx.AsyncClient() as client:
412+
for page_number in range(start_page_number, end_page_number, -1):
413+
print(f"开始获取第 {page_number} 页的帖子列表 ...")
414+
uri = f"/bbs/Stock/index{page_number}.html"
415+
response = await client.get(BASE_HOST + uri, headers=HEADERS)
416+
if response.status_code != 200:
417+
print(f"{page_number}页帖子获取异常,原因:{response.text}")
418+
continue
419+
selector = Selector(text=response.text)
420+
all_note_elements = selector.css("div.r-ent")
421+
for note_element_html in all_note_elements:
422+
note_content: NoteContent = await parse_note_use_parsel(note_element_html.get())
423+
notes_list.append(note_content)
424+
print(f"结束获取第 {page_number} 页的帖子列表,本次获取到:{len(all_note_elements)} 篇帖子...")
425+
return notes_list
426+
427+
428+
async def fetch_bbs_note_detail(note_content: NoteContent) -> NoteContentDetail:
429+
"""
430+
获取帖子详情页数据
431+
:param note_content:
432+
:return:
433+
"""
434+
print(f"开始获取帖子 {note_content.detail_link} 详情页....")
435+
note_content_detail = NoteContentDetail()
436+
note_content_detail.title = note_content.title
437+
note_content_detail.author = note_content.author
438+
note_content_detail.detail_link = BASE_HOST + note_content.detail_link
439+
440+
async with httpx.AsyncClient() as client:
441+
response = await client.get(note_content_detail.detail_link, headers=HEADERS)
442+
if response.status_code != 200:
443+
print(f"帖子:{note_content.title} 获取异常,原因:{response.text}")
444+
return note_content_detail
445+
selector = Selector(text=response.text)
446+
note_content_detail.publish_datetime = \
447+
selector.css("#main-content > div:nth-child(4) > span.article-meta-value")[0].root.text
448+
449+
# 解析推文
450+
note_content_detail.push_comment = []
451+
all_push_elements = selector.css("#main-content > div.push")
452+
for push_element in all_push_elements:
453+
note_push_comment = NotePushComment()
454+
spans = push_element.css("span")
455+
if len(spans) < 3:
456+
continue
457+
note_push_comment.push_user_name = spans[1].root.text.strip()
458+
note_push_comment.push_cotent = spans[2].root.text.strip().replace(": ", "")
459+
note_push_comment.push_time = spans[3].root.text.strip()
460+
note_content_detail.push_comment.append(note_push_comment)
461+
print(note_content_detail)
462+
return note_content_detail
463+
464+
465+
async def run_crawler(save_notes: List[NoteContentDetail]):
466+
previous_number = await get_previous_page_number()
467+
note_list = await fetch_bbs_note_list(previous_number)
468+
for note_content in note_list:
469+
if not note_content.detail_link:
470+
continue
471+
note_content_detail = await fetch_bbs_note_detail(note_content)
472+
save_notes.append(note_content_detail)
473+
print("任务爬取完成.......")
474+
475+
476+
if __name__ == '__main__':
477+
import asyncio
478+
479+
all_note_content_detail: List[NoteContentDetail] = []
480+
asyncio.run(run_crawler(all_note_content_detail))
481+
482+
```
483+
484+
### 存储实现
485+
> 存储实现我们留在第10讲再去实现吧
486+
487+
488+
### 其他
489+
490+
> 不知不觉的这一篇教程从晚上9点开始的,写到了晚上2.27,存储实现还没写玩,之前写前几篇帖子没什么感觉,到了实战帖子之后,感觉花费的时间多了很多很多。<br>
491+
> 并且这还是一个很简单很简单的爬虫需求,想要把完整的内容思路用图文表达出来,确实不是那么容易。。。。想到后续的进阶和高级爬虫,有点社死,可能时间严重不够。

0 commit comments

Comments
 (0)