@@ -157,6 +157,335 @@ def parse_html_use_parse(html_content: str):
157157
158158
159159### 简易流程图
160- > 一般像这一类比较简单的爬虫需求,我几乎不化流程图,但是为了让大家更清楚的知道,我在代码编写前都会给大家画一下,养成一个coding前画图的其实对于自己提升和代码容错都有一定的帮助
160+ > 一般像这一类比较简单的爬虫需求,我几乎不化流程图,但是为了让大家更清楚的知道,我在代码编写前都会给大家画一下,养成一个coding前画图的习惯,其实对于自己提升和代码容错都有一定的帮助,画图的过程中就在思考代码流程了。
161161
162- ### 代码实现
162+ ![ ] ( ../static/images/1000000010.png )
163+ ### 代码实现
164+ #### 依赖库安装
165+ ``` shell
166+ pip3 install requests
167+ pip3 install beautifulsoup4
168+ pip3 install lxml
169+ pip3 install httpx
170+ pip3 install parsel
171+
172+ ```
173+ #### requests + BeautifulSoup 同步版本
174+ > 代码路径:源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/002_源码实现_同步版本.py
175+ ``` python
176+ # -*- coding: utf-8 -*-
177+ 178+ # @Time : 2024/3/27 23:50
179+ # @Desc : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据+推文数据获取 - 同步版本
180+
181+ from typing import List
182+
183+ import requests
184+ from bs4 import BeautifulSoup
185+
186+ from common import NoteContent, NoteContentDetail, NotePushComment
187+
188+ FIRST_N_PAGE = 10 # 前N页的论坛帖子数据
189+ BASE_HOST = " https://www.ptt.cc"
190+ HEADERS = {
191+ " User-Agent" : " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
192+ }
193+
194+
195+ def parse_note_use_bs (html_content : str ) -> NoteContent:
196+ """
197+ 使用BeautifulSoup提取帖子标题、作者、发布日期,基于css选择器提取
198+ 需要注意的时,我们在提取帖子的时候,可能有些帖子状态不正常,会导致没有link之类的数据,所以我们在取值时最好判断一下元素长度
199+ :param html_content: html源代码内容
200+ :return:
201+ """
202+ # 初始化一个帖子保存容器
203+ note_content = NoteContent()
204+
205+ soup = BeautifulSoup(html_content, " lxml" )
206+ # 提取标题并去左右除换行空格字符
207+ note_content.title = soup.select(" div.r-ent div.title a" )[0 ].text.strip() if len (
208+ soup.select(" div.r-ent div.title a" )) > 0 else " "
209+
210+ # 提取作者
211+ note_content.author = soup.select(" div.r-ent div.meta div.author" )[0 ].text.strip() if len (
212+ soup.select(" div.r-ent div.meta div.author" )) > 0 else " "
213+
214+ # 提取发布日期
215+ note_content.publish_date = soup.select(" div.r-ent div.meta div.date" )[0 ].text.strip() if len (
216+ soup.select(" div.r-ent div.meta div.date" )) > 0 else " "
217+
218+ # 提取帖子链接
219+ note_content.detail_link = soup.select(" div.r-ent div.title a" )[0 ][" href" ] if len (
220+ soup.select(" div.r-ent div.title a" )) > 0 else " "
221+ return note_content
222+
223+
224+ def get_previos_page_number () -> int :
225+ """
226+ 打开首页提取上一页的分页Number
227+ :return:
228+ """
229+ uri = " /bbs/Stock/index.html"
230+ reponse = requests.get(url = BASE_HOST + uri, headers = HEADERS )
231+ if reponse.status_code != 200 :
232+ raise Exception (" send request got error status code, reason:" , reponse.text)
233+ soup = BeautifulSoup(reponse.text, " lxml" )
234+
235+ # 下面这一串css选择器获取的最好的办法是使用chrom工具,进入F12控制台,选中'上页'按钮, 右键,点击 Copy -> Copy Css Selector就自动生成了。
236+ css_selector = " #action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
237+ pagination_link = soup.select(css_selector)[0 ][" href" ].strip()
238+
239+ # pagination_link: /bbs/Stock/index7084.html 提取数字部分,可以使用正则表达式,也可以使用字符串替换,我这里就使用字符串替换暴力解决了
240+ previos_page_number = int (pagination_link.replace(" /bbs/Stock/index" , " " ).replace(" .html" , " " ))
241+
242+ return previos_page_number
243+
244+
245+ def fetch_bbs_note_list (previos_number : int ) -> List[NoteContent]:
246+ """
247+ 获取前N页的帖子列表
248+ :return:
249+ """
250+ notes_list: List[NoteContent] = []
251+
252+ # 计算分页的其实位置和终止位置,由于我们也是要爬首页的,所以得到上一页的分页Number之后,应该还要加1才是我们的起始位置
253+ start_page_number = previos_number + 1
254+ end_page_number = start_page_number - FIRST_N_PAGE
255+ for page_number in range (start_page_number, end_page_number, - 1 ):
256+ print (f " 开始获取第 { page_number} 页的帖子列表 ... " )
257+
258+ # 根据分页Number拼接帖子列表的URL
259+ uri = f " /bbs/Stock/index { page_number} .html "
260+ response = requests.get(url = BASE_HOST + uri, headers = HEADERS )
261+ if response.status_code != 200 :
262+ print (f " 第 { page_number} 页帖子获取异常,原因: { response.text} " )
263+ continue
264+
265+ # 使BeautifulSoup的CSS选择器解析数据,div.r-ent 是帖子列表html页面中每一个帖子都有的css class
266+ soup = BeautifulSoup(response.text, " lxml" )
267+ all_note_elements = soup.select(" div.r-ent" )
268+ for note_element in all_note_elements:
269+ # 调用prettify()方法可以获取整个div元素的HTML内容
270+ note_content: NoteContent = parse_note_use_bs(note_element.prettify())
271+ notes_list.append(note_content)
272+ print (f " 结束获取第 { page_number} 页的帖子列表,本次获取到: { len (all_note_elements)} 篇帖子... " )
273+ return notes_list
274+
275+
276+ def fetch_bbs_note_detail (note_content : NoteContent) -> NoteContentDetail:
277+ """
278+ 获取帖子详情页数据
279+ :param note_content:
280+ :return:
281+ """
282+ print (f " 开始获取帖子 { note_content.detail_link} 详情页.... " )
283+ note_content_detail = NoteContentDetail()
284+
285+ # note_content有值的, 我们直接赋值,就不要去网页提取了,能偷懒就偷懒(初学者还是要老老实实的都去提取一下数据)
286+ note_content_detail.title = note_content.title
287+ note_content_detail.author = note_content.author
288+ note_content_detail.detail_link = BASE_HOST + note_content.detail_link
289+
290+ response = requests.get(url = BASE_HOST + note_content.detail_link, headers = HEADERS )
291+ if response.status_code != 200 :
292+ print (f " 帖子: { note_content.title} 获取异常,原因: { response.text} " )
293+ return note_content_detail
294+
295+ soup = BeautifulSoup(response.text, " lxml" )
296+ note_content_detail.publish_datetime = soup.select(" #main-content > div:nth-child(4) > span.article-meta-value" )[
297+ 0 ].text
298+
299+ # 处理推文
300+ note_content_detail.push_comment = []
301+ all_push_elements = soup.select(" #main-content > div.push" )
302+ for push_element in all_push_elements:
303+ note_push_comment = NotePushComment()
304+ if len (push_element.select(" span" )) < 3 :
305+ continue
306+
307+ note_push_comment.push_user_name = push_element.select(" span" )[1 ].text.strip()
308+ note_push_comment.push_cotent = push_element.select(" span" )[2 ].text.strip().replace(" : " , " " )
309+ note_push_comment.push_time = push_element.select(" span" )[3 ].text.strip()
310+ note_content_detail.push_comment.append(note_push_comment)
311+
312+ print (note_content_detail)
313+ return note_content_detail
314+
315+
316+ def run_crawler (save_notes : List[NoteContentDetail]):
317+ """
318+ 爬虫主程序
319+ :param save_notes: 数据保存容器
320+ :return:
321+ """
322+ # step1 获取分页number
323+ previos_number: int = get_previos_page_number()
324+
325+ # step2 获取前N页帖子集合列表
326+ note_list: List[NoteContent] = fetch_bbs_note_list(previos_number)
327+
328+ # step3 获取帖子详情+推文
329+ for note_content in note_list:
330+ if not note_content.detail_link:
331+ continue
332+ note_content_detail = fetch_bbs_note_detail(note_content)
333+ save_notes.append(note_content_detail)
334+
335+ print (" 任务爬取完成......." )
336+
337+
338+ if __name__ == ' __main__' :
339+ all_note_content_detail: List[NoteContentDetail] = []
340+ run_crawler(all_note_content_detail)
341+
342+
343+ ```
344+ #### httpx + parsel 异步版本
345+ > 代码路径:源代码/爬虫入门/08_爬虫入门实战1xxx
346+ ``` python
347+ # -*- coding: utf-8 -*-
348+ 349+ # @Time : 2024/3/27 23:50
350+ # @Desc : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据获取 - 异步版本
351+
352+ import httpx
353+ from parsel import Selector
354+ from typing import List
355+
356+ from common import NoteContent, NoteContentDetail, NotePushComment
357+
358+ FIRST_N_PAGE = 10 # 前N页的论坛帖子数据
359+ BASE_HOST = " https://www.ptt.cc"
360+ HEADERS = {
361+ " User-Agent" : " Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
362+ }
363+
364+
365+ async def parse_note_use_parsel (html_content : str ) -> NoteContent:
366+ """
367+ 使用parse提取帖子标题、作者、发布日期,基于css选择器提取
368+ 需要注意的时,我们在提取帖子的时候,可能有些帖子状态不正常,会导致没有link之类的数据,所以我们在取值时最好判断一下元素长度
369+ :param html_content: html源代码内容
370+ :return:
371+ """
372+ note_content = NoteContent()
373+ selector = Selector(text = html_content)
374+ title_elements = selector.css(" div.r-ent div.title a" )
375+ author_elements = selector.css(" div.r-ent div.meta div.author" )
376+ date_elements = selector.css(" div.r-ent div.meta div.date" )
377+
378+ note_content.title = title_elements[0 ].root.text.strip() if title_elements else " "
379+ note_content.author = author_elements[0 ].root.text.strip() if author_elements else " "
380+ note_content.publish_date = date_elements[0 ].root.text.strip() if date_elements else " "
381+ note_content.detail_link = title_elements[0 ].attrib[' href' ] if title_elements else " "
382+ return note_content
383+
384+
385+ async def get_previous_page_number () -> int :
386+ """
387+ 打开首页提取上一页的分页Number
388+ :return:
389+ """
390+ uri = " /bbs/Stock/index.html"
391+ async with httpx.AsyncClient() as client:
392+ response = await client.get(BASE_HOST + uri, headers = HEADERS )
393+ if response.status_code != 200 :
394+ raise Exception (" send request got error status code, reason:" , response.text)
395+ selector = Selector(text = response.text)
396+ css_selector = " #action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
397+ pagination_link = selector.css(css_selector)[0 ].attrib[' href' ].strip()
398+ previous_page_number = int (pagination_link.replace(" /bbs/Stock/index" , " " ).replace(" .html" , " " ))
399+ return previous_page_number
400+
401+
402+ async def fetch_bbs_note_list (previous_number : int ) -> List[NoteContent]:
403+ """
404+ 获取前N页的帖子列表
405+ :param previous_number:
406+ :return:
407+ """
408+ notes_list: List[NoteContent] = []
409+ start_page_number = previous_number + 1
410+ end_page_number = start_page_number - FIRST_N_PAGE
411+ async with httpx.AsyncClient() as client:
412+ for page_number in range (start_page_number, end_page_number, - 1 ):
413+ print (f " 开始获取第 { page_number} 页的帖子列表 ... " )
414+ uri = f " /bbs/Stock/index { page_number} .html "
415+ response = await client.get(BASE_HOST + uri, headers = HEADERS )
416+ if response.status_code != 200 :
417+ print (f " 第 { page_number} 页帖子获取异常,原因: { response.text} " )
418+ continue
419+ selector = Selector(text = response.text)
420+ all_note_elements = selector.css(" div.r-ent" )
421+ for note_element_html in all_note_elements:
422+ note_content: NoteContent = await parse_note_use_parsel(note_element_html.get())
423+ notes_list.append(note_content)
424+ print (f " 结束获取第 { page_number} 页的帖子列表,本次获取到: { len (all_note_elements)} 篇帖子... " )
425+ return notes_list
426+
427+
428+ async def fetch_bbs_note_detail (note_content : NoteContent) -> NoteContentDetail:
429+ """
430+ 获取帖子详情页数据
431+ :param note_content:
432+ :return:
433+ """
434+ print (f " 开始获取帖子 { note_content.detail_link} 详情页.... " )
435+ note_content_detail = NoteContentDetail()
436+ note_content_detail.title = note_content.title
437+ note_content_detail.author = note_content.author
438+ note_content_detail.detail_link = BASE_HOST + note_content.detail_link
439+
440+ async with httpx.AsyncClient() as client:
441+ response = await client.get(note_content_detail.detail_link, headers = HEADERS )
442+ if response.status_code != 200 :
443+ print (f " 帖子: { note_content.title} 获取异常,原因: { response.text} " )
444+ return note_content_detail
445+ selector = Selector(text = response.text)
446+ note_content_detail.publish_datetime = \
447+ selector.css(" #main-content > div:nth-child(4) > span.article-meta-value" )[0 ].root.text
448+
449+ # 解析推文
450+ note_content_detail.push_comment = []
451+ all_push_elements = selector.css(" #main-content > div.push" )
452+ for push_element in all_push_elements:
453+ note_push_comment = NotePushComment()
454+ spans = push_element.css(" span" )
455+ if len (spans) < 3 :
456+ continue
457+ note_push_comment.push_user_name = spans[1 ].root.text.strip()
458+ note_push_comment.push_cotent = spans[2 ].root.text.strip().replace(" : " , " " )
459+ note_push_comment.push_time = spans[3 ].root.text.strip()
460+ note_content_detail.push_comment.append(note_push_comment)
461+ print (note_content_detail)
462+ return note_content_detail
463+
464+
465+ async def run_crawler (save_notes : List[NoteContentDetail]):
466+ previous_number = await get_previous_page_number()
467+ note_list = await fetch_bbs_note_list(previous_number)
468+ for note_content in note_list:
469+ if not note_content.detail_link:
470+ continue
471+ note_content_detail = await fetch_bbs_note_detail(note_content)
472+ save_notes.append(note_content_detail)
473+ print (" 任务爬取完成......." )
474+
475+
476+ if __name__ == ' __main__' :
477+ import asyncio
478+
479+ all_note_content_detail: List[NoteContentDetail] = []
480+ asyncio.run(run_crawler(all_note_content_detail))
481+
482+ ```
483+
484+ ### 存储实现
485+ > 存储实现我们留在第10讲再去实现吧
486+
487+
488+ ### 其他
489+
490+ > 不知不觉的这一篇教程从晚上9点开始的,写到了晚上2.27,存储实现还没写玩,之前写前几篇帖子没什么感觉,到了实战帖子之后,感觉花费的时间多了很多很多。<br >
491+ > 并且这还是一个很简单很简单的爬虫需求,想要把完整的内容思路用图文表达出来,确实不是那么容易。。。。想到后续的进阶和高级爬虫,有点社死,可能时间严重不够。
0 commit comments