feat: 08_爬虫入门实战1_静态网页提取，代码完成。。。

NanmiCoder · NanmiCoder · commit 18de14b22ad3 · 2024-03-28T02:30:23.000+08:00
diff --git a/static/images/1000000010.png b/static/images/1000000010.png
diff --git a/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/002_源码实现_同步版本.py b/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/002_源码实现_同步版本.py
@@ -0,0 +1,165 @@
+# -*- coding: utf-8 -*-
+# @Author  : relakkes@gmail.com
+# @Time    : 2024/3/27 23:50
+# @Desc    : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据+推文数据获取 - 同步版本
+
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+
+from common import NoteContent, NoteContentDetail, NotePushComment
+
+FIRST_N_PAGE = 10  # 前N页的论坛帖子数据
+BASE_HOST = "https://www.ptt.cc"
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+}
+
+
+def parse_note_use_bs(html_content: str) -> NoteContent:
+    """
+    使用BeautifulSoup提取帖子标题、作者、发布日期，基于css选择器提取
+    需要注意的时，我们在提取帖子的时候，可能有些帖子状态不正常，会导致没有link之类的数据，所以我们在取值时最好判断一下元素长度
+    :param html_content: html源代码内容
+    :return:
+    """
+    # 初始化一个帖子保存容器
+    note_content = NoteContent()
+
+    soup = BeautifulSoup(html_content, "lxml")
+    # 提取标题并去左右除换行空格字符
+    note_content.title = soup.select("div.r-ent div.title a")[0].text.strip() if len(
+        soup.select("div.r-ent div.title a")) > 0 else ""
+
+    # 提取作者
+    note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip() if len(
+        soup.select("div.r-ent div.meta div.author")) > 0 else ""
+
+    # 提取发布日期
+    note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip() if len(
+        soup.select("div.r-ent div.meta div.date")) > 0 else ""
+
+    # 提取帖子链接
+    note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"] if len(
+        soup.select("div.r-ent div.title a")) > 0 else ""
+    return note_content
+
+
+def get_previos_page_number() -> int:
+    """
+    打开首页提取上一页的分页Number
+    :return:
+    """
+    uri = "/bbs/Stock/index.html"
+    reponse = requests.get(url=BASE_HOST + uri, headers=HEADERS)
+    if reponse.status_code != 200:
+        raise Exception("send request got error status code, reason：", reponse.text)
+    soup = BeautifulSoup(reponse.text, "lxml")
+
+    # 下面这一串css选择器获取的最好的办法是使用chrom工具，进入F12控制台，选中'上页'按钮, 右键，点击 Copy -> Copy Css Selector就自动生成了。
+    css_selector = "#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
+    pagination_link = soup.select(css_selector)[0]["href"].strip()
+
+    # pagination_link: /bbs/Stock/index7084.html 提取数字部分，可以使用正则表达式，也可以使用字符串替换，我这里就使用字符串替换暴力解决了
+    previos_page_number = int(pagination_link.replace("/bbs/Stock/index", "").replace(".html", ""))
+
+    return previos_page_number
+
+
+def fetch_bbs_note_list(previos_number: int) -> List[NoteContent]:
+    """
+    获取前N页的帖子列表
+    :return:
+    """
+    notes_list: List[NoteContent] = []
+
+    # 计算分页的其实位置和终止位置，由于我们也是要爬首页的，所以得到上一页的分页Number之后，应该还要加1才是我们的起始位置
+    start_page_number = previos_number + 1
+    end_page_number = start_page_number - FIRST_N_PAGE
+    for page_number in range(start_page_number, end_page_number, -1):
+        print(f"开始获取第 {page_number} 页的帖子列表 ...")
+
+        # 根据分页Number拼接帖子列表的URL
+        uri = f"/bbs/Stock/index{page_number}.html"
+        response = requests.get(url=BASE_HOST + uri, headers=HEADERS)
+        if response.status_code != 200:
+            print(f"第{page_number}页帖子获取异常,原因：{response.text}")
+            continue
+
+        # 使BeautifulSoup的CSS选择器解析数据，div.r-ent 是帖子列表html页面中每一个帖子都有的css class
+        soup = BeautifulSoup(response.text, "lxml")
+        all_note_elements = soup.select("div.r-ent")
+        for note_element in all_note_elements:
+            # 调用prettify()方法可以获取整个div元素的HTML内容
+            note_content: NoteContent = parse_note_use_bs(note_element.prettify())
+            notes_list.append(note_content)
+        print(f"结束获取第 {page_number} 页的帖子列表，本次获取到:{len(all_note_elements)} 篇帖子...")
+    return notes_list
+
+
+def fetch_bbs_note_detail(note_content: NoteContent) -> NoteContentDetail:
+    """
+    获取帖子详情页数据
+    :param note_content:
+    :return:
+    """
+    print(f"开始获取帖子 {note_content.detail_link} 详情页....")
+    note_content_detail = NoteContentDetail()
+
+    # note_content有值的, 我们直接赋值，就不要去网页提取了，能偷懒就偷懒（初学者还是要老老实实的都去提取一下数据）
+    note_content_detail.title = note_content.title
+    note_content_detail.author = note_content.author
+    note_content_detail.detail_link = BASE_HOST + note_content.detail_link
+
+    response = requests.get(url=BASE_HOST + note_content.detail_link, headers=HEADERS)
+    if response.status_code != 200:
+        print(f"帖子：{note_content.title} 获取异常,原因：{response.text}")
+        return note_content_detail
+
+    soup = BeautifulSoup(response.text, "lxml")
+    note_content_detail.publish_datetime = soup.select("#main-content > div:nth-child(4) > span.article-meta-value")[
+        0].text
+
+    # 处理推文
+    note_content_detail.push_comment = []
+    all_push_elements = soup.select("#main-content > div.push")
+    for push_element in all_push_elements:
+        note_push_comment = NotePushComment()
+        if len(push_element.select("span")) < 3:
+            continue
+
+        note_push_comment.push_user_name = push_element.select("span")[1].text.strip()
+        note_push_comment.push_cotent = push_element.select("span")[2].text.strip().replace(": ", "")
+        note_push_comment.push_time = push_element.select("span")[3].text.strip()
+        note_content_detail.push_comment.append(note_push_comment)
+
+    print(note_content_detail)
+    return note_content_detail
+
+
+def run_crawler(save_notes: List[NoteContentDetail]):
+    """
+    爬虫主程序
+    :param save_notes: 数据保存容器
+    :return:
+    """
+    # step1 获取分页number
+    previos_number: int = get_previos_page_number()
+
+    # step2 获取前N页帖子集合列表
+    note_list: List[NoteContent] = fetch_bbs_note_list(previos_number)
+
+    # step3 获取帖子详情+推文
+    for note_content in note_list:
+        if not note_content.detail_link:
+            continue
+        note_content_detail = fetch_bbs_note_detail(note_content)
+        save_notes.append(note_content_detail)
+
+    print("任务爬取完成.......")
+
+
+if __name__ == '__main__':
+    all_note_content_detail: List[NoteContentDetail] = []
+    run_crawler(all_note_content_detail)
diff --git a/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/003_源码实现_异步版本.py b/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/003_源码实现_异步版本.py
@@ -0,0 +1,135 @@
+# -*- coding: utf-8 -*-
+# @Author  : relakkes@gmail.com
+# @Time    : 2024/3/27 23:50
+# @Desc    : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据获取 - 异步版本
+
+from typing import List
+
+import httpx
+from parsel import Selector
+
+from common import NoteContent, NoteContentDetail, NotePushComment
+
+FIRST_N_PAGE = 10  # 前N页的论坛帖子数据
+BASE_HOST = "https://www.ptt.cc"
+HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
+}
+
+
+async def parse_note_use_parsel(html_content: str) -> NoteContent:
+    """
+    使用parse提取帖子标题、作者、发布日期，基于css选择器提取
+    需要注意的时，我们在提取帖子的时候，可能有些帖子状态不正常，会导致没有link之类的数据，所以我们在取值时最好判断一下元素长度
+    :param html_content: html源代码内容
+    :return:
+    """
+    note_content = NoteContent()
+    selector = Selector(text=html_content)
+    title_elements = selector.css("div.r-ent div.title a")
+    author_elements = selector.css("div.r-ent div.meta div.author")
+    date_elements = selector.css("div.r-ent div.meta div.date")
+
+    note_content.title = title_elements[0].root.text.strip() if title_elements else ""
+    note_content.author = author_elements[0].root.text.strip() if author_elements else ""
+    note_content.publish_date = date_elements[0].root.text.strip() if date_elements else ""
+    note_content.detail_link = title_elements[0].attrib['href'] if title_elements else ""
+    return note_content
+
+
+async def get_previous_page_number() -> int:
+    """
+    打开首页提取上一页的分页Number
+    :return:
+    """
+    uri = "/bbs/Stock/index.html"
+    async with httpx.AsyncClient() as client:
+        response = await client.get(BASE_HOST + uri, headers=HEADERS)
+        if response.status_code != 200:
+            raise Exception("send request got error status code, reason：", response.text)
+        selector = Selector(text=response.text)
+        css_selector = "#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)"
+        pagination_link = selector.css(css_selector)[0].attrib['href'].strip()
+        previous_page_number = int(pagination_link.replace("/bbs/Stock/index", "").replace(".html", ""))
+        return previous_page_number
+
+
+async def fetch_bbs_note_list(previous_number: int) -> List[NoteContent]:
+    """
+    获取前N页的帖子列表
+    :param previous_number:
+    :return:
+    """
+    notes_list: List[NoteContent] = []
+    start_page_number = previous_number + 1
+    end_page_number = start_page_number - FIRST_N_PAGE
+    async with httpx.AsyncClient() as client:
+        for page_number in range(start_page_number, end_page_number, -1):
+            print(f"开始获取第 {page_number} 页的帖子列表 ...")
+            uri = f"/bbs/Stock/index{page_number}.html"
+            response = await client.get(BASE_HOST + uri, headers=HEADERS)
+            if response.status_code != 200:
+                print(f"第{page_number}页帖子获取异常,原因：{response.text}")
+                continue
+            selector = Selector(text=response.text)
+            all_note_elements = selector.css("div.r-ent")
+            for note_element_html in all_note_elements:
+                note_content: NoteContent = await parse_note_use_parsel(note_element_html.get())
+                notes_list.append(note_content)
+            print(f"结束获取第 {page_number} 页的帖子列表，本次获取到:{len(all_note_elements)} 篇帖子...")
+    return notes_list
+
+
+async def fetch_bbs_note_detail(note_content: NoteContent) -> NoteContentDetail:
+    """
+    获取帖子详情页数据
+    :param note_content:
+    :return:
+    """
+    print(f"开始获取帖子 {note_content.detail_link} 详情页....")
+    note_content_detail = NoteContentDetail()
+    note_content_detail.title = note_content.title
+    note_content_detail.author = note_content.author
+    note_content_detail.detail_link = BASE_HOST + note_content.detail_link
+
+    async with httpx.AsyncClient() as client:
+        response = await client.get(note_content_detail.detail_link, headers=HEADERS)
+        if response.status_code != 200:
+            print(f"帖子：{note_content.title} 获取异常,原因：{response.text}")
+            return note_content_detail
+        selector = Selector(text=response.text)
+        note_content_detail.publish_datetime = \
+            selector.css("#main-content > div:nth-child(4) > span.article-meta-value")[0].root.text
+
+        # 解析推文
+        note_content_detail.push_comment = []
+        all_push_elements = selector.css("#main-content > div.push")
+        for push_element in all_push_elements:
+            note_push_comment = NotePushComment()
+            spans = push_element.css("span")
+            if len(spans) < 3:
+                continue
+            note_push_comment.push_user_name = spans[1].root.text.strip()
+            note_push_comment.push_cotent = spans[2].root.text.strip().replace(": ", "")
+            note_push_comment.push_time = spans[3].root.text.strip()
+            note_content_detail.push_comment.append(note_push_comment)
+    print(note_content_detail)
+    return note_content_detail
+
+
+async def run_crawler(save_notes: List[NoteContentDetail]):
+    previous_number = await get_previous_page_number()
+    note_list = await fetch_bbs_note_list(previous_number)
+    for note_content in note_list:
+        if not note_content.detail_link:
+            continue
+        note_content_detail = await fetch_bbs_note_detail(note_content)
+        save_notes.append(note_content_detail)
+    print("任务爬取完成.......")
+
+
+if __name__ == '__main__':
+    import asyncio
+
+    all_note_content_detail: List[NoteContentDetail] = []
+    asyncio.run(run_crawler(all_note_content_detail))
diff --git a/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/common.py b/源代码/爬虫入门/08_爬虫入门实战1_静态网页数据提取/common.py
@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# @Author  : relakkes@gmail.com
+# @Time    : 2024/3/28 01:09
+# @Desc    : 公共模型代码
+
+from typing import List
+
+
+class NoteContent:
+    """
+    帖子简介存储容器
+    """
+    title: str = ""  # 帖子标题
+    author: str = ""  # 帖子作者
+    publish_date: str = ""  # 帖子发表日期
+    detail_link: str = ""  # 帖子详情
+
+    def __str__(self):
+        return f"""
+            Title: {self.title}
+            User: {self.author}
+            Publish Date: {self.publish_date}
+            Detail Link: {self.detail_link}        
+        """
+
+
+class NotePushComment:
+    """
+    推文存储容器
+    """
+    push_user_name: str = ""  # 推文人
+    push_cotent: str = ""  # 推文内容
+    push_time: str = ""  # 推文时间
+
+    def __repr__(self):
+        # 这里有用repr的原因是以为了NoteContentDetail的push_comment List结构方便打印
+        return f"NotePushComment(push_user_name='{self.push_user_name}', push_cotent='{self.push_cotent}', push_time='{self.push_time}')"
+
+
+class NoteContentDetail:
+    """
+    帖子
+    """
+    title: str = ""  # 帖子标题
+    author: str = ""  # 帖子作者
+    publish_datetime: str = ""  # 帖子发表日期
+    detail_link: str = ""  # 帖子详情链接
+    push_comment: List[NotePushComment] = []  # 帖子推文列表，相当于国内评论列表
+
+    def __str__(self):
+        return f"""
+Title: {self.title}
+User: {self.author}
+Publish Datetime: {self.publish_datetime}
+Detail Link: {self.detail_link}
+Push Comments: {self.push_comment}        
+"""