|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | + |
| 3 | +# @Time : 2024/3/27 23:50 |
| 4 | +# @Desc : https://www.ptt.cc/bbs/Stock/index.html 前N页帖子数据+推文数据获取 - 同步版本 |
| 5 | + |
| 6 | +from typing import List |
| 7 | + |
| 8 | +import requests |
| 9 | +from bs4 import BeautifulSoup |
| 10 | + |
| 11 | +from common import NoteContent, NoteContentDetail, NotePushComment |
| 12 | + |
| 13 | +FIRST_N_PAGE = 10 # 前N页的论坛帖子数据 |
| 14 | +BASE_HOST = "https://www.ptt.cc" |
| 15 | +HEADERS = { |
| 16 | + "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36" |
| 17 | +} |
| 18 | + |
| 19 | + |
| 20 | +def parse_note_use_bs(html_content: str) -> NoteContent: |
| 21 | + """ |
| 22 | + 使用BeautifulSoup提取帖子标题、作者、发布日期,基于css选择器提取 |
| 23 | + 需要注意的时,我们在提取帖子的时候,可能有些帖子状态不正常,会导致没有link之类的数据,所以我们在取值时最好判断一下元素长度 |
| 24 | + :param html_content: html源代码内容 |
| 25 | + :return: |
| 26 | + """ |
| 27 | + # 初始化一个帖子保存容器 |
| 28 | + note_content = NoteContent() |
| 29 | + |
| 30 | + soup = BeautifulSoup(html_content, "lxml") |
| 31 | + # 提取标题并去左右除换行空格字符 |
| 32 | + note_content.title = soup.select("div.r-ent div.title a")[0].text.strip() if len( |
| 33 | + soup.select("div.r-ent div.title a")) > 0 else "" |
| 34 | + |
| 35 | + # 提取作者 |
| 36 | + note_content.author = soup.select("div.r-ent div.meta div.author")[0].text.strip() if len( |
| 37 | + soup.select("div.r-ent div.meta div.author")) > 0 else "" |
| 38 | + |
| 39 | + # 提取发布日期 |
| 40 | + note_content.publish_date = soup.select("div.r-ent div.meta div.date")[0].text.strip() if len( |
| 41 | + soup.select("div.r-ent div.meta div.date")) > 0 else "" |
| 42 | + |
| 43 | + # 提取帖子链接 |
| 44 | + note_content.detail_link = soup.select("div.r-ent div.title a")[0]["href"] if len( |
| 45 | + soup.select("div.r-ent div.title a")) > 0 else "" |
| 46 | + return note_content |
| 47 | + |
| 48 | + |
| 49 | +def get_previos_page_number() -> int: |
| 50 | + """ |
| 51 | + 打开首页提取上一页的分页Number |
| 52 | + :return: |
| 53 | + """ |
| 54 | + uri = "/bbs/Stock/index.html" |
| 55 | + reponse = requests.get(url=BASE_HOST + uri, headers=HEADERS) |
| 56 | + if reponse.status_code != 200: |
| 57 | + raise Exception("send request got error status code, reason:", reponse.text) |
| 58 | + soup = BeautifulSoup(reponse.text, "lxml") |
| 59 | + |
| 60 | + # 下面这一串css选择器获取的最好的办法是使用chrom工具,进入F12控制台,选中'上页'按钮, 右键,点击 Copy -> Copy Css Selector就自动生成了。 |
| 61 | + css_selector = "#action-bar-container > div > div.btn-group.btn-group-paging > a:nth-child(2)" |
| 62 | + pagination_link = soup.select(css_selector)[0]["href"].strip() |
| 63 | + |
| 64 | + # pagination_link: /bbs/Stock/index7084.html 提取数字部分,可以使用正则表达式,也可以使用字符串替换,我这里就使用字符串替换暴力解决了 |
| 65 | + previos_page_number = int(pagination_link.replace("/bbs/Stock/index", "").replace(".html", "")) |
| 66 | + |
| 67 | + return previos_page_number |
| 68 | + |
| 69 | + |
| 70 | +def fetch_bbs_note_list(previos_number: int) -> List[NoteContent]: |
| 71 | + """ |
| 72 | + 获取前N页的帖子列表 |
| 73 | + :return: |
| 74 | + """ |
| 75 | + notes_list: List[NoteContent] = [] |
| 76 | + |
| 77 | + # 计算分页的其实位置和终止位置,由于我们也是要爬首页的,所以得到上一页的分页Number之后,应该还要加1才是我们的起始位置 |
| 78 | + start_page_number = previos_number + 1 |
| 79 | + end_page_number = start_page_number - FIRST_N_PAGE |
| 80 | + for page_number in range(start_page_number, end_page_number, -1): |
| 81 | + print(f"开始获取第 {page_number} 页的帖子列表 ...") |
| 82 | + |
| 83 | + # 根据分页Number拼接帖子列表的URL |
| 84 | + uri = f"/bbs/Stock/index{page_number}.html" |
| 85 | + response = requests.get(url=BASE_HOST + uri, headers=HEADERS) |
| 86 | + if response.status_code != 200: |
| 87 | + print(f"第{page_number}页帖子获取异常,原因:{response.text}") |
| 88 | + continue |
| 89 | + |
| 90 | + # 使BeautifulSoup的CSS选择器解析数据,div.r-ent 是帖子列表html页面中每一个帖子都有的css class |
| 91 | + soup = BeautifulSoup(response.text, "lxml") |
| 92 | + all_note_elements = soup.select("div.r-ent") |
| 93 | + for note_element in all_note_elements: |
| 94 | + # 调用prettify()方法可以获取整个div元素的HTML内容 |
| 95 | + note_content: NoteContent = parse_note_use_bs(note_element.prettify()) |
| 96 | + notes_list.append(note_content) |
| 97 | + print(f"结束获取第 {page_number} 页的帖子列表,本次获取到:{len(all_note_elements)} 篇帖子...") |
| 98 | + return notes_list |
| 99 | + |
| 100 | + |
| 101 | +def fetch_bbs_note_detail(note_content: NoteContent) -> NoteContentDetail: |
| 102 | + """ |
| 103 | + 获取帖子详情页数据 |
| 104 | + :param note_content: |
| 105 | + :return: |
| 106 | + """ |
| 107 | + print(f"开始获取帖子 {note_content.detail_link} 详情页....") |
| 108 | + note_content_detail = NoteContentDetail() |
| 109 | + |
| 110 | + # note_content有值的, 我们直接赋值,就不要去网页提取了,能偷懒就偷懒(初学者还是要老老实实的都去提取一下数据) |
| 111 | + note_content_detail.title = note_content.title |
| 112 | + note_content_detail.author = note_content.author |
| 113 | + note_content_detail.detail_link = BASE_HOST + note_content.detail_link |
| 114 | + |
| 115 | + response = requests.get(url=BASE_HOST + note_content.detail_link, headers=HEADERS) |
| 116 | + if response.status_code != 200: |
| 117 | + print(f"帖子:{note_content.title} 获取异常,原因:{response.text}") |
| 118 | + return note_content_detail |
| 119 | + |
| 120 | + soup = BeautifulSoup(response.text, "lxml") |
| 121 | + note_content_detail.publish_datetime = soup.select("#main-content > div:nth-child(4) > span.article-meta-value")[ |
| 122 | + 0].text |
| 123 | + |
| 124 | + # 处理推文 |
| 125 | + note_content_detail.push_comment = [] |
| 126 | + all_push_elements = soup.select("#main-content > div.push") |
| 127 | + for push_element in all_push_elements: |
| 128 | + note_push_comment = NotePushComment() |
| 129 | + if len(push_element.select("span")) < 3: |
| 130 | + continue |
| 131 | + |
| 132 | + note_push_comment.push_user_name = push_element.select("span")[1].text.strip() |
| 133 | + note_push_comment.push_cotent = push_element.select("span")[2].text.strip().replace(": ", "") |
| 134 | + note_push_comment.push_time = push_element.select("span")[3].text.strip() |
| 135 | + note_content_detail.push_comment.append(note_push_comment) |
| 136 | + |
| 137 | + print(note_content_detail) |
| 138 | + return note_content_detail |
| 139 | + |
| 140 | + |
| 141 | +def run_crawler(save_notes: List[NoteContentDetail]): |
| 142 | + """ |
| 143 | + 爬虫主程序 |
| 144 | + :param save_notes: 数据保存容器 |
| 145 | + :return: |
| 146 | + """ |
| 147 | + # step1 获取分页number |
| 148 | + previos_number: int = get_previos_page_number() |
| 149 | + |
| 150 | + # step2 获取前N页帖子集合列表 |
| 151 | + note_list: List[NoteContent] = fetch_bbs_note_list(previos_number) |
| 152 | + |
| 153 | + # step3 获取帖子详情+推文 |
| 154 | + for note_content in note_list: |
| 155 | + if not note_content.detail_link: |
| 156 | + continue |
| 157 | + note_content_detail = fetch_bbs_note_detail(note_content) |
| 158 | + save_notes.append(note_content_detail) |
| 159 | + |
| 160 | + print("任务爬取完成.......") |
| 161 | + |
| 162 | + |
| 163 | +if __name__ == '__main__': |
| 164 | + all_note_content_detail: List[NoteContentDetail] = [] |
| 165 | + run_crawler(all_note_content_detail) |
0 commit comments