|
14 | 14 | import os |
15 | 15 | import random |
16 | 16 | from asyncio import Task |
17 | | -from typing import Dict, List, Optional, Tuple |
| 17 | +from typing import Dict, List, Optional, Tuple, cast |
18 | 18 |
|
19 | 19 | from playwright.async_api import (BrowserContext, BrowserType, Page, |
20 | 20 | async_playwright) |
21 | 21 |
|
22 | 22 | import config |
| 23 | +from constant import zhihu as constant |
23 | 24 | from base.base_crawler import AbstractCrawler |
24 | 25 | from model.m_zhihu import ZhihuContent, ZhihuCreator |
25 | 26 | from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool |
|
29 | 30 |
|
30 | 31 | from .client import ZhiHuClient |
31 | 32 | from .exception import DataFetchError |
32 | | -from .help import ZhihuExtractor |
| 33 | +from .help import ZhihuExtractor, judge_zhihu_url |
33 | 34 | from .login import ZhiHuLogin |
34 | 35 |
|
35 | 36 |
|
@@ -96,7 +97,7 @@ async def start(self) -> None: |
96 | 97 | await self.search() |
97 | 98 | elif config.CRAWLER_TYPE == "detail": |
98 | 99 | # Get the information and comments of the specified post |
99 | | - raise NotImplementedError |
| 100 | + await self.get_specified_notes() |
100 | 101 | elif config.CRAWLER_TYPE == "creator": |
101 | 102 | # Get creator's information and their notes and comments |
102 | 103 | await self.get_creators_and_notes() |
@@ -226,6 +227,76 @@ async def get_creators_and_notes(self) -> None: |
226 | 227 | # Get all comments of the creator's contents |
227 | 228 | await self.batch_get_content_comments(all_content_list) |
228 | 229 |
|
| 230 | + async def get_note_detail( |
| 231 | + self, full_note_url: str, semaphore: asyncio.Semaphore |
| 232 | + ) -> Optional[ZhihuContent]: |
| 233 | + """ |
| 234 | + Get note detail |
| 235 | + Args: |
| 236 | + full_note_url: str |
| 237 | + semaphore: |
| 238 | +
|
| 239 | + Returns: |
| 240 | +
|
| 241 | + """ |
| 242 | + async with semaphore: |
| 243 | + utils.logger.info( |
| 244 | + f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}" |
| 245 | + ) |
| 246 | + # judge note type |
| 247 | + note_type: str = judge_zhihu_url(full_note_url) |
| 248 | + if note_type == constant.ANSWER_NAME: |
| 249 | + question_id = full_note_url.split("/")[-3] |
| 250 | + answer_id = full_note_url.split("/")[-1] |
| 251 | + utils.logger.info( |
| 252 | + f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}" |
| 253 | + ) |
| 254 | + return await self.zhihu_client.get_answer_info(question_id, answer_id) |
| 255 | + |
| 256 | + elif note_type == constant.ARTICLE_NAME: |
| 257 | + article_id = full_note_url.split("/")[-1] |
| 258 | + utils.logger.info( |
| 259 | + f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}" |
| 260 | + ) |
| 261 | + return await self.zhihu_client.get_article_info(article_id) |
| 262 | + |
| 263 | + elif note_type == constant.VIDEO_NAME: |
| 264 | + video_id = full_note_url.split("/")[-1] |
| 265 | + utils.logger.info( |
| 266 | + f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}" |
| 267 | + ) |
| 268 | + return await self.zhihu_client.get_video_info(video_id) |
| 269 | + |
| 270 | + async def get_specified_notes(self): |
| 271 | + """ |
| 272 | + Get the information and comments of the specified post |
| 273 | + Returns: |
| 274 | +
|
| 275 | + """ |
| 276 | + get_note_detail_task_list = [] |
| 277 | + for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST: |
| 278 | + # remove query params |
| 279 | + full_note_url = full_note_url.split("?")[0] |
| 280 | + crawler_task = self.get_note_detail( |
| 281 | + full_note_url=full_note_url, |
| 282 | + semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM), |
| 283 | + ) |
| 284 | + get_note_detail_task_list.append(crawler_task) |
| 285 | + |
| 286 | + need_get_comment_notes: List[ZhihuContent] = [] |
| 287 | + note_details = await asyncio.gather(*get_note_detail_task_list) |
| 288 | + for index, note_detail in enumerate(note_details): |
| 289 | + if not note_detail: |
| 290 | + utils.logger.info( |
| 291 | + f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found" |
| 292 | + ) |
| 293 | + continue |
| 294 | + |
| 295 | + note_detail = cast(ZhihuContent, note_detail) # only for type check |
| 296 | + need_get_comment_notes.append(note_detail) |
| 297 | + await zhihu_store.update_zhihu_content(note_detail) |
| 298 | + |
| 299 | + await self.batch_get_content_comments(need_get_comment_notes) |
229 | 300 |
|
230 | 301 | @staticmethod |
231 | 302 | def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: |
|
0 commit comments