Skip to content

Commit ea5223c

Browse files
committed
feat: 知乎支持详情模式
1 parent dc9116e commit ea5223c

File tree

6 files changed

+239
-17
lines changed

6 files changed

+239
-17
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
| B 站 ||||||||
3333
| 微博 ||||||||
3434
| 贴吧 ||||||||
35-
| 知乎 || ||||||
35+
| 知乎 || ||||||
3636

3737
### MediaCrawlerPro重磅发布啦!!!
3838
> 主打学习成熟项目的架构设计,不仅仅是爬虫,Pro中的其他代码设计思路也是值得学习,欢迎大家关注!!!
@@ -111,7 +111,9 @@
111111
> [MediaCrawler在线文档](https://nanmicoder.github.io/MediaCrawler/)
112112
>
113113
114-
# 知识付费服务
114+
# 作者提供的知识服务
115+
> 如果想快速入门和学习该项目的使用、源码架构设计等、学习编程技术、亦或者想了解MediaCrawlerPro的源代码设计可以看下我的知识付费栏目。
116+
115117
[作者的知识付费栏目介绍](https://nanmicoder.github.io/MediaCrawler/%E7%9F%A5%E8%AF%86%E4%BB%98%E8%B4%B9%E4%BB%8B%E7%BB%8D.html)
116118

117119
# 项目微信交流群

config/base_config.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,13 @@
162162
# ........................
163163
]
164164

165+
# 指定知乎需要爬取的帖子ID列表
166+
ZHIHU_SPECIFIED_ID_LIST = [
167+
"https://www.zhihu.com/question/826896610/answer/4885821440", # 回答
168+
"https://zhuanlan.zhihu.com/p/673461588", # 文章
169+
"https://www.zhihu.com/zvideo/1539542068422144000" # 视频
170+
]
171+
165172
# 词云相关
166173
# 是否开启生成评论词云图
167174
ENABLE_GET_WORDCLOUD = False

constant/zhihu.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,9 @@
1111

1212
# -*- coding: utf-8 -*-
1313
ZHIHU_URL = "https://www.zhihu.com"
14+
ZHIHU_ZHUANLAN_URL = "https://zhuanlan.zhihu.com"
1415

1516
ANSWER_NAME = "answer"
1617
ARTICLE_NAME = "article"
17-
VIDEO_NAME = "zvideo"
18+
VIDEO_NAME = "zvideo"
19+

media_platform/zhihu/client.py

Lines changed: 59 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,12 @@ async def get(self, uri: str, params=None, **kwargs) -> Union[Response, Dict, st
121121
if isinstance(params, dict):
122122
final_uri += '?' + urlencode(params)
123123
headers = await self._pre_headers(final_uri)
124-
return await self.request(method="GET", url=zhihu_constant.ZHIHU_URL + final_uri, headers=headers, **kwargs)
124+
base_url = (
125+
zhihu_constant.ZHIHU_URL
126+
if "/p/" not in uri
127+
else zhihu_constant.ZHIHU_ZHUANLAN_URL
128+
)
129+
return await self.request(method="GET", url=base_url + final_uri, headers=headers, **kwargs)
125130

126131
async def pong(self) -> bool:
127132
"""
@@ -209,7 +214,7 @@ async def get_note_by_keyword(
209214
return self._extractor.extract_contents_from_search(search_res)
210215

211216
async def get_root_comments(self, content_id: str, content_type: str, offset: str = "", limit: int = 10,
212-
order_by: str = "sort") -> Dict:
217+
order_by: str = "score") -> Dict:
213218
"""
214219
获取内容的一级评论
215220
Args:
@@ -222,13 +227,16 @@ async def get_root_comments(self, content_id: str, content_type: str, offset: st
222227
Returns:
223228
224229
"""
225-
uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
226-
params = {
227-
"order": order_by,
228-
"offset": offset,
229-
"limit": limit
230-
}
230+
uri = f"/api/v4/comment_v5/{content_type}s/{content_id}/root_comment"
231+
params = {"order": order_by, "offset": offset, "limit": limit}
231232
return await self.get(uri, params)
233+
# uri = f"/api/v4/{content_type}s/{content_id}/root_comments"
234+
# params = {
235+
# "order": order_by,
236+
# "offset": offset,
237+
# "limit": limit
238+
# }
239+
# return await self.get(uri, params)
232240

233241
async def get_child_comments(self, root_comment_id: str, offset: str = "", limit: int = 10,
234242
order_by: str = "sort") -> Dict:
@@ -496,3 +504,46 @@ async def get_all_videos_by_creator(self, creator: ZhihuCreator, crawl_interval:
496504
offset += limit
497505
await asyncio.sleep(crawl_interval)
498506
return all_contents
507+
508+
509+
async def get_answer_info(
510+
self, question_id: str, answer_id: str
511+
) -> Optional[ZhihuContent]:
512+
"""
513+
获取回答信息
514+
Args:
515+
question_id:
516+
answer_id:
517+
518+
Returns:
519+
520+
"""
521+
uri = f"/question/{question_id}/answer/{answer_id}"
522+
response_html = await self.get(uri, return_response=True)
523+
return self._extractor.extract_answer_content_from_html(response_html)
524+
525+
async def get_article_info(self, article_id: str) -> Optional[ZhihuContent]:
526+
"""
527+
获取文章信息
528+
Args:
529+
article_id:
530+
531+
Returns:
532+
533+
"""
534+
uri = f"/p/{article_id}"
535+
response_html = await self.get(uri, return_response=True)
536+
return self._extractor.extract_article_content_from_html(response_html)
537+
538+
async def get_video_info(self, video_id: str) -> Optional[ZhihuContent]:
539+
"""
540+
获取视频信息
541+
Args:
542+
video_id:
543+
544+
Returns:
545+
546+
"""
547+
uri = f"/zvideo/{video_id}"
548+
response_html = await self.get(uri, return_response=True)
549+
return self._extractor.extract_zvideo_content_from_html(response_html)

media_platform/zhihu/core.py

Lines changed: 74 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,13 @@
1414
import os
1515
import random
1616
from asyncio import Task
17-
from typing import Dict, List, Optional, Tuple
17+
from typing import Dict, List, Optional, Tuple, cast
1818

1919
from playwright.async_api import (BrowserContext, BrowserType, Page,
2020
async_playwright)
2121

2222
import config
23+
from constant import zhihu as constant
2324
from base.base_crawler import AbstractCrawler
2425
from model.m_zhihu import ZhihuContent, ZhihuCreator
2526
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
@@ -29,7 +30,7 @@
2930

3031
from .client import ZhiHuClient
3132
from .exception import DataFetchError
32-
from .help import ZhihuExtractor
33+
from .help import ZhihuExtractor, judge_zhihu_url
3334
from .login import ZhiHuLogin
3435

3536

@@ -96,7 +97,7 @@ async def start(self) -> None:
9697
await self.search()
9798
elif config.CRAWLER_TYPE == "detail":
9899
# Get the information and comments of the specified post
99-
raise NotImplementedError
100+
await self.get_specified_notes()
100101
elif config.CRAWLER_TYPE == "creator":
101102
# Get creator's information and their notes and comments
102103
await self.get_creators_and_notes()
@@ -226,6 +227,76 @@ async def get_creators_and_notes(self) -> None:
226227
# Get all comments of the creator's contents
227228
await self.batch_get_content_comments(all_content_list)
228229

230+
async def get_note_detail(
231+
self, full_note_url: str, semaphore: asyncio.Semaphore
232+
) -> Optional[ZhihuContent]:
233+
"""
234+
Get note detail
235+
Args:
236+
full_note_url: str
237+
semaphore:
238+
239+
Returns:
240+
241+
"""
242+
async with semaphore:
243+
utils.logger.info(
244+
f"[ZhihuCrawler.get_specified_notes] Begin get specified note {full_note_url}"
245+
)
246+
# judge note type
247+
note_type: str = judge_zhihu_url(full_note_url)
248+
if note_type == constant.ANSWER_NAME:
249+
question_id = full_note_url.split("/")[-3]
250+
answer_id = full_note_url.split("/")[-1]
251+
utils.logger.info(
252+
f"[ZhihuCrawler.get_specified_notes] Get answer info, question_id: {question_id}, answer_id: {answer_id}"
253+
)
254+
return await self.zhihu_client.get_answer_info(question_id, answer_id)
255+
256+
elif note_type == constant.ARTICLE_NAME:
257+
article_id = full_note_url.split("/")[-1]
258+
utils.logger.info(
259+
f"[ZhihuCrawler.get_specified_notes] Get article info, article_id: {article_id}"
260+
)
261+
return await self.zhihu_client.get_article_info(article_id)
262+
263+
elif note_type == constant.VIDEO_NAME:
264+
video_id = full_note_url.split("/")[-1]
265+
utils.logger.info(
266+
f"[ZhihuCrawler.get_specified_notes] Get video info, video_id: {video_id}"
267+
)
268+
return await self.zhihu_client.get_video_info(video_id)
269+
270+
async def get_specified_notes(self):
271+
"""
272+
Get the information and comments of the specified post
273+
Returns:
274+
275+
"""
276+
get_note_detail_task_list = []
277+
for full_note_url in config.ZHIHU_SPECIFIED_ID_LIST:
278+
# remove query params
279+
full_note_url = full_note_url.split("?")[0]
280+
crawler_task = self.get_note_detail(
281+
full_note_url=full_note_url,
282+
semaphore=asyncio.Semaphore(config.MAX_CONCURRENCY_NUM),
283+
)
284+
get_note_detail_task_list.append(crawler_task)
285+
286+
need_get_comment_notes: List[ZhihuContent] = []
287+
note_details = await asyncio.gather(*get_note_detail_task_list)
288+
for index, note_detail in enumerate(note_details):
289+
if not note_detail:
290+
utils.logger.info(
291+
f"[ZhihuCrawler.get_specified_notes] Note {config.ZHIHU_SPECIFIED_ID_LIST[index]} not found"
292+
)
293+
continue
294+
295+
note_detail = cast(ZhihuContent, note_detail) # only for type check
296+
need_get_comment_notes.append(note_detail)
297+
await zhihu_store.update_zhihu_content(note_detail)
298+
299+
await self.batch_get_content_comments(need_get_comment_notes)
229300

230301
@staticmethod
231302
def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]:

media_platform/zhihu/help.py

Lines changed: 92 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -159,15 +159,13 @@ def _extract_zvideo_content(self, zvideo: Dict) -> ZhihuContent:
159159
res = ZhihuContent()
160160

161161
if "video" in zvideo and isinstance(zvideo.get("video"), dict): # 说明是从创作者主页的视频列表接口来的
162-
res.content_id = zvideo.get("video").get("video_id")
163162
res.content_url = f"{zhihu_constant.ZHIHU_URL}/zvideo/{res.content_id}"
164163
res.created_time = zvideo.get("published_at")
165164
res.updated_time = zvideo.get("updated_at")
166165
else:
167-
res.content_id = zvideo.get("zvideo_id")
168166
res.content_url = zvideo.get("video_url")
169167
res.created_time = zvideo.get("created_at")
170-
168+
res.content_id = zvideo.get("id")
171169
res.content_type = zvideo.get("type")
172170
res.title = extract_text_from_html(zvideo.get("title"))
173171
res.desc = extract_text_from_html(zvideo.get("description"))
@@ -369,3 +367,94 @@ def extract_content_list_from_creator(self, anwser_list: List[Dict]) -> List[Zhi
369367
return []
370368

371369
return self._extract_content_list(anwser_list)
370+
371+
372+
373+
374+
def extract_answer_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
375+
"""
376+
extract zhihu answer content from html
377+
Args:
378+
html_content:
379+
380+
Returns:
381+
382+
"""
383+
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
384+
if not js_init_data:
385+
return None
386+
json_data: Dict = json.loads(js_init_data)
387+
answer_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("answers", {})
388+
if not answer_info:
389+
return None
390+
391+
return self._extract_answer_content(answer_info.get(list(answer_info.keys())[0]))
392+
393+
def extract_article_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
394+
"""
395+
extract zhihu article content from html
396+
Args:
397+
html_content:
398+
399+
Returns:
400+
401+
"""
402+
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
403+
if not js_init_data:
404+
return None
405+
json_data: Dict = json.loads(js_init_data)
406+
article_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("articles", {})
407+
if not article_info:
408+
return None
409+
410+
return self._extract_article_content(article_info.get(list(article_info.keys())[0]))
411+
412+
def extract_zvideo_content_from_html(self, html_content: str) -> Optional[ZhihuContent]:
413+
"""
414+
extract zhihu zvideo content from html
415+
Args:
416+
html_content:
417+
418+
Returns:
419+
420+
"""
421+
js_init_data: str = Selector(text=html_content).xpath("//script[@id='js-initialData']/text()").get(default="")
422+
if not js_init_data:
423+
return None
424+
json_data: Dict = json.loads(js_init_data)
425+
zvideo_info: Dict = json_data.get("initialState", {}).get("entities", {}).get("zvideos", {})
426+
users: Dict = json_data.get("initialState", {}).get("entities", {}).get("users", {})
427+
if not zvideo_info:
428+
return None
429+
430+
# handler user info and video info
431+
video_detail_info: Dict = zvideo_info.get(list(zvideo_info.keys())[0])
432+
if not video_detail_info:
433+
return None
434+
if isinstance(video_detail_info.get("author"), str):
435+
author_name: str = video_detail_info.get("author")
436+
video_detail_info["author"] = users.get(author_name)
437+
438+
return self._extract_zvideo_content(video_detail_info)
439+
440+
441+
def judge_zhihu_url(note_detail_url: str) -> str:
442+
"""
443+
judge zhihu url type
444+
Args:
445+
note_detail_url:
446+
eg1: https://www.zhihu.com/question/123456789/answer/123456789 # answer
447+
eg2: https://www.zhihu.com/p/123456789 # article
448+
eg3: https://www.zhihu.com/zvideo/123456789 # zvideo
449+
450+
Returns:
451+
452+
"""
453+
if "/answer/" in note_detail_url:
454+
return zhihu_constant.ANSWER_NAME
455+
elif "/p/" in note_detail_url:
456+
return zhihu_constant.ARTICLE_NAME
457+
elif "/zvideo/" in note_detail_url:
458+
return zhihu_constant.VIDEO_NAME
459+
else:
460+
return ""

0 commit comments

Comments
 (0)