Skip to content

Commit a966121

Browse files
feat(build-release): 对下载的 PDF 注入目录 (#54)
1 parent 4755e2b commit a966121

File tree

2 files changed

+156
-13
lines changed

2 files changed

+156
-13
lines changed

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@ psutil
22
pywin32; sys_platform == "win32"
33
requests
44
pyperclip
5+
pypdf

src/tchMaterial-parser.pyw

Lines changed: 155 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,14 @@ import os, sys, platform
1111
from functools import partial
1212
import threading, psutil, tempfile, pyperclip
1313
import base64, json, re, requests
14+
from pypdf import PdfReader, PdfWriter
1415

1516
os_name = platform.system() # 获取操作系统类型
1617

1718
if os_name == "Windows": # 在 Windows 操作系统下,导入 Windows 相关库
1819
import win32print, win32gui, win32con, win32api, ctypes, winreg
1920

20-
def parse(url: str) -> tuple[str, str, str] | tuple[None, None, None]: # 解析 URL
21+
def parse(url: str) -> tuple[str, str, str, list] | tuple[None, None, None, None]:
2122
try:
2223
content_id, content_type, resource_url = None, None, None
2324

@@ -27,7 +28,7 @@ def parse(url: str) -> tuple[str, str, str] | tuple[None, None, None]: # 解析
2728
content_id = q.split("=")[1]
2829
break
2930
if not content_id:
30-
return None, None, None
31+
return None, None, None, None
3132

3233
for q in url[url.find("?") + 1:].split("&"):
3334
if q.split("=")[0] == "contentType":
@@ -66,8 +67,96 @@ def parse(url: str) -> tuple[str, str, str] | tuple[None, None, None]: # 解析
6667
response = session.get(f"https://s-file-1.ykt.cbern.com.cn/zxx/ndrs/special_edu/resources/details/{content_id}.json")
6768
else: # 对普通电子课本的解析
6869
response = session.get(f"https://s-file-1.ykt.cbern.com.cn/zxx/ndrv2/resources/tch_material/details/{content_id}.json")
69-
70+
7071
data = response.json()
72+
title = data.get("title", "未知教材")
73+
74+
# 3. 获取章节目录 (核心修改部分)
75+
chapters = data.get("chapters", [])
76+
77+
# 如果主接口没目录,尝试通过 ebook_mapping + tree 接口组合获取
78+
if not chapters:
79+
mapping_url = None
80+
for item in data.get("ti_items", []):
81+
if item.get("ti_file_flag") == "ebook_mapping":
82+
mapping_url = item["ti_storages"][0]
83+
break
84+
85+
if mapping_url:
86+
try:
87+
if not access_token:
88+
mapping_url = re.sub(
89+
r"^https?://(?:.+).ykt.cbern.com.cn/(.+)/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}).pkg/(.+)$",
90+
r"https://c1.ykt.cbern.com.cn/\1/\2.pkg/\3",
91+
mapping_url
92+
)
93+
94+
# A. 下载 mapping 文件获取页码和 ebook_id
95+
map_resp = session.get(mapping_url)
96+
map_resp.encoding = 'utf-8'
97+
map_data = map_resp.json()
98+
99+
ebook_id = map_data.get("ebook_id")
100+
101+
# 构建 nodeId 到 pageNumber 的映射字典
102+
# 格式: { "node_id_1": 5, "node_id_2": 10 }
103+
page_map = {}
104+
if "mappings" in map_data:
105+
for m in map_data["mappings"]:
106+
page_map[m["node_id"]] = m.get("page_number", 1)
107+
108+
# B. 如果有 ebook_id,去下载完整的目录树 (Tree API)
109+
if ebook_id:
110+
tree_url = f"https://s-file-1.ykt.cbern.com.cn/zxx/ndrv2/national_lesson/trees/{ebook_id}.json"
111+
tree_resp = session.get(tree_url)
112+
113+
if tree_resp.status_code == 200:
114+
tree_data = tree_resp.json()
115+
116+
# 递归函数:合并 Tree的标题 和 Mapping的页码
117+
def process_tree_nodes(nodes):
118+
result = []
119+
for node in nodes:
120+
# 从 page_map 中找页码,找不到为none
121+
page_num = page_map.get(node["id"], None)
122+
123+
chapter_item = {
124+
"title": node["title"],
125+
"page_index": page_num
126+
}
127+
128+
# 如果有子节点,递归处理
129+
if node.get("child_nodes"):
130+
chapter_item["children"] = process_tree_nodes(node["child_nodes"])
131+
132+
result.append(chapter_item)
133+
return result
134+
135+
# 开始解析
136+
if isinstance(tree_data, list):
137+
chapters = process_tree_nodes(tree_data)
138+
elif isinstance(tree_data, dict) and "child_nodes" in tree_data:
139+
chapters = process_tree_nodes(tree_data["child_nodes"])
140+
141+
# print(f"成功获取完整目录: {len(chapters)} 个顶级章节")
142+
143+
# C. 兜底方案:如果获取 Tree 失败,仅使用 mapping 生成纯页码索引
144+
if not chapters and "mappings" in map_data:
145+
temp_chapters = []
146+
mappings = map_data["mappings"]
147+
mappings.sort(key=lambda x: x["page_number"])
148+
for i, m in enumerate(mappings):
149+
temp_chapters.append({
150+
"title": f"第 {i+1} 节 (P{m['page_number']})",
151+
"page_index": m['page_number']
152+
})
153+
chapters = temp_chapters
154+
155+
except Exception as e:
156+
print(f"目录解析异常: {e}")
157+
158+
# 4. 获取 PDF 下载链接 (保持不变)
159+
71160
for item in list(data["ti_items"]):
72161
if item["lc_ti_format"] == "pdf": # 寻找存有 PDF 链接列表的项
73162
resource_url: str = item["ti_storages"][0] # 获取并构造 PDF 的 URL
@@ -88,15 +177,64 @@ def parse(url: str) -> tuple[str, str, str] | tuple[None, None, None]: # 解析
88177
resource_url = re.sub(r"^https?://(?:.+).ykt.cbern.com.cn/(.+)/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}).pkg/(.+)\.pdf$", r"https://c1.ykt.cbern.com.cn/\1/\2.pkg/\3.pdf", resource_url)
89178
break
90179
if not resource_url:
91-
return None, None, None
180+
return None, None, None, None
92181
else:
93-
return None, None, None
94-
95-
return resource_url, content_id, data["title"]
96-
except Exception: # 解析失败时返回 None
97-
return None, None, None
182+
return None, None, None, None
183+
184+
return resource_url, content_id, title, chapters
185+
except Exception:
186+
return None, None, None, None
187+
188+
def add_bookmarks(pdf_path: str, chapters: list) -> None:
189+
"""给 PDF 添加书签"""
190+
try:
191+
if not chapters:
192+
return
193+
reader = PdfReader(pdf_path)
194+
writer = PdfWriter()
195+
writer.append_pages_from_reader(reader)
196+
197+
# 递归添加书签的内部函数
198+
def _add_chapter(chapter_list, parent=None):
199+
for chapter in chapter_list:
200+
title = chapter.get("title", "未知章节")
201+
# 1. 获取原始值
202+
p_index = chapter.get("page_index")
203+
# print(f"处理章节“{title}”,页码索引:{p_index}")
204+
# 2. 如果值为 None (JSON里的null) 或者不存在,跳过这个书签(因为未使用)
205+
if p_index is None:
206+
sys.stderr.write(f"[!!]跳过章节“{title}”的书签,原因:未指定页码\n")
207+
continue
208+
# 3. 尝试将其转为整数并减 1 (pypdf 页码从 0 开始)
209+
try:
210+
page_num = int(p_index) - 1
211+
except (ValueError, TypeError):
212+
page_num = 0 # 如果转换失败,默认指向第1页
213+
# page_num = chapter.get("page_index", 1) - 1
214+
if page_num < 0: page_num = 0
215+
216+
if page_num >= len(writer.pages):
217+
page_num = len(writer.pages) - 1
218+
219+
# 添加书签
220+
# parent 是父级书签对象,用于处理多级目录
221+
bookmark = writer.add_outline_item(title, page_num, parent=parent)
222+
223+
# 如果有子章节(children),递归添加
224+
if "children" in chapter and chapter["children"]:
225+
_add_chapter(chapter["children"], parent=bookmark)
226+
227+
# 开始处理章节数据
228+
_add_chapter(chapters)
229+
230+
# 保存修改后的文件
231+
with open(pdf_path, "wb") as f:
232+
writer.write(f)
233+
234+
except Exception as e:
235+
sys.stderr.write(f"添加书签失败: {e}\n")
98236

99-
def download_file(url: str, save_path: str) -> None: # 下载文件
237+
def download_file(url: str, save_path: str, chapters: list = None) -> None: # 下载文件
100238
global download_states
101239
current_state = { "download_url": url, "save_path": save_path, "downloaded_size": 0, "total_size": 0, "finished": False, "failed_reason": None }
102240
download_states.append(current_state)
@@ -123,7 +261,9 @@ def download_file(url: str, save_path: str) -> None: # 下载文件
123261
download_progress = (all_downloaded_size / all_total_size) * 100
124262
download_progress_bar["value"] = download_progress # 更新进度条
125263
progress_label.config(text=f"{format_bytes(all_downloaded_size)}/{format_bytes(all_total_size)} ({download_progress:.2f}%) 已下载 {downloaded_number}/{total_number}") # 更新标签以显示当前下载进度
126-
264+
if chapters:
265+
progress_label.config(text=f"添加书签")
266+
add_bookmarks(save_path, chapters)
127267
current_state["downloaded_size"] = current_state["total_size"]
128268
current_state["finished"] = True
129269

@@ -188,7 +328,8 @@ def download() -> None: # 下载资源文件
188328
dir_path = None
189329

190330
for url in urls:
191-
resource_url, content_id, title = parse(url)
331+
# resource_url, content_id, title = parse(url)
332+
resource_url, content_id, title , chapters = parse(url)
192333
if not resource_url:
193334
failed_links.append(url) # 添加到失败链接
194335
continue
@@ -205,7 +346,8 @@ def download() -> None: # 下载资源文件
205346
if os_name == "Windows":
206347
save_path = save_path.replace("/", "\\")
207348

208-
thread_it(download_file, (resource_url, save_path)) # 开始下载(多线程,防止窗口卡死)
349+
# thread_it(download_file, (resource_url, save_path)) # 开始下载(多线程,防止窗口卡死)
350+
thread_it(download_file, (resource_url, save_path, chapters)) # 开始下载(多线程,防止窗口卡死)
209351

210352
if failed_links:
211353
messagebox.showwarning("警告", "以下 “行” 无法解析:\n" + "\n".join(failed_links)) # 显示警告对话框

0 commit comments

Comments
 (0)