From 59e95fc2beb7d0572907b80e6d4871050f5fe470 Mon Sep 17 00:00:00 2001 From: CaptainB Date: Thu, 11 Dec 2025 15:07:57 +0800 Subject: [PATCH] chore: enhance zip content extraction with posixpath for path normalization --- .../handle/impl/text/zip_split_handle.py | 60 ++++++++++++------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/apps/common/handle/impl/text/zip_split_handle.py b/apps/common/handle/impl/text/zip_split_handle.py index b557ad7b73d..f41d4c30268 100644 --- a/apps/common/handle/impl/text/zip_split_handle.py +++ b/apps/common/handle/impl/text/zip_split_handle.py @@ -167,14 +167,17 @@ def support(self, file, get_buffer): def get_content(self, file, save_image): """ 从 zip 中提取并返回拼接的 md 文本,同时收集并保存内嵌图片(通过 save_image 回调)。 + 使用 posixpath 来正确处理 zip 内部的路径拼接与规范化。 """ buffer = file.read() if hasattr(file, 'read') else None bytes_io = io.BytesIO(buffer) if buffer is not None else io.BytesIO(file) - md_parts = [] + md_items = [] # 存储 (md_text, source_file_path) image_mode_list = [] + import posixpath + def is_image_name(name: str): - ext = os.path.splitext(name.lower())[1] + ext = posixpath.splitext(name.lower())[1] return ext in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.svg') with zipfile.ZipFile(bytes_io, 'r') as zip_ref: @@ -197,7 +200,6 @@ def is_image_name(name: str): meta={'debug': False, 'content': raw} ) image_mode_list.append(fmodel) - # 在 md 中不直接插入二进制,保存后上层可替换引用 continue # 为 split_handle 提供可重复读取的 file-like 对象 @@ -210,22 +212,8 @@ def is_image_name(name: str): # 准备一个简单的 get_buffer 回调,返回当前 raw get_buffer = lambda f, _raw=raw: _raw if split_handle.support(inner_file, get_buffer): - # 回到文件头 inner_file.seek(0) md_text = split_handle.get_content(inner_file, save_image) - image_list = parse_md_image(md_text) - for image in image_list: - search = re.search("\(.*\)", image) - if search: - source_image_path = search.group().replace('(', '').replace(')', '') - source_image_path = source_image_path.strip().split(" ")[0] - image_path = urljoin( - real_name, '.' + source_image_path if source_image_path.startswith( - '/') else source_image_path - ) - for img_model in image_mode_list: - if img_model.file_name == os.path.basename(image_path): - md_text = md_text.replace(source_image_path, f'./oss/file/{img_model.id}') break # 如果没有任何 split_handle 处理,按文本解码作为后备 @@ -237,10 +225,42 @@ def is_image_name(name: str): md_text = raw.decode('utf-8', errors='ignore') if isinstance(md_text, str) and md_text.strip(): - md_parts.append(md_text) + # 保存 md 文本与其所在的文件路径,后面统一做图片路径替换 + md_items.append((md_text, real_name)) - # 将收集到的图片通过回调保存 + # 将收集到的图片通过回调保存(一次性) if image_mode_list: save_image(image_mode_list) - return '\n\n'.join(md_parts) + # 后处理:在每个 md 片段中将相对/绝对引用替换为已保存图片的 oss 路径 + content_parts = [] + for md_text, base_name in md_items: + image_refs = parse_md_image(md_text) + for image in image_refs: + search = re.search(r"\(.*\)", image) + if not search: + continue + source_image_path = search.group().strip("()").split(" ")[0] + + # 规范化 zip 内部路径:若以 '/' 开头,视为相对于 zip 根,否则相对于 base_name 的目录 + if source_image_path.startswith('/'): + joined = posixpath.normpath(source_image_path.lstrip('/')) + else: + base_dir = posixpath.dirname(base_name) + joined = posixpath.normpath(posixpath.join(base_dir, source_image_path)) + + # 匹配已收集图片:以文件名做匹配(zip 中的文件名通常是不含反斜杠的 POSIX 风格) + matched = None + for img_model in image_mode_list: + if img_model.file_name == posixpath.basename(joined): + matched = img_model + break + + if matched: + md_text = md_text.replace(source_image_path, f'./oss/file/{matched.id}') + + content_parts.append(md_text) + + return '\n\n'.join(content_parts) + +