Skip to content

Commit ecf07ce

Browse files
committed
chore: enhance zip content extraction with posixpath for path normalization
1 parent f5cef9b commit ecf07ce

File tree

1 file changed

+40
-20
lines changed

1 file changed

+40
-20
lines changed

apps/common/handle/impl/text/zip_split_handle.py

Lines changed: 40 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -167,14 +167,17 @@ def support(self, file, get_buffer):
167167
def get_content(self, file, save_image):
168168
"""
169169
从 zip 中提取并返回拼接的 md 文本,同时收集并保存内嵌图片(通过 save_image 回调)。
170+
使用 posixpath 来正确处理 zip 内部的路径拼接与规范化。
170171
"""
171172
buffer = file.read() if hasattr(file, 'read') else None
172173
bytes_io = io.BytesIO(buffer) if buffer is not None else io.BytesIO(file)
173-
md_parts = []
174+
md_items = [] # 存储 (md_text, source_file_path)
174175
image_mode_list = []
175176

177+
import posixpath
178+
176179
def is_image_name(name: str):
177-
ext = os.path.splitext(name.lower())[1]
180+
ext = posixpath.splitext(name.lower())[1]
178181
return ext in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.svg')
179182

180183
with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
@@ -197,7 +200,6 @@ def is_image_name(name: str):
197200
meta={'debug': False, 'content': raw}
198201
)
199202
image_mode_list.append(fmodel)
200-
# 在 md 中不直接插入二进制,保存后上层可替换引用
201203
continue
202204

203205
# 为 split_handle 提供可重复读取的 file-like 对象
@@ -210,22 +212,8 @@ def is_image_name(name: str):
210212
# 准备一个简单的 get_buffer 回调,返回当前 raw
211213
get_buffer = lambda f, _raw=raw: _raw
212214
if split_handle.support(inner_file, get_buffer):
213-
# 回到文件头
214215
inner_file.seek(0)
215216
md_text = split_handle.get_content(inner_file, save_image)
216-
image_list = parse_md_image(md_text)
217-
for image in image_list:
218-
search = re.search("\(.*\)", image)
219-
if search:
220-
source_image_path = search.group().replace('(', '').replace(')', '')
221-
source_image_path = source_image_path.strip().split(" ")[0]
222-
image_path = urljoin(
223-
real_name, '.' + source_image_path if source_image_path.startswith(
224-
'/') else source_image_path
225-
)
226-
for img_model in image_mode_list:
227-
if img_model.file_name == os.path.basename(image_path):
228-
md_text = md_text.replace(source_image_path, f'./oss/file/{img_model.id}')
229217
break
230218

231219
# 如果没有任何 split_handle 处理,按文本解码作为后备
@@ -237,10 +225,42 @@ def is_image_name(name: str):
237225
md_text = raw.decode('utf-8', errors='ignore')
238226

239227
if isinstance(md_text, str) and md_text.strip():
240-
md_parts.append(md_text)
228+
# 保存 md 文本与其所在的文件路径,后面统一做图片路径替换
229+
md_items.append((md_text, real_name))
241230

242-
# 将收集到的图片通过回调保存
231+
# 将收集到的图片通过回调保存(一次性)
243232
if image_mode_list:
244233
save_image(image_mode_list)
245234

246-
return '\n\n'.join(md_parts)
235+
# 后处理:在每个 md 片段中将相对/绝对引用替换为已保存图片的 oss 路径
236+
content_parts = []
237+
for md_text, base_name in md_items:
238+
image_refs = parse_md_image(md_text)
239+
for image in image_refs:
240+
search = re.search(r"\(.*\)", image)
241+
if not search:
242+
continue
243+
source_image_path = search.group().strip("()").split(" ")[0]
244+
245+
# 规范化 zip 内部路径:若以 '/' 开头,视为相对于 zip 根,否则相对于 base_name 的目录
246+
if source_image_path.startswith('/'):
247+
joined = posixpath.normpath(source_image_path.lstrip('/'))
248+
else:
249+
base_dir = posixpath.dirname(base_name)
250+
joined = posixpath.normpath(posixpath.join(base_dir, source_image_path))
251+
252+
# 匹配已收集图片:以文件名做匹配(zip 中的文件名通常是不含反斜杠的 POSIX 风格)
253+
matched = None
254+
for img_model in image_mode_list:
255+
if img_model.file_name == posixpath.basename(joined):
256+
matched = img_model
257+
break
258+
259+
if matched:
260+
md_text = md_text.replace(source_image_path, f'./oss/file/{matched.id}')
261+
262+
content_parts.append(md_text)
263+
264+
return '\n\n'.join(content_parts)
265+
266+

0 commit comments

Comments
 (0)