Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 40 additions & 20 deletions apps/common/handle/impl/text/zip_split_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,14 +167,17 @@ def support(self, file, get_buffer):
def get_content(self, file, save_image):
"""
从 zip 中提取并返回拼接的 md 文本,同时收集并保存内嵌图片(通过 save_image 回调)。
使用 posixpath 来正确处理 zip 内部的路径拼接与规范化。
"""
buffer = file.read() if hasattr(file, 'read') else None
bytes_io = io.BytesIO(buffer) if buffer is not None else io.BytesIO(file)
md_parts = []
md_items = [] # 存储 (md_text, source_file_path)
image_mode_list = []

import posixpath

def is_image_name(name: str):
ext = os.path.splitext(name.lower())[1]
ext = posixpath.splitext(name.lower())[1]
return ext in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.svg')

with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
Expand All @@ -197,7 +200,6 @@ def is_image_name(name: str):
meta={'debug': False, 'content': raw}
)
image_mode_list.append(fmodel)
# 在 md 中不直接插入二进制,保存后上层可替换引用
continue

# 为 split_handle 提供可重复读取的 file-like 对象
Expand All @@ -210,22 +212,8 @@ def is_image_name(name: str):
# 准备一个简单的 get_buffer 回调,返回当前 raw
get_buffer = lambda f, _raw=raw: _raw
if split_handle.support(inner_file, get_buffer):
# 回到文件头
inner_file.seek(0)
md_text = split_handle.get_content(inner_file, save_image)
image_list = parse_md_image(md_text)
for image in image_list:
search = re.search("\(.*\)", image)
if search:
source_image_path = search.group().replace('(', '').replace(')', '')
source_image_path = source_image_path.strip().split(" ")[0]
image_path = urljoin(
real_name, '.' + source_image_path if source_image_path.startswith(
'/') else source_image_path
)
for img_model in image_mode_list:
if img_model.file_name == os.path.basename(image_path):
md_text = md_text.replace(source_image_path, f'./oss/file/{img_model.id}')
break

# 如果没有任何 split_handle 处理,按文本解码作为后备
Expand All @@ -237,10 +225,42 @@ def is_image_name(name: str):
md_text = raw.decode('utf-8', errors='ignore')

if isinstance(md_text, str) and md_text.strip():
md_parts.append(md_text)
# 保存 md 文本与其所在的文件路径,后面统一做图片路径替换
md_items.append((md_text, real_name))

# 将收集到的图片通过回调保存
# 将收集到的图片通过回调保存(一次性)
if image_mode_list:
save_image(image_mode_list)

return '\n\n'.join(md_parts)
# 后处理:在每个 md 片段中将相对/绝对引用替换为已保存图片的 oss 路径
content_parts = []
for md_text, base_name in md_items:
image_refs = parse_md_image(md_text)
for image in image_refs:
search = re.search(r"\(.*\)", image)
if not search:
continue
source_image_path = search.group().strip("()").split(" ")[0]

# 规范化 zip 内部路径:若以 '/' 开头,视为相对于 zip 根,否则相对于 base_name 的目录
if source_image_path.startswith('/'):
joined = posixpath.normpath(source_image_path.lstrip('/'))
else:
base_dir = posixpath.dirname(base_name)
joined = posixpath.normpath(posixpath.join(base_dir, source_image_path))

# 匹配已收集图片:以文件名做匹配(zip 中的文件名通常是不含反斜杠的 POSIX 风格)
matched = None
for img_model in image_mode_list:
if img_model.file_name == posixpath.basename(joined):
matched = img_model
break

if matched:
md_text = md_text.replace(source_image_path, f'./oss/file/{matched.id}')

content_parts.append(md_text)

return '\n\n'.join(content_parts)


Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code Review

Irregularities and Potential Issues:

  1. String Concatenation: The code uses string concatenation (+\n or +) across several statements which can be inefficient due to repeated memory allocation.

  2. Variable Names: Some variable names could be more descriptive or unique to avoid confusion between similarly named variables (e.g., md_parts vs image_mode_list).

  3. Regular Expressions: While using regular expressions is necessary for parsing Markdown images, they can slow down execution slightly compared to manual splitting with split().

  4. Image Reference Handling: There's some repetition in how image references are handled within the loop. Consider creating a function to simplify this logic.

  5. Resource Management: Ensure that resources like ZIP files and bytes buffers are properly closed after use. This is implicit here but worth mentioning.

  6. Logging: Adding logging might help in debugging and understanding the flow of execution.

  7. PosixPath Usage: Using posixpath correctly throughout helps ensure platform-independence when dealing with file paths.

Optimization Suggestions:

  1. StringBuilder: Use Python's stringbuilder module instead of manually concatenating strings inside loops for better performance.

from collections import deque

def join_string_parts(parts):
sb = deque()
for part in parts:
sb.append(part)
return ''.join(sb)

Replace existing string concatenations with join_string_parts


2. **Dictionary Comprehension**: If you want to replace references once per document rather than each time an MD block is processed, consider storing matches as dictionary keys.

3. **Avoid Repeated Function Calls**: Store the result of certain operations (like decoding UTF-8), even though it seems unnecessary based on context.

### Conclusion:
The current implementation is robust but can benefit from improvements such as using `join_string_parts`, optimizing reference replacement, and managing resource cleanup more explicitly.

Loading