Skip to content

Commit a197cfc

Browse files
committed
feat: implement CSV, XLS, and XLSX content extraction with Markdown formatting
1 parent c4dd09c commit a197cfc

File tree

4 files changed

+198
-4
lines changed

4 files changed

+198
-4
lines changed

apps/common/handle/impl/text/csv_split_handle.py

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,38 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
6868
return result
6969

7070
def get_content(self, file, save_image):
71-
pass
71+
buffer = file.read()
72+
try:
73+
reader = csv.reader(io.TextIOWrapper(io.BytesIO(buffer), encoding=detect(buffer)['encoding']))
74+
rows = list(reader)
75+
76+
if not rows:
77+
return ""
78+
79+
# 构建 Markdown 表格
80+
md_lines = []
81+
82+
# 添加表头
83+
header = [cell.replace('\n', '<br>').replace('\r', '') for cell in rows[0]]
84+
md_lines.append('| ' + ' | '.join(header) + ' |')
85+
86+
# 添加分隔线
87+
md_lines.append('| ' + ' | '.join(['---'] * len(header)) + ' |')
88+
89+
# 添加数据行
90+
for row in rows[1:]:
91+
if row: # 跳过空行
92+
# 确保行长度与表头一致,并将换行符转换为 <br>
93+
padded_row = [
94+
cell.replace('\n', '<br>').replace('\r', '') for cell in row
95+
] + [''] * (len(header) - len(row))
96+
md_lines.append('| ' + ' | '.join(padded_row[:len(header)]) + ' |')
97+
98+
return '\n'.join(md_lines)
99+
100+
except Exception as e:
101+
maxkb_logger.error(f"Error processing CSV file {file.name}: {e}, {traceback.format_exc()}")
102+
return ""
72103

73104
def support(self, file, get_buffer):
74105
file_name: str = file.name.lower()

apps/common/handle/impl/text/xls_split_handle.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,36 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
7575
return [{'name': file.name, 'content': []}]
7676

7777
def get_content(self, file, save_image):
78-
pass
78+
# 打开 .xls 文件
79+
try:
80+
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
81+
sheets = workbook.sheets()
82+
md_tables = ''
83+
for sheet in sheets:
84+
# 过滤空白的sheet
85+
if sheet.nrows == 0 or sheet.ncols == 0:
86+
continue
87+
88+
# 获取表头和内容
89+
headers = sheet.row_values(0)
90+
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
91+
92+
# 构建 Markdown 表格
93+
md_table = '| ' + ' | '.join(headers) + ' |\n'
94+
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
95+
for row in data:
96+
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
97+
md_table += '| ' + ' | '.join(
98+
[str(cell)
99+
.replace('\r\n', '<br>')
100+
.replace('\n', '<br>')
101+
if cell else '' for cell in row]) + ' |\n'
102+
md_tables += md_table + '\n\n'
103+
104+
return md_tables
105+
except Exception as e:
106+
maxkb_logger.error(f'excel split handle error: {e}')
107+
return f'error: {e}'
79108

80109
def support(self, file, get_buffer):
81110
file_name: str = file.name.lower()

apps/common/handle/impl/text/xlsx_split_handle.py

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from typing import List
1212

1313
import openpyxl
14+
from openpyxl import load_workbook
1415

1516
from common.handle.base_split_handle import BaseSplitHandle
1617
from common.handle.impl.common_handle import xlsx_embed_cells_images
@@ -63,6 +64,40 @@ def handle_sheet(file_name, sheet, image_dict, limit: int):
6364

6465

6566
class XlsxSplitHandle(BaseSplitHandle):
67+
def fill_merged_cells(self, sheet, image_dict):
68+
data = []
69+
70+
# 获取第一行作为标题行
71+
headers = []
72+
for idx, cell in enumerate(sheet[1]):
73+
if cell.value is None:
74+
headers.append(' ' * (idx + 1))
75+
else:
76+
headers.append(cell.value)
77+
78+
# 从第二行开始遍历每一行
79+
for row in sheet.iter_rows(min_row=2, values_only=False):
80+
row_data = {}
81+
for col_idx, cell in enumerate(row):
82+
cell_value = cell.value
83+
84+
# 如果单元格为空,并且该单元格在合并单元格内,获取合并单元格的值
85+
if cell_value is None:
86+
for merged_range in sheet.merged_cells.ranges:
87+
if cell.coordinate in merged_range:
88+
cell_value = sheet[merged_range.min_row][merged_range.min_col - 1].value
89+
break
90+
91+
image = image_dict.get(cell_value, None)
92+
if image is not None:
93+
cell_value = f'![](./oss/file/{image.id})'
94+
95+
# 使用标题作为键,单元格的值作为值存入字典
96+
row_data[headers[col_idx]] = cell_value
97+
data.append(row_data)
98+
99+
return data
100+
66101
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
67102
buffer = get_buffer(file)
68103
try:
@@ -88,7 +123,41 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
88123
return [{'name': file.name, 'content': []}]
89124

90125
def get_content(self, file, save_image):
91-
pass
126+
try:
127+
# 加载 Excel 文件
128+
workbook = load_workbook(file)
129+
try:
130+
image_dict: dict = xlsx_embed_cells_images(file)
131+
if len(image_dict) > 0:
132+
save_image(image_dict.values())
133+
except Exception as e:
134+
maxkb_logger.error(f'Exception: {e}')
135+
image_dict = {}
136+
md_tables = ''
137+
# 如果未指定 sheet_name,则使用第一个工作表
138+
for sheetname in workbook.sheetnames:
139+
sheet = workbook[sheetname] if sheetname else workbook.active
140+
rows = self.fill_merged_cells(sheet, image_dict)
141+
if len(rows) == 0:
142+
continue
143+
# 提取表头和内容
144+
145+
headers = [f"{key}" for key, value in rows[0].items()]
146+
147+
# 构建 Markdown 表格
148+
md_table = '| ' + ' | '.join(headers) + ' |\n'
149+
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
150+
for row in rows:
151+
r = [f'{value}' for key, value in row.items()]
152+
md_table += '| ' + ' | '.join(
153+
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in r]) + ' |\n'
154+
155+
md_tables += md_table + '\n\n'
156+
157+
return md_tables
158+
except Exception as e:
159+
maxkb_logger.error(f'excel split handle error: {e}')
160+
return f'error: {e}'
92161

93162
def support(self, file, get_buffer):
94163
file_name: str = file.name.lower()

apps/common/handle/impl/text/zip_split_handle.py

Lines changed: 66 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,4 +165,69 @@ def support(self, file, get_buffer):
165165
return False
166166

167167
def get_content(self, file, save_image):
168-
return ""
168+
"""
169+
从 zip 中提取并返回拼接的 md 文本,同时收集并保存内嵌图片(通过 save_image 回调)。
170+
"""
171+
buffer = file.read() if hasattr(file, 'read') else None
172+
bytes_io = io.BytesIO(buffer) if buffer is not None else io.BytesIO(file)
173+
md_parts = []
174+
image_mode_list = []
175+
176+
def is_image_name(name: str):
177+
ext = os.path.splitext(name.lower())[1]
178+
return ext in ('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.webp', '.svg')
179+
180+
with zipfile.ZipFile(bytes_io, 'r') as zip_ref:
181+
files = zip_ref.namelist()
182+
for inner_name in files:
183+
if inner_name.endswith('/') or inner_name.startswith('__MACOSX'):
184+
continue
185+
with zip_ref.open(inner_name) as zf:
186+
try:
187+
real_name = get_file_name(zf.name)
188+
except Exception:
189+
real_name = zf.name
190+
raw = zf.read()
191+
# 图片直接收集
192+
if is_image_name(real_name):
193+
image_id = str(uuid.uuid7())
194+
fmodel = File(
195+
id=image_id,
196+
file_name=os.path.basename(real_name),
197+
meta={'debug': False, 'content': raw}
198+
)
199+
image_mode_list.append(fmodel)
200+
# 在 md 中不直接插入二进制,保存后上层可替换引用
201+
continue
202+
203+
# 为 split_handle 提供可重复读取的 file-like 对象
204+
inner_file = io.BytesIO(raw)
205+
inner_file.name = real_name
206+
207+
# 尝试使用已注册的 split handle 的 get_content
208+
md_text = None
209+
for split_handle in split_handles:
210+
# 准备一个简单的 get_buffer 回调,返回当前 raw
211+
get_buffer = lambda f, _raw=raw: _raw
212+
if split_handle.support(inner_file, get_buffer):
213+
# 回到文件头
214+
inner_file.seek(0)
215+
md_text = split_handle.get_content(inner_file, save_image)
216+
break
217+
218+
# 如果没有任何 split_handle 处理,按文本解码作为后备
219+
if md_text is None:
220+
enc = detect(raw).get('encoding') or 'utf-8'
221+
try:
222+
md_text = raw.decode(enc, errors='ignore')
223+
except Exception:
224+
md_text = raw.decode('utf-8', errors='ignore')
225+
226+
if isinstance(md_text, str) and md_text.strip():
227+
md_parts.append(md_text)
228+
229+
# 将收集到的图片通过回调保存
230+
if image_mode_list:
231+
save_image(image_mode_list)
232+
233+
return '\n\n'.join(md_parts)

0 commit comments

Comments
 (0)