Skip to content

Commit b57a619

Browse files
committed
feat: 高级编排支持文件上传(WIP)
1 parent a0cfcb7 commit b57a619

File tree

11 files changed

+149
-6
lines changed

11 files changed

+149
-6
lines changed

apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,36 @@
11
# coding=utf-8
2+
import io
3+
24
from django.db.models import QuerySet
35

46
from application.flow.i_step_node import NodeResult
57
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
68
from dataset.models import File
9+
from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
710

811

912
class BaseDocumentExtractNode(IDocumentExtractNode):
1013
def execute(self, document, **kwargs):
14+
get_buffer = FileBufferHandle().get_buffer
15+
1116
self.context['document_list'] = document
1217
content = ''
1318
spliter = '\n-----------------------------------\n'
14-
if len(document) > 0:
15-
for doc in document:
16-
file = QuerySet(File).filter(id=doc['file_id']).first()
17-
file_type = doc['name'].split('.')[-1]
18-
if file_type.lower() in ['txt', 'md', 'csv', 'html']:
19-
content += spliter + doc['name'] + '\n' + file.get_byte().tobytes().decode('utf-8')
19+
if document is None:
20+
return NodeResult({'content': content}, {})
21+
22+
for doc in document:
23+
file = QuerySet(File).filter(id=doc['file_id']).first()
24+
buffer = io.BytesIO(file.get_byte().tobytes())
25+
buffer.name = doc['name'] # this is the important line
2026

27+
for split_handle in (parse_table_handle_list + split_handles):
28+
if split_handle.support(buffer, get_buffer):
29+
# 回到文件头
30+
buffer.seek(0)
31+
file_content = split_handle.get_content(buffer)
32+
content += spliter + '## ' + doc['name'] + '\n' + file_content
33+
return NodeResult({'content': content}, {})
2134

2235
return NodeResult({'content': content}, {})
2336

apps/common/handle/base_parse_table_handle.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,7 @@ def support(self, file, get_buffer):
1717
@abstractmethod
1818
def handle(self, file, get_buffer,save_image):
1919
pass
20+
21+
@abstractmethod
22+
def get_content(self, file):
23+
pass

apps/common/handle/base_split_handle.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,7 @@ def support(self, file, get_buffer):
1818
@abstractmethod
1919
def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_buffer, save_image):
2020
pass
21+
22+
@abstractmethod
23+
def get_content(self, file):
24+
pass

apps/common/handle/impl/doc_split_handle.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,13 @@ def support(self, file, get_buffer):
189189
".DOC") or file_name.endswith(".DOCX"):
190190
return True
191191
return False
192+
193+
def get_content(self, file):
194+
try:
195+
image_list = []
196+
buffer = file.read()
197+
doc = Document(io.BytesIO(buffer))
198+
return self.to_md(doc, image_list, get_image_id_func())
199+
except BaseException as e:
200+
traceback.print_exception(e)
201+
return ''

apps/common/handle/impl/html_split_handle.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
@desc:
88
"""
99
import re
10+
import traceback
1011
from typing import List
1112

1213
from bs4 import BeautifulSoup
@@ -59,3 +60,14 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
5960
return {'name': file.name,
6061
'content': split_model.parse(content)
6162
}
63+
64+
def get_content(self, file):
65+
buffer = file.read()
66+
67+
try:
68+
encoding = get_encoding(buffer)
69+
content = buffer.decode(encoding)
70+
return html2text(content)
71+
except BaseException as e:
72+
traceback.print_exception(e)
73+
return ''

apps/common/handle/impl/pdf_split_handle.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
import re
1212
import tempfile
1313
import time
14+
import traceback
1415
from typing import List
1516

1617
import fitz
@@ -297,3 +298,17 @@ def support(self, file, get_buffer):
297298
if file_name.endswith(".pdf") or file_name.endswith(".PDF"):
298299
return True
299300
return False
301+
302+
def get_content(self, file):
303+
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
304+
# 将上传的文件保存到临时文件中
305+
temp_file.write(file.read())
306+
# 获取临时文件的路径
307+
temp_file_path = temp_file.name
308+
309+
pdf_document = fitz.open(temp_file_path)
310+
try:
311+
return self.handle_pdf_content(file, pdf_document)
312+
except BaseException as e:
313+
traceback.print_exception(e)
314+
return ''

apps/common/handle/impl/table/csv_parse_table_handle.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,11 @@ def handle(self, file, get_buffer,save_image):
3434
paragraphs.append({'title': '', 'content': line})
3535

3636
return [{'name': file.name, 'paragraphs': paragraphs}]
37+
38+
def get_content(self, file):
39+
buffer = file.read()
40+
try:
41+
return buffer.decode(detect(buffer)['encoding'])
42+
except BaseException as e:
43+
max_kb.error(f'csv split handle error: {e}')
44+
return [{'name': file.name, 'paragraphs': []}]

apps/common/handle/impl/table/xls_parse_table_handle.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,24 @@ def handle(self, file, get_buffer, save_image):
6060
max_kb.error(f'excel split handle error: {e}')
6161
return [{'name': file.name, 'paragraphs': []}]
6262
return result
63+
64+
def get_content(self, file):
65+
# 打开 .xls 文件
66+
workbook = xlrd.open_workbook(file_contents=file.read(), formatting_info=True)
67+
sheets = workbook.sheets()
68+
md_tables = ''
69+
for sheet in sheets:
70+
71+
# 获取表头和内容
72+
headers = sheet.row_values(0)
73+
data = [sheet.row_values(row_idx) for row_idx in range(1, sheet.nrows)]
74+
75+
# 构建 Markdown 表格
76+
md_table = '| ' + ' | '.join(headers) + ' |\n'
77+
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
78+
for row in data:
79+
# 将每个单元格中的内容替换换行符为 <br> 以保留原始格式
80+
md_table += '| ' + ' | '.join([str(cell).replace('\n', '<br>') if cell else '' for cell in row]) + ' |\n'
81+
md_tables += md_table + '\n\n'
82+
83+
return md_tables

apps/common/handle/impl/table/xlsx_parse_table_handle.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,31 @@ def handle(self, file, get_buffer, save_image):
7272
max_kb.error(f'excel split handle error: {e}')
7373
return [{'name': file.name, 'paragraphs': []}]
7474
return result
75+
76+
77+
def get_content(self, file):
78+
# 加载 Excel 文件
79+
workbook = load_workbook(file)
80+
md_tables = ''
81+
# 如果未指定 sheet_name,则使用第一个工作表
82+
for sheetname in workbook.sheetnames:
83+
sheet = workbook[sheetname] if sheetname else workbook.active
84+
85+
# 获取工作表的所有行
86+
rows = list(sheet.iter_rows(values_only=True))
87+
if not rows:
88+
continue
89+
90+
# 提取表头和内容
91+
headers = rows[0]
92+
data = rows[1:]
93+
94+
# 构建 Markdown 表格
95+
md_table = '| ' + ' | '.join(headers) + ' |\n'
96+
md_table += '| ' + ' | '.join(['---'] * len(headers)) + ' |\n'
97+
for row in data:
98+
md_table += '| ' + ' | '.join(
99+
[str(cell).replace('\n', '<br>') if cell is not None else '' for cell in row]) + ' |\n'
100+
101+
md_tables += md_table + '\n\n'
102+
return md_tables

apps/common/handle/impl/text_split_handle.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
@desc:
88
"""
99
import re
10+
import traceback
1011
from typing import List
1112

1213
from charset_normalizer import detect
@@ -49,3 +50,11 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
4950
return {'name': file.name,
5051
'content': split_model.parse(content)
5152
}
53+
54+
def get_content(self, file):
55+
buffer = file.read()
56+
try:
57+
return buffer.decode(detect(buffer)['encoding'])
58+
except BaseException as e:
59+
traceback.print_exception(e)
60+
return ''

0 commit comments

Comments
 (0)