Skip to content

Commit 8813ccc

Browse files
committed
fix: 修复文档提取doc图片没有保存和展示的问题
1 parent c3d92e8 commit 8813ccc

File tree

8 files changed

+57
-11
lines changed

8 files changed

+57
-11
lines changed

apps/application/flow/step_node/document_extract_node/i_document_extract_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,5 @@ def _run(self):
2323
self.node_params_serializer.data.get('document_list')[1:])
2424
return self.execute(document=res, **self.flow_params_serializer.data)
2525

26-
def execute(self, document, **kwargs) -> NodeResult:
26+
def execute(self, document, chat_id, **kwargs) -> NodeResult:
2727
pass

apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py

Lines changed: 43 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,43 @@
11
# coding=utf-8
22
import io
3+
import mimetypes
34

5+
from django.core.files.uploadedfile import InMemoryUploadedFile
46
from django.db.models import QuerySet
57

68
from application.flow.i_step_node import NodeResult
79
from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
10+
from application.models import Chat
811
from dataset.models import File
912
from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
13+
from dataset.serializers.file_serializers import FileSerializer
14+
15+
16+
def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
17+
content_type, _ = mimetypes.guess_type(file_name)
18+
if content_type is None:
19+
# 如果未能识别,设置为默认的二进制文件类型
20+
content_type = "application/octet-stream"
21+
# 创建一个内存中的字节流对象
22+
file_stream = io.BytesIO(file_bytes)
23+
24+
# 获取文件大小
25+
file_size = len(file_bytes)
26+
27+
# 创建 InMemoryUploadedFile 对象
28+
uploaded_file = InMemoryUploadedFile(
29+
file=file_stream,
30+
field_name=None,
31+
name=file_name,
32+
content_type=content_type,
33+
size=file_size,
34+
charset=None,
35+
)
36+
return uploaded_file
1037

1138

1239
class BaseDocumentExtractNode(IDocumentExtractNode):
13-
def execute(self, document, **kwargs):
40+
def execute(self, document, chat_id, **kwargs):
1441
get_buffer = FileBufferHandle().get_buffer
1542

1643
self.context['document_list'] = document
@@ -19,6 +46,20 @@ def execute(self, document, **kwargs):
1946
if document is None or not isinstance(document, list):
2047
return NodeResult({'content': content}, {})
2148

49+
application = self.workflow_manage.work_flow_post_handler.chat_info.application
50+
51+
# doc文件中的图片保存
52+
def save_image(image_list):
53+
for image in image_list:
54+
meta = {
55+
'debug': False if application.id else True,
56+
'chat_id': chat_id,
57+
'application_id': str(application.id) if application.id else None,
58+
'file_id': str(image.id)
59+
}
60+
file = bytes_to_uploaded_file(image.image, image.image_name)
61+
FileSerializer(data={'file': file, 'meta': meta}).upload()
62+
2263
for doc in document:
2364
file = QuerySet(File).filter(id=doc['file_id']).first()
2465
buffer = io.BytesIO(file.get_byte().tobytes())
@@ -28,7 +69,7 @@ def execute(self, document, **kwargs):
2869
if split_handle.support(buffer, get_buffer):
2970
# 回到文件头
3071
buffer.seek(0)
31-
file_content = split_handle.get_content(buffer)
72+
file_content = split_handle.get_content(buffer, save_image)
3273
content.append('## ' + doc['name'] + '\n' + file_content)
3374
break
3475

apps/common/handle/base_split_handle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,5 +20,5 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
2020
pass
2121

2222
@abstractmethod
23-
def get_content(self, file):
23+
def get_content(self, file, save_image):
2424
pass

apps/common/handle/impl/doc_split_handle.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,12 +190,16 @@ def support(self, file, get_buffer):
190190
return True
191191
return False
192192

193-
def get_content(self, file):
193+
def get_content(self, file, save_image):
194194
try:
195195
image_list = []
196196
buffer = file.read()
197197
doc = Document(io.BytesIO(buffer))
198-
return self.to_md(doc, image_list, get_image_id_func())
198+
content = self.to_md(doc, image_list, get_image_id_func())
199+
if len(image_list) > 0:
200+
content = content.replace('/api/image/', '/api/file/')
201+
save_image(image_list)
202+
return content
199203
except BaseException as e:
200204
traceback.print_exception(e)
201205
return f'{e}'

apps/common/handle/impl/html_split_handle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
6161
'content': split_model.parse(content)
6262
}
6363

64-
def get_content(self, file):
64+
def get_content(self, file, save_image):
6565
buffer = file.read()
6666

6767
try:

apps/common/handle/impl/pdf_split_handle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -309,7 +309,7 @@ def support(self, file, get_buffer):
309309
return True
310310
return False
311311

312-
def get_content(self, file):
312+
def get_content(self, file, save_image):
313313
with tempfile.NamedTemporaryFile(delete=False) as temp_file:
314314
# 将上传的文件保存到临时文件中
315315
temp_file.write(file.read())

apps/common/handle/impl/text_split_handle.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
5151
'content': split_model.parse(content)
5252
}
5353

54-
def get_content(self, file):
54+
def get_content(self, file, save_image):
5555
buffer = file.read()
5656
try:
5757
return buffer.decode(detect(buffer)['encoding'])

apps/dataset/serializers/file_serializers.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,8 +61,9 @@ class FileSerializer(serializers.Serializer):
6161
def upload(self, with_valid=True):
6262
if with_valid:
6363
self.is_valid(raise_exception=True)
64-
file_id = uuid.uuid1()
65-
file = File(id=file_id, file_name=self.data.get('file').name, meta=self.data.get('meta'))
64+
meta = self.data.get('meta')
65+
file_id = meta.get('file_id', uuid.uuid1())
66+
file = File(id=file_id, file_name=self.data.get('file').name, meta=meta)
6667
file.save(self.data.get('file').read())
6768
return f'/api/file/{file_id}'
6869

0 commit comments

Comments
 (0)