fix: 修复文档提取doc图片没有保存和展示的问题

liuruibin · liuruibin · commit 8813ccc24dfe · 2024-11-28T15:01:47.000+08:00
diff --git a/apps/application/flow/step_node/document_extract_node/i_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/i_document_extract_node.py
@@ -23,5 +23,5 @@ def _run(self):
                                                        self.node_params_serializer.data.get('document_list')[1:])
         return self.execute(document=res, **self.flow_params_serializer.data)
 
-    def execute(self, document, **kwargs) -> NodeResult:
+    def execute(self, document, chat_id, **kwargs) -> NodeResult:
         pass
diff --git a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
@@ -1,16 +1,43 @@
 # coding=utf-8
 import io
+import mimetypes
 
+from django.core.files.uploadedfile import InMemoryUploadedFile
 from django.db.models import QuerySet
 
 from application.flow.i_step_node import NodeResult
 from application.flow.step_node.document_extract_node.i_document_extract_node import IDocumentExtractNode
+from application.models import Chat
 from dataset.models import File
 from dataset.serializers.document_serializers import split_handles, parse_table_handle_list, FileBufferHandle
+from dataset.serializers.file_serializers import FileSerializer
+
+
+def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
+    content_type, _ = mimetypes.guess_type(file_name)
+    if content_type is None:
+        # 如果未能识别，设置为默认的二进制文件类型
+        content_type = "application/octet-stream"
+    # 创建一个内存中的字节流对象
+    file_stream = io.BytesIO(file_bytes)
+
+    # 获取文件大小
+    file_size = len(file_bytes)
+
+    # 创建 InMemoryUploadedFile 对象
+    uploaded_file = InMemoryUploadedFile(
+        file=file_stream,
+        field_name=None,
+        name=file_name,
+        content_type=content_type,
+        size=file_size,
+        charset=None,
+    )
+    return uploaded_file
 
 
 class BaseDocumentExtractNode(IDocumentExtractNode):
-    def execute(self, document, **kwargs):
+    def execute(self, document, chat_id, **kwargs):
         get_buffer = FileBufferHandle().get_buffer
 
         self.context['document_list'] = document
@@ -19,6 +46,20 @@ def execute(self, document, **kwargs):
         if document is None or not isinstance(document, list):
             return NodeResult({'content': content}, {})
 
+        application = self.workflow_manage.work_flow_post_handler.chat_info.application
+
+        # doc文件中的图片保存
+        def save_image(image_list):
+            for image in image_list:
+                meta = {
+                    'debug': False if application.id else True,
+                    'chat_id': chat_id,
+                    'application_id': str(application.id) if application.id else None,
+                    'file_id': str(image.id)
+                }
+                file = bytes_to_uploaded_file(image.image, image.image_name)
+                FileSerializer(data={'file': file, 'meta': meta}).upload()
+
         for doc in document:
             file = QuerySet(File).filter(id=doc['file_id']).first()
             buffer = io.BytesIO(file.get_byte().tobytes())
@@ -28,7 +69,7 @@ def execute(self, document, **kwargs):
                 if split_handle.support(buffer, get_buffer):
                     # 回到文件头
                     buffer.seek(0)
-                    file_content = split_handle.get_content(buffer)
+                    file_content = split_handle.get_content(buffer, save_image)
                     content.append('## ' + doc['name'] + '\n' + file_content)
                     break
 
diff --git a/apps/common/handle/base_split_handle.py b/apps/common/handle/base_split_handle.py
@@ -20,5 +20,5 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
         pass
 
     @abstractmethod
-    def get_content(self, file):
+    def get_content(self, file, save_image):
         pass
diff --git a/apps/common/handle/impl/doc_split_handle.py b/apps/common/handle/impl/doc_split_handle.py
@@ -190,12 +190,16 @@ def support(self, file, get_buffer):
             return True
         return False
 
-    def get_content(self, file):
+    def get_content(self, file, save_image):
         try:
             image_list = []
             buffer = file.read()
             doc = Document(io.BytesIO(buffer))
-            return self.to_md(doc, image_list, get_image_id_func())
+            content = self.to_md(doc, image_list, get_image_id_func())
+            if len(image_list) > 0:
+                content = content.replace('/api/image/', '/api/file/')
+                save_image(image_list)
+            return content
         except BaseException as e:
             traceback.print_exception(e)
             return f'{e}'
diff --git a/apps/common/handle/impl/html_split_handle.py b/apps/common/handle/impl/html_split_handle.py
@@ -61,7 +61,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
                 'content': split_model.parse(content)
                 }
 
-    def get_content(self, file):
+    def get_content(self, file, save_image):
         buffer = file.read()
 
         try:
diff --git a/apps/common/handle/impl/pdf_split_handle.py b/apps/common/handle/impl/pdf_split_handle.py
@@ -309,7 +309,7 @@ def support(self, file, get_buffer):
             return True
         return False
 
-    def get_content(self, file):
+    def get_content(self, file, save_image):
         with tempfile.NamedTemporaryFile(delete=False) as temp_file:
             # 将上传的文件保存到临时文件中
             temp_file.write(file.read())
diff --git a/apps/common/handle/impl/text_split_handle.py b/apps/common/handle/impl/text_split_handle.py
@@ -51,7 +51,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu
                 'content': split_model.parse(content)
                 }
 
-    def get_content(self, file):
+    def get_content(self, file, save_image):
         buffer = file.read()
         try:
            return buffer.decode(detect(buffer)['encoding'])
diff --git a/apps/dataset/serializers/file_serializers.py b/apps/dataset/serializers/file_serializers.py
@@ -61,8 +61,9 @@ class FileSerializer(serializers.Serializer):
     def upload(self, with_valid=True):
         if with_valid:
             self.is_valid(raise_exception=True)
-        file_id = uuid.uuid1()
-        file = File(id=file_id, file_name=self.data.get('file').name, meta=self.data.get('meta'))
+        meta = self.data.get('meta')
+        file_id = meta.get('file_id', uuid.uuid1())
+        file = File(id=file_id, file_name=self.data.get('file').name, meta=meta)
         file.save(self.data.get('file').read())
         return f'/api/file/{file_id}'
 

Original file line number	Diff line number	Diff line change
`@@ -61,7 +61,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu`
`61`	`61`	`'content': split_model.parse(content)`
`62`	`62`	`}`
`63`	`63`
`64`		`- def get_content(self, file):`
	`64`	`+ def get_content(self, file, save_image):`
`65`	`65`	`buffer = file.read()`
`66`	`66`
`67`	`67`	`try:`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ def handle(self, file, pattern_list: List, with_filter: bool, limit: int, get_bu`
`51`	`51`	`'content': split_model.parse(content)`
`52`	`52`	`}`
`53`	`53`
`54`		`- def get_content(self, file):`
	`54`	`+ def get_content(self, file, save_image):`
`55`	`55`	`buffer = file.read()`
`56`	`56`	`try:`
`57`	`57`	`return buffer.decode(detect(buffer)['encoding'])`