feat: enhance Document Split Node with result processing and problem list generation

liuruibin · liuruibin · commit 592259777525 · 2025-11-21T15:44:56.000+08:00
diff --git a/apps/application/flow/step_node/document_extract_node/i_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/i_document_extract_node.py
@@ -25,5 +25,5 @@ def _run(self):
                                                        self.node_params_serializer.data.get('document_list')[1:])
         return self.execute(document=res, **self.flow_params_serializer.data)
 
-    def execute(self, document, chat_id, **kwargs) -> NodeResult:
+    def execute(self, document, chat_id=None, **kwargs) -> NodeResult:
         pass
diff --git a/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py b/apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py
@@ -42,23 +42,28 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
     def save_context(self, details, workflow_manage):
         self.context['content'] = details.get('content')
 
-    def execute(self, document, chat_id, **kwargs):
+    def execute(self, document, chat_id=None, **kwargs):
         get_buffer = FileBufferHandle().get_buffer
 
         self.context['document_list'] = document
         content = []
         if document is None or not isinstance(document, list):
-            return NodeResult({'content': ''}, {})
+            return NodeResult({'content': '', 'document_list': []}, {})
 
-        application = self.workflow_manage.work_flow_post_handler.chat_info.application
+        # 安全获取 application
+        application = None
+        if (self.workflow_manage and
+                self.workflow_manage.work_flow_post_handler and
+                self.workflow_manage.work_flow_post_handler.chat_info):
+            application = self.workflow_manage.work_flow_post_handler.chat_info.application
 
         # doc文件中的图片保存
         def save_image(image_list):
             for image in image_list:
                 meta = {
-                    'debug': False if application.id else True,
+                    'debug': False if (application and application.id) else True,
                     'chat_id': chat_id,
-                    'application_id': str(application.id) if application.id else None,
+                    'application_id': str(application.id) if (application and application.id) else None,
                     'file_id': str(image.id)
                 }
                 file_bytes = image.meta.pop('content')
@@ -70,6 +75,7 @@ def save_image(image_list):
                     'source_type': FileSourceType.APPLICATION.value
                 }).upload()
 
+        document_list = []
         for doc in document:
             file = QuerySet(File).filter(id=doc['file_id']).first()
             buffer = io.BytesIO(file.get_bytes())
@@ -81,9 +87,10 @@ def save_image(image_list):
                     buffer.seek(0)
                     file_content = split_handle.get_content(buffer, save_image)
                     content.append('### ' + doc['name'] + '\n' + file_content)
+                    document_list.append({'id': file.id, 'name': doc['name'], 'content': file_content})
                     break
 
-        return NodeResult({'content': splitter.join(content)}, {})
+        return NodeResult({'content': splitter.join(content), 'document_list': document_list}, {})
 
     def get_details(self, index: int, **kwargs):
         content = self.context.get('content', '').split(splitter)
diff --git a/apps/application/flow/step_node/document_split_node/i_document_split_node.py b/apps/application/flow/step_node/document_split_node/i_document_split_node.py
@@ -10,7 +10,7 @@
 
 
 class DocumentSplitNodeSerializer(serializers.Serializer):
-    file_list = serializers.ListField(required=False, label=_("file list"))
+    document_list = serializers.ListField(required=False, label=_("document list"))
     split_strategy = serializers.ChoiceField(
         choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto'
     )
@@ -53,11 +53,11 @@ def get_node_params_serializer_class(self) -> Type[serializers.Serializer]:
         return DocumentSplitNodeSerializer
 
     def _run(self):
-        res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
-                                                       self.node_params_serializer.data.get('file_list')[1:])
-        return self.execute(files=res, **self.node_params_serializer.data, **self.flow_params_serializer.data)
+        # res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
+        #                                                self.node_params_serializer.data.get('file_list')[1:])
+        return self.execute(**self.node_params_serializer.data, **self.flow_params_serializer.data)
 
-    def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
+    def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
                 paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
                 document_name_relate_problem_type, document_name_relate_problem,
                 document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
diff --git a/apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py b/apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py
@@ -9,7 +9,7 @@
 from application.flow.i_step_node import NodeResult
 from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
 from knowledge.models import File, FileSourceType
-from knowledge.serializers.document import split_handles, FileBufferHandle
+from knowledge.serializers.document import default_split_handle, FileBufferHandle
 
 
 def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
@@ -42,36 +42,31 @@ def save_context(self, details, workflow_manage):
     def get_reference_content(self, fields: List[str]):
         return self.workflow_manage.get_reference_field(fields[0], fields[1:])
 
-    def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
+    def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
                 paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
                 document_name_relate_problem_type, document_name_relate_problem,
                 document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
-        get_buffer = FileBufferHandle().get_buffer
-        self.context['file_list'] = files
         self.context['knowledge_id'] = knowledge_id
-
+        file_list = self.workflow_manage.get_reference_field(document_list[0], document_list[1:])
         paragraph_list = []
-        for doc in files:
-            file = QuerySet(File).filter(id=doc['file_id']).first()
-            file_mem = bytes_to_uploaded_file(file.get_bytes(), file_name=file.file_name)
-
-            for split_handle in split_handles:
-                if split_handle.support(file_mem, get_buffer):
-                    result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
-                    # 统一处理结果为列表
-                    results = result if isinstance(result, list) else [result]
-
-                    for item in results:
-                        self._process_split_result(
-                            item, knowledge_id, file.id, file.file_name,
-                            split_strategy, paragraph_title_relate_problem_type,
-                            paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
-                            document_name_relate_problem_type, document_name_relate_problem,
-                            document_name_relate_problem_reference
-                        )
-
-                    paragraph_list = results
-                    break
+        get_buffer = FileBufferHandle().get_buffer
+
+        for doc in file_list:
+            file_mem = bytes_to_uploaded_file(doc['content'].encode('utf-8'), doc['name'])
+            result = default_split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
+            # 统一处理结果为列表
+            results = result if isinstance(result, list) else [result]
+
+            for item in results:
+                self._process_split_result(
+                    item, knowledge_id, doc['id'], doc['name'],
+                    split_strategy, paragraph_title_relate_problem_type,
+                    paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
+                    document_name_relate_problem_type, document_name_relate_problem,
+                    document_name_relate_problem_reference
+                )
+
+            paragraph_list = results
 
         self.context['paragraph_list'] = paragraph_list
 
diff --git a/ui/src/workflow/common/data.ts b/ui/src/workflow/common/data.ts
@@ -383,6 +383,10 @@ export const documentExtractNode = {
           label: t('views.applicationWorkflow.nodes.documentExtractNode.content'),
           value: 'content',
         },
+        {
+          label: t('views.applicationWorkflow.nodes.dataSourceWebNode.field_label'),
+          value: 'document_list',
+        },
       ],
     },
   },
diff --git a/ui/src/workflow/nodes/document-split-node/index.vue b/ui/src/workflow/nodes/document-split-node/index.vue
@@ -22,7 +22,7 @@
             :nodeModel="nodeModel"
             class="w-full"
             :placeholder="$t('views.chatLog.documentPlaceholder')"
-            v-model="form_data.file_list"
+            v-model="form_data.document_list"
           />
         </el-form-item>
         <el-form-item
@@ -207,7 +207,7 @@ const props = defineProps<{ nodeModel: any }>()
 const splitPatternList = ref<Array<KeyValue<string, string>>>([])
 
 const form = {
-  file_list: [],
+  document_list: [],
   split_strategy: 'auto',
   paragraph_title_relate_problem_type: 'custom',
   paragraph_title_relate_problem: false,