Skip to content

Commit 5922597

Browse files
committed
feat: enhance Document Split Node with result processing and problem list generation
1 parent 820b680 commit 5922597

File tree

6 files changed

+46
-40
lines changed

6 files changed

+46
-40
lines changed

apps/application/flow/step_node/document_extract_node/i_document_extract_node.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,5 +25,5 @@ def _run(self):
2525
self.node_params_serializer.data.get('document_list')[1:])
2626
return self.execute(document=res, **self.flow_params_serializer.data)
2727

28-
def execute(self, document, chat_id, **kwargs) -> NodeResult:
28+
def execute(self, document, chat_id=None, **kwargs) -> NodeResult:
2929
pass

apps/application/flow/step_node/document_extract_node/impl/base_document_extract_node.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -42,23 +42,28 @@ class BaseDocumentExtractNode(IDocumentExtractNode):
4242
def save_context(self, details, workflow_manage):
4343
self.context['content'] = details.get('content')
4444

45-
def execute(self, document, chat_id, **kwargs):
45+
def execute(self, document, chat_id=None, **kwargs):
4646
get_buffer = FileBufferHandle().get_buffer
4747

4848
self.context['document_list'] = document
4949
content = []
5050
if document is None or not isinstance(document, list):
51-
return NodeResult({'content': ''}, {})
51+
return NodeResult({'content': '', 'document_list': []}, {})
5252

53-
application = self.workflow_manage.work_flow_post_handler.chat_info.application
53+
# 安全获取 application
54+
application = None
55+
if (self.workflow_manage and
56+
self.workflow_manage.work_flow_post_handler and
57+
self.workflow_manage.work_flow_post_handler.chat_info):
58+
application = self.workflow_manage.work_flow_post_handler.chat_info.application
5459

5560
# doc文件中的图片保存
5661
def save_image(image_list):
5762
for image in image_list:
5863
meta = {
59-
'debug': False if application.id else True,
64+
'debug': False if (application and application.id) else True,
6065
'chat_id': chat_id,
61-
'application_id': str(application.id) if application.id else None,
66+
'application_id': str(application.id) if (application and application.id) else None,
6267
'file_id': str(image.id)
6368
}
6469
file_bytes = image.meta.pop('content')
@@ -70,6 +75,7 @@ def save_image(image_list):
7075
'source_type': FileSourceType.APPLICATION.value
7176
}).upload()
7277

78+
document_list = []
7379
for doc in document:
7480
file = QuerySet(File).filter(id=doc['file_id']).first()
7581
buffer = io.BytesIO(file.get_bytes())
@@ -81,9 +87,10 @@ def save_image(image_list):
8187
buffer.seek(0)
8288
file_content = split_handle.get_content(buffer, save_image)
8389
content.append('### ' + doc['name'] + '\n' + file_content)
90+
document_list.append({'id': file.id, 'name': doc['name'], 'content': file_content})
8491
break
8592

86-
return NodeResult({'content': splitter.join(content)}, {})
93+
return NodeResult({'content': splitter.join(content), 'document_list': document_list}, {})
8794

8895
def get_details(self, index: int, **kwargs):
8996
content = self.context.get('content', '').split(splitter)

apps/application/flow/step_node/document_split_node/i_document_split_node.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111

1212
class DocumentSplitNodeSerializer(serializers.Serializer):
13-
file_list = serializers.ListField(required=False, label=_("file list"))
13+
document_list = serializers.ListField(required=False, label=_("document list"))
1414
split_strategy = serializers.ChoiceField(
1515
choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto'
1616
)
@@ -53,11 +53,11 @@ def get_node_params_serializer_class(self) -> Type[serializers.Serializer]:
5353
return DocumentSplitNodeSerializer
5454

5555
def _run(self):
56-
res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
57-
self.node_params_serializer.data.get('file_list')[1:])
58-
return self.execute(files=res, **self.node_params_serializer.data, **self.flow_params_serializer.data)
56+
# res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
57+
# self.node_params_serializer.data.get('file_list')[1:])
58+
return self.execute(**self.node_params_serializer.data, **self.flow_params_serializer.data)
5959

60-
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
60+
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
6161
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
6262
document_name_relate_problem_type, document_name_relate_problem,
6363
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:

apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py

Lines changed: 21 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from application.flow.i_step_node import NodeResult
1010
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
1111
from knowledge.models import File, FileSourceType
12-
from knowledge.serializers.document import split_handles, FileBufferHandle
12+
from knowledge.serializers.document import default_split_handle, FileBufferHandle
1313

1414

1515
def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
@@ -42,36 +42,31 @@ def save_context(self, details, workflow_manage):
4242
def get_reference_content(self, fields: List[str]):
4343
return self.workflow_manage.get_reference_field(fields[0], fields[1:])
4444

45-
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
45+
def execute(self, document_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
4646
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
4747
document_name_relate_problem_type, document_name_relate_problem,
4848
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
49-
get_buffer = FileBufferHandle().get_buffer
50-
self.context['file_list'] = files
5149
self.context['knowledge_id'] = knowledge_id
52-
50+
file_list = self.workflow_manage.get_reference_field(document_list[0], document_list[1:])
5351
paragraph_list = []
54-
for doc in files:
55-
file = QuerySet(File).filter(id=doc['file_id']).first()
56-
file_mem = bytes_to_uploaded_file(file.get_bytes(), file_name=file.file_name)
57-
58-
for split_handle in split_handles:
59-
if split_handle.support(file_mem, get_buffer):
60-
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
61-
# 统一处理结果为列表
62-
results = result if isinstance(result, list) else [result]
63-
64-
for item in results:
65-
self._process_split_result(
66-
item, knowledge_id, file.id, file.file_name,
67-
split_strategy, paragraph_title_relate_problem_type,
68-
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
69-
document_name_relate_problem_type, document_name_relate_problem,
70-
document_name_relate_problem_reference
71-
)
72-
73-
paragraph_list = results
74-
break
52+
get_buffer = FileBufferHandle().get_buffer
53+
54+
for doc in file_list:
55+
file_mem = bytes_to_uploaded_file(doc['content'].encode('utf-8'), doc['name'])
56+
result = default_split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
57+
# 统一处理结果为列表
58+
results = result if isinstance(result, list) else [result]
59+
60+
for item in results:
61+
self._process_split_result(
62+
item, knowledge_id, doc['id'], doc['name'],
63+
split_strategy, paragraph_title_relate_problem_type,
64+
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
65+
document_name_relate_problem_type, document_name_relate_problem,
66+
document_name_relate_problem_reference
67+
)
68+
69+
paragraph_list = results
7570

7671
self.context['paragraph_list'] = paragraph_list
7772

ui/src/workflow/common/data.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -383,6 +383,10 @@ export const documentExtractNode = {
383383
label: t('views.applicationWorkflow.nodes.documentExtractNode.content'),
384384
value: 'content',
385385
},
386+
{
387+
label: t('views.applicationWorkflow.nodes.dataSourceWebNode.field_label'),
388+
value: 'document_list',
389+
},
386390
],
387391
},
388392
},

ui/src/workflow/nodes/document-split-node/index.vue

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
:nodeModel="nodeModel"
2323
class="w-full"
2424
:placeholder="$t('views.chatLog.documentPlaceholder')"
25-
v-model="form_data.file_list"
25+
v-model="form_data.document_list"
2626
/>
2727
</el-form-item>
2828
<el-form-item
@@ -207,7 +207,7 @@ const props = defineProps<{ nodeModel: any }>()
207207
const splitPatternList = ref<Array<KeyValue<string, string>>>([])
208208
209209
const form = {
210-
file_list: [],
210+
document_list: [],
211211
split_strategy: 'auto',
212212
paragraph_title_relate_problem_type: 'custom',
213213
paragraph_title_relate_problem: false,

0 commit comments

Comments
 (0)