Skip to content

Commit 80f14f1

Browse files
committed
feat: enhance Document Split Node with result processing and problem list generation
1 parent 1da372e commit 80f14f1

File tree

1 file changed

+72
-10
lines changed

1 file changed

+72
-10
lines changed

apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py

Lines changed: 72 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# coding=utf-8
22
import io
33
import mimetypes
4+
from typing import List
45

56
from django.core.files.uploadedfile import InMemoryUploadedFile
67
from django.db.models import QuerySet
@@ -38,6 +39,9 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
3839
def save_context(self, details, workflow_manage):
3940
self.context['content'] = details.get('content')
4041

42+
def get_reference_content(self, fields: List[str]):
43+
return self.workflow_manage.get_reference_field(fields[0], fields[1:])
44+
4145
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
4246
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
4347
document_name_relate_problem_type, document_name_relate_problem,
@@ -53,21 +57,27 @@ def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_pr
5357

5458
for split_handle in split_handles:
5559
if split_handle.support(file_mem, get_buffer):
56-
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self.save_image)
57-
if isinstance(result, list):
58-
for item in result:
59-
item['source_file_id'] = file.id
60-
paragraph_list = result
61-
else:
62-
result['source_file_id'] = file.id
63-
paragraph_list = [result]
60+
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self._save_image)
61+
# 统一处理结果为列表
62+
results = result if isinstance(result, list) else [result]
63+
64+
for item in results:
65+
self._process_split_result(
66+
item, knowledge_id, file.id, file.file_name,
67+
split_strategy, paragraph_title_relate_problem_type,
68+
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
69+
document_name_relate_problem_type, document_name_relate_problem,
70+
document_name_relate_problem_reference
71+
)
72+
73+
paragraph_list = results
74+
break
6475

6576
self.context['paragraph_list'] = paragraph_list
6677

67-
6878
return NodeResult({'paragraph_list': paragraph_list}, {})
6979

70-
def save_image(self, image_list):
80+
def _save_image(self, image_list):
7181
if image_list is not None and len(image_list) > 0:
7282
exist_image_list = [str(i.get('id')) for i in
7383
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
@@ -81,6 +91,58 @@ def save_image(self, image_list):
8191
file.source_id = self.context.get('knowledge_id')
8292
file.save(file_bytes)
8393

94+
def _process_split_result(
95+
self, item, knowledge_id, source_file_id, file_name,
96+
split_strategy, paragraph_title_relate_problem_type,
97+
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
98+
document_name_relate_problem_type, document_name_relate_problem,
99+
document_name_relate_problem_reference
100+
):
101+
"""处理文档分割结果"""
102+
item['meta'] = {
103+
'knowledge_id': knowledge_id,
104+
'source_file_id': source_file_id
105+
}
106+
item['paragraphs'] = item.pop('content', [])
107+
108+
for paragraph in item['paragraphs']:
109+
paragraph['problem_list'] = self._generate_problem_list(
110+
paragraph, file_name,
111+
split_strategy, paragraph_title_relate_problem_type,
112+
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
113+
document_name_relate_problem_type, document_name_relate_problem,
114+
document_name_relate_problem_reference
115+
)
116+
paragraph['is_active'] = True
117+
118+
def _generate_problem_list(
119+
self, paragraph, document_name, split_strategy, paragraph_title_relate_problem_type,
120+
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
121+
document_name_relate_problem_type, document_name_relate_problem,
122+
document_name_relate_problem_reference
123+
):
124+
if paragraph_title_relate_problem_type == 'referencing':
125+
paragraph_title_relate_problem = self.get_reference_content(paragraph_title_relate_problem_reference)
126+
if document_name_relate_problem_type == 'referencing':
127+
document_name_relate_problem = self.get_reference_content(document_name_relate_problem_reference)
128+
129+
problem_list = []
130+
if split_strategy == 'auto':
131+
if paragraph_title_relate_problem and paragraph.get('title'):
132+
problem_list.append(paragraph.get('title'))
133+
if document_name_relate_problem and document_name:
134+
problem_list.append(document_name)
135+
elif split_strategy == 'custom':
136+
if paragraph_title_relate_problem:
137+
problem_list.extend(paragraph_title_relate_problem)
138+
if document_name_relate_problem:
139+
problem_list.extend(document_name_relate_problem)
140+
elif split_strategy == 'qa':
141+
if document_name_relate_problem and document_name:
142+
problem_list.append(document_name)
143+
144+
return problem_list
145+
84146
def get_details(self, index: int, **kwargs):
85147
return {
86148
'name': self.node.properties.get('stepName'),

0 commit comments

Comments
 (0)