Skip to content

Commit b6c6e1b

Browse files
committed
feat: add Document Split Node functionality and localization
1 parent 9dc3f21 commit b6c6e1b

File tree

14 files changed

+487
-8
lines changed

14 files changed

+487
-8
lines changed

apps/application/flow/step_node/__init__.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
from .variable_assign_node import BaseVariableAssignNode
4040
from .variable_splitting_node import BaseVariableSplittingNode
4141
from .video_understand_step_node import BaseVideoUnderstandNode
42+
from .document_split_node import BaseDocumentSplitNode
4243

4344
node_list = [BaseStartStepNode, BaseChatNode, BaseSearchKnowledgeNode, BaseSearchDocumentNode, BaseQuestionNode,
4445
BaseConditionNode, BaseReplyNode,
@@ -50,7 +51,7 @@
5051
BaseIntentNode, BaseLoopNode, BaseLoopStartStepNode,
5152
BaseLoopContinueNode,
5253
BaseLoopBreakNode, BaseVariableSplittingNode, BaseParameterExtractionNode, BaseVariableAggregationNode,
53-
BaseDataSourceLocalNode, BaseDataSourceWebNode, BaseKnowledgeWriteNode]
54+
BaseDataSourceLocalNode, BaseDataSourceWebNode, BaseKnowledgeWriteNode, BaseDocumentSplitNode]
5455

5556
node_map = {n.type: {w: n for w in n.support} for n in node_list}
5657

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .impl import *
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# coding=utf-8
2+
3+
from typing import Type
4+
5+
from django.utils.translation import gettext_lazy as _
6+
from rest_framework import serializers
7+
8+
from application.flow.common import WorkflowMode
9+
from application.flow.i_step_node import INode, NodeResult
10+
11+
12+
class DocumentSplitNodeSerializer(serializers.Serializer):
13+
file_list = serializers.ListField(required=False, label=_("file list"))
14+
split_strategy = serializers.ChoiceField(
15+
choices=['auto', 'custom', 'qa'], required=False, label=_("split strategy"), default='auto'
16+
)
17+
paragraph_title_relate_problem_type = serializers.ChoiceField(
18+
choices=['custom', 'referencing'], required=False, label=_("paragraph title relate problem type"),
19+
default='custom'
20+
)
21+
paragraph_title_relate_problem = serializers.BooleanField(
22+
required=False, label=_("paragraph title relate problem"), default=False
23+
)
24+
paragraph_title_relate_problem_reference = serializers.ListField(
25+
required=False, label=_("paragraph title relate problem reference"), child=serializers.CharField()
26+
)
27+
document_name_relate_problem_type = serializers.ChoiceField(
28+
choices=['custom', 'referencing'], required=False, label=_("document name relate problem type"),
29+
default='custom'
30+
)
31+
document_name_relate_problem = serializers.BooleanField(
32+
required=False, label=_("document name relate problem"), default=False
33+
)
34+
document_name_relate_problem_reference = serializers.ListField(
35+
required=False, label=_("document name relate problem reference"), child=serializers.CharField()
36+
)
37+
limit = serializers.IntegerField(required=False, label=_("limit"), default=4096)
38+
patterns = serializers.ListField(
39+
required=False, label=_("patterns"), child=serializers.CharField(), default=[]
40+
)
41+
with_filter = serializers.BooleanField(
42+
required=False, label=_("with filter"), default=False
43+
)
44+
45+
46+
class IDocumentSplitNode(INode):
47+
type = 'document-split-node'
48+
support = [
49+
WorkflowMode.APPLICATION, WorkflowMode.APPLICATION_LOOP, WorkflowMode.KNOWLEDGE_LOOP, WorkflowMode.KNOWLEDGE
50+
]
51+
52+
def get_node_params_serializer_class(self) -> Type[serializers.Serializer]:
53+
return DocumentSplitNodeSerializer
54+
55+
def _run(self):
56+
res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
57+
self.node_params_serializer.data.get('file_list')[1:])
58+
return self.execute(file_list=res, **self.flow_params_serializer.data)
59+
60+
def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
61+
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
62+
document_name_relate_problem_type, document_name_relate_problem,
63+
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
64+
pass
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
from .base_document_split_node import BaseDocumentSplitNode
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# coding=utf-8
2+
from django.db.models import QuerySet
3+
4+
from application.flow.i_step_node import NodeResult
5+
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
6+
from knowledge.models import File
7+
from knowledge.serializers.document import split_handles, FileBufferHandle
8+
9+
10+
class BaseDocumentSplitNode(IDocumentSplitNode):
11+
def save_context(self, details, workflow_manage):
12+
self.context['content'] = details.get('content')
13+
print(details)
14+
15+
def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
16+
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
17+
document_name_relate_problem_type, document_name_relate_problem,
18+
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
19+
get_buffer = FileBufferHandle().get_buffer
20+
21+
paragraph_list = []
22+
for doc in file_list:
23+
file = QuerySet(File).filter(id=doc['file_id']).first()
24+
file_id = file.id
25+
for split_handle in split_handles:
26+
if split_handle.support(file, get_buffer):
27+
result = split_handle.handle(file, patterns, with_filter, limit, get_buffer, self.save_image)
28+
if isinstance(result, list):
29+
for item in result:
30+
item['source_file_id'] = file_id
31+
paragraph_list = result
32+
else:
33+
result['source_file_id'] = file_id
34+
paragraph_list = [result]
35+
36+
self.context['file_list'] = file_list
37+
self.context['paragraph_list'] = paragraph_list
38+
39+
print(paragraph_list)
40+
41+
return NodeResult({'paragraph_list': paragraph_list}, {})
42+
43+
def save_image(self, image_list):
44+
# if image_list is not None and len(image_list) > 0:
45+
# exist_image_list = [str(i.get('id')) for i in
46+
# QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
47+
# save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
48+
# save_image_list = list({img.id: img for img in save_image_list}.values())
49+
# # save image
50+
# for file in save_image_list:
51+
# file_bytes = file.meta.pop('content')
52+
# file.meta['knowledge_id'] = self.data.get('knowledge_id')
53+
# file.source_type = FileSourceType.KNOWLEDGE
54+
# file.source_id = self.data.get('knowledge_id')
55+
# file.save(file_bytes)
56+
pass
57+
58+
def get_details(self, index: int, **kwargs):
59+
return {
60+
'name': self.node.properties.get('stepName'),
61+
"index": index,
62+
'run_time': self.context.get('run_time'),
63+
'type': self.node.type,
64+
'status': self.status,
65+
'err_message': self.err_message,
66+
'file_list': self.context.get('file_list'),
67+
'paragraph_list': self.context.get('paragraph_list', []),
68+
}

ui/src/enums/application.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ export enum WorkflowType {
1919
RerankerNode = 'reranker-node',
2020
Application = 'application-node',
2121
DocumentExtractNode = 'document-extract-node',
22+
DocumentSplitNode = 'document-split-node',
2223
ImageUnderstandNode = 'image-understand-node',
2324
VariableAssignNode = 'variable-assign-node',
2425
FormNode = 'form-node',

ui/src/locales/lang/en-US/views/application-workflow.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ export default {
8383
chunk_length: 'Chunk length',
8484
text: 'Knowledge write',
8585
label: 'Knowledge write',
86-
},
86+
},
8787
dataSourceWebNode: {
8888
label: 'Web Site',
8989
text: 'Web Site',
@@ -250,6 +250,16 @@ You are a master of problem optimization, adept at accurately inferring user int
250250
text: 'Extract content from documents',
251251
content: 'Document Content',
252252
},
253+
documentSplitNode: {
254+
label: 'Document Splitting',
255+
text: 'Split document content into smaller segments',
256+
paragraph_list: 'List of split segments',
257+
splitStrategy: {
258+
label: 'Splitting Strategy',
259+
placeholder: 'Please select a splitting strategy',
260+
requiredMessage: 'Please select a splitting strategy',
261+
},
262+
},
253263
imageUnderstandNode: {
254264
label: 'Image Understanding',
255265
text: 'Analyze images to identify objects, scenes, and provide answers',

ui/src/locales/lang/zh-CN/views/application-workflow.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ export default {
8585
chunk_length: '子分块长度',
8686
text: '知识库写入',
8787
label: '知识库写入',
88-
},
88+
},
8989
dataSourceWebNode: {
9090
label: 'Web站点',
9191
text: 'Web站点',
@@ -256,6 +256,16 @@ export default {
256256
text: '提取文档中的内容',
257257
content: '文档内容',
258258
},
259+
documentSplitNode: {
260+
label: '文档分段',
261+
text: '将文档内容拆分为多个分段',
262+
paragraph_list: '分段列表',
263+
splitStrategy: {
264+
label: '分段策略',
265+
placeholder: '请选择分段策略',
266+
requiredMessage: '请选择分段策略',
267+
},
268+
},
259269
imageUnderstandNode: {
260270
label: '图片理解',
261271
text: '识别出图片中的对象、场景等信息回答用户问题',

ui/src/locales/lang/zh-Hant/views/application-workflow.ts

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ export default {
8484
chunk_length: '子分塊長度',
8585
text: '知識庫寫入',
8686
label: '知識庫寫入',
87-
},
87+
},
8888
dataSourceWebNode: {
8989
label: 'Web網站',
9090
text: 'Web網站',
@@ -250,6 +250,16 @@ export default {
250250
text: '提取文檔中的內容',
251251
content: '文檔內容',
252252
},
253+
documentSplitNode: {
254+
label: '文檔拆分',
255+
text: '將文檔內容拆分為多個分段',
256+
paragraph_list: '分段列表',
257+
splitStrategy: {
258+
label: '分段策略',
259+
placeholder: '請選擇分段策略',
260+
requiredMessage: '請選擇分段策略',
261+
},
262+
},
253263
imageUnderstandNode: {
254264
label: '圖片理解',
255265
text: '識別出圖片中的物件、場景等信息回答用戶問題',

ui/src/workflow/common/data.ts

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,24 @@ export const documentExtractNode = {
387387
},
388388
},
389389
}
390+
export const documentSplitNode = {
391+
type: WorkflowType.DocumentSplitNode,
392+
text: t('views.applicationWorkflow.nodes.documentSplitNode.text'),
393+
label: t('views.applicationWorkflow.nodes.documentSplitNode.label'),
394+
height: 252,
395+
properties: {
396+
width: 500,
397+
stepName: t('views.applicationWorkflow.nodes.documentSplitNode.label'),
398+
config: {
399+
fields: [
400+
{
401+
label: t('views.applicationWorkflow.nodes.documentSplitNode.paragraph_list'),
402+
value: 'paragraph_list',
403+
},
404+
],
405+
},
406+
},
407+
}
390408
export const imageUnderstandNode = {
391409
type: WorkflowType.ImageUnderstandNode,
392410
text: t('views.applicationWorkflow.nodes.imageUnderstandNode.text'),
@@ -724,7 +742,7 @@ export const knowledgeMenuNodes = [
724742
},
725743
{
726744
label: t('views.knowledge.title'),
727-
list: [documentExtractNode, knowledgeWriteNode],
745+
list: [documentExtractNode, documentSplitNode, knowledgeWriteNode],
728746
},
729747
{
730748
label: t('views.applicationWorkflow.nodes.classify.businessLogic'),
@@ -763,7 +781,7 @@ export const menuNodes = [
763781
},
764782
{
765783
label: t('views.knowledge.title'),
766-
list: [searchKnowledgeNode, searchDocumentNode, rerankerNode, documentExtractNode],
784+
list: [searchKnowledgeNode, searchDocumentNode, rerankerNode, documentExtractNode, documentSplitNode, knowledgeWriteNode],
767785
},
768786
{
769787
label: t('views.applicationWorkflow.nodes.classify.businessLogic'),
@@ -949,6 +967,7 @@ export const nodeDict: any = {
949967
[WorkflowType.FormNode]: formNode,
950968
[WorkflowType.Application]: applicationNode,
951969
[WorkflowType.DocumentExtractNode]: documentExtractNode,
970+
[WorkflowType.DocumentSplitNode]: documentSplitNode,
952971
[WorkflowType.ImageUnderstandNode]: imageUnderstandNode,
953972
[WorkflowType.TextToSpeechNode]: textToSpeechNode,
954973
[WorkflowType.SpeechToTextNode]: speechToTextNode,

0 commit comments

Comments
 (0)