Skip to content

Commit 27e5c65

Browse files
committed
feat: add Document Split Node functionality and localization
1 parent b6c6e1b commit 27e5c65

File tree

2 files changed

+54
-28
lines changed

2 files changed

+54
-28
lines changed

apps/application/flow/step_node/document_split_node/i_document_split_node.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ class DocumentSplitNodeSerializer(serializers.Serializer):
2222
required=False, label=_("paragraph title relate problem"), default=False
2323
)
2424
paragraph_title_relate_problem_reference = serializers.ListField(
25-
required=False, label=_("paragraph title relate problem reference"), child=serializers.CharField()
25+
required=False, label=_("paragraph title relate problem reference"), child=serializers.CharField(), default=[]
2626
)
2727
document_name_relate_problem_type = serializers.ChoiceField(
2828
choices=['custom', 'referencing'], required=False, label=_("document name relate problem type"),
@@ -32,7 +32,7 @@ class DocumentSplitNodeSerializer(serializers.Serializer):
3232
required=False, label=_("document name relate problem"), default=False
3333
)
3434
document_name_relate_problem_reference = serializers.ListField(
35-
required=False, label=_("document name relate problem reference"), child=serializers.CharField()
35+
required=False, label=_("document name relate problem reference"), child=serializers.CharField(), default=[]
3636
)
3737
limit = serializers.IntegerField(required=False, label=_("limit"), default=4096)
3838
patterns = serializers.ListField(
@@ -55,9 +55,9 @@ def get_node_params_serializer_class(self) -> Type[serializers.Serializer]:
5555
def _run(self):
5656
res = self.workflow_manage.get_reference_field(self.node_params_serializer.data.get('file_list')[0],
5757
self.node_params_serializer.data.get('file_list')[1:])
58-
return self.execute(file_list=res, **self.flow_params_serializer.data)
58+
return self.execute(files=res, **self.node_params_serializer.data, **self.flow_params_serializer.data)
5959

60-
def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
60+
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
6161
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
6262
document_name_relate_problem_type, document_name_relate_problem,
6363
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:

apps/application/flow/step_node/document_split_node/impl/base_document_split_node.py

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,59 +1,85 @@
11
# coding=utf-8
2+
import io
3+
import mimetypes
4+
5+
from django.core.files.uploadedfile import InMemoryUploadedFile
26
from django.db.models import QuerySet
37

48
from application.flow.i_step_node import NodeResult
59
from application.flow.step_node.document_split_node.i_document_split_node import IDocumentSplitNode
6-
from knowledge.models import File
10+
from knowledge.models import File, FileSourceType
711
from knowledge.serializers.document import split_handles, FileBufferHandle
812

913

14+
def bytes_to_uploaded_file(file_bytes, file_name="file.txt"):
15+
content_type, _ = mimetypes.guess_type(file_name)
16+
if content_type is None:
17+
# 如果未能识别,设置为默认的二进制文件类型
18+
content_type = "application/octet-stream"
19+
# 创建一个内存中的字节流对象
20+
file_stream = io.BytesIO(file_bytes)
21+
22+
# 获取文件大小
23+
file_size = len(file_bytes)
24+
25+
# 创建 InMemoryUploadedFile 对象
26+
uploaded_file = InMemoryUploadedFile(
27+
file=file_stream,
28+
field_name=None,
29+
name=file_name,
30+
content_type=content_type,
31+
size=file_size,
32+
charset=None,
33+
)
34+
return uploaded_file
35+
36+
1037
class BaseDocumentSplitNode(IDocumentSplitNode):
1138
def save_context(self, details, workflow_manage):
1239
self.context['content'] = details.get('content')
13-
print(details)
1440

15-
def execute(self, file_list, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
41+
def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_problem_type,
1642
paragraph_title_relate_problem, paragraph_title_relate_problem_reference,
1743
document_name_relate_problem_type, document_name_relate_problem,
1844
document_name_relate_problem_reference, limit, patterns, with_filter, **kwargs) -> NodeResult:
1945
get_buffer = FileBufferHandle().get_buffer
46+
self.context['file_list'] = files
47+
self.context['knowledge_id'] = knowledge_id
2048

2149
paragraph_list = []
22-
for doc in file_list:
50+
for doc in files:
2351
file = QuerySet(File).filter(id=doc['file_id']).first()
24-
file_id = file.id
52+
file_mem = bytes_to_uploaded_file(file.get_bytes(), file_name=file.file_name)
53+
2554
for split_handle in split_handles:
26-
if split_handle.support(file, get_buffer):
27-
result = split_handle.handle(file, patterns, with_filter, limit, get_buffer, self.save_image)
55+
if split_handle.support(file_mem, get_buffer):
56+
result = split_handle.handle(file_mem, patterns, with_filter, limit, get_buffer, self.save_image)
2857
if isinstance(result, list):
2958
for item in result:
30-
item['source_file_id'] = file_id
59+
item['source_file_id'] = file.id
3160
paragraph_list = result
3261
else:
33-
result['source_file_id'] = file_id
62+
result['source_file_id'] = file.id
3463
paragraph_list = [result]
3564

36-
self.context['file_list'] = file_list
3765
self.context['paragraph_list'] = paragraph_list
3866

39-
print(paragraph_list)
4067

4168
return NodeResult({'paragraph_list': paragraph_list}, {})
4269

4370
def save_image(self, image_list):
44-
# if image_list is not None and len(image_list) > 0:
45-
# exist_image_list = [str(i.get('id')) for i in
46-
# QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
47-
# save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
48-
# save_image_list = list({img.id: img for img in save_image_list}.values())
49-
# # save image
50-
# for file in save_image_list:
51-
# file_bytes = file.meta.pop('content')
52-
# file.meta['knowledge_id'] = self.data.get('knowledge_id')
53-
# file.source_type = FileSourceType.KNOWLEDGE
54-
# file.source_id = self.data.get('knowledge_id')
55-
# file.save(file_bytes)
56-
pass
71+
if image_list is not None and len(image_list) > 0:
72+
exist_image_list = [str(i.get('id')) for i in
73+
QuerySet(File).filter(id__in=[i.id for i in image_list]).values('id')]
74+
save_image_list = [image for image in image_list if not exist_image_list.__contains__(str(image.id))]
75+
save_image_list = list({img.id: img for img in save_image_list}.values())
76+
# save image
77+
for file in save_image_list:
78+
file_bytes = file.meta.pop('content')
79+
file.meta['knowledge_id'] = self.context.get('knowledge_id')
80+
file.source_type = FileSourceType.KNOWLEDGE
81+
file.source_id = self.context.get('knowledge_id')
82+
file.save(file_bytes)
5783

5884
def get_details(self, index: int, **kwargs):
5985
return {

0 commit comments

Comments
 (0)