11# coding=utf-8
22import io
33import mimetypes
4+ from typing import List
45
56from django .core .files .uploadedfile import InMemoryUploadedFile
67from django .db .models import QuerySet
@@ -38,6 +39,9 @@ class BaseDocumentSplitNode(IDocumentSplitNode):
3839 def save_context (self , details , workflow_manage ):
3940 self .context ['content' ] = details .get ('content' )
4041
42+ def get_reference_content (self , fields : List [str ]):
43+ return self .workflow_manage .get_reference_field (fields [0 ], fields [1 :])
44+
4145 def execute (self , files , knowledge_id , split_strategy , paragraph_title_relate_problem_type ,
4246 paragraph_title_relate_problem , paragraph_title_relate_problem_reference ,
4347 document_name_relate_problem_type , document_name_relate_problem ,
@@ -53,21 +57,27 @@ def execute(self, files, knowledge_id, split_strategy, paragraph_title_relate_pr
5357
5458 for split_handle in split_handles :
5559 if split_handle .support (file_mem , get_buffer ):
56- result = split_handle .handle (file_mem , patterns , with_filter , limit , get_buffer , self .save_image )
57- if isinstance (result , list ):
58- for item in result :
59- item ['source_file_id' ] = file .id
60- paragraph_list = result
61- else :
62- result ['source_file_id' ] = file .id
63- paragraph_list = [result ]
60+ result = split_handle .handle (file_mem , patterns , with_filter , limit , get_buffer , self ._save_image )
61+ # 统一处理结果为列表
62+ results = result if isinstance (result , list ) else [result ]
63+
64+ for item in results :
65+ self ._process_split_result (
66+ item , knowledge_id , file .id , file .file_name ,
67+ split_strategy , paragraph_title_relate_problem_type ,
68+ paragraph_title_relate_problem , paragraph_title_relate_problem_reference ,
69+ document_name_relate_problem_type , document_name_relate_problem ,
70+ document_name_relate_problem_reference
71+ )
72+
73+ paragraph_list = results
74+ break
6475
6576 self .context ['paragraph_list' ] = paragraph_list
6677
67-
6878 return NodeResult ({'paragraph_list' : paragraph_list }, {})
6979
70- def save_image (self , image_list ):
80+ def _save_image (self , image_list ):
7181 if image_list is not None and len (image_list ) > 0 :
7282 exist_image_list = [str (i .get ('id' )) for i in
7383 QuerySet (File ).filter (id__in = [i .id for i in image_list ]).values ('id' )]
@@ -81,6 +91,58 @@ def save_image(self, image_list):
8191 file .source_id = self .context .get ('knowledge_id' )
8292 file .save (file_bytes )
8393
94+ def _process_split_result (
95+ self , item , knowledge_id , source_file_id , file_name ,
96+ split_strategy , paragraph_title_relate_problem_type ,
97+ paragraph_title_relate_problem , paragraph_title_relate_problem_reference ,
98+ document_name_relate_problem_type , document_name_relate_problem ,
99+ document_name_relate_problem_reference
100+ ):
101+ """处理文档分割结果"""
102+ item ['meta' ] = {
103+ 'knowledge_id' : knowledge_id ,
104+ 'source_file_id' : source_file_id
105+ }
106+ item ['paragraphs' ] = item .pop ('content' , [])
107+
108+ for paragraph in item ['paragraphs' ]:
109+ paragraph ['problem_list' ] = self ._generate_problem_list (
110+ paragraph , file_name ,
111+ split_strategy , paragraph_title_relate_problem_type ,
112+ paragraph_title_relate_problem , paragraph_title_relate_problem_reference ,
113+ document_name_relate_problem_type , document_name_relate_problem ,
114+ document_name_relate_problem_reference
115+ )
116+ paragraph ['is_active' ] = True
117+
118+ def _generate_problem_list (
119+ self , paragraph , document_name , split_strategy , paragraph_title_relate_problem_type ,
120+ paragraph_title_relate_problem , paragraph_title_relate_problem_reference ,
121+ document_name_relate_problem_type , document_name_relate_problem ,
122+ document_name_relate_problem_reference
123+ ):
124+ if paragraph_title_relate_problem_type == 'referencing' :
125+ paragraph_title_relate_problem = self .get_reference_content (paragraph_title_relate_problem_reference )
126+ if document_name_relate_problem_type == 'referencing' :
127+ document_name_relate_problem = self .get_reference_content (document_name_relate_problem_reference )
128+
129+ problem_list = []
130+ if split_strategy == 'auto' :
131+ if paragraph_title_relate_problem and paragraph .get ('title' ):
132+ problem_list .append (paragraph .get ('title' ))
133+ if document_name_relate_problem and document_name :
134+ problem_list .append (document_name )
135+ elif split_strategy == 'custom' :
136+ if paragraph_title_relate_problem :
137+ problem_list .extend (paragraph_title_relate_problem )
138+ if document_name_relate_problem :
139+ problem_list .extend (document_name_relate_problem )
140+ elif split_strategy == 'qa' :
141+ if document_name_relate_problem and document_name :
142+ problem_list .append (document_name )
143+
144+ return problem_list
145+
84146 def get_details (self , index : int , ** kwargs ):
85147 return {
86148 'name' : self .node .properties .get ('stepName' ),
0 commit comments