1313from typing import List
1414
1515import django .db .models
16+ from django .db import models
1617from django .db .models import QuerySet
18+ from django .db .models .functions import Substr , Reverse
1719from langchain_core .embeddings import Embeddings
1820
1921from common .config .embedding_config import VectorStore
20- from common .db .search import native_search , get_dynamics_model
21- from common .event . common import embedding_poxy
22+ from common .db .search import native_search , get_dynamics_model , native_update
23+ from common .db . sql_execute import sql_execute , update_execute
2224from common .util .file_util import get_file_content
2325from common .util .lock import try_lock , un_lock
24- from dataset .models import Paragraph , Status , Document , ProblemParagraphMapping
26+ from common .util .page_utils import page
27+ from dataset .models import Paragraph , Status , Document , ProblemParagraphMapping , TaskType , State
2528from embedding .models import SourceType , SearchMode
2629from smartdoc .conf import PROJECT_DIR
2730
@@ -114,7 +117,8 @@ def embedding_by_paragraph(paragraph_id, embedding_model: Embeddings):
114117 @param embedding_model: 向量模型
115118 """
116119 max_kb .info (f"开始--->向量化段落:{ paragraph_id } " )
117- status = Status .success
120+ # 更新到开始状态
121+ ListenerManagement .update_status (QuerySet (Paragraph ).filter (id = paragraph_id ), TaskType .EMBEDDING , State .STARTED )
118122 try :
119123 data_list = native_search (
120124 {'problem' : QuerySet (get_dynamics_model ({'paragraph.id' : django .db .models .CharField ()})).filter (
@@ -125,23 +129,89 @@ def embedding_by_paragraph(paragraph_id, embedding_model: Embeddings):
125129 # 删除段落
126130 VectorStore .get_embedding_vector ().delete_by_paragraph_id (paragraph_id )
127131
128- def is_save_function ():
129- return QuerySet (Paragraph ).filter (id = paragraph_id ).exists ()
132+ def is_the_task_interrupted ():
133+ _paragraph = QuerySet (Paragraph ).filter (id = paragraph_id ).first ()
134+ if _paragraph is None or Status (_paragraph .status )[TaskType .EMBEDDING ] == State .REVOKE :
135+ return True
136+ return False
130137
131138 # 批量向量化
132- VectorStore .get_embedding_vector ().batch_save (data_list , embedding_model , is_save_function )
139+ VectorStore .get_embedding_vector ().batch_save (data_list , embedding_model , is_the_task_interrupted )
140+ # 更新到开始状态
141+ ListenerManagement .update_status (QuerySet (Paragraph ).filter (id = paragraph_id ), TaskType .EMBEDDING ,
142+ State .SUCCESS )
133143 except Exception as e :
134144 max_kb_error .error (f'向量化段落:{ paragraph_id } 出现错误{ str (e )} { traceback .format_exc ()} ' )
135- status = Status .error
145+ ListenerManagement .update_status (QuerySet (Paragraph ).filter (id = paragraph_id ), TaskType .EMBEDDING ,
146+ State .FAILURE )
136147 finally :
137- QuerySet (Paragraph ).filter (id = paragraph_id ).update (** {'status' : status })
138148 max_kb .info (f'结束--->向量化段落:{ paragraph_id } ' )
139149
140150 @staticmethod
141151 def embedding_by_data_list (data_list : List , embedding_model : Embeddings ):
142152 # 批量向量化
143153 VectorStore .get_embedding_vector ().batch_save (data_list , embedding_model , lambda : True )
144154
155+ @staticmethod
156+ def get_embedding_paragraph_apply (embedding_model , is_the_task_interrupted , post_apply = lambda : None ):
157+ def embedding_paragraph_apply (paragraph_list ):
158+ for paragraph in paragraph_list :
159+ if is_the_task_interrupted ():
160+ break
161+ ListenerManagement .embedding_by_paragraph (str (paragraph .get ('id' )), embedding_model )
162+ post_apply ()
163+
164+ return embedding_paragraph_apply
165+
166+ @staticmethod
167+ def get_aggregation_document_status (document_id ):
168+ def aggregation_document_status ():
169+ sql = get_file_content (
170+ os .path .join (PROJECT_DIR , "apps" , "dataset" , 'sql' , 'update_document_status_meta.sql' ))
171+ update_execute (sql , [document_id , document_id ])
172+
173+ return aggregation_document_status
174+
175+ @staticmethod
176+ def post_update_document_status (document_id , task_type : TaskType ):
177+ _document = QuerySet (Document ).filter (id = document_id ).first ()
178+
179+ status = Status (_document .status )
180+ if status [task_type ] == State .REVOKE :
181+ status [task_type ] = State .REVOKED
182+ else :
183+ status [task_type ] = State .SUCCESS
184+ for item in _document .status_meta .get ('aggs' , []):
185+ agg_status = item .get ('status' )
186+ agg_count = item .get ('count' )
187+ if Status (agg_status )[task_type ] == State .FAILURE and agg_count > 0 :
188+ status [task_type ] = State .FAILURE
189+ _document .status = status .__str__ ()
190+ _document .save ()
191+ ListenerManagement .update_status (QuerySet (Paragraph ).annotate (
192+ reversed_status = Reverse ('status' ),
193+ task_type_status = Substr ('reversed_status' , task_type .value ,
194+ task_type .value ),
195+ ).filter (task_type_status = State .REVOKE .value ).filter (document_id = document_id ).values ('id' ),
196+ task_type ,
197+ State .REVOKED )
198+
199+ @staticmethod
200+ def update_status (query_set : QuerySet , taskType : TaskType , state : State ):
201+ exec_sql = get_file_content (
202+ os .path .join (PROJECT_DIR , "apps" , "dataset" , 'sql' , 'update_paragraph_status.sql' ))
203+ bit_number = len (TaskType )
204+ up_index = taskType .value - 1
205+ next_index = taskType .value + 1
206+ status_number = state .value
207+ params_dict = {'${bit_number}' : bit_number , '${up_index}' : up_index ,
208+ '${status_number}' : status_number , '${next_index}' : next_index ,
209+ '${table_name}' : query_set .model ._meta .db_table }
210+ for key in params_dict :
211+ _value_ = params_dict [key ]
212+ exec_sql = exec_sql .replace (key , str (_value_ ))
213+ native_update (query_set , exec_sql )
214+
145215 @staticmethod
146216 def embedding_by_document (document_id , embedding_model : Embeddings ):
147217 """
@@ -153,33 +223,28 @@ def embedding_by_document(document_id, embedding_model: Embeddings):
153223 if not try_lock ('embedding' + str (document_id )):
154224 return
155225 max_kb .info (f"开始--->向量化文档:{ document_id } " )
156- QuerySet (Document ).filter (id = document_id ).update (** {'status' : Status .embedding })
157- QuerySet (Paragraph ).filter (document_id = document_id ).update (** {'status' : Status .embedding })
158- status = Status .success
226+ # 批量修改状态为PADDING
227+ ListenerManagement .update_status (QuerySet (Document ).filter (id = document_id ), TaskType .EMBEDDING , State .STARTED )
159228 try :
160- data_list = native_search (
161- {'problem' : QuerySet (
162- get_dynamics_model ({'paragraph.document_id' : django .db .models .CharField ()})).filter (
163- ** {'paragraph.document_id' : document_id }),
164- 'paragraph' : QuerySet (Paragraph ).filter (document_id = document_id )},
165- select_string = get_file_content (
166- os .path .join (PROJECT_DIR , "apps" , "common" , 'sql' , 'list_embedding_text.sql' )))
167229 # 删除文档向量数据
168230 VectorStore .get_embedding_vector ().delete_by_document_id (document_id )
169231
170- def is_save_function ():
171- return QuerySet (Document ).filter (id = document_id ).exists ()
172-
173- # 批量向量化
174- VectorStore .get_embedding_vector ().batch_save (data_list , embedding_model , is_save_function )
232+ def is_the_task_interrupted ():
233+ document = QuerySet (Document ).filter (id = document_id ).first ()
234+ if document is None or Status (document .status )[TaskType .EMBEDDING ] == State .REVOKE :
235+ return True
236+ return False
237+
238+ # 根据段落进行向量化处理
239+ page (QuerySet (Paragraph ).filter (document_id = document_id ).values ('id' ), 10 ,
240+ ListenerManagement .get_embedding_paragraph_apply (embedding_model , is_the_task_interrupted ,
241+ ListenerManagement .get_aggregation_document_status (
242+ document_id )),
243+ is_the_task_interrupted )
175244 except Exception as e :
176245 max_kb_error .error (f'向量化文档:{ document_id } 出现错误{ str (e )} { traceback .format_exc ()} ' )
177- status = Status .error
178246 finally :
179- # 修改状态
180- QuerySet (Document ).filter (id = document_id ).update (
181- ** {'status' : status , 'update_time' : datetime .datetime .now ()})
182- QuerySet (Paragraph ).filter (document_id = document_id ).update (** {'status' : status })
247+ ListenerManagement .post_update_document_status (document_id , TaskType .EMBEDDING )
183248 max_kb .info (f"结束--->向量化文档:{ document_id } " )
184249 un_lock ('embedding' + str (document_id ))
185250
0 commit comments