1212from celery_once import AlreadyQueued
1313from django .core import validators
1414from django .db import transaction , models
15- from django .db .models import QuerySet , Model
15+ from django .db .models import QuerySet
1616from django .db .models .functions import Substr , Reverse
1717from django .http import HttpResponse
1818from django .utils .translation import gettext_lazy as _ , gettext , get_language , to_locale
4343from common .utils .fork import Fork
4444from common .utils .split_model import get_split_model , flat_map
4545from knowledge .models import Knowledge , Paragraph , Problem , Document , KnowledgeType , ProblemParagraphMapping , State , \
46- TaskType , File
46+ TaskType , File , FileSourceType
4747from knowledge .serializers .common import ProblemParagraphManage , BatchSerializer , \
4848 get_embedding_model_id_by_knowledge_id , MetaSerializer , write_image , zip_dir
4949from knowledge .serializers .paragraph import ParagraphSerializers , ParagraphInstanceSerializer , \
5454from knowledge .task .generate import generate_related_by_document_id
5555from knowledge .task .sync import sync_web_document
5656from maxkb .const import PROJECT_DIR
57+ from models_provider .models import Model
5758
5859default_split_handle = TextSplitHandle ()
5960split_handles = [
@@ -87,6 +88,7 @@ def is_valid(self, *, raise_exception=False):
8788class DocumentInstanceSerializer (serializers .Serializer ):
8889 name = serializers .CharField (required = True , label = _ ('document name' ), max_length = 128 , min_length = 1 )
8990 paragraphs = ParagraphInstanceSerializer (required = False , many = True , allow_null = True )
91+ source_file_id = serializers .UUIDField (required = True , label = _ ('source file id' ))
9092
9193
9294class CancelInstanceSerializer (serializers .Serializer ):
@@ -545,6 +547,9 @@ def export_zip(self, with_valid=True):
545547 response .write (zip_buffer .getvalue ())
546548 return response
547549
550+ def download_source_file (self ):
551+ pass
552+
548553 def one (self , with_valid = False ):
549554 if with_valid :
550555 self .is_valid (raise_exception = True )
@@ -626,8 +631,6 @@ def refresh(self, state_list=None, with_valid=True):
626631 embedding_model = QuerySet (Model ).filter (id = embedding_model_id ).first ()
627632 if embedding_model is None :
628633 raise AppApiException (500 , _ ('Model does not exist' ))
629- if embedding_model .permission_type == 'PRIVATE' and knowledge_user_id != embedding_model .user_id :
630- raise AppApiException (500 , _ ('No permission to use this model' ) + f"{ embedding_model .name } " )
631634 document_id = self .data .get ("document_id" )
632635 ListenerManagement .update_status (
633636 QuerySet (Document ).filter (id = document_id ), TaskType .EMBEDDING , State .PENDING
@@ -859,6 +862,8 @@ def save_image(self, image_list):
859862 for file in save_image_list :
860863 file_bytes = file .meta .pop ('content' )
861864 file .meta ['knowledge_id' ] = self .data .get ('knowledge_id' )
865+ file .source_type = FileSourceType .KNOWLEDGE
866+ file .source_id = self .data .get ('knowledge_id' )
862867 file .save (file_bytes )
863868
864869 class Split (serializers .Serializer ):
@@ -901,19 +906,39 @@ def save_image(self, image_list):
901906 for file in save_image_list :
902907 file_bytes = file .meta .pop ('content' )
903908 file .meta ['knowledge_id' ] = self .data .get ('knowledge_id' )
909+ file .source_type = FileSourceType .KNOWLEDGE
910+ file .source_id = self .data .get ('knowledge_id' )
904911 file .save (file_bytes )
905912
906913 def file_to_paragraph (self , file , pattern_list : List , with_filter : bool , limit : int ):
914+ # 保存源文件
915+ file_id = uuid .uuid7 ()
916+ raw_file = File (
917+ id = file_id ,
918+ file_name = file .name ,
919+ file_size = file .size ,
920+ source_type = FileSourceType .KNOWLEDGE ,
921+ source_id = self .data .get ('knowledge_id' ),
922+ )
923+ raw_file .save (file .read ())
924+ file .seek (0 )
925+
907926 get_buffer = FileBufferHandle ().get_buffer
908927 for split_handle in split_handles :
909928 if split_handle .support (file , get_buffer ):
910929 result = split_handle .handle (file , pattern_list , with_filter , limit , get_buffer , self .save_image )
911930 if isinstance (result , list ):
931+ for item in result :
932+ item ['source_file_id' ] = file_id
912933 return result
934+ result ['source_file_id' ] = file_id
913935 return [result ]
914936 result = default_split_handle .handle (file , pattern_list , with_filter , limit , get_buffer , self .save_image )
915937 if isinstance (result , list ):
938+ for item in result :
939+ item ['source_file_id' ] = file_id
916940 return result
941+ result ['source_file_id' ] = file_id
917942 return [result ]
918943
919944 class SplitPattern (serializers .Serializer ):
@@ -937,14 +962,37 @@ def list():
937962 ]
938963
939964 class Batch (serializers .Serializer ):
940- workspace_id = serializers .UUIDField (required = True , label = _ ('workspace id' ))
965+ workspace_id = serializers .CharField (required = True , label = _ ('workspace id' ))
941966 knowledge_id = serializers .UUIDField (required = True , label = _ ('knowledge id' ))
942967
943968 @staticmethod
944- def post_embedding (document_list , knowledge_id ):
969+ def link_file (source_file_id , document_id ):
970+ source_file = QuerySet (File ).filter (id = source_file_id ).first ()
971+ if source_file :
972+ # 获取原始文件内容
973+ file_content = source_file .get_bytes ()
974+
975+ # 创建新文件对象,复制原始文件的重要属性
976+ new_file = File (
977+ id = uuid .uuid7 (),
978+ file_name = source_file .file_name ,
979+ file_size = source_file .file_size ,
980+ source_type = FileSourceType .DOCUMENT ,
981+ source_id = document_id , # 更新为当前知识库ID
982+ meta = source_file .meta .copy () if source_file .meta else {}
983+ )
984+
985+ # 保存文件内容和元数据
986+ new_file .save (file_content )
987+
988+ @staticmethod
989+ def post_embedding (document_list , knowledge_id , workspace_id ):
945990 for document_dict in document_list :
946- DocumentSerializers .Operate (
947- data = {'knowledge_id' : knowledge_id , 'document_id' : document_dict .get ('id' )}).refresh ()
991+ DocumentSerializers .Operate (data = {
992+ 'knowledge_id' : knowledge_id ,
993+ 'document_id' : document_dict .get ('id' ),
994+ 'workspace_id' : workspace_id
995+ }).refresh ()
948996 return document_list
949997
950998 @post (post_function = post_embedding )
@@ -953,15 +1001,21 @@ def batch_save(self, instance_list: List[Dict], with_valid=True):
9531001 if with_valid :
9541002 self .is_valid (raise_exception = True )
9551003 DocumentInstanceSerializer (many = True , data = instance_list ).is_valid (raise_exception = True )
1004+ workspace_id = self .data .get ("workspace_id" )
9561005 knowledge_id = self .data .get ("knowledge_id" )
9571006 document_model_list = []
9581007 paragraph_model_list = []
9591008 problem_paragraph_object_list = []
9601009 # 插入文档
9611010 for document in instance_list :
962- document_paragraph_dict_model = DocumentSerializers .Create .get_document_paragraph_model (knowledge_id ,
963- document )
964- document_model_list .append (document_paragraph_dict_model .get ('document' ))
1011+ document_paragraph_dict_model = DocumentSerializers .Create .get_document_paragraph_model (
1012+ knowledge_id ,
1013+ document
1014+ )
1015+ # 保存文档和文件的关系
1016+ document_instance = document_paragraph_dict_model .get ('document' )
1017+ self .link_file (document ['source_file_id' ], document_instance .id )
1018+ document_model_list .append (document_instance )
9651019 for paragraph in document_paragraph_dict_model .get ('paragraph_model_list' ):
9661020 paragraph_model_list .append (paragraph )
9671021 for problem_paragraph_object in document_paragraph_dict_model .get ('problem_paragraph_object_list' ):
@@ -992,7 +1046,7 @@ def batch_save(self, instance_list: List[Dict], with_valid=True):
9921046 os .path .join (PROJECT_DIR , "apps" , "knowledge" , 'sql' , 'list_document.sql' )
9931047 ),
9941048 with_search_one = False
995- ), knowledge_id
1049+ ), knowledge_id , workspace_id
9961050
9971051 @staticmethod
9981052 def _batch_sync (document_id_list : List [str ]):
0 commit comments