Skip to content

Commit 56a9e69

Browse files
committed
feat: add endpoint to download source files with updated parameter handling
1 parent 642920e commit 56a9e69

File tree

6 files changed

+125
-13
lines changed

6 files changed

+125
-13
lines changed

apps/common/constants/permission_constants.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -393,6 +393,11 @@ class PermissionConstants(Enum):
393393
role_list=[RoleConstants.ADMIN, RoleConstants.USER],
394394
parent_group=[WorkspaceGroup.KNOWLEDGE, UserGroup.KNOWLEDGE]
395395
)
396+
KNOWLEDGE_DOCUMENT_DOWNLOAD_RAW = Permission(
397+
group=Group.KNOWLEDGE_DOCUMENT, operate=Operate.EXPORT,
398+
role_list=[RoleConstants.ADMIN, RoleConstants.USER],
399+
parent_group=[WorkspaceGroup.KNOWLEDGE, UserGroup.KNOWLEDGE]
400+
)
396401
KNOWLEDGE_DOCUMENT_GENERATE = Permission(
397402
group=Group.KNOWLEDGE_DOCUMENT, operate=Operate.GENERATE,
398403
role_list=[RoleConstants.ADMIN, RoleConstants.USER],

apps/knowledge/api/document.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,3 +503,35 @@ def get_parameters():
503503
@staticmethod
504504
def get_request():
505505
return DocumentMigrateSerializer
506+
507+
508+
class DocumentDownloadSourceAPI(APIMixin):
509+
@staticmethod
510+
def get_parameters():
511+
return [
512+
OpenApiParameter(
513+
name="workspace_id",
514+
description="工作空间id",
515+
type=OpenApiTypes.STR,
516+
location='path',
517+
required=True,
518+
),
519+
OpenApiParameter(
520+
name="knowledge_id",
521+
description="知识库id",
522+
type=OpenApiTypes.STR,
523+
location='path',
524+
required=True,
525+
),
526+
OpenApiParameter(
527+
name="document_id",
528+
description="文档id",
529+
type=OpenApiTypes.STR,
530+
location='path',
531+
required=True,
532+
),
533+
]
534+
535+
@staticmethod
536+
def get_response():
537+
return DefaultResultSerializer

apps/knowledge/models/knowledge.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,10 @@ class FileSourceType(models.TextChoices):
220220
KNOWLEDGE = "KNOWLEDGE"
221221
# 应用 跟随应用被删除而被删除 source_id 为应用id
222222
APPLICATION = "APPLICATION"
223+
# 工具 跟随工具被删除而被删除 source_id 为应用id
224+
TOOL = "TOOL"
225+
# 文档
226+
DOCUMENT = "DOCUMENT"
223227
# 临时30分钟 数据30分钟后被清理 source_id 为TEMPORARY_30_MINUTE
224228
TEMPORARY_30_MINUTE = "TEMPORARY_30_MINUTE"
225229
# 临时120分钟 数据120分钟后被清理 source_id为TEMPORARY_100_MINUTE

apps/knowledge/serializers/document.py

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from celery_once import AlreadyQueued
1313
from django.core import validators
1414
from django.db import transaction, models
15-
from django.db.models import QuerySet, Model
15+
from django.db.models import QuerySet
1616
from django.db.models.functions import Substr, Reverse
1717
from django.http import HttpResponse
1818
from django.utils.translation import gettext_lazy as _, gettext, get_language, to_locale
@@ -43,7 +43,7 @@
4343
from common.utils.fork import Fork
4444
from common.utils.split_model import get_split_model, flat_map
4545
from knowledge.models import Knowledge, Paragraph, Problem, Document, KnowledgeType, ProblemParagraphMapping, State, \
46-
TaskType, File
46+
TaskType, File, FileSourceType
4747
from knowledge.serializers.common import ProblemParagraphManage, BatchSerializer, \
4848
get_embedding_model_id_by_knowledge_id, MetaSerializer, write_image, zip_dir
4949
from knowledge.serializers.paragraph import ParagraphSerializers, ParagraphInstanceSerializer, \
@@ -54,6 +54,7 @@
5454
from knowledge.task.generate import generate_related_by_document_id
5555
from knowledge.task.sync import sync_web_document
5656
from maxkb.const import PROJECT_DIR
57+
from models_provider.models import Model
5758

5859
default_split_handle = TextSplitHandle()
5960
split_handles = [
@@ -87,6 +88,7 @@ def is_valid(self, *, raise_exception=False):
8788
class DocumentInstanceSerializer(serializers.Serializer):
8889
name = serializers.CharField(required=True, label=_('document name'), max_length=128, min_length=1)
8990
paragraphs = ParagraphInstanceSerializer(required=False, many=True, allow_null=True)
91+
source_file_id = serializers.UUIDField(required=True, label=_('source file id'))
9092

9193

9294
class CancelInstanceSerializer(serializers.Serializer):
@@ -545,6 +547,9 @@ def export_zip(self, with_valid=True):
545547
response.write(zip_buffer.getvalue())
546548
return response
547549

550+
def download_source_file(self):
551+
pass
552+
548553
def one(self, with_valid=False):
549554
if with_valid:
550555
self.is_valid(raise_exception=True)
@@ -626,8 +631,6 @@ def refresh(self, state_list=None, with_valid=True):
626631
embedding_model = QuerySet(Model).filter(id=embedding_model_id).first()
627632
if embedding_model is None:
628633
raise AppApiException(500, _('Model does not exist'))
629-
if embedding_model.permission_type == 'PRIVATE' and knowledge_user_id != embedding_model.user_id:
630-
raise AppApiException(500, _('No permission to use this model') + f"{embedding_model.name}")
631634
document_id = self.data.get("document_id")
632635
ListenerManagement.update_status(
633636
QuerySet(Document).filter(id=document_id), TaskType.EMBEDDING, State.PENDING
@@ -859,6 +862,8 @@ def save_image(self, image_list):
859862
for file in save_image_list:
860863
file_bytes = file.meta.pop('content')
861864
file.meta['knowledge_id'] = self.data.get('knowledge_id')
865+
file.source_type = FileSourceType.KNOWLEDGE
866+
file.source_id = self.data.get('knowledge_id')
862867
file.save(file_bytes)
863868

864869
class Split(serializers.Serializer):
@@ -901,19 +906,39 @@ def save_image(self, image_list):
901906
for file in save_image_list:
902907
file_bytes = file.meta.pop('content')
903908
file.meta['knowledge_id'] = self.data.get('knowledge_id')
909+
file.source_type = FileSourceType.KNOWLEDGE
910+
file.source_id = self.data.get('knowledge_id')
904911
file.save(file_bytes)
905912

906913
def file_to_paragraph(self, file, pattern_list: List, with_filter: bool, limit: int):
914+
# 保存源文件
915+
file_id = uuid.uuid7()
916+
raw_file = File(
917+
id=file_id,
918+
file_name=file.name,
919+
file_size=file.size,
920+
source_type=FileSourceType.KNOWLEDGE,
921+
source_id=self.data.get('knowledge_id'),
922+
)
923+
raw_file.save(file.read())
924+
file.seek(0)
925+
907926
get_buffer = FileBufferHandle().get_buffer
908927
for split_handle in split_handles:
909928
if split_handle.support(file, get_buffer):
910929
result = split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
911930
if isinstance(result, list):
931+
for item in result:
932+
item['source_file_id'] = file_id
912933
return result
934+
result['source_file_id'] = file_id
913935
return [result]
914936
result = default_split_handle.handle(file, pattern_list, with_filter, limit, get_buffer, self.save_image)
915937
if isinstance(result, list):
938+
for item in result:
939+
item['source_file_id'] = file_id
916940
return result
941+
result['source_file_id'] = file_id
917942
return [result]
918943

919944
class SplitPattern(serializers.Serializer):
@@ -937,14 +962,37 @@ def list():
937962
]
938963

939964
class Batch(serializers.Serializer):
940-
workspace_id = serializers.UUIDField(required=True, label=_('workspace id'))
965+
workspace_id = serializers.CharField(required=True, label=_('workspace id'))
941966
knowledge_id = serializers.UUIDField(required=True, label=_('knowledge id'))
942967

943968
@staticmethod
944-
def post_embedding(document_list, knowledge_id):
969+
def link_file(source_file_id, document_id):
970+
source_file = QuerySet(File).filter(id=source_file_id).first()
971+
if source_file:
972+
# 获取原始文件内容
973+
file_content = source_file.get_bytes()
974+
975+
# 创建新文件对象,复制原始文件的重要属性
976+
new_file = File(
977+
id=uuid.uuid7(),
978+
file_name=source_file.file_name,
979+
file_size=source_file.file_size,
980+
source_type=FileSourceType.DOCUMENT,
981+
source_id=document_id, # 更新为当前知识库ID
982+
meta=source_file.meta.copy() if source_file.meta else {}
983+
)
984+
985+
# 保存文件内容和元数据
986+
new_file.save(file_content)
987+
988+
@staticmethod
989+
def post_embedding(document_list, knowledge_id, workspace_id):
945990
for document_dict in document_list:
946-
DocumentSerializers.Operate(
947-
data={'knowledge_id': knowledge_id, 'document_id': document_dict.get('id')}).refresh()
991+
DocumentSerializers.Operate(data={
992+
'knowledge_id': knowledge_id,
993+
'document_id': document_dict.get('id'),
994+
'workspace_id': workspace_id
995+
}).refresh()
948996
return document_list
949997

950998
@post(post_function=post_embedding)
@@ -953,15 +1001,21 @@ def batch_save(self, instance_list: List[Dict], with_valid=True):
9531001
if with_valid:
9541002
self.is_valid(raise_exception=True)
9551003
DocumentInstanceSerializer(many=True, data=instance_list).is_valid(raise_exception=True)
1004+
workspace_id = self.data.get("workspace_id")
9561005
knowledge_id = self.data.get("knowledge_id")
9571006
document_model_list = []
9581007
paragraph_model_list = []
9591008
problem_paragraph_object_list = []
9601009
# 插入文档
9611010
for document in instance_list:
962-
document_paragraph_dict_model = DocumentSerializers.Create.get_document_paragraph_model(knowledge_id,
963-
document)
964-
document_model_list.append(document_paragraph_dict_model.get('document'))
1011+
document_paragraph_dict_model = DocumentSerializers.Create.get_document_paragraph_model(
1012+
knowledge_id,
1013+
document
1014+
)
1015+
# 保存文档和文件的关系
1016+
document_instance = document_paragraph_dict_model.get('document')
1017+
self.link_file(document['source_file_id'], document_instance.id)
1018+
document_model_list.append(document_instance)
9651019
for paragraph in document_paragraph_dict_model.get('paragraph_model_list'):
9661020
paragraph_model_list.append(paragraph)
9671021
for problem_paragraph_object in document_paragraph_dict_model.get('problem_paragraph_object_list'):
@@ -992,7 +1046,7 @@ def batch_save(self, instance_list: List[Dict], with_valid=True):
9921046
os.path.join(PROJECT_DIR, "apps", "knowledge", 'sql', 'list_document.sql')
9931047
),
9941048
with_search_one=False
995-
), knowledge_id
1049+
), knowledge_id, workspace_id
9961050

9971051
@staticmethod
9981052
def _batch_sync(document_id_list: List[str]):

apps/knowledge/urls.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>/batch_cancel_task', views.DocumentView.BatchCancelTask.as_view()),
3939
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>/export', views.DocumentView.Export.as_view()),
4040
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>/export_zip', views.DocumentView.ExportZip.as_view()),
41+
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>/download_source_file', views.DocumentView.DownloadSourceFile.as_view()),
4142
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>/paragraph', views.ParagraphView.as_view()),
4243
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>/paragraph/batch_delete', views.ParagraphView.BatchDelete.as_view()),
4344
path('workspace/<str:workspace_id>/knowledge/<str:knowledge_id>/document/<str:document_id>/paragraph/batch_generate_related', views.ParagraphView.BatchGenerateRelated.as_view()),

apps/knowledge/views/document.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
DocumentReadAPI, DocumentEditAPI, DocumentDeleteAPI, TableDocumentCreateAPI, QaDocumentCreateAPI, \
1313
WebDocumentCreateAPI, CancelTaskAPI, BatchCancelTaskAPI, SyncWebAPI, RefreshAPI, BatchEditHitHandlingAPI, \
1414
DocumentTreeReadAPI, DocumentSplitPatternAPI, BatchRefreshAPI, BatchGenerateRelatedAPI, TemplateExportAPI, \
15-
DocumentExportAPI, DocumentMigrateAPI
15+
DocumentExportAPI, DocumentMigrateAPI, DocumentDownloadSourceAPI
1616
from knowledge.serializers.document import DocumentSerializers
1717

1818

@@ -417,6 +417,22 @@ def get(self, request: Request, workspace_id: str, knowledge_id: str, document_i
417417
'workspace_id': workspace_id, 'document_id': document_id, 'knowledge_id': knowledge_id
418418
}).export_zip()
419419

420+
class DownloadSourceFile(APIView):
421+
authentication_classes = [TokenAuth]
422+
423+
@extend_schema(
424+
summary=_('Download source file'),
425+
operation_id=_('Download source file'), # type: ignore
426+
parameters=DocumentDownloadSourceAPI.get_parameters(),
427+
responses=DocumentDownloadSourceAPI.get_response(),
428+
tags=[_('Knowledge Base/Documentation')] # type: ignore
429+
)
430+
@has_permissions(PermissionConstants.KNOWLEDGE_DOCUMENT_DOWNLOAD_RAW.get_workspace_permission())
431+
def get(self, request: Request, workspace_id: str, knowledge_id: str, document_id: str):
432+
return DocumentSerializers.Operate(data={
433+
'workspace_id': workspace_id, 'document_id': document_id, 'knowledge_id': knowledge_id
434+
}).download_source_file()
435+
420436
class Migrate(APIView):
421437
authentication_classes = [TokenAuth]
422438

0 commit comments

Comments
 (0)