Skip to content

Commit 31a4e54

Browse files
committed
refactor: 优化文件下载,移除原有逻辑
清理minio客户端导入顺序 移除已处理的TODO注释和废弃代码
1 parent 2094fdf commit 31a4e54

File tree

4 files changed

+4
-30
lines changed

4 files changed

+4
-30
lines changed

server/routers/knowledge_router.py

Lines changed: 1 addition & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -516,29 +516,10 @@ async def run_rechunks(context: TaskContext):
516516

517517
@knowledge.get("/databases/{db_id}/documents/{doc_id}/download")
518518
async def download_document(db_id: str, doc_id: str, request: Request, current_user: User = Depends(get_admin_user)):
519-
# TODO: 可以考虑修改为minio下载,将文件相关逻辑完全迁移到minio
520519
"""下载原始文件"""
521520
logger.debug(f"Download document {doc_id} from {db_id}")
522521
try:
523522
file_info = await knowledge_base.get_file_basic_info(db_id, doc_id)
524-
if not file_info:
525-
raise HTTPException(status_code=404, detail="File not found")
526-
527-
file_path = file_info.get("meta", {}).get("path")
528-
if not file_path:
529-
raise HTTPException(status_code=404, detail="File path not found in metadata")
530-
531-
# 安全检查:验证文件路径
532-
from src.knowledge.utils.kb_utils import validate_file_path
533-
534-
try:
535-
normalized_path = validate_file_path(file_path, db_id)
536-
except ValueError as e:
537-
raise HTTPException(status_code=403, detail=str(e))
538-
539-
if not os.path.exists(normalized_path):
540-
raise HTTPException(status_code=404, detail=f"File not found on disk: {file_info=}")
541-
542523
# 获取文件扩展名和MIME类型,解码URL编码的文件名
543524
filename = file_info.get("meta", {}).get("filename", "file")
544525
logger.debug(f"Original filename from database: {filename}")
@@ -1075,9 +1056,7 @@ async def upload_file(
10751056

10761057
basename, ext = os.path.splitext(file.filename)
10771058
# TODO:
1078-
# 如果知识库中的文件多了,上传了内容修改过的同名文件应当把旧的文件删除掉
1079-
# 否则会保存两份相同的文档,建议固定salt,上传逻辑是:
1080-
# 若上传了同名文件时且hash相同则报错,不同则直接替换同名文件
1059+
# 后续修改为遇到同名文件则在上传区域提示,是否删除旧文件,同时 filename name 也就不用添加 hash 了
10811060
filename = f"{basename}_{hashstr(basename, 4, with_salt=True, salt='fixed_salt')}{ext}".lower()
10821061

10831062
file_path = os.path.join(upload_dir, filename)

src/knowledge/indexing.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -371,11 +371,6 @@ async def process_file_to_markdown(file_path: str, params: dict | None = None) -
371371

372372
return markdown_content.strip()
373373

374-
# TODO:
375-
# 此处修改了excel的处理逻辑,原本的excel是转为markdown后切分
376-
# 但是实际使用时发现,对于excel这种结构化数据,保留表头非常有必要
377-
# 因此改为了每10行重复保存一次表头
378-
# 目前前端显示有点问题,不知道为啥不能换行
379374
elif file_ext in [".xls", ".xlsx"]:
380375
# 处理 Excel 文件
381376
import pandas as pd

src/knowledge/utils/kb_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ def split_text_into_chunks(text: str, file_id: str, filename: str, params: dict
9090
chunks.append(
9191
{
9292
"id": f"{file_id}_chunk_{chunk_index}",
93-
"content": chunk_content, # .strip(),
93+
"content": chunk_content, # .strip(),
9494
"file_id": file_id,
9595
"filename": filename,
9696
"chunk_index": chunk_index,

src/storage/minio/client.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,10 @@
1010
from datetime import timedelta
1111
from io import BytesIO
1212

13-
from minio import Minio
14-
from minio.error import S3Error
1513
from urllib3 import BaseHTTPResponse
1614

15+
from minio import Minio
16+
from minio.error import S3Error
1717
from src.utils import logger
1818

1919

0 commit comments

Comments
 (0)