feat: 添加文件内容哈希计算功能，防止重复文件上传并更新相关逻辑

xerrors · xerrors · commit 246e133eb447 · 2025-10-11T21:52:48.000+08:00
diff --git a/server/routers/knowledge_router.py b/server/routers/knowledge_router.py
@@ -12,6 +12,7 @@
 from server.services.tasker import TaskContext, tasker
 from src import config, knowledge_base
 from src.knowledge.indexing import SUPPORTED_FILE_EXTENSIONS, is_supported_file_extension, process_file_to_markdown
+from src.knowledge.utils import calculate_content_hash
 from src.models.embed import test_embedding_model_status, test_all_embedding_models_status
 from src.utils import hashstr, logger
 
@@ -587,10 +588,16 @@ async def upload_file(
     file_path = os.path.join(upload_dir, filename)
     os.makedirs(upload_dir, exist_ok=True)
 
+    file_bytes = await file.read()
+
+    content_hash = calculate_content_hash(file_bytes)
+    if knowledge_base.file_existed_in_db(db_id, content_hash):
+        raise HTTPException(status_code=409, detail="数据库中已经存在了相同文件，File with the same content already exists in this database")
+
     with open(file_path, "wb") as buffer:
-        buffer.write(await file.read())
+        buffer.write(file_bytes)
 
-    return {"message": "File successfully uploaded", "file_path": file_path, "db_id": db_id}
+    return {"message": "File successfully uploaded", "file_path": file_path, "db_id": db_id, "content_hash": content_hash}
 
 
 @knowledge.get("/files/supported-types")
diff --git a/src/knowledge/manager.py b/src/knowledge/manager.py
@@ -268,6 +268,24 @@ def get_db_upload_path(self, db_id: str | None = None) -> str:
         os.makedirs(general_uploads, exist_ok=True)
         return general_uploads
 
+    def file_existed_in_db(self, db_id: str | None, content_hash: str | None) -> bool:
+        """检查指定数据库中是否存在相同内容哈希的文件"""
+        if not db_id or not content_hash:
+            return False
+
+        try:
+            kb_instance = self._get_kb_for_database(db_id)
+        except KBNotFoundError:
+            return False
+
+        for file_info in kb_instance.files_meta.values():
+            if file_info.get("database_id") != db_id:
+                continue
+            if file_info.get("content_hash") == content_hash:
+                return True
+
+        return False
+
     async def update_database(self, db_id: str, name: str, description: str) -> dict:
         """更新数据库"""
         kb_instance = self._get_kb_for_database(db_id)
diff --git a/src/knowledge/utils/__init__.py b/src/knowledge/utils/__init__.py
@@ -6,6 +6,7 @@
 """
 
 from .kb_utils import (
+    calculate_content_hash,
     get_embedding_config,
     prepare_item_metadata,
     split_text_into_chunks,
@@ -14,6 +15,7 @@
 )
 
 __all__ = [
+    "calculate_content_hash",
     "get_embedding_config",
     "prepare_item_metadata",
     "split_text_into_chunks",
diff --git a/src/knowledge/utils/kb_utils.py b/src/knowledge/utils/kb_utils.py
@@ -1,3 +1,4 @@
+import hashlib
 import os
 import time
 from pathlib import Path
@@ -100,6 +101,32 @@ def split_text_into_chunks(text: str, file_id: str, filename: str, params: dict
     return chunks
 
 
+def calculate_content_hash(data: bytes | bytearray | str | os.PathLike[str] | Path) -> str:
+    """
+    计算文件内容的 SHA-256 哈希值。
+
+    Args:
+        data: 文件内容的二进制数据或文件路径
+
+    Returns:
+        str: 十六进制哈希值
+    """
+    sha256 = hashlib.sha256()
+
+    if isinstance(data, (bytes, bytearray)):
+        sha256.update(data)
+        return sha256.hexdigest()
+
+    if isinstance(data, (str, os.PathLike, Path)):
+        path = Path(data)
+        with path.open("rb") as file_handle:
+            for chunk in iter(lambda: file_handle.read(8192), b""):
+                sha256.update(chunk)
+        return sha256.hexdigest()
+
+    raise TypeError(f"Unsupported data type for hashing: {type(data)!r}")
+
+
 def prepare_item_metadata(item: str, content_type: str, db_id: str) -> dict:
     """
     准备文件或URL的元数据
@@ -110,11 +137,18 @@ def prepare_item_metadata(item: str, content_type: str, db_id: str) -> dict:
         file_type = file_path.suffix.lower().replace(".", "")
         filename = file_path.name
         item_path = os.path.relpath(file_path, Path.cwd())
+        content_hash = None
+        try:
+            if file_path.exists():
+                content_hash = calculate_content_hash(file_path)
+        except Exception as exc:  # noqa: BLE001
+            logger.warning(f"Failed to calculate content hash for {file_path}: {exc}")
     else:  # URL
         file_id = f"url_{hashstr(item + str(time.time()), 6)}"
         file_type = "url"
         filename = f"webpage_{hashstr(item, 6)}.md"
         item_path = item
+        content_hash = None
 
     return {
         "database_id": db_id,
@@ -124,6 +158,7 @@ def prepare_item_metadata(item: str, content_type: str, db_id: str) -> dict:
         "status": "processing",
         "created_at": time.time(),
         "file_id": file_id,
+        "content_hash": content_hash,
     }