Skip to content

Commit 246e133

Browse files
committed
feat: 添加文件内容哈希计算功能,防止重复文件上传并更新相关逻辑
1 parent ff955bd commit 246e133

File tree

4 files changed

+64
-2
lines changed

4 files changed

+64
-2
lines changed

server/routers/knowledge_router.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
from server.services.tasker import TaskContext, tasker
1313
from src import config, knowledge_base
1414
from src.knowledge.indexing import SUPPORTED_FILE_EXTENSIONS, is_supported_file_extension, process_file_to_markdown
15+
from src.knowledge.utils import calculate_content_hash
1516
from src.models.embed import test_embedding_model_status, test_all_embedding_models_status
1617
from src.utils import hashstr, logger
1718

@@ -587,10 +588,16 @@ async def upload_file(
587588
file_path = os.path.join(upload_dir, filename)
588589
os.makedirs(upload_dir, exist_ok=True)
589590

591+
file_bytes = await file.read()
592+
593+
content_hash = calculate_content_hash(file_bytes)
594+
if knowledge_base.file_existed_in_db(db_id, content_hash):
595+
raise HTTPException(status_code=409, detail="数据库中已经存在了相同文件,File with the same content already exists in this database")
596+
590597
with open(file_path, "wb") as buffer:
591-
buffer.write(await file.read())
598+
buffer.write(file_bytes)
592599

593-
return {"message": "File successfully uploaded", "file_path": file_path, "db_id": db_id}
600+
return {"message": "File successfully uploaded", "file_path": file_path, "db_id": db_id, "content_hash": content_hash}
594601

595602

596603
@knowledge.get("/files/supported-types")

src/knowledge/manager.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,24 @@ def get_db_upload_path(self, db_id: str | None = None) -> str:
268268
os.makedirs(general_uploads, exist_ok=True)
269269
return general_uploads
270270

271+
def file_existed_in_db(self, db_id: str | None, content_hash: str | None) -> bool:
272+
"""检查指定数据库中是否存在相同内容哈希的文件"""
273+
if not db_id or not content_hash:
274+
return False
275+
276+
try:
277+
kb_instance = self._get_kb_for_database(db_id)
278+
except KBNotFoundError:
279+
return False
280+
281+
for file_info in kb_instance.files_meta.values():
282+
if file_info.get("database_id") != db_id:
283+
continue
284+
if file_info.get("content_hash") == content_hash:
285+
return True
286+
287+
return False
288+
271289
async def update_database(self, db_id: str, name: str, description: str) -> dict:
272290
"""更新数据库"""
273291
kb_instance = self._get_kb_for_database(db_id)

src/knowledge/utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""
77

88
from .kb_utils import (
9+
calculate_content_hash,
910
get_embedding_config,
1011
prepare_item_metadata,
1112
split_text_into_chunks,
@@ -14,6 +15,7 @@
1415
)
1516

1617
__all__ = [
18+
"calculate_content_hash",
1719
"get_embedding_config",
1820
"prepare_item_metadata",
1921
"split_text_into_chunks",

src/knowledge/utils/kb_utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import hashlib
12
import os
23
import time
34
from pathlib import Path
@@ -100,6 +101,32 @@ def split_text_into_chunks(text: str, file_id: str, filename: str, params: dict
100101
return chunks
101102

102103

104+
def calculate_content_hash(data: bytes | bytearray | str | os.PathLike[str] | Path) -> str:
105+
"""
106+
计算文件内容的 SHA-256 哈希值。
107+
108+
Args:
109+
data: 文件内容的二进制数据或文件路径
110+
111+
Returns:
112+
str: 十六进制哈希值
113+
"""
114+
sha256 = hashlib.sha256()
115+
116+
if isinstance(data, (bytes, bytearray)):
117+
sha256.update(data)
118+
return sha256.hexdigest()
119+
120+
if isinstance(data, (str, os.PathLike, Path)):
121+
path = Path(data)
122+
with path.open("rb") as file_handle:
123+
for chunk in iter(lambda: file_handle.read(8192), b""):
124+
sha256.update(chunk)
125+
return sha256.hexdigest()
126+
127+
raise TypeError(f"Unsupported data type for hashing: {type(data)!r}")
128+
129+
103130
def prepare_item_metadata(item: str, content_type: str, db_id: str) -> dict:
104131
"""
105132
准备文件或URL的元数据
@@ -110,11 +137,18 @@ def prepare_item_metadata(item: str, content_type: str, db_id: str) -> dict:
110137
file_type = file_path.suffix.lower().replace(".", "")
111138
filename = file_path.name
112139
item_path = os.path.relpath(file_path, Path.cwd())
140+
content_hash = None
141+
try:
142+
if file_path.exists():
143+
content_hash = calculate_content_hash(file_path)
144+
except Exception as exc: # noqa: BLE001
145+
logger.warning(f"Failed to calculate content hash for {file_path}: {exc}")
113146
else: # URL
114147
file_id = f"url_{hashstr(item + str(time.time()), 6)}"
115148
file_type = "url"
116149
filename = f"webpage_{hashstr(item, 6)}.md"
117150
item_path = item
151+
content_hash = None
118152

119153
return {
120154
"database_id": db_id,
@@ -124,6 +158,7 @@ def prepare_item_metadata(item: str, content_type: str, db_id: str) -> dict:
124158
"status": "processing",
125159
"created_at": time.time(),
126160
"file_id": file_id,
161+
"content_hash": content_hash,
127162
}
128163

129164

0 commit comments

Comments
 (0)