Skip to content

Commit 1700628

Browse files
supreme0597xerrors
authored andcommitted
feat: 完善重新分块功能,优化代码逻辑,保存请求参数
1 parent b7ba646 commit 1700628

File tree

6 files changed

+101
-79
lines changed

6 files changed

+101
-79
lines changed

src/knowledge/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ def get_database_info(self, db_id: str) -> dict | None:
293293
"type": file_info.get("file_type", ""),
294294
"status": file_info.get("status", "done"),
295295
"created_at": created_at,
296+
"processing_params": file_info.get("processing_params", None),
296297
}
297298

298299
# 按创建时间倒序排序文件列表

src/knowledge/implementations/chroma.py

Lines changed: 22 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -47,10 +47,6 @@ def __init__(self, work_dir: str, **kwargs):
4747

4848
# 存储集合映射 {db_id: collection}
4949
self.collections: dict[str, Any] = {}
50-
51-
# 元数据锁
52-
self._metadata_lock = asyncio.Lock()
53-
5450
logger.info("ChromaKB initialized")
5551

5652
@property
@@ -290,6 +286,7 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
290286

291287
try:
292288
# 更新状态为处理中
289+
self.files_meta[file_id]["processing_params"] = params.copy()
293290
self.files_meta[file_id]["status"] = "processing"
294291
self._save_metadata()
295292

@@ -361,24 +358,6 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
361358

362359
return processed_items_info
363360

364-
async def delete_file_chunks_only(self, db_id: str, file_id: str) -> None:
365-
"""仅删除文件的chunks数据,保留元数据(用于更新操作)"""
366-
collection = await self._get_chroma_collection(db_id)
367-
368-
if collection:
369-
try:
370-
# 查找所有相关的chunks
371-
results = collection.get(where={"full_doc_id": file_id}, include=["metadatas"])
372-
373-
# 删除所有相关chunks
374-
if results and results.get("ids"):
375-
collection.delete(ids=results["ids"])
376-
logger.info(f"Deleted {len(results['ids'])} chunks for file {file_id}")
377-
378-
except Exception as e:
379-
logger.error(f"Error deleting file {file_id} from ChromaDB: {e}")
380-
# 注意:这里不删除 files_meta[file_id],保留元数据用于后续操作
381-
382361
async def aquery(self, query_text: str, db_id: str, **kwargs) -> list[dict]:
383362
"""异步查询知识库"""
384363
collection = await self._get_chroma_collection(db_id)
@@ -473,16 +452,32 @@ async def aquery(self, query_text: str, db_id: str, **kwargs) -> list[dict]:
473452
logger.error(f"ChromaDB query error: {e}, {traceback.format_exc()}")
474453
return []
475454

455+
async def delete_file_chunks_only(self, db_id: str, file_id: str) -> None:
456+
"""仅删除文件的chunks数据,保留元数据(用于更新操作)"""
457+
collection = await self._get_chroma_collection(db_id)
458+
if collection:
459+
try:
460+
# 查找所有相关的chunks
461+
results = collection.get(where={"full_doc_id": file_id}, include=["metadatas"])
462+
463+
# 删除所有相关chunks
464+
if results and results.get("ids"):
465+
collection.delete(ids=results["ids"])
466+
logger.info(f"Deleted {len(results['ids'])} chunks for file {file_id}")
467+
468+
except Exception as e:
469+
logger.error(f"Error deleting file {file_id} from ChromaDB: {e}")
470+
# 注意:这里不删除 files_meta[file_id],保留元数据用于后续操作
471+
476472
async def delete_file(self, db_id: str, file_id: str) -> None:
477473
"""删除文件(包括元数据)"""
478474
# 先删除 ChromaDB 中的 chunks 数据
479475
await self.delete_file_chunks_only(db_id, file_id)
480476

481-
# 使用锁确保元数据操作的原子性
482-
async with self._metadata_lock:
483-
if file_id in self.files_meta:
484-
del self.files_meta[file_id]
485-
self._save_metadata()
477+
# 删除文件记录
478+
if file_id in self.files_meta:
479+
del self.files_meta[file_id]
480+
self._save_metadata()
486481

487482
async def get_file_basic_info(self, db_id: str, file_id: str) -> dict:
488483
"""获取文件基本信息(仅元数据)"""

src/knowledge/implementations/lightrag.py

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -32,9 +32,6 @@ def __init__(self, work_dir: str, **kwargs):
3232
# 存储 LightRAG 实例映射 {db_id: LightRAG}
3333
self.instances: dict[str, LightRAG] = {}
3434

35-
# 元数据锁
36-
self._metadata_lock = asyncio.Lock()
37-
3835
# 设置 LightRAG 日志
3936
log_dir = os.path.join(work_dir, "logs", "lightrag")
4037
os.makedirs(log_dir, exist_ok=True)
@@ -306,6 +303,7 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
306303

307304
try:
308305
# 更新状态为处理中
306+
self.files_meta[file_id]["processing_params"] = params.copy()
309307
self.files_meta[file_id]["status"] = "processing"
310308
self._save_metadata()
311309

@@ -357,19 +355,6 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
357355

358356
return processed_items_info
359357

360-
async def delete_file_chunks_only(self, db_id: str, file_id: str) -> None:
361-
"""仅删除文件的chunks数据,保留元数据(用于更新操作)"""
362-
rag = await self._get_lightrag_instance(db_id)
363-
364-
if rag:
365-
try:
366-
# 使用 LightRAG 删除文档
367-
await rag.adelete_by_doc_id(file_id)
368-
logger.info(f"Deleted chunks for file {file_id} from LightRAG")
369-
except Exception as e:
370-
logger.error(f"Error deleting file {file_id} from LightRAG: {e}")
371-
# 注意:这里不删除 files_meta[file_id],保留元数据用于后续操作
372-
373358
async def aquery(self, query_text: str, db_id: str, **kwargs) -> str:
374359
"""异步查询知识库"""
375360
rag = await self._get_lightrag_instance(db_id)
@@ -395,16 +380,27 @@ async def aquery(self, query_text: str, db_id: str, **kwargs) -> str:
395380
logger.error(f"Query error: {e}, {traceback.format_exc()}")
396381
return ""
397382

383+
async def delete_file_chunks_only(self, db_id: str, file_id: str) -> None:
384+
"""仅删除文件的chunks数据,保留元数据(用于更新操作)"""
385+
rag = await self._get_lightrag_instance(db_id)
386+
if rag:
387+
try:
388+
# 使用 LightRAG 删除文档
389+
await rag.adelete_by_doc_id(file_id)
390+
logger.info(f"Deleted chunks for file {file_id} from LightRAG")
391+
except Exception as e:
392+
logger.error(f"Error deleting file {file_id} from LightRAG: {e}")
393+
# 注意:这里不删除 files_meta[file_id],保留元数据用于后续操作
394+
398395
async def delete_file(self, db_id: str, file_id: str) -> None:
399396
"""删除文件(包括元数据)"""
400397
# 先删除 LightRAG 中的 chunks 数据
401398
await self.delete_file_chunks_only(db_id, file_id)
402399

403-
# 使用锁确保元数据操作的原子性
404-
async with self._metadata_lock:
405-
if file_id in self.files_meta:
406-
del self.files_meta[file_id]
407-
self._save_metadata()
400+
# 删除文件记录
401+
if file_id in self.files_meta:
402+
del self.files_meta[file_id]
403+
self._save_metadata()
408404

409405
async def get_file_basic_info(self, db_id: str, file_id: str) -> dict:
410406
"""获取文件基本信息(仅元数据)"""

src/knowledge/implementations/milvus.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,7 @@ async def update_content(self, db_id: str, file_ids: list[str], params: dict | N
332332
try:
333333
# 更新状态为处理中
334334
async with self._metadata_lock:
335+
self.files_meta[file_id]["processing_params"] = params.copy()
335336
self.files_meta[file_id]["status"] = "processing"
336337
self._save_metadata()
337338

web/src/components/FileTable.vue

Lines changed: 20 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ const selectedRowKeys = computed({
203203
204204
// 重新分块参数配置相关
205205
const rechunkModalVisible = ref(false);
206-
const rechunkModalLoading = ref(false);
206+
const rechunkModalLoading = computed(() => store.state.chunkLoading);
207+
207208
const rechunkParams = ref({
208209
chunk_size: 1000,
209210
chunk_overlap: 200,
@@ -448,18 +449,17 @@ const handleDownloadFile = async (record) => {
448449
};
449450
450451
const handleRechunkFile = async (record) => {
451-
const dbId = store.databaseId;
452-
if (!dbId) {
453-
console.error('无法获取数据库ID,数据库ID:', store.databaseId, '记录:', record);
454-
message.error('无法获取数据库ID,请刷新页面后重试');
455-
return;
456-
}
457-
458452
try {
459453
// 设置当前重新分块的文件ID
460454
currentRechunkFileIds.value = [record.file_id];
461455
isBatchRechunk.value = false;
462456
457+
if (record?.processing_params) {
458+
rechunkParams.value = {
459+
...record?.processing_params
460+
};
461+
}
462+
463463
// 显示参数配置模态框
464464
rechunkModalVisible.value = true;
465465
} catch (error) {
@@ -470,50 +470,39 @@ const handleRechunkFile = async (record) => {
470470
471471
// 重新分块确认
472472
const handleRechunkConfirm = async () => {
473-
const dbId = store.databaseId;
474-
if (!dbId) {
475-
console.error('无法获取数据库ID,数据库ID:', store.databaseId);
476-
message.error('无法获取数据库ID,请刷新页面后重试');
477-
return;
478-
}
479-
480-
if (currentRechunkFileIds.value.length === 0) {
481-
message.warning('请选择要重新分块的文件');
482-
return;
483-
}
484-
485-
rechunkModalLoading.value = true;
486-
487473
try {
488474
// 调用 rechunks 接口
489-
const result = await documentApi.rechunksDocuments(dbId, currentRechunkFileIds.value, rechunkParams.value);
490-
491-
if (result.status === 'queued') {
492-
message.success('重新分块任务已提交,请在任务中心查看进度');
493-
// 刷新文件列表
494-
store.getDatabaseInfo(undefined, true);
475+
const result = await store.rechunksFiles({fileIds: currentRechunkFileIds.value, params: rechunkParams.value});
476+
if (result) {
477+
currentRechunkFileIds.value = [];
495478
// 清空选择
496479
if (isBatchRechunk.value) {
497480
selectedRowKeys.value = [];
498481
}
499482
// 关闭模态框
500483
rechunkModalVisible.value = false;
484+
485+
// 重置参数为默认值
486+
rechunkParams.value = {
487+
chunk_size: 1000,
488+
chunk_overlap: 200,
489+
use_qa_split: false,
490+
qa_separator: '\n\n\n'
491+
};
501492
} else {
502493
message.error(`重新分块失败: ${result.message}`);
503494
}
504495
} catch (error) {
505496
console.error('重新分块失败:', error);
506497
const errorMessage = error.message || '重新分块失败,请稍后重试';
507498
message.error(errorMessage);
508-
} finally {
509-
rechunkModalLoading.value = false;
510499
}
511500
};
512501
513502
// 重新分块取消
514503
const handleRechunkCancel = () => {
515504
rechunkModalVisible.value = false;
516-
rechunkModalLoading.value = false;
505+
// rechunkModalLoading.value = false;
517506
currentRechunkFileIds.value = [];
518507
isBatchRechunk.value = false;
519508
// 重置参数为默认值

web/src/stores/database.js

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,45 @@ export const useDatabaseStore = defineStore('database', () => {
256256
}
257257
}
258258

259+
async function rechunksFiles({ fileIds, params }) {
260+
if (fileIds.length === 0) {
261+
message.error('请选择要重新分块的文件!');
262+
return;
263+
}
264+
265+
state.chunkLoading = true;
266+
try {
267+
const data = await documentApi.rechunksDocuments(databaseId.value, fileIds, { ...params });
268+
if (data.status === 'success' || data.status === 'queued') {
269+
enableAutoRefresh('auto');
270+
message.success(data.message || `文档已提交处理,请在任务中心查看进度`);
271+
if (data.task_id) {
272+
taskerStore.registerQueuedTask({
273+
task_id: data.task_id,
274+
name: `文档重新分块 (${databaseId.value || ''})`,
275+
task_type: 'knowledge_rechunks',
276+
message: data.message,
277+
payload: {
278+
db_id: databaseId.value,
279+
count: fileIds.length,
280+
}
281+
});
282+
}
283+
await getDatabaseInfo(undefined, true); // Skip query params when adding files
284+
return true; // Indicate success
285+
} else {
286+
message.error(data.message || '处理失败');
287+
return false;
288+
}
289+
} catch (error) {
290+
console.error(error);
291+
message.error(error.message || '处理请求失败');
292+
return false;
293+
} finally {
294+
state.chunkLoading = false;
295+
}
296+
}
297+
259298
async function openFileDetail(record) {
260299
if (record.status !== 'done') {
261300
message.error('文件未处理完成,请稍后再试');
@@ -379,6 +418,7 @@ export const useDatabaseStore = defineStore('database', () => {
379418
handleDeleteFile,
380419
handleBatchDelete,
381420
addFiles,
421+
rechunksFiles,
382422
openFileDetail,
383423
loadQueryParams,
384424

0 commit comments

Comments
 (0)