Skip to content

Commit 17d41f1

Browse files
authored
Merge pull request #1736 from ModelEngine-Group/xyc/kb_chunk_preview
2 parents 7ce348f + b2b16a4 commit 17d41f1

File tree

15 files changed

+1098
-27
lines changed

15 files changed

+1098
-27
lines changed

backend/apps/vectordatabase_app.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from typing import Any, Dict, List, Optional
44

55
from fastapi import APIRouter, Body, Depends, Header, HTTPException, Path, Query
6+
from fastapi.responses import JSONResponse
67

78
from consts.model import IndexingResponse
89
from nexent.vector_database.base import VectorDatabaseCore
@@ -195,3 +196,33 @@ def health_check(vdb_core: VectorDatabaseCore = Depends(get_vector_db_core)):
195196
return ElasticSearchService.health_check(vdb_core)
196197
except Exception as e:
197198
raise HTTPException(status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail=f"{str(e)}")
199+
200+
201+
@router.post("/{index_name}/chunks")
202+
def get_index_chunks(
203+
index_name: str = Path(...,
204+
description="Name of the index to get chunks from"),
205+
page: int = Query(
206+
None, description="Page number (1-based) for pagination"),
207+
page_size: int = Query(
208+
None, description="Number of records per page for pagination"),
209+
path_or_url: Optional[str] = Query(
210+
None, description="Filter chunks by document path_or_url"),
211+
vdb_core: VectorDatabaseCore = Depends(get_vector_db_core)
212+
):
213+
"""Get chunks from the specified index, with optional pagination support"""
214+
try:
215+
result = ElasticSearchService.get_index_chunks(
216+
index_name=index_name,
217+
page=page,
218+
page_size=page_size,
219+
path_or_url=path_or_url,
220+
vdb_core=vdb_core,
221+
)
222+
return JSONResponse(status_code=HTTPStatus.OK, content=result)
223+
except Exception as e:
224+
error_msg = str(e)
225+
logger.error(
226+
f"Error getting chunks for index '{index_name}': {error_msg}")
227+
raise HTTPException(
228+
status_code=HTTPStatus.INTERNAL_SERVER_ERROR, detail=f"Error getting chunks: {error_msg}")

backend/services/vectordatabase_service.py

Lines changed: 66 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@
3535
from utils.config_utils import tenant_config_manager, get_model_name_from_config
3636
from utils.file_management_utils import get_all_files_status, get_file_size
3737

38+
ALLOWED_CHUNK_FIELDS = {"filename",
39+
"path_or_url", "content", "create_time", "id"}
40+
3841
# Configure logging
3942
logger = logging.getLogger("vectordatabase_service")
4043

@@ -572,7 +575,8 @@ async def list_files(
572575
'file_size': file_info.get('file_size', 0),
573576
'create_time': int(utc_create_timestamp * 1000),
574577
'status': "COMPLETED",
575-
'latest_task_id': ''
578+
'latest_task_id': '',
579+
'chunk_count': file_info.get('chunk_count', 0)
576580
}
577581
files.append(file_data)
578582

@@ -630,7 +634,7 @@ async def list_files(
630634
# Initialize chunks for all files
631635
for file_data in files:
632636
file_data['chunks'] = []
633-
file_data['chunk_count'] = 0
637+
file_data['chunk_count'] = file_data.get('chunk_count', 0)
634638

635639
if msearch_body:
636640
try:
@@ -667,7 +671,7 @@ async def list_files(
667671
else:
668672
for file_data in files:
669673
file_data['chunks'] = []
670-
file_data['chunk_count'] = 0
674+
file_data['chunk_count'] = file_data.get('chunk_count', 0)
671675

672676
return {"files": files}
673677

@@ -919,3 +923,62 @@ def get_summary(index_name: str = Path(..., description="Name of the index to ge
919923
except Exception as e:
920924
error_msg = f"Failed to get summary: {str(e)}"
921925
raise Exception(error_msg)
926+
927+
@staticmethod
928+
def get_index_chunks(
929+
index_name: str,
930+
page: Optional[int] = None,
931+
page_size: Optional[int] = None,
932+
path_or_url: Optional[str] = None,
933+
vdb_core: VectorDatabaseCore = Depends(get_vector_db_core),
934+
):
935+
"""
936+
Retrieve chunk records for the specified index with optional pagination.
937+
938+
Args:
939+
index_name: Name of the index to query
940+
page: Page number (1-based) when paginating
941+
page_size: Page size when paginating
942+
path_or_url: Optional document filter
943+
vdb_core: VectorDatabaseCore instance
944+
945+
Returns:
946+
Dictionary containing status, chunk list, total, and pagination metadata
947+
"""
948+
try:
949+
result = vdb_core.get_index_chunks(
950+
index_name,
951+
page=page,
952+
page_size=page_size,
953+
path_or_url=path_or_url,
954+
)
955+
raw_chunks = result.get("chunks", [])
956+
total = result.get("total", len(raw_chunks))
957+
result_page = result.get("page", page)
958+
result_page_size = result.get("page_size", page_size)
959+
960+
filtered_chunks: List[Any] = []
961+
for chunk in raw_chunks:
962+
if isinstance(chunk, dict):
963+
filtered_chunks.append(
964+
{
965+
field: chunk.get(field)
966+
for field in ALLOWED_CHUNK_FIELDS
967+
if field in chunk
968+
}
969+
)
970+
else:
971+
filtered_chunks.append(chunk)
972+
973+
return {
974+
"status": "success",
975+
"message": f"Successfully retrieved {len(filtered_chunks)} chunks from index {index_name}",
976+
"chunks": filtered_chunks,
977+
"total": total,
978+
"page": result_page,
979+
"page_size": result_page_size
980+
}
981+
except Exception as e:
982+
error_msg = f"Error retrieving chunks from index {index_name}: {str(e)}"
983+
logger.error(error_msg)
984+
raise Exception(error_msg)

0 commit comments

Comments
 (0)