Skip to content

Commit bc8b377

Browse files
authored
fix(vector-io): handle missing document_id in insert_chunks (llamastack#3521)
Fixed KeyError when chunks don't have document_id in metadata or chunk_metadata. Updated logging to safely extract document_id using getattr and RAG memory to handle different document_id locations. Added test for missing document_id scenarios. Fixes issue llamastack#3494 where /v1/vector-io/insert would crash with KeyError. Fixed KeyError when chunks don't have document_id in metadata or chunk_metadata. Updated logging to safely extract document_id using getattr and RAG memory to handle different document_id locations. Added test for missing document_id scenarios. # What does this PR do? Fixes a KeyError crash in `/v1/vector-io/insert` when chunks are missing `document_id` fields. The API was failing even though `document_id` is optional according to the schema. Closes llamastack#3494 ## Test Plan **Before fix:** - POST to `/v1/vector-io/insert` with chunks → 500 KeyError - Happened regardless of where `document_id` was placed **After fix:** - Same request works fine → 200 OK - Tested with Postman using FAISS backend - Added unit test covering missing `document_id` scenarios
1 parent e9b4278 commit bc8b377

File tree

4 files changed

+51
-2
lines changed

4 files changed

+51
-2
lines changed

llama_stack/apis/vector_io/vector_io.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,22 @@ def chunk_id(self) -> str:
9393

9494
return generate_chunk_id(str(uuid.uuid4()), str(self.content))
9595

96+
@property
97+
def document_id(self) -> str | None:
98+
"""Returns the document_id from either metadata or chunk_metadata, with metadata taking precedence."""
99+
# Check metadata first (takes precedence)
100+
doc_id = self.metadata.get("document_id")
101+
if doc_id is not None:
102+
if not isinstance(doc_id, str):
103+
raise TypeError(f"metadata['document_id'] must be a string, got {type(doc_id).__name__}: {doc_id!r}")
104+
return doc_id
105+
106+
# Fall back to chunk_metadata if available (Pydantic ensures type safety)
107+
if self.chunk_metadata is not None:
108+
return self.chunk_metadata.document_id
109+
110+
return None
111+
96112

97113
@json_schema_type
98114
class QueryChunksResponse(BaseModel):

llama_stack/core/routers/vector_io.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,8 +93,10 @@ async def insert_chunks(
9393
chunks: list[Chunk],
9494
ttl_seconds: int | None = None,
9595
) -> None:
96+
doc_ids = [chunk.document_id for chunk in chunks[:3]]
9697
logger.debug(
97-
f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, ttl_seconds={ttl_seconds}, chunk_ids={[chunk.metadata['document_id'] for chunk in chunks[:3]]}{' and more...' if len(chunks) > 3 else ''}",
98+
f"VectorIORouter.insert_chunks: {vector_db_id}, {len(chunks)} chunks, "
99+
f"ttl_seconds={ttl_seconds}, chunk_ids={doc_ids}{' and more...' if len(chunks) > 3 else ''}"
98100
)
99101
provider = await self.routing_table.get_provider_impl(vector_db_id)
100102
return await provider.insert_chunks(vector_db_id, chunks, ttl_seconds)

llama_stack/providers/inline/tool_runtime/rag/memory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -272,7 +272,7 @@ async def query(
272272
return RAGQueryResult(
273273
content=picked,
274274
metadata={
275-
"document_ids": [c.metadata["document_id"] for c in chunks[: len(picked)]],
275+
"document_ids": [c.document_id for c in chunks[: len(picked)]],
276276
"chunks": [c.content for c in chunks[: len(picked)]],
277277
"scores": scores[: len(picked)],
278278
"vector_db_ids": [c.metadata["vector_db_id"] for c in chunks[: len(picked)]],

tests/unit/providers/vector_io/test_vector_io_openai_vector_stores.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,37 @@ async def test_insert_chunks_missing_db_raises(vector_io_adapter):
128128
await vector_io_adapter.insert_chunks("db_not_exist", [])
129129

130130

131+
async def test_insert_chunks_with_missing_document_id(vector_io_adapter):
132+
"""Ensure no KeyError when document_id is missing or in different places."""
133+
from llama_stack.apis.vector_io import Chunk, ChunkMetadata
134+
135+
fake_index = AsyncMock()
136+
vector_io_adapter.cache["db1"] = fake_index
137+
138+
# Various document_id scenarios that shouldn't crash
139+
chunks = [
140+
Chunk(content="has doc_id in metadata", metadata={"document_id": "doc-1"}),
141+
Chunk(content="no doc_id anywhere", metadata={"source": "test"}),
142+
Chunk(content="doc_id in chunk_metadata", chunk_metadata=ChunkMetadata(document_id="doc-3")),
143+
]
144+
145+
# Should work without KeyError
146+
await vector_io_adapter.insert_chunks("db1", chunks)
147+
fake_index.insert_chunks.assert_awaited_once()
148+
149+
150+
async def test_document_id_with_invalid_type_raises_error():
151+
"""Ensure TypeError is raised when document_id is not a string."""
152+
from llama_stack.apis.vector_io import Chunk
153+
154+
# Integer document_id should raise TypeError
155+
chunk = Chunk(content="test", metadata={"document_id": 12345})
156+
with pytest.raises(TypeError) as exc_info:
157+
_ = chunk.document_id
158+
assert "metadata['document_id'] must be a string" in str(exc_info.value)
159+
assert "got int" in str(exc_info.value)
160+
161+
131162
async def test_query_chunks_calls_underlying_index_and_returns(vector_io_adapter):
132163
expected = QueryChunksResponse(chunks=[Chunk(content="c1")], scores=[0.1])
133164
fake_index = AsyncMock(query_chunks=AsyncMock(return_value=expected))

0 commit comments

Comments
 (0)