Skip to content

Commit 8bc3fec

Browse files
committed
Fix ingestion for non-multimodal case
1 parent 180fb2d commit 8bc3fec

File tree

2 files changed

+58
-59
lines changed

2 files changed

+58
-59
lines changed

app/backend/prepdocslib/searchmanager.py

Lines changed: 29 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ async def create_index(self):
8383
logger.info("Checking whether search index %s exists...", self.search_info.index_name)
8484

8585
async with self.search_info.create_search_index_client() as search_index_client:
86+
8687
embedding_field = None
8788
images_field = None
8889
text_vector_search_profile = None
@@ -230,12 +231,7 @@ async def create_index(self):
230231
type="Edm.String",
231232
analyzer_name=self.search_analyzer_name,
232233
),
233-
SimpleField(
234-
name="category",
235-
type="Edm.String",
236-
filterable=True,
237-
facetable=True,
238-
),
234+
SimpleField(name="category", type="Edm.String", filterable=True, facetable=True),
239235
SimpleField(
240236
name="sourcepage",
241237
type="Edm.String",
@@ -280,10 +276,7 @@ async def create_index(self):
280276
vector_algorithms: list[VectorSearchAlgorithmConfiguration] = []
281277
vector_compressions: list[VectorSearchCompression] = []
282278
if embedding_field:
283-
logger.info(
284-
"Including %s field for text vectors in new index",
285-
embedding_field.name,
286-
)
279+
logger.info("Including %s field for text vectors in new index", embedding_field.name)
287280
fields.append(embedding_field)
288281
if text_vectorizer is not None:
289282
vectorizers.append(text_vectorizer)
@@ -298,10 +291,7 @@ async def create_index(self):
298291
vector_compressions.append(text_vector_compression)
299292

300293
if images_field:
301-
logger.info(
302-
"Including %s field for image descriptions and vectors in new index",
303-
images_field.name,
304-
)
294+
logger.info("Including %s field for image descriptions and vectors in new index", images_field.name)
305295
fields.append(images_field)
306296
if image_vector_search_profile is None or image_vector_algorithm is None:
307297
raise ValueError("Image search profile and algorithm must be set")
@@ -338,10 +328,7 @@ async def create_index(self):
338328
logger.info("Search index %s already exists", self.search_info.index_name)
339329
existing_index = await search_index_client.get_index(self.search_info.index_name)
340330
if not any(field.name == "storageUrl" for field in existing_index.fields):
341-
logger.info(
342-
"Adding storageUrl field to index %s",
343-
self.search_info.index_name,
344-
)
331+
logger.info("Adding storageUrl field to index %s", self.search_info.index_name)
345332
existing_index.fields.append(
346333
SimpleField(
347334
name="storageUrl",
@@ -406,10 +393,7 @@ async def create_index(self):
406393

407394
if existing_index.semantic_search:
408395
if not existing_index.semantic_search.default_configuration_name:
409-
logger.info(
410-
"Adding default semantic configuration to index %s",
411-
self.search_info.index_name,
412-
)
396+
logger.info("Adding default semantic configuration to index %s", self.search_info.index_name)
413397
existing_index.semantic_search.default_configuration_name = "default"
414398

415399
if existing_index.semantic_search.configurations:
@@ -419,10 +403,7 @@ async def create_index(self):
419403
and existing_semantic_config.prioritized_fields.title_field
420404
and not existing_semantic_config.prioritized_fields.title_field.field_name == "sourcepage"
421405
):
422-
logger.info(
423-
"Updating semantic configuration for index %s",
424-
self.search_info.index_name,
425-
)
406+
logger.info("Updating semantic configuration for index %s", self.search_info.index_name)
426407
existing_semantic_config.prioritized_fields.title_field = SemanticField(
427408
field_name="sourcepage"
428409
)
@@ -432,10 +413,7 @@ async def create_index(self):
432413
or len(existing_index.vector_search.vectorizers) == 0
433414
):
434415
if self.embeddings is not None and isinstance(self.embeddings, AzureOpenAIEmbeddingService):
435-
logger.info(
436-
"Adding vectorizer to search index %s",
437-
self.search_info.index_name,
438-
)
416+
logger.info("Adding vectorizer to search index %s", self.search_info.index_name)
439417
existing_index.vector_search.vectorizers = [
440418
AzureOpenAIVectorizer(
441419
vectorizer_name=f"{self.search_info.index_name}-vectorizer",
@@ -467,8 +445,7 @@ async def create_agent(self):
467445
name=self.search_info.agent_name,
468446
target_indexes=[
469447
KnowledgeAgentTargetIndex(
470-
index_name=self.search_info.index_name,
471-
default_include_reference_source_data=True,
448+
index_name=self.search_info.index_name, default_include_reference_source_data=True
472449
)
473450
],
474451
models=[
@@ -494,35 +471,33 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
494471

495472
async with self.search_info.create_search_client() as search_client:
496473
for batch_index, batch in enumerate(section_batches):
497-
image_fields = {}
498-
if self.search_images:
499-
image_fields = {
500-
"images": [
501-
{
502-
"url": image.url,
503-
"description": image.description,
504-
"boundingbox": image.bbox,
505-
"embedding": image.embedding,
506-
}
507-
for section in batch
508-
for image in section.chunk.images
509-
]
510-
}
511-
documents = [
512-
{
474+
documents = []
475+
for section_index, section in enumerate(batch):
476+
image_fields = {}
477+
if self.search_images:
478+
image_fields = {
479+
"images": [
480+
{
481+
"url": image.url,
482+
"description": image.description,
483+
"boundingbox": image.bbox,
484+
"embedding": image.embedding,
485+
}
486+
for image in section.chunk.images
487+
]
488+
}
489+
document = {
513490
"id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
514491
"content": section.chunk.text,
515492
"category": section.category,
516493
"sourcepage": BlobManager.sourcepage_from_file_page(
517-
filename=section.content.filename(),
518-
page=section.chunk.page_num,
494+
filename=section.content.filename(), page=section.chunk.page_num
519495
),
520496
"sourcefile": section.content.filename(),
521497
**image_fields,
522498
**section.content.acls,
523499
}
524-
for section_index, section in enumerate(batch)
525-
]
500+
documents.append(document)
526501
if url:
527502
for document in documents:
528503
document["storageUrl"] = url
@@ -544,9 +519,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
544519

545520
async def remove_content(self, path: Optional[str] = None, only_oid: Optional[str] = None):
546521
logger.info(
547-
"Removing sections from '{%s or '<all>'}' from search index '%s'",
548-
path,
549-
self.search_info.index_name,
522+
"Removing sections from '{%s or '<all>'}' from search index '%s'", path, self.search_info.index_name
550523
)
551524
async with self.search_info.create_search_client() as search_client:
552525
while True:
@@ -558,10 +531,7 @@ async def remove_content(self, path: Optional[str] = None, only_oid: Optional[st
558531
filter = f"sourcefile eq '{path_for_filter}'"
559532
max_results = 1000
560533
result = await search_client.search(
561-
search_text="",
562-
filter=filter,
563-
top=max_results,
564-
include_total_count=True,
534+
search_text="", filter=filter, top=max_results, include_total_count=True
565535
)
566536
result_count = await result.get_count()
567537
if result_count == 0:

tests/test_searchmanager.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,35 @@ async def mock_upload_documents(self, documents):
316316
]
317317

318318

319+
@pytest.mark.asyncio
320+
async def test_update_content_no_images_when_disabled(monkeypatch, search_info):
321+
"""Ensure no 'images' field is added when search_images is False (baseline case without any images)."""
322+
323+
documents_uploaded: list[dict] = []
324+
325+
async def mock_upload_documents(self, documents):
326+
documents_uploaded.extend(documents)
327+
328+
monkeypatch.setattr(SearchClient, "upload_documents", mock_upload_documents)
329+
330+
manager = SearchManager(search_info, search_images=False)
331+
332+
test_io = io.BytesIO(b"test file")
333+
test_io.name = "test/foo.pdf"
334+
file = File(test_io)
335+
336+
section = Section(
337+
chunk=Chunk(page_num=0, text="chunk text"),
338+
content=file,
339+
category="test",
340+
)
341+
342+
await manager.update_content([section])
343+
344+
assert len(documents_uploaded) == 1, "Exactly one document should be uploaded"
345+
assert "images" not in documents_uploaded[0], "'images' field should not be present when search_images is False"
346+
347+
319348
class AsyncSearchResultsIterator:
320349
def __init__(self, results):
321350
self.results = results

0 commit comments

Comments
 (0)