Fix ingestion for non-multimodal case

pamelafox · pamelafox · commit 8bc3fec0e792 · 2025-09-08T17:52:51.000-07:00
diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py
@@ -83,6 +83,7 @@ async def create_index(self):
         logger.info("Checking whether search index %s exists...", self.search_info.index_name)
 
         async with self.search_info.create_search_index_client() as search_index_client:
+
             embedding_field = None
             images_field = None
             text_vector_search_profile = None
@@ -230,12 +231,7 @@ async def create_index(self):
                         type="Edm.String",
                         analyzer_name=self.search_analyzer_name,
                     ),
-                    SimpleField(
-                        name="category",
-                        type="Edm.String",
-                        filterable=True,
-                        facetable=True,
-                    ),
+                    SimpleField(name="category", type="Edm.String", filterable=True, facetable=True),
                     SimpleField(
                         name="sourcepage",
                         type="Edm.String",
@@ -280,10 +276,7 @@ async def create_index(self):
                 vector_algorithms: list[VectorSearchAlgorithmConfiguration] = []
                 vector_compressions: list[VectorSearchCompression] = []
                 if embedding_field:
-                    logger.info(
-                        "Including %s field for text vectors in new index",
-                        embedding_field.name,
-                    )
+                    logger.info("Including %s field for text vectors in new index", embedding_field.name)
                     fields.append(embedding_field)
                     if text_vectorizer is not None:
                         vectorizers.append(text_vectorizer)
@@ -298,10 +291,7 @@ async def create_index(self):
                     vector_compressions.append(text_vector_compression)
 
                 if images_field:
-                    logger.info(
-                        "Including %s field for image descriptions and vectors in new index",
-                        images_field.name,
-                    )
+                    logger.info("Including %s field for image descriptions and vectors in new index", images_field.name)
                     fields.append(images_field)
                     if image_vector_search_profile is None or image_vector_algorithm is None:
                         raise ValueError("Image search profile and algorithm must be set")
@@ -338,10 +328,7 @@ async def create_index(self):
                 logger.info("Search index %s already exists", self.search_info.index_name)
                 existing_index = await search_index_client.get_index(self.search_info.index_name)
                 if not any(field.name == "storageUrl" for field in existing_index.fields):
-                    logger.info(
-                        "Adding storageUrl field to index %s",
-                        self.search_info.index_name,
-                    )
+                    logger.info("Adding storageUrl field to index %s", self.search_info.index_name)
                     existing_index.fields.append(
                         SimpleField(
                             name="storageUrl",
@@ -406,10 +393,7 @@ async def create_index(self):
 
                 if existing_index.semantic_search:
                     if not existing_index.semantic_search.default_configuration_name:
-                        logger.info(
-                            "Adding default semantic configuration to index %s",
-                            self.search_info.index_name,
-                        )
+                        logger.info("Adding default semantic configuration to index %s", self.search_info.index_name)
                         existing_index.semantic_search.default_configuration_name = "default"
 
                     if existing_index.semantic_search.configurations:
@@ -419,10 +403,7 @@ async def create_index(self):
                             and existing_semantic_config.prioritized_fields.title_field
                             and not existing_semantic_config.prioritized_fields.title_field.field_name == "sourcepage"
                         ):
-                            logger.info(
-                                "Updating semantic configuration for index %s",
-                                self.search_info.index_name,
-                            )
+                            logger.info("Updating semantic configuration for index %s", self.search_info.index_name)
                             existing_semantic_config.prioritized_fields.title_field = SemanticField(
                                 field_name="sourcepage"
                             )
@@ -432,10 +413,7 @@ async def create_index(self):
                     or len(existing_index.vector_search.vectorizers) == 0
                 ):
                     if self.embeddings is not None and isinstance(self.embeddings, AzureOpenAIEmbeddingService):
-                        logger.info(
-                            "Adding vectorizer to search index %s",
-                            self.search_info.index_name,
-                        )
+                        logger.info("Adding vectorizer to search index %s", self.search_info.index_name)
                         existing_index.vector_search.vectorizers = [
                             AzureOpenAIVectorizer(
                                 vectorizer_name=f"{self.search_info.index_name}-vectorizer",
@@ -467,8 +445,7 @@ async def create_agent(self):
                         name=self.search_info.agent_name,
                         target_indexes=[
                             KnowledgeAgentTargetIndex(
-                                index_name=self.search_info.index_name,
-                                default_include_reference_source_data=True,
+                                index_name=self.search_info.index_name, default_include_reference_source_data=True
                             )
                         ],
                         models=[
@@ -494,35 +471,33 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
 
         async with self.search_info.create_search_client() as search_client:
             for batch_index, batch in enumerate(section_batches):
-                image_fields = {}
-                if self.search_images:
-                    image_fields = {
-                        "images": [
-                            {
-                                "url": image.url,
-                                "description": image.description,
-                                "boundingbox": image.bbox,
-                                "embedding": image.embedding,
-                            }
-                            for section in batch
-                            for image in section.chunk.images
-                        ]
-                    }
-                documents = [
-                    {
+                documents = []
+                for section_index, section in enumerate(batch):
+                    image_fields = {}
+                    if self.search_images:
+                        image_fields = {
+                            "images": [
+                                {
+                                    "url": image.url,
+                                    "description": image.description,
+                                    "boundingbox": image.bbox,
+                                    "embedding": image.embedding,
+                                }
+                                for image in section.chunk.images
+                            ]
+                        }
+                    document = {
                         "id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
                         "content": section.chunk.text,
                         "category": section.category,
                         "sourcepage": BlobManager.sourcepage_from_file_page(
-                            filename=section.content.filename(),
-                            page=section.chunk.page_num,
+                            filename=section.content.filename(), page=section.chunk.page_num
                         ),
                         "sourcefile": section.content.filename(),
                         **image_fields,
                         **section.content.acls,
                     }
-                    for section_index, section in enumerate(batch)
-                ]
+                    documents.append(document)
                 if url:
                     for document in documents:
                         document["storageUrl"] = url
@@ -544,9 +519,7 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
 
     async def remove_content(self, path: Optional[str] = None, only_oid: Optional[str] = None):
         logger.info(
-            "Removing sections from '{%s or '<all>'}' from search index '%s'",
-            path,
-            self.search_info.index_name,
+            "Removing sections from '{%s or '<all>'}' from search index '%s'", path, self.search_info.index_name
         )
         async with self.search_info.create_search_client() as search_client:
             while True:
@@ -558,10 +531,7 @@ async def remove_content(self, path: Optional[str] = None, only_oid: Optional[st
                     filter = f"sourcefile eq '{path_for_filter}'"
                 max_results = 1000
                 result = await search_client.search(
-                    search_text="",
-                    filter=filter,
-                    top=max_results,
-                    include_total_count=True,
+                    search_text="", filter=filter, top=max_results, include_total_count=True
                 )
                 result_count = await result.get_count()
                 if result_count == 0:
diff --git a/tests/test_searchmanager.py b/tests/test_searchmanager.py
@@ -316,6 +316,35 @@ async def mock_upload_documents(self, documents):
     ]
 
 
+@pytest.mark.asyncio
+async def test_update_content_no_images_when_disabled(monkeypatch, search_info):
+    """Ensure no 'images' field is added when search_images is False (baseline case without any images)."""
+
+    documents_uploaded: list[dict] = []
+
+    async def mock_upload_documents(self, documents):
+        documents_uploaded.extend(documents)
+
+    monkeypatch.setattr(SearchClient, "upload_documents", mock_upload_documents)
+
+    manager = SearchManager(search_info, search_images=False)
+
+    test_io = io.BytesIO(b"test file")
+    test_io.name = "test/foo.pdf"
+    file = File(test_io)
+
+    section = Section(
+        chunk=Chunk(page_num=0, text="chunk text"),
+        content=file,
+        category="test",
+    )
+
+    await manager.update_content([section])
+
+    assert len(documents_uploaded) == 1, "Exactly one document should be uploaded"
+    assert "images" not in documents_uploaded[0], "'images' field should not be present when search_images is False"
+
+
 class AsyncSearchResultsIterator:
     def __init__(self, results):
         self.results = results