Skip to content

Commit 165dcac

Browse files
authored
Fix ingestion for case when no images field exists (#2719)
* Only set images field if multimodal is enabled * Fix ingestion for non-multimodal case * Update tests and snapshots
1 parent 74b66bd commit 165dcac

File tree

5 files changed

+96
-68
lines changed

5 files changed

+96
-68
lines changed

app/backend/prepdocslib/searchmanager.py

Lines changed: 18 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -471,28 +471,33 @@ async def update_content(self, sections: list[Section], url: Optional[str] = Non
471471

472472
async with self.search_info.create_search_client() as search_client:
473473
for batch_index, batch in enumerate(section_batches):
474-
documents = [
475-
{
474+
documents = []
475+
for section_index, section in enumerate(batch):
476+
image_fields = {}
477+
if self.search_images:
478+
image_fields = {
479+
"images": [
480+
{
481+
"url": image.url,
482+
"description": image.description,
483+
"boundingbox": image.bbox,
484+
"embedding": image.embedding,
485+
}
486+
for image in section.chunk.images
487+
]
488+
}
489+
document = {
476490
"id": f"{section.content.filename_to_id()}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
477491
"content": section.chunk.text,
478492
"category": section.category,
479493
"sourcepage": BlobManager.sourcepage_from_file_page(
480494
filename=section.content.filename(), page=section.chunk.page_num
481495
),
482496
"sourcefile": section.content.filename(),
483-
"images": [
484-
{
485-
"url": image.url,
486-
"description": image.description,
487-
"boundingbox": image.bbox,
488-
"embedding": image.embedding,
489-
}
490-
for image in section.chunk.images
491-
],
497+
**image_fields,
492498
**section.content.acls,
493499
}
494-
for section_index, section in enumerate(batch)
495-
]
500+
documents.append(document)
496501
if url:
497502
for document in documents:
498503
document["storageUrl"] = url
-124 KB
Binary file not shown.

0 commit comments

Comments
 (0)