Skip to content

Commit 5d0a5d6

Browse files
authored
feat: Upload to AI Search in Batches (#1323)
1 parent c1cb24e commit 5d0a5d6

File tree

4 files changed

+77
-17
lines changed

4 files changed

+77
-17
lines changed

.flake8

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22
max-line-length = 88
33
extend-ignore = E501
44
exclude = .venv
5+
ignore = E203, W503

code/backend/batch/utilities/helpers/embedders/push_embedder.py

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -79,12 +79,18 @@ def __embed(
7979
for document in documents:
8080
documents_to_upload.append(self.__convert_to_search_document(document))
8181

82-
response = self.azure_search_helper.get_search_client().upload_documents(
83-
documents_to_upload
84-
)
85-
if not all([r.succeeded for r in response]):
86-
logger.error("Failed to upload documents to search index")
87-
raise Exception(response)
82+
# Upload documents (which are chunks) to search index in batches
83+
if documents_to_upload:
84+
batch_size = self.env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE
85+
search_client = self.azure_search_helper.get_search_client()
86+
for i in range(0, len(documents_to_upload), batch_size):
87+
batch = documents_to_upload[i : i + batch_size]
88+
response = search_client.upload_documents(batch)
89+
if not all(r.succeeded for r in response if response):
90+
logger.error("Failed to upload documents to search index")
91+
raise RuntimeError(f"Upload failed for some documents: {response}")
92+
else:
93+
logger.warning("No documents to upload.")
8894

8995
def __generate_image_caption(self, source_url):
9096
model = self.env_helper.AZURE_OPENAI_VISION_MODEL

code/backend/batch/utilities/helpers/env_helper.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,9 @@ def __load_config(self, **kwargs) -> None:
7878
self.AZURE_SEARCH_CONVERSATIONS_LOG_INDEX = os.getenv(
7979
"AZURE_SEARCH_CONVERSATIONS_LOG_INDEX", "conversations"
8080
)
81+
self.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = os.getenv(
82+
"AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE", 100
83+
)
8184
# Integrated Vectorization
8285
self.AZURE_SEARCH_DATASOURCE_NAME = os.getenv(
8386
"AZURE_SEARCH_DATASOURCE_NAME", ""

code/tests/utilities/helpers/test_push_embedder.py

Lines changed: 61 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
AZURE_SEARCH_SEMANTIC_SEARCH_CONFIG = "default"
2828
AZURE_SEARCH_CONVERSATIONS_LOG_INDEX = "mock-log-index"
2929
USE_ADVANCED_IMAGE_PROCESSING = False
30+
AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = 100
3031

3132

3233
@pytest.fixture(autouse=True)
@@ -49,7 +50,9 @@ def llm_helper_mock():
4950

5051
@pytest.fixture(autouse=True)
5152
def env_helper_mock():
52-
with patch("backend.batch.utilities.helpers.embedders.push_embedder.EnvHelper") as mock:
53+
with patch(
54+
"backend.batch.utilities.helpers.embedders.push_embedder.EnvHelper"
55+
) as mock:
5356
env_helper = mock.return_value
5457
env_helper.AZURE_AUTH_TYPE = AZURE_AUTH_TYPE
5558
env_helper.AZURE_SEARCH_KEY = AZURE_SEARCH_KEY
@@ -58,7 +61,9 @@ def env_helper_mock():
5861
env_helper.AZURE_SEARCH_USE_SEMANTIC_SEARCH = AZURE_SEARCH_USE_SEMANTIC_SEARCH
5962
env_helper.AZURE_SEARCH_FIELDS_ID = AZURE_SEARCH_FIELDS_ID
6063
env_helper.AZURE_SEARCH_CONTENT_COLUMN = AZURE_SEARCH_CONTENT_COLUMN
61-
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN = AZURE_SEARCH_CONTENT_VECTOR_COLUMN
64+
env_helper.AZURE_SEARCH_CONTENT_VECTOR_COLUMN = (
65+
AZURE_SEARCH_CONTENT_VECTOR_COLUMN
66+
)
6267
env_helper.AZURE_SEARCH_TITLE_COLUMN = AZURE_SEARCH_TITLE_COLUMN
6368
env_helper.AZURE_SEARCH_FIELDS_METADATA = AZURE_SEARCH_FIELDS_METADATA
6469
env_helper.AZURE_SEARCH_SOURCE_COLUMN = AZURE_SEARCH_SOURCE_COLUMN
@@ -73,6 +78,9 @@ def env_helper_mock():
7378

7479
env_helper.USE_ADVANCED_IMAGE_PROCESSING = USE_ADVANCED_IMAGE_PROCESSING
7580
env_helper.is_auth_type_keys.return_value = True
81+
env_helper.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = (
82+
AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE
83+
)
7684
yield env_helper
7785

7886

@@ -291,7 +299,10 @@ def test_embed_file_advanced_image_processing_raises_exception_on_failure(
291299

292300

293301
def test_embed_file_use_advanced_image_processing_does_not_vectorize_image_if_unsupported(
294-
azure_computer_vision_mock, mock_config_helper, azure_search_helper_mock, env_helper_mock
302+
azure_computer_vision_mock,
303+
mock_config_helper,
304+
azure_search_helper_mock,
305+
env_helper_mock,
295306
):
296307
# given
297308
mock_config_helper.document_processors = [
@@ -331,7 +342,9 @@ def test_embed_file_loads_documents(document_loading_mock, env_helper_mock):
331342
)
332343

333344

334-
def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mock, env_helper_mock):
345+
def test_embed_file_chunks_documents(
346+
document_loading_mock, document_chunking_mock, env_helper_mock
347+
):
335348
# given
336349
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)
337350

@@ -347,7 +360,9 @@ def test_embed_file_chunks_documents(document_loading_mock, document_chunking_mo
347360
)
348361

349362

350-
def test_embed_file_chunks_documents_upper_case(document_loading_mock, document_chunking_mock, env_helper_mock):
363+
def test_embed_file_chunks_documents_upper_case(
364+
document_loading_mock, document_chunking_mock, env_helper_mock
365+
):
351366
# given
352367
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)
353368

@@ -363,7 +378,9 @@ def test_embed_file_chunks_documents_upper_case(document_loading_mock, document_
363378
)
364379

365380

366-
def test_embed_file_generates_embeddings_for_documents(llm_helper_mock, env_helper_mock):
381+
def test_embed_file_generates_embeddings_for_documents(
382+
llm_helper_mock, env_helper_mock
383+
):
367384
# given
368385
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)
369386

@@ -382,7 +399,8 @@ def test_embed_file_generates_embeddings_for_documents(llm_helper_mock, env_help
382399
def test_embed_file_stores_documents_in_search_index(
383400
document_chunking_mock,
384401
llm_helper_mock,
385-
azure_search_helper_mock: MagicMock, env_helper_mock
402+
azure_search_helper_mock: MagicMock,
403+
env_helper_mock,
386404
):
387405
# given
388406
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)
@@ -404,10 +422,14 @@ def test_embed_file_stores_documents_in_search_index(
404422
AZURE_SEARCH_FIELDS_METADATA: json.dumps(
405423
{
406424
AZURE_SEARCH_FIELDS_ID: expected_chunked_documents[0].id,
407-
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[0].source,
425+
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[
426+
0
427+
].source,
408428
AZURE_SEARCH_TITLE_COLUMN: expected_chunked_documents[0].title,
409429
AZURE_SEARCH_CHUNK_COLUMN: expected_chunked_documents[0].chunk,
410-
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[0].offset,
430+
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[
431+
0
432+
].offset,
411433
"page_number": expected_chunked_documents[0].page_number,
412434
"chunk_id": expected_chunked_documents[0].chunk_id,
413435
}
@@ -424,10 +446,14 @@ def test_embed_file_stores_documents_in_search_index(
424446
AZURE_SEARCH_FIELDS_METADATA: json.dumps(
425447
{
426448
AZURE_SEARCH_FIELDS_ID: expected_chunked_documents[1].id,
427-
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[1].source,
449+
AZURE_SEARCH_SOURCE_COLUMN: expected_chunked_documents[
450+
1
451+
].source,
428452
AZURE_SEARCH_TITLE_COLUMN: expected_chunked_documents[1].title,
429453
AZURE_SEARCH_CHUNK_COLUMN: expected_chunked_documents[1].chunk,
430-
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[1].offset,
454+
AZURE_SEARCH_OFFSET_COLUMN: expected_chunked_documents[
455+
1
456+
].offset,
431457
"page_number": expected_chunked_documents[1].page_number,
432458
"chunk_id": expected_chunked_documents[1].chunk_id,
433459
}
@@ -441,6 +467,30 @@ def test_embed_file_stores_documents_in_search_index(
441467
)
442468

443469

470+
def test_embed_file_stores_documents_in_search_index_in_batches(
471+
document_chunking_mock,
472+
llm_helper_mock,
473+
azure_search_helper_mock: MagicMock,
474+
env_helper_mock,
475+
):
476+
# given
477+
env_helper_mock.AZURE_SEARCH_DOC_UPLOAD_BATCH_SIZE = 1
478+
push_embedder = PushEmbedder(MagicMock(), env_helper_mock)
479+
480+
# when
481+
push_embedder.embed_file(
482+
"some-url",
483+
"some-file-name.pdf",
484+
)
485+
486+
# then
487+
azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.assert_called()
488+
assert (
489+
azure_search_helper_mock.return_value.get_search_client.return_value.upload_documents.call_count
490+
== 2
491+
)
492+
493+
444494
def test_embed_file_raises_exception_on_failure(
445495
azure_search_helper_mock,
446496
):

0 commit comments

Comments
 (0)