Skip to content

Commit 5ed3e8d

Browse files
committed
Revert integrated vectorization changes, using a different strategy
1 parent 11c2d29 commit 5ed3e8d

File tree

6 files changed

+12
-214
lines changed

6 files changed

+12
-214
lines changed

.github/copilot-instructions.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@
3030

3131
# Adding new data
3232

33-
New files should be added to the `data` folder, and then either run scripts/prepdocs.sh or script/prepdocs.ps1 to ingest the data.
33+
New files should be added to the `data` folder, and then either run scripts/prepdocs.sh or scripts/prepdocs.ps1 to ingest the data.
3434

3535
# Adding a new azd environment variable
3636

app/backend/prepdocs.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -559,12 +559,6 @@ async def main(strategy: Strategy, setup_index: bool = True):
559559
search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"),
560560
use_acls=use_acls,
561561
category=args.category,
562-
use_multimodal=use_multimodal,
563-
image_embeddings=setup_image_embeddings_service(
564-
azure_credential=azd_credential,
565-
vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"),
566-
use_multimodal=use_multimodal,
567-
),
568562
)
569563
else:
570564
file_processors = setup_file_processors(

app/backend/prepdocslib/goals.json

Lines changed: 0 additions & 16 deletions
This file was deleted.

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 10 additions & 179 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,9 @@
55
NativeBlobSoftDeleteDeletionDetectionPolicy,
66
)
77
from azure.search.documents.indexes.models import (
8-
AIServicesAccountIdentity,
98
AzureOpenAIEmbeddingSkill,
10-
DocumentIntelligenceLayoutSkill,
11-
DocumentIntelligenceLayoutSkillChunkingProperties,
12-
IndexingParameters,
13-
IndexingParametersConfiguration,
149
IndexProjectionMode,
1510
InputFieldMappingEntry,
16-
MergeSkill,
1711
OutputFieldMappingEntry,
1812
SearchIndexer,
1913
SearchIndexerDataContainer,
@@ -22,17 +16,12 @@
2216
SearchIndexerIndexProjection,
2317
SearchIndexerIndexProjectionSelector,
2418
SearchIndexerIndexProjectionsParameters,
25-
SearchIndexerKnowledgeStore,
26-
SearchIndexerKnowledgeStoreFileProjectionSelector,
27-
SearchIndexerKnowledgeStoreProjection,
2819
SearchIndexerSkillset,
29-
ShaperSkill,
3020
SplitSkill,
31-
VisionVectorizeSkill,
3221
)
3322

3423
from .blobmanager import BlobManager
35-
from .embeddings import AzureOpenAIEmbeddingService, ImageEmbeddings
24+
from .embeddings import AzureOpenAIEmbeddingService
3625
from .listfilestrategy import ListFileStrategy
3726
from .searchmanager import SearchManager
3827
from .strategy import DocumentAction, SearchInfo, Strategy
@@ -53,20 +42,20 @@ def __init__(
5342
embeddings: AzureOpenAIEmbeddingService,
5443
search_field_name_embedding: str,
5544
subscription_id: str,
45+
search_service_user_assigned_id: str,
5646
document_action: DocumentAction = DocumentAction.Add,
5747
search_analyzer_name: Optional[str] = None,
5848
use_acls: bool = False,
5949
category: Optional[str] = None,
60-
use_multimodal: bool = False,
61-
image_embeddings: Optional[ImageEmbeddings] = None,
6250
):
51+
6352
self.list_file_strategy = list_file_strategy
6453
self.blob_manager = blob_manager
6554
self.document_action = document_action
6655
self.embeddings = embeddings
67-
self.image_embeddings = image_embeddings
6856
self.search_field_name_embedding = search_field_name_embedding
6957
self.subscription_id = subscription_id
58+
self.search_user_assigned_identity = search_service_user_assigned_id
7059
self.search_analyzer_name = search_analyzer_name
7160
self.use_acls = use_acls
7261
self.category = category
@@ -75,12 +64,12 @@ def __init__(
7564
self.skillset_name = f"{prefix}-skillset"
7665
self.indexer_name = f"{prefix}-indexer"
7766
self.data_source_name = f"{prefix}-blob"
78-
self.use_multimodal = use_multimodal and image_embeddings is not None
7967

80-
async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
68+
async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset:
8169
"""
8270
Create a skillset for the indexer to chunk documents and generate embeddings
8371
"""
72+
8473
split_skill = SplitSkill(
8574
name="split-skill",
8675
description="Split skill to chunk documents",
@@ -139,152 +128,6 @@ async def create_skillset(self, index_name: str) -> SearchIndexerSkillset:
139128

140129
return skillset
141130

142-
async def create_multimodal_skillset(self, index_name: str) -> SearchIndexerSkillset:
143-
if self.image_embeddings is None:
144-
raise ValueError("Image embeddings client must be provided for multimodal skillset creation.")
145-
if self.blob_manager.image_container is None:
146-
raise ValueError("Blob manager must have an image container set for multimodal skillset creation.")
147-
148-
document_layout_skill = DocumentIntelligenceLayoutSkill(
149-
description="Layout skill to read documents",
150-
context="/document",
151-
output_mode="oneToMany",
152-
output_format="text",
153-
markdown_header_depth="", # Necessary so that SDK doesnt send a header depth
154-
extraction_options=["images", "locationMetadata"],
155-
chunking_properties=DocumentIntelligenceLayoutSkillChunkingProperties(
156-
unit="characters",
157-
maximum_length=2000,
158-
overlap_length=200,
159-
),
160-
inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
161-
outputs=[
162-
OutputFieldMappingEntry(name="text_sections", target_name="text_sections"),
163-
OutputFieldMappingEntry(name="normalized_images", target_name="normalized_images"),
164-
],
165-
)
166-
167-
split_skill = SplitSkill(
168-
description="Split skill to chunk pages of documents",
169-
text_split_mode="pages",
170-
context="/document/text_sections/*",
171-
maximum_page_length=2000,
172-
page_overlap_length=500,
173-
inputs=[
174-
InputFieldMappingEntry(name="text", source="/document/text_sections/*/content"),
175-
],
176-
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
177-
)
178-
179-
embedding_skill = AzureOpenAIEmbeddingSkill(
180-
name="embedding-skill",
181-
description="Skill to generate embeddings via Azure OpenAI",
182-
context="/document/text_sections/*/pages/*",
183-
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
184-
deployment_name=self.embeddings.open_ai_deployment,
185-
model_name=self.embeddings.open_ai_model_name,
186-
dimensions=self.embeddings.open_ai_dimensions,
187-
inputs=[
188-
InputFieldMappingEntry(name="text", source="/document/text_sections/*/pages/*"),
189-
],
190-
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
191-
)
192-
193-
vision_embedding_skill = VisionVectorizeSkill(
194-
name="vision-embedding-skill",
195-
description="Skill to generate image embeddings via Azure AI Vision",
196-
context="/document/normalized_images/*",
197-
model_version="2023-04-15",
198-
inputs=[InputFieldMappingEntry(name="image", source="/document/normalized_images/*")],
199-
outputs=[OutputFieldMappingEntry(name="vector", target_name="image_vector")],
200-
)
201-
202-
vision_embedding_shaper_skill = ShaperSkill(
203-
name="vision-embedding-shaper-skill",
204-
description="Shaper skill to ensure image embeddings are in the correct format",
205-
context="/document/normalized_images/*",
206-
inputs=[
207-
InputFieldMappingEntry(name="embedding", source="/document/normalized_images/*/image_vector"),
208-
InputFieldMappingEntry(
209-
name="url",
210-
source=f'="{self.blob_manager.endpoint}/images/"+$(/document/normalized_images/*/imagePath)',
211-
),
212-
],
213-
outputs=[OutputFieldMappingEntry(name="output", target_name="images")],
214-
)
215-
216-
merge_skill = MergeSkill(
217-
name="merge-skill",
218-
description="Merge skill to create source page",
219-
insert_post_tag="",
220-
insert_pre_tag="",
221-
context="/document/text_sections/*/locationMetadata",
222-
inputs=[
223-
InputFieldMappingEntry(
224-
name="itemsToInsert",
225-
source='=[$(/document/metadata_storage_name), "#page=", $(/document/text_sections/*/locationMetadata/pageNumber)]',
226-
)
227-
],
228-
outputs=[OutputFieldMappingEntry(name="mergedText", target_name="citation")],
229-
)
230-
231-
indexer_skills = [
232-
document_layout_skill,
233-
split_skill,
234-
embedding_skill,
235-
vision_embedding_skill,
236-
vision_embedding_shaper_skill,
237-
merge_skill,
238-
]
239-
240-
index_projection = SearchIndexerIndexProjection(
241-
selectors=[
242-
SearchIndexerIndexProjectionSelector(
243-
target_index_name=index_name,
244-
parent_key_field_name="parent_id",
245-
source_context="/document/text_sections/*/pages/*",
246-
mappings=[
247-
InputFieldMappingEntry(name="content", source="/document/text_sections/*/pages/*"),
248-
InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"),
249-
InputFieldMappingEntry(
250-
name="sourcepage", source="/document/text_sections/*/locationMetadata/citation"
251-
),
252-
InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"),
253-
InputFieldMappingEntry(
254-
name=self.search_field_name_embedding, source="/document/text_sections/*/pages/*/vector"
255-
),
256-
InputFieldMappingEntry(name="images", source="/document/normalized_images/*/images"),
257-
],
258-
)
259-
],
260-
parameters=SearchIndexerIndexProjectionsParameters(
261-
projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
262-
),
263-
)
264-
265-
skillset = SearchIndexerSkillset(
266-
name=self.skillset_name,
267-
description="Skillset to chunk documents and generate embeddings",
268-
skills=indexer_skills,
269-
index_projection=index_projection,
270-
cognitive_services_account=AIServicesAccountIdentity(subdomain_url=self.image_embeddings.endpoint),
271-
knowledge_store=SearchIndexerKnowledgeStore(
272-
storage_connection_string=self.blob_manager.get_managedidentity_connectionstring(),
273-
projections=[
274-
SearchIndexerKnowledgeStoreProjection(
275-
files=[
276-
SearchIndexerKnowledgeStoreFileProjectionSelector(
277-
storage_container=self.blob_manager.image_container,
278-
source="/document/normalized_images/*",
279-
)
280-
]
281-
)
282-
],
283-
),
284-
)
285-
286-
return skillset
287-
288131
async def setup(self):
289132
logger.info("Setting up search index using integrated vectorization...")
290133
search_manager = SearchManager(
@@ -294,7 +137,7 @@ async def setup(self):
294137
use_int_vectorization=True,
295138
embeddings=self.embeddings,
296139
field_name_embedding=self.search_field_name_embedding,
297-
search_images=self.use_multimodal,
140+
search_images=False,
298141
)
299142

300143
await search_manager.create_index()
@@ -311,11 +154,8 @@ async def setup(self):
311154

312155
await ds_client.create_or_update_data_source_connection(data_source_connection)
313156

314-
if self.use_multimodal:
315-
skillset = await self.create_multimodal_skillset(self.search_info.index_name)
316-
else:
317-
skillset = await self.create_skillset(self.search_info.index_name)
318-
await ds_client.create_or_update_skillset(skillset)
157+
embedding_skillset = await self.create_embedding_skill(self.search_info.index_name)
158+
await ds_client.create_or_update_skillset(embedding_skillset)
319159
await ds_client.close()
320160

321161
async def run(self):
@@ -334,22 +174,13 @@ async def run(self):
334174
elif self.document_action == DocumentAction.RemoveAll:
335175
await self.blob_manager.remove_blob()
336176

337-
indexing_parameters = None
338-
if self.use_multimodal:
339-
indexing_parameters = IndexingParameters(
340-
configuration=IndexingParametersConfiguration(
341-
query_timeout=None, allow_skillset_to_read_file_data=True # type: ignore
342-
),
343-
max_failed_items=-1,
344-
)
345-
177+
# Create an indexer
346178
indexer = SearchIndexer(
347179
name=self.indexer_name,
348180
description="Indexer to index documents and generate embeddings",
349181
skillset_name=self.skillset_name,
350182
target_index_name=self.search_info.index_name,
351183
data_source_name=self.data_source_name,
352-
parameters=indexing_parameters, # Properly pass the parameters
353184
)
354185

355186
indexer_client = self.search_info.create_search_indexer_client()

docs/multimodal.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,8 +98,8 @@ With this feature enabled, the following changes are made:
9898

9999
## Compatibility
100100

101+
* This feature is **not** compatible with [integrated vectorization](./deploy_features.md#enabling-integrated-vectorization), as the currently configured built-in skills do not process images or store image embeddings. Azure AI Search does now offer built-in skills for multimodal support, as demonstrated in [azure-ai-search-multimodal-sample](https://github.com/Azure-Samples/azure-ai-search-multimodal-sample), but we were not able to customize them enough to meet the requirements of this feature. Instead, we are working on making a custom skill based off the data ingestion code in this repository, and hosting that skill on Azure Functions. Stay tuned to the releases to find out when that's available.
101102
* This feature is **not** fully compatible with the [agentic retrieval](./agentic_retrieval.md) feature.
102103
The agent *will* perform the multimodal vector embedding search, but it will not return images in the response,
103104
so we cannot send the images to the chat completion model.
104105
* This feature *is* compatible with the [reasoning models](./reasoning.md) feature, as long as you use a model that [supports image inputs](https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning?tabs=python-secure%2Cpy#api--feature-support).
105-
* This feature is *mostly* compatible with [integrated vectorization](./deploy_features.md#enabling-integrated-vectorization). The extraction process will not be exactly the same, so the chunks will not be identical, and the extracted images will not contain citations.

todo.txt

Lines changed: 0 additions & 11 deletions
This file was deleted.

0 commit comments

Comments
 (0)