Azure-Samples · pamelafox · Oct 17, 2024 · Oct 10, 2024 · Oct 16, 2024 · Oct 17, 2024
diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py
@@ -6,16 +6,15 @@
 )
 from azure.search.documents.indexes.models import (
     AzureOpenAIEmbeddingSkill,
-    AzureOpenAIParameters,
-    AzureOpenAIVectorizer,
     FieldMapping,
     IndexProjectionMode,
     InputFieldMappingEntry,
     OutputFieldMappingEntry,
     SearchIndexer,
     SearchIndexerDataContainer,
     SearchIndexerDataSourceConnection,
-    SearchIndexerIndexProjections,
+    SearchIndexerDataSourceType,
+    SearchIndexerIndexProjection,
     SearchIndexerIndexProjectionSelector,
     SearchIndexerIndexProjectionsParameters,
     SearchIndexerSkillset,
@@ -67,6 +66,7 @@ async def create_embedding_skill(self, index_name: str):
         skillset_name = f"{index_name}-skillset"
 
         split_skill = SplitSkill(
+            name=f"{index_name}-split-skill",
             description="Split skill to chunk documents",
             text_split_mode="pages",
             context="/document",
@@ -82,17 +82,20 @@ async def create_embedding_skill(self, index_name: str):
             raise ValueError("Expecting Azure Open AI instance")
 
         embedding_skill = AzureOpenAIEmbeddingSkill(
+            name=f"{index_name}-embedding-skill",
             description="Skill to generate embeddings via Azure OpenAI",
             context="/document/pages/*",
-            resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
-            deployment_id=self.embeddings.open_ai_deployment,
+            resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
+            deployment_name=self.embeddings.open_ai_deployment,
+            model_name=self.embeddings.open_ai_model_name,
+            dimensions=self.embeddings.open_ai_dimensions,
             inputs=[
                 InputFieldMappingEntry(name="text", source="/document/pages/*"),
             ],
             outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
         )
 
-        index_projections = SearchIndexerIndexProjections(
+        index_projection = SearchIndexerIndexProjection(
             selectors=[
                 SearchIndexerIndexProjectionSelector(
                     target_index_name=index_name,
@@ -114,7 +117,7 @@ async def create_embedding_skill(self, index_name: str):
             name=skillset_name,
             description="Skillset to chunk documents and generate embeddings",
             skills=[split_skill, embedding_skill],
-            index_projections=index_projections,
+            index_projection=index_projection,
         )
 
         return skillset
@@ -132,25 +135,14 @@ async def setup(self):
         if self.embeddings is None:
             raise ValueError("Expecting Azure Open AI instance")
 
-        await search_manager.create_index(
-            vectorizers=[
-                AzureOpenAIVectorizer(
-                    name=f"{self.search_info.index_name}-vectorizer",
-                    kind="azureOpenAI",
-                    azure_open_ai_parameters=AzureOpenAIParameters(
-                        resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
-                        deployment_id=self.embeddings.open_ai_deployment,
-                    ),
-                ),
-            ]
-        )
+        await search_manager.create_index()
 
         # create indexer client
         ds_client = self.search_info.create_search_indexer_client()
         ds_container = SearchIndexerDataContainer(name=self.blob_manager.container)
         data_source_connection = SearchIndexerDataSourceConnection(
             name=f"{self.search_info.index_name}-blob",
-            type="azureblob",
+            type=SearchIndexerDataSourceType.AZURE_BLOB,
             connection_string=self.blob_manager.get_managedidentity_connectionstring(),
             container=ds_container,
             data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(),

diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py
@@ -4,6 +4,8 @@
 from typing import List, Optional
 
 from azure.search.documents.indexes.models import (
+    AzureOpenAIVectorizer,
+    AzureOpenAIVectorizerParameters,
     HnswAlgorithmConfiguration,
     HnswParameters,
     SearchableField,
@@ -174,12 +176,21 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
                         VectorSearchProfile(
                             name="embedding_config",
                             algorithm_configuration_name="hnsw_config",
-                            vectorizer=(
+                            vectorizer_name=(
                                 f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None
                             ),
                         ),
                     ],
-                    vectorizers=vectorizers,
+                    vectorizers=[
+                        AzureOpenAIVectorizer(
+                            vectorizer_name=f"{self.search_info.index_name}-vectorizer",
+                            parameters=AzureOpenAIVectorizerParameters(
+                                resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
+                                deployment_name=self.embeddings.open_ai_deployment,
+                                model_name=self.embeddings.open_ai_model_name,
+                            ),
+                        ),
+                    ],
                 ),
             )
             if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:

diff --git a/app/backend/requirements.in b/app/backend/requirements.in
@@ -7,7 +7,7 @@ tiktoken
 tenacity
 azure-ai-documentintelligence
 azure-cognitiveservices-speech
-azure-search-documents==11.6.0b1
+azure-search-documents==11.6.0b6
 azure-storage-blob
 azure-storage-file-datalake
 uvicorn

diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
@@ -52,7 +52,7 @@ azure-monitor-opentelemetry==1.6.1
     # via -r requirements.in
 azure-monitor-opentelemetry-exporter==1.0.0b28
     # via azure-monitor-opentelemetry
-azure-search-documents==11.6.0b1
+azure-search-documents==11.6.0b6
     # via -r requirements.in
 azure-storage-blob==12.22.0
     # via

diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md
@@ -65,11 +65,13 @@ You can also remove individual documents by using the `--remove` flag. Open eith
 
 ## Overview of Integrated Vectorization
 
-Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in/ba-p/3960809#:~:text=Integrated%20vectorization%20is%20a%20new%20feature%20of%20Azure,pull-indexers%2C%20and%20vectorization%20of%20text%20queries%20through%20vectorizers). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies.
+Azure AI Search includes an [integrated vectorization feature](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in/ba-p/3960809#:~:text=Integrated%20vectorization%20is%20a%20new%20feature%20of%20Azure,pull-indexers%2C%20and%20vectorization%20of%20text%20queries%20through%20vectorizers), a cloud-based approach to data ingestion. Integrated vectorization takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies.
 
 See [this notebook](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb) to understand the process of setting up integrated vectorization.
 We have integrated that code into our `prepdocs` script, so you can use it without needing to understand the details.
 
+You must first explicitly [enable integrated vectorization](./deploy_features.md#enabling-integrated-vectorization) in the `azd` environment to use this feature.
+
 This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index.
 In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks.
 

diff --git a/docs/deploy_features.md b/docs/deploy_features.md
@@ -192,12 +192,26 @@ Azure AI search recently introduced an [integrated vectorization feature in prev
 
 To enable integrated vectorization with this sample:
 
-1. If you've previously deployed, delete the existing search index.
-2. Run `azd env set USE_FEATURE_INT_VECTORIZATION true`
-3. Run `azd up` to update system and user roles
+1. If you've previously deployed, delete the existing search index. 🗑️
+2. To enable the use of integrated vectorization, run:
+
+    ```shell
+    azd env set USE_FEATURE_INT_VECTORIZATION true
+    ```
+
+3. If you've already deployed your app, then you can run just the `provision` step:
+
+    ```shell
+    azd provision
+    ```
+
+    That will set up necessary RBAC roles and configure the integrated vectorization feature on your search service.
+
+    If you haven't deployed your app yet, then you should run the full `azd up` after configuring all optional features.
+
 4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process.
 
-This feature is not currently compatible with GPT4-vision or the newer text-embedding-3 models.
+⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md).
 
 ## Enabling authentication