diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 420b4af39f..aa864eda00 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -375,6 +375,10 @@ async def main(strategy: Strategy, setup_index: bool = True): ingestion_strategy: Strategy if use_int_vectorization: + + if not openai_embeddings_service or not isinstance(openai_embeddings_service, AzureOpenAIEmbeddingService): + raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service") + ingestion_strategy = IntegratedVectorizerStrategy( search_info=search_info, list_file_strategy=list_file_strategy, diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py index 58b84a1689..66e8e4a346 100644 --- a/app/backend/prepdocslib/integratedvectorizerstrategy.py +++ b/app/backend/prepdocslib/integratedvectorizerstrategy.py @@ -6,8 +6,6 @@ ) from azure.search.documents.indexes.models import ( AzureOpenAIEmbeddingSkill, - AzureOpenAIParameters, - AzureOpenAIVectorizer, FieldMapping, IndexProjectionMode, InputFieldMappingEntry, @@ -15,7 +13,8 @@ SearchIndexer, SearchIndexerDataContainer, SearchIndexerDataSourceConnection, - SearchIndexerIndexProjections, + SearchIndexerDataSourceType, + SearchIndexerIndexProjection, SearchIndexerIndexProjectionSelector, SearchIndexerIndexProjectionsParameters, SearchIndexerSkillset, @@ -41,7 +40,7 @@ def __init__( list_file_strategy: ListFileStrategy, blob_manager: BlobManager, search_info: SearchInfo, - embeddings: Optional[AzureOpenAIEmbeddingService], + embeddings: AzureOpenAIEmbeddingService, subscription_id: str, search_service_user_assigned_id: str, document_action: DocumentAction = DocumentAction.Add, @@ -49,8 +48,6 @@ def __init__( use_acls: bool = False, category: Optional[str] = None, ): - if not embeddings or not isinstance(embeddings, AzureOpenAIEmbeddingService): - raise Exception("Expecting AzureOpenAI embedding service") self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager @@ -67,6 +64,7 @@ async def create_embedding_skill(self, index_name: str): skillset_name = f"{index_name}-skillset" split_skill = SplitSkill( + name=f"{index_name}-split-skill", description="Split skill to chunk documents", text_split_mode="pages", context="/document", @@ -78,21 +76,21 @@ async def create_embedding_skill(self, index_name: str): outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")], ) - if self.embeddings is None: - raise ValueError("Expecting Azure Open AI instance") - embedding_skill = AzureOpenAIEmbeddingSkill( + name=f"{index_name}-embedding-skill", description="Skill to generate embeddings via Azure OpenAI", context="/document/pages/*", - resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com", - deployment_id=self.embeddings.open_ai_deployment, + resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com", + deployment_name=self.embeddings.open_ai_deployment, + model_name=self.embeddings.open_ai_model_name, + dimensions=self.embeddings.open_ai_dimensions, inputs=[ InputFieldMappingEntry(name="text", source="/document/pages/*"), ], outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")], ) - index_projections = SearchIndexerIndexProjections( + index_projection = SearchIndexerIndexProjection( selectors=[ SearchIndexerIndexProjectionSelector( target_index_name=index_name, @@ -114,12 +112,13 @@ async def create_embedding_skill(self, index_name: str): name=skillset_name, description="Skillset to chunk documents and generate embeddings", skills=[split_skill, embedding_skill], - index_projections=index_projections, + index_projection=index_projection, ) return skillset async def setup(self): + logger.info("Setting up search index using integrated vectorization...") search_manager = SearchManager( search_info=self.search_info, search_analyzer_name=self.search_analyzer_name, @@ -129,35 +128,19 @@ async def setup(self): search_images=False, ) - if self.embeddings is None: - raise ValueError("Expecting Azure Open AI instance") - - await search_manager.create_index( - vectorizers=[ - AzureOpenAIVectorizer( - name=f"{self.search_info.index_name}-vectorizer", - kind="azureOpenAI", - azure_open_ai_parameters=AzureOpenAIParameters( - resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com", - deployment_id=self.embeddings.open_ai_deployment, - ), - ), - ] - ) + await search_manager.create_index() - # create indexer client ds_client = self.search_info.create_search_indexer_client() ds_container = SearchIndexerDataContainer(name=self.blob_manager.container) data_source_connection = SearchIndexerDataSourceConnection( name=f"{self.search_info.index_name}-blob", - type="azureblob", + type=SearchIndexerDataSourceType.AZURE_BLOB, connection_string=self.blob_manager.get_managedidentity_connectionstring(), container=ds_container, data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(), ) await ds_client.create_or_update_data_source_connection(data_source_connection) - logger.info("Search indexer data source connection updated.") embedding_skillset = await self.create_embedding_skill(self.search_info.index_name) await ds_client.create_or_update_skillset(embedding_skillset) diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index 8757926000..f75af03514 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -4,6 +4,8 @@ from typing import List, Optional from azure.search.documents.indexes.models import ( + AzureOpenAIVectorizer, + AzureOpenAIVectorizerParameters, HnswAlgorithmConfiguration, HnswParameters, SearchableField, @@ -21,7 +23,7 @@ ) from .blobmanager import BlobManager -from .embeddings import OpenAIEmbeddings +from .embeddings import AzureOpenAIEmbeddingService, OpenAIEmbeddings from .listfilestrategy import File from .strategy import SearchInfo from .textsplitter import SplitPage @@ -65,132 +67,159 @@ def __init__( self.search_images = search_images async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]] = None): - logger.info("Ensuring search index %s exists", self.search_info.index_name) + logger.info("Checking whether search index %s exists...", self.search_info.index_name) async with self.search_info.create_search_index_client() as search_index_client: - fields = [ - ( - SimpleField(name="id", type="Edm.String", key=True) - if not self.use_int_vectorization - else SearchField( - name="id", + + if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]: + logger.info("Creating new search index %s", self.search_info.index_name) + fields = [ + ( + SimpleField(name="id", type="Edm.String", key=True) + if not self.use_int_vectorization + else SearchField( + name="id", + type="Edm.String", + key=True, + sortable=True, + filterable=True, + facetable=True, + analyzer_name="keyword", + ) + ), + SearchableField( + name="content", type="Edm.String", - key=True, - sortable=True, - filterable=True, - facetable=True, - analyzer_name="keyword", - ) - ), - SearchableField( - name="content", - type="Edm.String", - analyzer_name=self.search_analyzer_name, - ), - SearchField( - name="embedding", - type=SearchFieldDataType.Collection(SearchFieldDataType.Single), - hidden=False, - searchable=True, - filterable=False, - sortable=False, - facetable=False, - vector_search_dimensions=self.embedding_dimensions, - vector_search_profile_name="embedding_config", - ), - SimpleField(name="category", type="Edm.String", filterable=True, facetable=True), - SimpleField( - name="sourcepage", - type="Edm.String", - filterable=True, - facetable=True, - ), - SimpleField( - name="sourcefile", - type="Edm.String", - filterable=True, - facetable=True, - ), - SimpleField( - name="storageUrl", - type="Edm.String", - filterable=True, - facetable=False, - ), - ] - if self.use_acls: - fields.append( - SimpleField( - name="oids", - type=SearchFieldDataType.Collection(SearchFieldDataType.String), - filterable=True, - ) - ) - fields.append( - SimpleField( - name="groups", - type=SearchFieldDataType.Collection(SearchFieldDataType.String), - filterable=True, - ) - ) - if self.use_int_vectorization: - fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True)) - if self.search_images: - fields.append( + analyzer_name=self.search_analyzer_name, + ), SearchField( - name="imageEmbedding", + name="embedding", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), hidden=False, searchable=True, filterable=False, sortable=False, facetable=False, - vector_search_dimensions=1024, + vector_search_dimensions=self.embedding_dimensions, vector_search_profile_name="embedding_config", ), - ) + SimpleField(name="category", type="Edm.String", filterable=True, facetable=True), + SimpleField( + name="sourcepage", + type="Edm.String", + filterable=True, + facetable=True, + ), + SimpleField( + name="sourcefile", + type="Edm.String", + filterable=True, + facetable=True, + ), + SimpleField( + name="storageUrl", + type="Edm.String", + filterable=True, + facetable=False, + ), + ] + if self.use_acls: + fields.append( + SimpleField( + name="oids", + type=SearchFieldDataType.Collection(SearchFieldDataType.String), + filterable=True, + ) + ) + fields.append( + SimpleField( + name="groups", + type=SearchFieldDataType.Collection(SearchFieldDataType.String), + filterable=True, + ) + ) + if self.use_int_vectorization: + logger.info("Including parent_id field in new index %s", self.search_info.index_name) + fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True)) + if self.search_images: + logger.info("Including imageEmbedding field in new index %s", self.search_info.index_name) + fields.append( + SearchField( + name="imageEmbedding", + type=SearchFieldDataType.Collection(SearchFieldDataType.Single), + hidden=False, + searchable=True, + filterable=False, + sortable=False, + facetable=False, + vector_search_dimensions=1024, + vector_search_profile_name="embedding_config", + ), + ) - index = SearchIndex( - name=self.search_info.index_name, - fields=fields, - semantic_search=SemanticSearch( - configurations=[ - SemanticConfiguration( - name="default", - prioritized_fields=SemanticPrioritizedFields( - title_field=None, content_fields=[SemanticField(field_name="content")] + vectorizers = [] + if self.embeddings and isinstance(self.embeddings, AzureOpenAIEmbeddingService): + logger.info( + "Including vectorizer for search index %s, using Azure OpenAI service %s", + self.search_info.index_name, + self.embeddings.open_ai_service, + ) + vectorizers.append( + AzureOpenAIVectorizer( + vectorizer_name=f"{self.search_info.index_name}-vectorizer", + parameters=AzureOpenAIVectorizerParameters( + resource_url=self.embeddings.open_ai_endpoint, + deployment_name=self.embeddings.open_ai_deployment, + model_name=self.embeddings.open_ai_model_name, ), ) - ] - ), - vector_search=VectorSearch( - algorithms=[ - HnswAlgorithmConfiguration( - name="hnsw_config", - parameters=HnswParameters(metric="cosine"), - ) - ], - profiles=[ - VectorSearchProfile( - name="embedding_config", - algorithm_configuration_name="hnsw_config", - vectorizer=( - f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None + ) + else: + logger.info( + "Not including vectorizer for search index %s, no Azure OpenAI service found", + self.search_info.index_name, + ) + + index = SearchIndex( + name=self.search_info.index_name, + fields=fields, + semantic_search=SemanticSearch( + configurations=[ + SemanticConfiguration( + name="default", + prioritized_fields=SemanticPrioritizedFields( + title_field=None, content_fields=[SemanticField(field_name="content")] + ), + ) + ] + ), + vector_search=VectorSearch( + algorithms=[ + HnswAlgorithmConfiguration( + name="hnsw_config", + parameters=HnswParameters(metric="cosine"), + ) + ], + profiles=[ + VectorSearchProfile( + name="embedding_config", + algorithm_configuration_name="hnsw_config", + vectorizer_name=( + f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None + ), ), - ), - ], - vectorizers=vectorizers, - ), - ) - if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]: - logger.info("Creating %s search index", self.search_info.index_name) + ], + vectorizers=vectorizers, + ), + ) + await search_index_client.create_index(index) else: logger.info("Search index %s already exists", self.search_info.index_name) - index_definition = await search_index_client.get_index(self.search_info.index_name) - if not any(field.name == "storageUrl" for field in index_definition.fields): + existing_index = await search_index_client.get_index(self.search_info.index_name) + if not any(field.name == "storageUrl" for field in existing_index.fields): logger.info("Adding storageUrl field to index %s", self.search_info.index_name) - index_definition.fields.append( + existing_index.fields.append( SimpleField( name="storageUrl", type="Edm.String", @@ -198,7 +227,30 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]] facetable=False, ), ) - await search_index_client.create_or_update_index(index_definition) + await search_index_client.create_or_update_index(existing_index) + + if existing_index.vector_search is not None and ( + existing_index.vector_search.vectorizers is None + or len(existing_index.vector_search.vectorizers) == 0 + ): + if self.embeddings is not None and isinstance(self.embeddings, AzureOpenAIEmbeddingService): + logger.info("Adding vectorizer to search index %s", self.search_info.index_name) + existing_index.vector_search.vectorizers = [ + AzureOpenAIVectorizer( + vectorizer_name=f"{self.search_info.index_name}-vectorizer", + parameters=AzureOpenAIVectorizerParameters( + resource_url=self.embeddings.open_ai_endpoint, + deployment_name=self.embeddings.open_ai_deployment, + model_name=self.embeddings.open_ai_model_name, + ), + ) + ] + await search_index_client.create_or_update_index(existing_index) + else: + logger.info( + "Can't add vectorizer to search index %s since no Azure OpenAI embeddings service is defined", + self.search_info, + ) async def update_content( self, sections: List[Section], image_embeddings: Optional[List[List[float]]] = None, url: Optional[str] = None diff --git a/app/backend/requirements.in b/app/backend/requirements.in index be5dd02754..99cb44e678 100644 --- a/app/backend/requirements.in +++ b/app/backend/requirements.in @@ -7,7 +7,7 @@ tiktoken tenacity azure-ai-documentintelligence azure-cognitiveservices-speech -azure-search-documents==11.6.0b1 +azure-search-documents==11.6.0b6 azure-storage-blob azure-storage-file-datalake uvicorn diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 0263ea188a..51df00e14b 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -52,7 +52,7 @@ azure-monitor-opentelemetry==1.6.1 # via -r requirements.in azure-monitor-opentelemetry-exporter==1.0.0b28 # via azure-monitor-opentelemetry -azure-search-documents==11.6.0b1 +azure-search-documents==11.6.0b6 # via -r requirements.in azure-storage-blob==12.22.0 # via diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index b293ab0abb..60852fac55 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -65,11 +65,13 @@ You can also remove individual documents by using the `--remove` flag. Open eith ## Overview of Integrated Vectorization -Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in/ba-p/3960809#:~:text=Integrated%20vectorization%20is%20a%20new%20feature%20of%20Azure,pull-indexers%2C%20and%20vectorization%20of%20text%20queries%20through%20vectorizers). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. +Azure AI Search includes an [integrated vectorization feature](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in/ba-p/3960809#:~:text=Integrated%20vectorization%20is%20a%20new%20feature%20of%20Azure,pull-indexers%2C%20and%20vectorization%20of%20text%20queries%20through%20vectorizers), a cloud-based approach to data ingestion. Integrated vectorization takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. See [this notebook](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb) to understand the process of setting up integrated vectorization. We have integrated that code into our `prepdocs` script, so you can use it without needing to understand the details. +You must first explicitly [enable integrated vectorization](./deploy_features.md#enabling-integrated-vectorization) in the `azd` environment to use this feature. + This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index. In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks. diff --git a/docs/deploy_features.md b/docs/deploy_features.md index 3fe3d62605..d05ccda903 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -192,12 +192,26 @@ Azure AI search recently introduced an [integrated vectorization feature in prev To enable integrated vectorization with this sample: -1. If you've previously deployed, delete the existing search index. -2. Run `azd env set USE_FEATURE_INT_VECTORIZATION true` -3. Run `azd up` to update system and user roles +1. If you've previously deployed, delete the existing search index. 🗑️ +2. To enable the use of integrated vectorization, run: + + ```shell + azd env set USE_FEATURE_INT_VECTORIZATION true + ``` + +3. If you've already deployed your app, then you can run just the `provision` step: + + ```shell + azd provision + ``` + + That will set up necessary RBAC roles and configure the integrated vectorization feature on your search service. + + If you haven't deployed your app yet, then you should run the full `azd up` after configuring all optional features. + 4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process. -This feature is not currently compatible with GPT4-vision or the newer text-embedding-3 models. +⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md). ## Enabling authentication