Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 12 additions & 20 deletions app/backend/prepdocslib/integratedvectorizerstrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
)
from azure.search.documents.indexes.models import (
AzureOpenAIEmbeddingSkill,
AzureOpenAIParameters,
AzureOpenAIVectorizer,
FieldMapping,
IndexProjectionMode,
InputFieldMappingEntry,
OutputFieldMappingEntry,
SearchIndexer,
SearchIndexerDataContainer,
SearchIndexerDataSourceConnection,
SearchIndexerIndexProjections,
SearchIndexerDataSourceType,
SearchIndexerIndexProjection,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
SearchIndexerSkillset,
Expand Down Expand Up @@ -67,6 +66,7 @@ async def create_embedding_skill(self, index_name: str):
skillset_name = f"{index_name}-skillset"

split_skill = SplitSkill(
name=f"{index_name}-split-skill",
description="Split skill to chunk documents",
text_split_mode="pages",
context="/document",
Expand All @@ -82,17 +82,20 @@ async def create_embedding_skill(self, index_name: str):
raise ValueError("Expecting Azure Open AI instance")

embedding_skill = AzureOpenAIEmbeddingSkill(
name=f"{index_name}-embedding-skill",
description="Skill to generate embeddings via Azure OpenAI",
context="/document/pages/*",
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_id=self.embeddings.open_ai_deployment,
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_name=self.embeddings.open_ai_deployment,
model_name=self.embeddings.open_ai_model_name,
dimensions=self.embeddings.open_ai_dimensions,
inputs=[
InputFieldMappingEntry(name="text", source="/document/pages/*"),
],
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
)

index_projections = SearchIndexerIndexProjections(
index_projection = SearchIndexerIndexProjection(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=index_name,
Expand All @@ -114,7 +117,7 @@ async def create_embedding_skill(self, index_name: str):
name=skillset_name,
description="Skillset to chunk documents and generate embeddings",
skills=[split_skill, embedding_skill],
index_projections=index_projections,
index_projection=index_projection,
)

return skillset
Expand All @@ -132,25 +135,14 @@ async def setup(self):
if self.embeddings is None:
raise ValueError("Expecting Azure Open AI instance")

await search_manager.create_index(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now we always make a vectorizer, even when not using integrated vectorization.

vectorizers=[
AzureOpenAIVectorizer(
name=f"{self.search_info.index_name}-vectorizer",
kind="azureOpenAI",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_id=self.embeddings.open_ai_deployment,
),
),
]
)
await search_manager.create_index()

# create indexer client
ds_client = self.search_info.create_search_indexer_client()
ds_container = SearchIndexerDataContainer(name=self.blob_manager.container)
data_source_connection = SearchIndexerDataSourceConnection(
name=f"{self.search_info.index_name}-blob",
type="azureblob",
type=SearchIndexerDataSourceType.AZURE_BLOB,
connection_string=self.blob_manager.get_managedidentity_connectionstring(),
container=ds_container,
data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(),
Expand Down
15 changes: 13 additions & 2 deletions app/backend/prepdocslib/searchmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from typing import List, Optional

from azure.search.documents.indexes.models import (
AzureOpenAIVectorizer,
AzureOpenAIVectorizerParameters,
HnswAlgorithmConfiguration,
HnswParameters,
SearchableField,
Expand Down Expand Up @@ -174,12 +176,21 @@ async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]]
VectorSearchProfile(
name="embedding_config",
algorithm_configuration_name="hnsw_config",
vectorizer=(
vectorizer_name=(
f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None
),
),
],
vectorizers=vectorizers,
vectorizers=[
AzureOpenAIVectorizer(
vectorizer_name=f"{self.search_info.index_name}-vectorizer",
parameters=AzureOpenAIVectorizerParameters(
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_name=self.embeddings.open_ai_deployment,
model_name=self.embeddings.open_ai_model_name,
),
),
],
),
)
if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:
Expand Down
2 changes: 1 addition & 1 deletion app/backend/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ tiktoken
tenacity
azure-ai-documentintelligence
azure-cognitiveservices-speech
azure-search-documents==11.6.0b1
azure-search-documents==11.6.0b6
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't get any errors about code changes on the .search() side so I'm assuming that's largely unchanged in the latest versions?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correct

azure-storage-blob
azure-storage-file-datalake
uvicorn
Expand Down
2 changes: 1 addition & 1 deletion app/backend/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ azure-monitor-opentelemetry==1.6.1
# via -r requirements.in
azure-monitor-opentelemetry-exporter==1.0.0b28
# via azure-monitor-opentelemetry
azure-search-documents==11.6.0b1
azure-search-documents==11.6.0b6
# via -r requirements.in
azure-storage-blob==12.22.0
# via
Expand Down
4 changes: 3 additions & 1 deletion docs/data_ingestion.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,11 +65,13 @@ You can also remove individual documents by using the `--remove` flag. Open eith

## Overview of Integrated Vectorization

Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in/ba-p/3960809#:~:text=Integrated%20vectorization%20is%20a%20new%20feature%20of%20Azure,pull-indexers%2C%20and%20vectorization%20of%20text%20queries%20through%20vectorizers). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies.
Azure AI Search includes an [integrated vectorization feature](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in/ba-p/3960809#:~:text=Integrated%20vectorization%20is%20a%20new%20feature%20of%20Azure,pull-indexers%2C%20and%20vectorization%20of%20text%20queries%20through%20vectorizers), a cloud-based approach to data ingestion. Integrated vectorization takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies.

See [this notebook](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb) to understand the process of setting up integrated vectorization.
We have integrated that code into our `prepdocs` script, so you can use it without needing to understand the details.

You must first explicitly [enable integrated vectorization](./deploy_features.md#enabling-integrated-vectorization) in the `azd` environment to use this feature.

This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index.
In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks.

Expand Down
22 changes: 18 additions & 4 deletions docs/deploy_features.md
Original file line number Diff line number Diff line change
Expand Up @@ -192,12 +192,26 @@ Azure AI search recently introduced an [integrated vectorization feature in prev

To enable integrated vectorization with this sample:

1. If you've previously deployed, delete the existing search index.
2. Run `azd env set USE_FEATURE_INT_VECTORIZATION true`
3. Run `azd up` to update system and user roles
1. If you've previously deployed, delete the existing search index. 🗑️
2. To enable the use of integrated vectorization, run:

```shell
azd env set USE_FEATURE_INT_VECTORIZATION true
```

3. If you've already deployed your app, then you can run just the `provision` step:

```shell
azd provision
```

That will set up necessary RBAC roles and configure the integrated vectorization feature on your search service.

If you haven't deployed your app yet, then you should run the full `azd up` after configuring all optional features.

4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process.

This feature is not currently compatible with GPT4-vision or the newer text-embedding-3 models.
⚠️ This feature is not currently compatible with the [GPT vision integration](./gpt4v.md).

## Enabling authentication

Expand Down
Loading