Skip to content

Commit 31f501a

Browse files
authored
Updates to integrated vectorization (#2045)
* Initial integrated vectorization improvements * Update integrated vectorization * More updates to integrated vectorization, fixes type checks * Add type check
1 parent ec07548 commit 31f501a

File tree

7 files changed

+201
-146
lines changed

7 files changed

+201
-146
lines changed

app/backend/prepdocs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,10 @@ async def main(strategy: Strategy, setup_index: bool = True):
375375

376376
ingestion_strategy: Strategy
377377
if use_int_vectorization:
378+
379+
if not openai_embeddings_service or not isinstance(openai_embeddings_service, AzureOpenAIEmbeddingService):
380+
raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service")
381+
378382
ingestion_strategy = IntegratedVectorizerStrategy(
379383
search_info=search_info,
380384
list_file_strategy=list_file_strategy,

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,15 @@
66
)
77
from azure.search.documents.indexes.models import (
88
AzureOpenAIEmbeddingSkill,
9-
AzureOpenAIParameters,
10-
AzureOpenAIVectorizer,
119
FieldMapping,
1210
IndexProjectionMode,
1311
InputFieldMappingEntry,
1412
OutputFieldMappingEntry,
1513
SearchIndexer,
1614
SearchIndexerDataContainer,
1715
SearchIndexerDataSourceConnection,
18-
SearchIndexerIndexProjections,
16+
SearchIndexerDataSourceType,
17+
SearchIndexerIndexProjection,
1918
SearchIndexerIndexProjectionSelector,
2019
SearchIndexerIndexProjectionsParameters,
2120
SearchIndexerSkillset,
@@ -41,16 +40,14 @@ def __init__(
4140
list_file_strategy: ListFileStrategy,
4241
blob_manager: BlobManager,
4342
search_info: SearchInfo,
44-
embeddings: Optional[AzureOpenAIEmbeddingService],
43+
embeddings: AzureOpenAIEmbeddingService,
4544
subscription_id: str,
4645
search_service_user_assigned_id: str,
4746
document_action: DocumentAction = DocumentAction.Add,
4847
search_analyzer_name: Optional[str] = None,
4948
use_acls: bool = False,
5049
category: Optional[str] = None,
5150
):
52-
if not embeddings or not isinstance(embeddings, AzureOpenAIEmbeddingService):
53-
raise Exception("Expecting AzureOpenAI embedding service")
5451

5552
self.list_file_strategy = list_file_strategy
5653
self.blob_manager = blob_manager
@@ -67,6 +64,7 @@ async def create_embedding_skill(self, index_name: str):
6764
skillset_name = f"{index_name}-skillset"
6865

6966
split_skill = SplitSkill(
67+
name=f"{index_name}-split-skill",
7068
description="Split skill to chunk documents",
7169
text_split_mode="pages",
7270
context="/document",
@@ -78,21 +76,21 @@ async def create_embedding_skill(self, index_name: str):
7876
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
7977
)
8078

81-
if self.embeddings is None:
82-
raise ValueError("Expecting Azure Open AI instance")
83-
8479
embedding_skill = AzureOpenAIEmbeddingSkill(
80+
name=f"{index_name}-embedding-skill",
8581
description="Skill to generate embeddings via Azure OpenAI",
8682
context="/document/pages/*",
87-
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
88-
deployment_id=self.embeddings.open_ai_deployment,
83+
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
84+
deployment_name=self.embeddings.open_ai_deployment,
85+
model_name=self.embeddings.open_ai_model_name,
86+
dimensions=self.embeddings.open_ai_dimensions,
8987
inputs=[
9088
InputFieldMappingEntry(name="text", source="/document/pages/*"),
9189
],
9290
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
9391
)
9492

95-
index_projections = SearchIndexerIndexProjections(
93+
index_projection = SearchIndexerIndexProjection(
9694
selectors=[
9795
SearchIndexerIndexProjectionSelector(
9896
target_index_name=index_name,
@@ -114,12 +112,13 @@ async def create_embedding_skill(self, index_name: str):
114112
name=skillset_name,
115113
description="Skillset to chunk documents and generate embeddings",
116114
skills=[split_skill, embedding_skill],
117-
index_projections=index_projections,
115+
index_projection=index_projection,
118116
)
119117

120118
return skillset
121119

122120
async def setup(self):
121+
logger.info("Setting up search index using integrated vectorization...")
123122
search_manager = SearchManager(
124123
search_info=self.search_info,
125124
search_analyzer_name=self.search_analyzer_name,
@@ -129,35 +128,19 @@ async def setup(self):
129128
search_images=False,
130129
)
131130

132-
if self.embeddings is None:
133-
raise ValueError("Expecting Azure Open AI instance")
134-
135-
await search_manager.create_index(
136-
vectorizers=[
137-
AzureOpenAIVectorizer(
138-
name=f"{self.search_info.index_name}-vectorizer",
139-
kind="azureOpenAI",
140-
azure_open_ai_parameters=AzureOpenAIParameters(
141-
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
142-
deployment_id=self.embeddings.open_ai_deployment,
143-
),
144-
),
145-
]
146-
)
131+
await search_manager.create_index()
147132

148-
# create indexer client
149133
ds_client = self.search_info.create_search_indexer_client()
150134
ds_container = SearchIndexerDataContainer(name=self.blob_manager.container)
151135
data_source_connection = SearchIndexerDataSourceConnection(
152136
name=f"{self.search_info.index_name}-blob",
153-
type="azureblob",
137+
type=SearchIndexerDataSourceType.AZURE_BLOB,
154138
connection_string=self.blob_manager.get_managedidentity_connectionstring(),
155139
container=ds_container,
156140
data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(),
157141
)
158142

159143
await ds_client.create_or_update_data_source_connection(data_source_connection)
160-
logger.info("Search indexer data source connection updated.")
161144

162145
embedding_skillset = await self.create_embedding_skill(self.search_info.index_name)
163146
await ds_client.create_or_update_skillset(embedding_skillset)

0 commit comments

Comments
 (0)