Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions app/backend/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,10 @@ async def main(strategy: Strategy, setup_index: bool = True):

ingestion_strategy: Strategy
if use_int_vectorization:

if not openai_embeddings_service or not isinstance(openai_embeddings_service, AzureOpenAIEmbeddingService):
raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service")

ingestion_strategy = IntegratedVectorizerStrategy(
search_info=search_info,
list_file_strategy=list_file_strategy,
Expand Down
45 changes: 14 additions & 31 deletions app/backend/prepdocslib/integratedvectorizerstrategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,15 @@
)
from azure.search.documents.indexes.models import (
AzureOpenAIEmbeddingSkill,
AzureOpenAIParameters,
AzureOpenAIVectorizer,
FieldMapping,
IndexProjectionMode,
InputFieldMappingEntry,
OutputFieldMappingEntry,
SearchIndexer,
SearchIndexerDataContainer,
SearchIndexerDataSourceConnection,
SearchIndexerIndexProjections,
SearchIndexerDataSourceType,
SearchIndexerIndexProjection,
SearchIndexerIndexProjectionSelector,
SearchIndexerIndexProjectionsParameters,
SearchIndexerSkillset,
Expand All @@ -41,16 +40,14 @@ def __init__(
list_file_strategy: ListFileStrategy,
blob_manager: BlobManager,
search_info: SearchInfo,
embeddings: Optional[AzureOpenAIEmbeddingService],
embeddings: AzureOpenAIEmbeddingService,
subscription_id: str,
search_service_user_assigned_id: str,
document_action: DocumentAction = DocumentAction.Add,
search_analyzer_name: Optional[str] = None,
use_acls: bool = False,
category: Optional[str] = None,
):
if not embeddings or not isinstance(embeddings, AzureOpenAIEmbeddingService):
raise Exception("Expecting AzureOpenAI embedding service")

self.list_file_strategy = list_file_strategy
self.blob_manager = blob_manager
Expand All @@ -67,6 +64,7 @@ async def create_embedding_skill(self, index_name: str):
skillset_name = f"{index_name}-skillset"

split_skill = SplitSkill(
name=f"{index_name}-split-skill",
description="Split skill to chunk documents",
text_split_mode="pages",
context="/document",
Expand All @@ -78,21 +76,21 @@ async def create_embedding_skill(self, index_name: str):
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
)

if self.embeddings is None:
raise ValueError("Expecting Azure Open AI instance")

embedding_skill = AzureOpenAIEmbeddingSkill(
name=f"{index_name}-embedding-skill",
description="Skill to generate embeddings via Azure OpenAI",
context="/document/pages/*",
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_id=self.embeddings.open_ai_deployment,
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_name=self.embeddings.open_ai_deployment,
model_name=self.embeddings.open_ai_model_name,
dimensions=self.embeddings.open_ai_dimensions,
inputs=[
InputFieldMappingEntry(name="text", source="/document/pages/*"),
],
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
)

index_projections = SearchIndexerIndexProjections(
index_projection = SearchIndexerIndexProjection(
selectors=[
SearchIndexerIndexProjectionSelector(
target_index_name=index_name,
Expand All @@ -114,12 +112,13 @@ async def create_embedding_skill(self, index_name: str):
name=skillset_name,
description="Skillset to chunk documents and generate embeddings",
skills=[split_skill, embedding_skill],
index_projections=index_projections,
index_projection=index_projection,
)

return skillset

async def setup(self):
logger.info("Setting up search index using integrated vectorization...")
search_manager = SearchManager(
search_info=self.search_info,
search_analyzer_name=self.search_analyzer_name,
Expand All @@ -129,35 +128,19 @@ async def setup(self):
search_images=False,
)

if self.embeddings is None:
raise ValueError("Expecting Azure Open AI instance")

await search_manager.create_index(
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Now we always make a vectorizer, even when not using integrated vectorization.

vectorizers=[
AzureOpenAIVectorizer(
name=f"{self.search_info.index_name}-vectorizer",
kind="azureOpenAI",
azure_open_ai_parameters=AzureOpenAIParameters(
resource_uri=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
deployment_id=self.embeddings.open_ai_deployment,
),
),
]
)
await search_manager.create_index()

# create indexer client
ds_client = self.search_info.create_search_indexer_client()
ds_container = SearchIndexerDataContainer(name=self.blob_manager.container)
data_source_connection = SearchIndexerDataSourceConnection(
name=f"{self.search_info.index_name}-blob",
type="azureblob",
type=SearchIndexerDataSourceType.AZURE_BLOB,
connection_string=self.blob_manager.get_managedidentity_connectionstring(),
container=ds_container,
data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(),
)

await ds_client.create_or_update_data_source_connection(data_source_connection)
logger.info("Search indexer data source connection updated.")

embedding_skillset = await self.create_embedding_skill(self.search_info.index_name)
await ds_client.create_or_update_skillset(embedding_skillset)
Expand Down
Loading
Loading