Skip to content

Commit e9b822c

Browse files
committed
Upgrade int vect for new embedding model
1 parent bfb74e6 commit e9b822c

File tree

1 file changed

+15
-15
lines changed

1 file changed

+15
-15
lines changed

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
)
77
from azure.search.documents.indexes.models import (
88
AzureOpenAIEmbeddingSkill,
9-
FieldMapping,
109
IndexProjectionMode,
1110
InputFieldMappingEntry,
1211
OutputFieldMappingEntry,
@@ -63,15 +62,18 @@ def __init__(
6362
self.use_acls = use_acls
6463
self.category = category
6564
self.search_info = search_info
65+
prefix = f"{self.search_info.index_name}-{self.search_field_name_embedding}"
66+
self.skillset_name = f"{prefix}-skillset"
67+
self.indexer_name = f"{prefix}-indexer"
68+
self.data_source_name = f"{prefix}-blob"
6669

6770
async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset:
6871
"""
6972
Create a skillset for the indexer to chunk documents and generate embeddings
7073
"""
71-
skillset_name = f"{index_name}-skillset"
7274

7375
split_skill = SplitSkill(
74-
name=f"{index_name}-split-skill",
76+
name="split-skill",
7577
description="Split skill to chunk documents",
7678
text_split_mode="pages",
7779
context="/document",
@@ -84,7 +86,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
8486
)
8587

8688
embedding_skill = AzureOpenAIEmbeddingSkill(
87-
name=f"{index_name}-embedding-skill",
89+
name="embedding-skill",
8890
description="Skill to generate embeddings via Azure OpenAI",
8991
context="/document/pages/*",
9092
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
@@ -94,7 +96,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
9496
inputs=[
9597
InputFieldMappingEntry(name="text", source="/document/pages/*"),
9698
],
97-
outputs=[OutputFieldMappingEntry(name=self.search_field_name_embedding, target_name="vector")],
99+
outputs=[OutputFieldMappingEntry(name="embedding", target_name="vector")],
98100
)
99101

100102
index_projection = SearchIndexerIndexProjection(
@@ -106,6 +108,8 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
106108
mappings=[
107109
InputFieldMappingEntry(name="content", source="/document/pages/*"),
108110
InputFieldMappingEntry(name="sourcepage", source="/document/metadata_storage_name"),
111+
InputFieldMappingEntry(name="sourcefile", source="/document/metadata_storage_name"),
112+
InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"),
109113
InputFieldMappingEntry(
110114
name=self.search_field_name_embedding, source="/document/pages/*/vector"
111115
),
@@ -118,7 +122,7 @@ async def create_embedding_skill(self, index_name: str) -> SearchIndexerSkillset
118122
)
119123

120124
skillset = SearchIndexerSkillset(
121-
name=skillset_name,
125+
name=self.skillset_name,
122126
description="Skillset to chunk documents and generate embeddings",
123127
skills=[split_skill, embedding_skill],
124128
index_projection=index_projection,
@@ -144,7 +148,7 @@ async def setup(self):
144148
ds_client = self.search_info.create_search_indexer_client()
145149
ds_container = SearchIndexerDataContainer(name=self.blob_manager.container)
146150
data_source_connection = SearchIndexerDataSourceConnection(
147-
name=f"{self.search_info.index_name}-blob",
151+
name=self.data_source_name,
148152
type=SearchIndexerDataSourceType.AZURE_BLOB,
149153
connection_string=self.blob_manager.get_managedidentity_connectionstring(),
150154
container=ds_container,
@@ -174,23 +178,19 @@ async def run(self):
174178
await self.blob_manager.remove_blob()
175179

176180
# Create an indexer
177-
indexer_name = f"{self.search_info.index_name}-indexer"
178-
179181
indexer = SearchIndexer(
180-
name=indexer_name,
182+
name=self.indexer_name,
181183
description="Indexer to index documents and generate embeddings",
182-
skillset_name=f"{self.search_info.index_name}-skillset",
184+
skillset_name=self.skillset_name,
183185
target_index_name=self.search_info.index_name,
184-
data_source_name=f"{self.search_info.index_name}-blob",
185-
# Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results
186-
field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
186+
data_source_name=self.data_source_name,
187187
)
188188

189189
indexer_client = self.search_info.create_search_indexer_client()
190190
indexer_result = await indexer_client.create_or_update_indexer(indexer)
191191

192192
# Run the indexer
193-
await indexer_client.run_indexer(indexer_name)
193+
await indexer_client.run_indexer(self.indexer_name)
194194
await indexer_client.close()
195195

196196
logger.info(

0 commit comments

Comments
 (0)