Skip to content

Commit 06c0956

Browse files
committed
More updates to integrated vectorization, fixes type checks
1 parent d4e40b8 commit 06c0956

File tree

3 files changed

+159
-123
lines changed

3 files changed

+159
-123
lines changed

app/backend/prepdocs.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,10 @@ async def main(strategy: Strategy, setup_index: bool = True):
375375

376376
ingestion_strategy: Strategy
377377
if use_int_vectorization:
378+
379+
if not openai_embeddings_service or not isinstance(openai_embeddings_service, AzureOpenAIEmbeddingService):
380+
raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service")
381+
378382
ingestion_strategy = IntegratedVectorizerStrategy(
379383
search_info=search_info,
380384
list_file_strategy=list_file_strategy,

app/backend/prepdocslib/integratedvectorizerstrategy.py

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -40,16 +40,14 @@ def __init__(
4040
list_file_strategy: ListFileStrategy,
4141
blob_manager: BlobManager,
4242
search_info: SearchInfo,
43-
embeddings: Optional[AzureOpenAIEmbeddingService],
43+
embeddings: AzureOpenAIEmbeddingService,
4444
subscription_id: str,
4545
search_service_user_assigned_id: str,
4646
document_action: DocumentAction = DocumentAction.Add,
4747
search_analyzer_name: Optional[str] = None,
4848
use_acls: bool = False,
4949
category: Optional[str] = None,
5050
):
51-
if not embeddings or not isinstance(embeddings, AzureOpenAIEmbeddingService):
52-
raise Exception("Expecting AzureOpenAI embedding service")
5351

5452
self.list_file_strategy = list_file_strategy
5553
self.blob_manager = blob_manager
@@ -78,9 +76,6 @@ async def create_embedding_skill(self, index_name: str):
7876
outputs=[OutputFieldMappingEntry(name="textItems", target_name="pages")],
7977
)
8078

81-
if self.embeddings is None:
82-
raise ValueError("Expecting Azure Open AI instance")
83-
8479
embedding_skill = AzureOpenAIEmbeddingSkill(
8580
name=f"{index_name}-embedding-skill",
8681
description="Skill to generate embeddings via Azure OpenAI",
@@ -123,6 +118,7 @@ async def create_embedding_skill(self, index_name: str):
123118
return skillset
124119

125120
async def setup(self):
121+
logger.info("Setting up search index using integrated vectorization...")
126122
search_manager = SearchManager(
127123
search_info=self.search_info,
128124
search_analyzer_name=self.search_analyzer_name,
@@ -132,12 +128,8 @@ async def setup(self):
132128
search_images=False,
133129
)
134130

135-
if self.embeddings is None:
136-
raise ValueError("Expecting Azure Open AI instance")
137-
138131
await search_manager.create_index()
139132

140-
# create indexer client
141133
ds_client = self.search_info.create_search_indexer_client()
142134
ds_container = SearchIndexerDataContainer(name=self.blob_manager.container)
143135
data_source_connection = SearchIndexerDataSourceConnection(
@@ -149,7 +141,6 @@ async def setup(self):
149141
)
150142

151143
await ds_client.create_or_update_data_source_connection(data_source_connection)
152-
logger.info("Search indexer data source connection updated.")
153144

154145
embedding_skillset = await self.create_embedding_skill(self.search_info.index_name)
155146
await ds_client.create_or_update_skillset(embedding_skillset)

app/backend/prepdocslib/searchmanager.py

Lines changed: 153 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
)
2424

2525
from .blobmanager import BlobManager
26-
from .embeddings import OpenAIEmbeddings
26+
from .embeddings import AzureOpenAIEmbeddingService, OpenAIEmbeddings
2727
from .listfilestrategy import File
2828
from .strategy import SearchInfo
2929
from .textsplitter import SplitPage
@@ -67,149 +67,190 @@ def __init__(
6767
self.search_images = search_images
6868

6969
async def create_index(self, vectorizers: Optional[List[VectorSearchVectorizer]] = None):
70-
logger.info("Ensuring search index %s exists", self.search_info.index_name)
70+
logger.info("Checking whether search index %s exists...", self.search_info.index_name)
7171

7272
async with self.search_info.create_search_index_client() as search_index_client:
73-
fields = [
74-
(
75-
SimpleField(name="id", type="Edm.String", key=True)
76-
if not self.use_int_vectorization
77-
else SearchField(
78-
name="id",
73+
74+
if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:
75+
logger.info("Creating new search index %s", self.search_info.index_name)
76+
fields = [
77+
(
78+
SimpleField(name="id", type="Edm.String", key=True)
79+
if not self.use_int_vectorization
80+
else SearchField(
81+
name="id",
82+
type="Edm.String",
83+
key=True,
84+
sortable=True,
85+
filterable=True,
86+
facetable=True,
87+
analyzer_name="keyword",
88+
)
89+
),
90+
SearchableField(
91+
name="content",
7992
type="Edm.String",
80-
key=True,
81-
sortable=True,
82-
filterable=True,
83-
facetable=True,
84-
analyzer_name="keyword",
85-
)
86-
),
87-
SearchableField(
88-
name="content",
89-
type="Edm.String",
90-
analyzer_name=self.search_analyzer_name,
91-
),
92-
SearchField(
93-
name="embedding",
94-
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
95-
hidden=False,
96-
searchable=True,
97-
filterable=False,
98-
sortable=False,
99-
facetable=False,
100-
vector_search_dimensions=self.embedding_dimensions,
101-
vector_search_profile_name="embedding_config",
102-
),
103-
SimpleField(name="category", type="Edm.String", filterable=True, facetable=True),
104-
SimpleField(
105-
name="sourcepage",
106-
type="Edm.String",
107-
filterable=True,
108-
facetable=True,
109-
),
110-
SimpleField(
111-
name="sourcefile",
112-
type="Edm.String",
113-
filterable=True,
114-
facetable=True,
115-
),
116-
SimpleField(
117-
name="storageUrl",
118-
type="Edm.String",
119-
filterable=True,
120-
facetable=False,
121-
),
122-
]
123-
if self.use_acls:
124-
fields.append(
125-
SimpleField(
126-
name="oids",
127-
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
128-
filterable=True,
129-
)
130-
)
131-
fields.append(
132-
SimpleField(
133-
name="groups",
134-
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
135-
filterable=True,
136-
)
137-
)
138-
if self.use_int_vectorization:
139-
fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True))
140-
if self.search_images:
141-
fields.append(
93+
analyzer_name=self.search_analyzer_name,
94+
),
14295
SearchField(
143-
name="imageEmbedding",
96+
name="embedding",
14497
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
14598
hidden=False,
14699
searchable=True,
147100
filterable=False,
148101
sortable=False,
149102
facetable=False,
150-
vector_search_dimensions=1024,
103+
vector_search_dimensions=self.embedding_dimensions,
151104
vector_search_profile_name="embedding_config",
152105
),
153-
)
154-
155-
index = SearchIndex(
156-
name=self.search_info.index_name,
157-
fields=fields,
158-
semantic_search=SemanticSearch(
159-
configurations=[
160-
SemanticConfiguration(
161-
name="default",
162-
prioritized_fields=SemanticPrioritizedFields(
163-
title_field=None, content_fields=[SemanticField(field_name="content")]
164-
),
106+
SimpleField(name="category", type="Edm.String", filterable=True, facetable=True),
107+
SimpleField(
108+
name="sourcepage",
109+
type="Edm.String",
110+
filterable=True,
111+
facetable=True,
112+
),
113+
SimpleField(
114+
name="sourcefile",
115+
type="Edm.String",
116+
filterable=True,
117+
facetable=True,
118+
),
119+
SimpleField(
120+
name="storageUrl",
121+
type="Edm.String",
122+
filterable=True,
123+
facetable=False,
124+
),
125+
]
126+
if self.use_acls:
127+
fields.append(
128+
SimpleField(
129+
name="oids",
130+
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
131+
filterable=True,
165132
)
166-
]
167-
),
168-
vector_search=VectorSearch(
169-
algorithms=[
170-
HnswAlgorithmConfiguration(
171-
name="hnsw_config",
172-
parameters=HnswParameters(metric="cosine"),
133+
)
134+
fields.append(
135+
SimpleField(
136+
name="groups",
137+
type=SearchFieldDataType.Collection(SearchFieldDataType.String),
138+
filterable=True,
173139
)
174-
],
175-
profiles=[
176-
VectorSearchProfile(
177-
name="embedding_config",
178-
algorithm_configuration_name="hnsw_config",
179-
vectorizer_name=(
180-
f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None
181-
),
140+
)
141+
if self.use_int_vectorization:
142+
logger.info("Including parent_id field in new index %s", self.search_info.index_name)
143+
fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True))
144+
if self.search_images:
145+
logger.info("Including imageEmbedding field in new index %s", self.search_info.index_name)
146+
fields.append(
147+
SearchField(
148+
name="imageEmbedding",
149+
type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
150+
hidden=False,
151+
searchable=True,
152+
filterable=False,
153+
sortable=False,
154+
facetable=False,
155+
vector_search_dimensions=1024,
156+
vector_search_profile_name="embedding_config",
182157
),
183-
],
184-
vectorizers=[
158+
)
159+
160+
vectorizers = []
161+
if self.embeddings and isinstance(self.embeddings, AzureOpenAIEmbeddingService):
162+
logger.info(
163+
"Including vectorizer for search index %s, using Azure OpenAI service %s",
164+
self.search_info.index_name,
165+
self.embeddings.open_ai_service,
166+
)
167+
vectorizers.append(
185168
AzureOpenAIVectorizer(
186169
vectorizer_name=f"{self.search_info.index_name}-vectorizer",
187170
parameters=AzureOpenAIVectorizerParameters(
188-
resource_url=f"https://{self.embeddings.open_ai_service}.openai.azure.com",
171+
resource_url=self.embeddings.open_ai_endpoint,
189172
deployment_name=self.embeddings.open_ai_deployment,
190173
model_name=self.embeddings.open_ai_model_name,
191174
),
192-
),
193-
],
194-
),
195-
)
196-
if self.search_info.index_name not in [name async for name in search_index_client.list_index_names()]:
197-
logger.info("Creating %s search index", self.search_info.index_name)
175+
)
176+
)
177+
else:
178+
logger.info(
179+
"Not including vectorizer for search index %s, no Azure OpenAI service found",
180+
self.search_info.index_name,
181+
)
182+
183+
index = SearchIndex(
184+
name=self.search_info.index_name,
185+
fields=fields,
186+
semantic_search=SemanticSearch(
187+
configurations=[
188+
SemanticConfiguration(
189+
name="default",
190+
prioritized_fields=SemanticPrioritizedFields(
191+
title_field=None, content_fields=[SemanticField(field_name="content")]
192+
),
193+
)
194+
]
195+
),
196+
vector_search=VectorSearch(
197+
algorithms=[
198+
HnswAlgorithmConfiguration(
199+
name="hnsw_config",
200+
parameters=HnswParameters(metric="cosine"),
201+
)
202+
],
203+
profiles=[
204+
VectorSearchProfile(
205+
name="embedding_config",
206+
algorithm_configuration_name="hnsw_config",
207+
vectorizer_name=(
208+
f"{self.search_info.index_name}-vectorizer" if self.use_int_vectorization else None
209+
),
210+
),
211+
],
212+
vectorizers=vectorizers,
213+
),
214+
)
215+
198216
await search_index_client.create_index(index)
199217
else:
200218
logger.info("Search index %s already exists", self.search_info.index_name)
201-
index_definition = await search_index_client.get_index(self.search_info.index_name)
202-
if not any(field.name == "storageUrl" for field in index_definition.fields):
219+
existing_index = await search_index_client.get_index(self.search_info.index_name)
220+
if not any(field.name == "storageUrl" for field in existing_index.fields):
203221
logger.info("Adding storageUrl field to index %s", self.search_info.index_name)
204-
index_definition.fields.append(
222+
existing_index.fields.append(
205223
SimpleField(
206224
name="storageUrl",
207225
type="Edm.String",
208226
filterable=True,
209227
facetable=False,
210228
),
211229
)
212-
await search_index_client.create_or_update_index(index_definition)
230+
await search_index_client.create_or_update_index(existing_index)
231+
232+
if existing_index.vector_search is not None and (
233+
existing_index.vector_search.vectorizers is None
234+
or len(existing_index.vector_search.vectorizers) == 0
235+
):
236+
if self.embeddings is not None:
237+
logger.info("Adding vectorizer to search index %s", self.search_info.index_name)
238+
existing_index.vector_search.vectorizers = [
239+
AzureOpenAIVectorizer(
240+
vectorizer_name=f"{self.search_info.index_name}-vectorizer",
241+
parameters=AzureOpenAIVectorizerParameters(
242+
resource_url=self.embeddings.open_ai_endpoint,
243+
deployment_name=self.embeddings.open_ai_deployment,
244+
model_name=self.embeddings.open_ai_model_name,
245+
),
246+
)
247+
]
248+
await search_index_client.create_or_update_index(existing_index)
249+
else:
250+
logger.info(
251+
"Can't add vectorizer to search index %s since embeddings service isn't defined",
252+
self.search_info,
253+
)
213254

214255
async def update_content(
215256
self, sections: List[Section], image_embeddings: Optional[List[List[float]]] = None, url: Optional[str] = None

0 commit comments

Comments
 (0)