Skip to content

Commit 68bc453

Browse files
authored
[ai] Fix build index bug (#34001)
* Revert change to acs_connection_id type * Revert changes to data_index_* pipeline constructors * Prevent NoneType errors by checking * Fix import of EmbeddingsContainer * Fix import of EmbeddingsContainer in _mlindex * Revert "Fix import of EmbeddingsContainer in _mlindex" This reverts commit 46adaf8. * Add type ignore for mypy error and assign bug
1 parent 2c69b77 commit 68bc453

File tree

4 files changed

+60
-62
lines changed

4 files changed

+60
-62
lines changed

sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py

Lines changed: 29 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -260,10 +260,10 @@ def data_index_acs_pipeline(
260260
input_data: Input,
261261
embeddings_model: str,
262262
acs_config: str,
263-
acs_connection_id: Optional[str],
264-
aoai_connection_id: Optional[str],
263+
acs_connection_id: str,
264+
aoai_connection_id: str,
265265
embeddings_container: Input,
266-
chunk_size: Optional[int] = 768,
266+
chunk_size: int = 768,
267267
chunk_overlap: Optional[int] = 0,
268268
input_glob: Optional[str] = "**/*",
269269
citation_url: Optional[str] = None,
@@ -279,28 +279,26 @@ def data_index_acs_pipeline(
279279
:param acs_config: The configuration for the Azure Cognitive Search index.
280280
:type acs_config: str
281281
:param acs_connection_id: The connection ID for the Azure Cognitive Search index.
282-
:type acs_connection_id: Optional[str]
282+
:type acs_connection_id: str
283283
:param chunk_size: The size of the chunks to break the input data into.
284-
:type chunk_size: Optional[int]
284+
:type chunk_size: int
285285
:param chunk_overlap: The number of tokens to overlap between chunks.
286286
:type chunk_overlap: Optional[int]
287287
:param input_glob: The glob pattern to use when searching for input data.
288-
:type input_glob: Optional[str]s
288+
:type input_glob: Optional[str]
289289
:param citation_url: The URL to use when generating citations for the input data.
290290
:type citation_url: str
291291
:param citation_replacement_regex: The regex to use when generating citations for the input data.
292292
:type citation_replacement_regex: str
293293
:param aoai_connection_id: The connection ID for the Azure Open AI service.
294-
:type aoai_connection_id: Optional[str]
294+
:type aoai_connection_id: str
295295
:param embeddings_container: The container to use when caching embeddings.
296296
:type embeddings_container: Input
297297
:return: The URI of the generated Azure Cognitive Search index.
298298
:rtype: str.
299299
"""
300300
if input_glob is None:
301301
input_glob = "**/*"
302-
if chunk_size is None:
303-
chunk_size = 768
304302
if chunk_overlap is None:
305303
chunk_overlap = 0
306304

@@ -361,7 +359,7 @@ def data_index_acs_pipeline(
361359
component = data_index_acs_pipeline(
362360
input_data=input_data,
363361
input_glob=data_index.source.input_glob,
364-
chunk_size=data_index.source.chunk_size,
362+
chunk_size=data_index.source.chunk_size, # type: ignore[arg-type]
365363
chunk_overlap=data_index.source.chunk_overlap,
366364
citation_url=data_index.source.citation_url,
367365
citation_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
@@ -422,11 +420,11 @@ def data_index_faiss_pipeline(
422420
input_data: Input,
423421
embeddings_model: str,
424422
embeddings_container: Input,
425-
chunk_size: Optional[int] = 1024,
426-
data_source_glob: Optional[str] = None,
427-
data_source_url: Optional[str] = None,
428-
document_path_replacement_regex: Optional[str] = None,
429-
aoai_connection_id: Optional[str] = None,
423+
chunk_size: int = 1024,
424+
data_source_glob: str = None, # type: ignore[assignment]
425+
data_source_url: str = None, # type: ignore[assignment]
426+
document_path_replacement_regex: str = None, # type: ignore[assignment]
427+
aoai_connection_id: str = None, # type: ignore[assignment]
430428
):
431429
"""
432430
Generate embeddings for a `input_data` source and create a Faiss index from them.
@@ -508,10 +506,10 @@ def data_index_faiss_pipeline(
508506
component = data_index_faiss_pipeline(
509507
input_data=input_data,
510508
embeddings_model=build_model_protocol(data_index.embedding.model),
511-
chunk_size=data_index.source.chunk_size,
512-
data_source_glob=data_index.source.input_glob,
513-
data_source_url=data_index.source.citation_url,
514-
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
509+
chunk_size=data_index.source.chunk_size, # type: ignore[arg-type]
510+
data_source_glob=data_index.source.input_glob, # type: ignore[arg-type]
511+
data_source_url=data_index.source.citation_url, # type: ignore[arg-type]
512+
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type]
515513
if data_index.source.citation_url_replacement_regex
516514
else None,
517515
aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -563,13 +561,13 @@ def data_index_acs_pipeline(
563561
input_data: Input,
564562
embeddings_model: str,
565563
acs_config: str,
566-
acs_connection_id: Optional[str],
564+
acs_connection_id: str,
567565
embeddings_container: Input,
568-
chunk_size: Optional[int] = 1024,
569-
data_source_glob: Optional[str] = None,
570-
data_source_url: Optional[str] = None,
571-
document_path_replacement_regex: Optional[str] = None,
572-
aoai_connection_id: Optional[str] = None,
566+
chunk_size: int = 1024,
567+
data_source_glob: str = None, # type: ignore[assignment]
568+
data_source_url: str = None, # type: ignore[assignment]
569+
document_path_replacement_regex: str = None, # type: ignore[assignment]
570+
aoai_connection_id: str = None, # type: ignore[assignment]
573571
):
574572
"""
575573
Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index.
@@ -666,10 +664,10 @@ def data_index_acs_pipeline(
666664
embeddings_model=build_model_protocol(data_index.embedding.model),
667665
acs_config=json.dumps(acs_config),
668666
acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection),
669-
chunk_size=data_index.source.chunk_size,
670-
data_source_glob=data_index.source.input_glob,
671-
data_source_url=data_index.source.citation_url,
672-
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
667+
chunk_size=data_index.source.chunk_size, # type: ignore[arg-type]
668+
data_source_glob=data_index.source.input_glob, # type: ignore[arg-type]
669+
data_source_url=data_index.source.citation_url, # type: ignore[arg-type]
670+
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type]
673671
if data_index.source.citation_url_replacement_regex
674672
else None,
675673
aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -754,9 +752,9 @@ def get_component_obj(ml_client, component_uri):
754752
return component_obj
755753

756754

757-
def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> Optional[str]:
755+
def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> str:
758756
if connection is None:
759-
return None
757+
return ""
760758

761759
if isinstance(connection, str):
762760
short_form = re.match(r"azureml:(?P<connection_name>[^/]*)", connection)

sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def split_details(details):
4545
config = {**split_details(details), **config}
4646
config["kind"] = "open_ai"
4747
if "endpoint" in config:
48-
if ".openai." in config["endpoint"] or ".api.cognitive." in config["endpoint"] or ".cognitiveservices." in config["endpoint"]:
48+
if config["endpoint"] and (".openai." in config["endpoint"] or ".api.cognitive." in config["endpoint"] or ".cognitiveservices." in config["endpoint"]):
4949
config["api_base"] = config["endpoint"].rstrip("/")
5050
else:
5151
config["api_base"] = f"https://{config['endpoint']}.openai.azure.com"
@@ -137,7 +137,7 @@ def init_open_ai_from_config(config: dict, credential: Optional[TokenCredential]
137137
else:
138138
raise e
139139

140-
if "azure" in openai.api_type:
140+
if openai.api_type and "azure" in openai.api_type:
141141
config["api_version"] = config.get("api_version", "2023-03-15-preview")
142142

143143
return config

sdk/ai/azure-ai-resources/azure/ai/resources/_index/_dataindex/entities/_builders/data_index_func.py

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -224,14 +224,14 @@ def data_index_acs_pipeline(
224224
input_data: Input,
225225
embeddings_model: str,
226226
acs_config: str,
227-
acs_connection_id: Optional[str],
227+
acs_connection_id: str,
228228
embeddings_container: Input,
229-
chunk_size: Optional[int] = 768,
229+
chunk_size: int = 768,
230230
chunk_overlap: Optional[int] = 0,
231231
input_glob: Optional[str] = "**/*",
232232
citation_url: Optional[str] = None,
233233
citation_replacement_regex: Optional[str] = None,
234-
aoai_connection_id: Optional[str] = None,
234+
aoai_connection_id: str = None, # type: ignore[assignment]
235235
):
236236
"""
237237
Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index.
@@ -243,7 +243,7 @@ def data_index_acs_pipeline(
243243
:param acs_config: The configuration for the Azure Cognitive Search index.
244244
:type acs_config: str
245245
:param acs_connection_id: The connection ID for the Azure Cognitive Search index.
246-
:type acs_connection_id: Optional[str]
246+
:type acs_connection_id: str
247247
:param chunk_size: The size of the chunks to break the input data into. Defaults to 768.
248248
:type chunk_size: int
249249
:param chunk_overlap: The number of tokens to overlap between chunks. Defaults to 0.
@@ -325,7 +325,7 @@ def data_index_acs_pipeline(
325325
component = data_index_acs_pipeline(
326326
input_data=input_data,
327327
input_glob=data_index.source.input_glob,
328-
chunk_size=data_index.source.chunk_size,
328+
chunk_size=data_index.source.chunk_size, # type: ignore[arg-type]
329329
chunk_overlap=data_index.source.chunk_overlap,
330330
citation_url=data_index.source.citation_url,
331331
citation_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
@@ -386,11 +386,11 @@ def data_index_faiss_pipeline(
386386
input_data: Input,
387387
embeddings_model: str,
388388
embeddings_container: Input,
389-
chunk_size: Optional[int] = 1024,
390-
data_source_glob: Optional[str] = None,
391-
data_source_url: Optional[str] = None,
392-
document_path_replacement_regex: Optional[str] = None,
393-
aoai_connection_id: Optional[str] = None,
389+
chunk_size: int = 1024,
390+
data_source_glob: str = None, # type: ignore[assignment]
391+
data_source_url: str = None, # type: ignore[assignment]
392+
document_path_replacement_regex: str = None, # type: ignore[assignment]
393+
aoai_connection_id: str = None, # type: ignore[assignment]
394394
):
395395
"""
396396
Generate embeddings for a `input_data` source and create a Faiss index from them.
@@ -472,10 +472,10 @@ def data_index_faiss_pipeline(
472472
component = data_index_faiss_pipeline(
473473
input_data=input_data,
474474
embeddings_model=build_model_protocol(data_index.embedding.model),
475-
chunk_size=data_index.source.chunk_size,
476-
data_source_glob=data_index.source.input_glob,
477-
data_source_url=data_index.source.citation_url,
478-
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
475+
chunk_size=data_index.source.chunk_size, # type: ignore[arg-type]
476+
data_source_glob=data_index.source.input_glob, # type: ignore[arg-type]
477+
data_source_url=data_index.source.citation_url, # type: ignore[arg-type]
478+
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type]
479479
if data_index.source.citation_url_replacement_regex
480480
else None,
481481
aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -527,13 +527,13 @@ def data_index_acs_pipeline(
527527
input_data: Input,
528528
embeddings_model: str,
529529
acs_config: str,
530-
acs_connection_id: Optional[str],
530+
acs_connection_id: str,
531531
embeddings_container: Input,
532-
chunk_size: Optional[int] = 1024,
533-
data_source_glob: Optional[str] = None,
534-
data_source_url: Optional[str] = None,
535-
document_path_replacement_regex: Optional[str] = None,
536-
aoai_connection_id: Optional[str] = None,
532+
chunk_size: int = 1024,
533+
data_source_glob: str = None, # type: ignore[assignment]
534+
data_source_url: str = None, # type: ignore[assignment]
535+
document_path_replacement_regex: str = None, # type: ignore[assignment]
536+
aoai_connection_id: str = None, # type: ignore[assignment]
537537
) -> Dict[str, Any]:
538538
"""
539539
Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index.
@@ -630,10 +630,10 @@ def data_index_acs_pipeline(
630630
embeddings_model=build_model_protocol(data_index.embedding.model),
631631
acs_config=json.dumps(acs_config),
632632
acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection),
633-
chunk_size=data_index.source.chunk_size,
634-
data_source_glob=data_index.source.input_glob,
635-
data_source_url=data_index.source.citation_url,
636-
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
633+
chunk_size=data_index.source.chunk_size, # type: ignore[arg-type]
634+
data_source_glob=data_index.source.input_glob, # type: ignore[arg-type]
635+
data_source_url=data_index.source.citation_url, # type: ignore[arg-type]
636+
document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict()) # type: ignore[arg-type]
637637
if data_index.source.citation_url_replacement_regex
638638
else None,
639639
aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -718,9 +718,9 @@ def get_component_obj(ml_client, component_uri):
718718
return component_obj
719719

720720

721-
def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> Optional[str]:
721+
def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> str:
722722
if connection is None:
723-
return None
723+
return ""
724724

725725
if isinstance(connection, str):
726726
short_form = re.match(r"azureml:(?P<connection_name>[^/]*)", connection)

sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -315,16 +315,16 @@ def build_index_on_cloud(
315315
IndexSource,
316316
IndexStore,
317317
)
318-
from azure.ai.resources._index._embeddings.EmbeddingsContainer import from_uri
318+
from azure.ai.resources._index._embeddings import EmbeddingsContainer
319319
if isinstance(input_source, ACSSource):
320320
from azure.ai.resources._index._utils.connections import get_connection_by_id_v2, get_target_from_connection
321321

322322
# Construct MLIndex object
323323
mlindex_config = {}
324324
connection_args = {"connection_type": "workspace_connection", "connection": {"id": aoai_connection_id}}
325-
mlindex_config["embeddings"] = from_uri(
325+
mlindex_config["embeddings"] = EmbeddingsContainer.from_uri( # type: ignore[attr-defined]
326326
build_open_ai_protocol(embeddings_model), **connection_args
327-
).get_metadata()
327+
).get_metadata() # Bug 2922096
328328
mlindex_config["index"] = {
329329
"kind": "acs",
330330
"connection_type": "workspace_connection",

0 commit comments

Comments
 (0)