[ai] Fix build index bug (#34001)

diondrapeck · web-flow · commit 68bc453ed9be · 2024-01-25T14:25:16.000-08:00
* Revert change to acs_connection_id type * Revert changes to data_index_* pipeline constructors * Prevent NoneType errors by checking * Fix import of EmbeddingsContainer * Fix import of EmbeddingsContainer in _mlindex * Revert "Fix import of EmbeddingsContainer in _mlindex" This reverts commit 46adaf8. * Add type ignore for mypy error and assign bug
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_dataindex/entities/_builders/data_index_func.py
@@ -260,10 +260,10 @@ def data_index_acs_pipeline(
         input_data: Input,
         embeddings_model: str,
         acs_config: str,
-        acs_connection_id: Optional[str],
-        aoai_connection_id: Optional[str],
+        acs_connection_id: str,
+        aoai_connection_id: str,
         embeddings_container: Input,
-        chunk_size: Optional[int] = 768,
+        chunk_size: int = 768,
         chunk_overlap: Optional[int] = 0,
         input_glob: Optional[str] = "**/*",
         citation_url: Optional[str] = None,
@@ -279,28 +279,26 @@ def data_index_acs_pipeline(
         :param acs_config: The configuration for the Azure Cognitive Search index.
         :type acs_config: str
         :param acs_connection_id: The connection ID for the Azure Cognitive Search index.
-        :type acs_connection_id: Optional[str]
+        :type acs_connection_id: str
         :param chunk_size: The size of the chunks to break the input data into.
-        :type chunk_size: Optional[int]
+        :type chunk_size: int
         :param chunk_overlap: The number of tokens to overlap between chunks.
         :type chunk_overlap: Optional[int]
         :param input_glob: The glob pattern to use when searching for input data.
-        :type input_glob: Optional[str]s
+        :type input_glob: Optional[str]
         :param citation_url: The URL to use when generating citations for the input data.
         :type citation_url: str
         :param citation_replacement_regex: The regex to use when generating citations for the input data.
         :type citation_replacement_regex: str
         :param aoai_connection_id: The connection ID for the Azure Open AI service.
-        :type aoai_connection_id: Optional[str]
+        :type aoai_connection_id: str
         :param embeddings_container: The container to use when caching embeddings.
         :type embeddings_container: Input
         :return: The URI of the generated Azure Cognitive Search index.
         :rtype: str.
         """
         if input_glob is None:
             input_glob = "**/*"
-        if chunk_size is None:
-            chunk_size = 768
         if chunk_overlap is None:
             chunk_overlap = 0
 
@@ -361,7 +359,7 @@ def data_index_acs_pipeline(
     component = data_index_acs_pipeline(
         input_data=input_data,
         input_glob=data_index.source.input_glob,
-        chunk_size=data_index.source.chunk_size,
+        chunk_size=data_index.source.chunk_size,  # type: ignore[arg-type]
         chunk_overlap=data_index.source.chunk_overlap,
         citation_url=data_index.source.citation_url,
         citation_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
@@ -422,11 +420,11 @@ def data_index_faiss_pipeline(
         input_data: Input,
         embeddings_model: str,
         embeddings_container: Input,
-        chunk_size: Optional[int] = 1024,
-        data_source_glob: Optional[str] = None,
-        data_source_url: Optional[str] = None,
-        document_path_replacement_regex: Optional[str] = None,
-        aoai_connection_id: Optional[str] = None,
+        chunk_size: int = 1024,
+        data_source_glob: str = None,  # type: ignore[assignment]
+        data_source_url: str = None,  # type: ignore[assignment]
+        document_path_replacement_regex: str = None,  # type: ignore[assignment]
+        aoai_connection_id: str = None,  # type: ignore[assignment]
     ):
         """
         Generate embeddings for a `input_data` source and create a Faiss index from them.
@@ -508,10 +506,10 @@ def data_index_faiss_pipeline(
     component = data_index_faiss_pipeline(
         input_data=input_data,
         embeddings_model=build_model_protocol(data_index.embedding.model),
-        chunk_size=data_index.source.chunk_size,
-        data_source_glob=data_index.source.input_glob,
-        data_source_url=data_index.source.citation_url,
-        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
+        chunk_size=data_index.source.chunk_size,  # type: ignore[arg-type]
+        data_source_glob=data_index.source.input_glob,  # type: ignore[arg-type]
+        data_source_url=data_index.source.citation_url,  # type: ignore[arg-type]
+        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())  # type: ignore[arg-type]
         if data_index.source.citation_url_replacement_regex
         else None,
         aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -563,13 +561,13 @@ def data_index_acs_pipeline(
         input_data: Input,
         embeddings_model: str,
         acs_config: str,
-        acs_connection_id: Optional[str],
+        acs_connection_id: str,
         embeddings_container: Input,
-        chunk_size: Optional[int] = 1024,
-        data_source_glob: Optional[str] = None,
-        data_source_url: Optional[str] = None,
-        document_path_replacement_regex: Optional[str] = None,
-        aoai_connection_id: Optional[str] = None,
+        chunk_size: int = 1024,
+        data_source_glob: str = None,  # type: ignore[assignment]
+        data_source_url: str = None,  # type: ignore[assignment]
+        document_path_replacement_regex: str = None,  # type: ignore[assignment]
+        aoai_connection_id: str = None,  # type: ignore[assignment]
     ):
         """
         Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index.
@@ -666,10 +664,10 @@ def data_index_acs_pipeline(
         embeddings_model=build_model_protocol(data_index.embedding.model),
         acs_config=json.dumps(acs_config),
         acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection),
-        chunk_size=data_index.source.chunk_size,
-        data_source_glob=data_index.source.input_glob,
-        data_source_url=data_index.source.citation_url,
-        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
+        chunk_size=data_index.source.chunk_size,  # type: ignore[arg-type]
+        data_source_glob=data_index.source.input_glob,  # type: ignore[arg-type]
+        data_source_url=data_index.source.citation_url,  # type: ignore[arg-type]
+        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())  # type: ignore[arg-type]
         if data_index.source.citation_url_replacement_regex
         else None,
         aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -754,9 +752,9 @@ def get_component_obj(ml_client, component_uri):
     return component_obj
 
 
-def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> Optional[str]:
+def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> str:
     if connection is None:
-        return None
+        return ""
 
     if isinstance(connection, str):
         short_form = re.match(r"azureml:(?P<connection_name>[^/]*)", connection)
diff --git a/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py b/sdk/ai/azure-ai-generative/azure/ai/generative/index/_models.py
@@ -45,7 +45,7 @@ def split_details(details):
         config = {**split_details(details), **config}
         config["kind"] = "open_ai"
         if "endpoint" in config:
-            if ".openai." in config["endpoint"] or ".api.cognitive." in config["endpoint"] or ".cognitiveservices." in config["endpoint"]:
+            if config["endpoint"] and (".openai." in config["endpoint"] or ".api.cognitive." in config["endpoint"] or ".cognitiveservices." in config["endpoint"]):
                 config["api_base"] = config["endpoint"].rstrip("/")
             else:
                 config["api_base"] = f"https://{config['endpoint']}.openai.azure.com"
@@ -137,7 +137,7 @@ def init_open_ai_from_config(config: dict, credential: Optional[TokenCredential]
         else:
             raise e
 
-    if "azure" in openai.api_type:
+    if openai.api_type and "azure" in openai.api_type:
         config["api_version"] = config.get("api_version", "2023-03-15-preview")
 
     return config
diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_dataindex/entities/_builders/data_index_func.py b/sdk/ai/azure-ai-resources/azure/ai/resources/_index/_dataindex/entities/_builders/data_index_func.py
@@ -224,14 +224,14 @@ def data_index_acs_pipeline(
         input_data: Input,
         embeddings_model: str,
         acs_config: str,
-        acs_connection_id: Optional[str],
+        acs_connection_id: str,
         embeddings_container: Input,
-        chunk_size: Optional[int] = 768,
+        chunk_size: int = 768,
         chunk_overlap: Optional[int] = 0,
         input_glob: Optional[str] = "**/*",
         citation_url: Optional[str] = None,
         citation_replacement_regex: Optional[str] = None,
-        aoai_connection_id: Optional[str] = None,
+        aoai_connection_id: str = None,  # type: ignore[assignment]
     ):
         """
         Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index.
@@ -243,7 +243,7 @@ def data_index_acs_pipeline(
         :param acs_config: The configuration for the Azure Cognitive Search index.
         :type acs_config: str
         :param acs_connection_id: The connection ID for the Azure Cognitive Search index.
-        :type acs_connection_id: Optional[str]
+        :type acs_connection_id: str
         :param chunk_size: The size of the chunks to break the input data into. Defaults to 768.
         :type chunk_size: int
         :param chunk_overlap: The number of tokens to overlap between chunks. Defaults to 0.
@@ -325,7 +325,7 @@ def data_index_acs_pipeline(
     component = data_index_acs_pipeline(
         input_data=input_data,
         input_glob=data_index.source.input_glob,
-        chunk_size=data_index.source.chunk_size,
+        chunk_size=data_index.source.chunk_size,  # type: ignore[arg-type]
         chunk_overlap=data_index.source.chunk_overlap,
         citation_url=data_index.source.citation_url,
         citation_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
@@ -386,11 +386,11 @@ def data_index_faiss_pipeline(
         input_data: Input,
         embeddings_model: str,
         embeddings_container: Input,
-        chunk_size: Optional[int] = 1024,
-        data_source_glob: Optional[str] = None,
-        data_source_url: Optional[str] = None,
-        document_path_replacement_regex: Optional[str] = None,
-        aoai_connection_id: Optional[str] = None,
+        chunk_size: int = 1024,
+        data_source_glob: str = None,  # type: ignore[assignment]
+        data_source_url: str = None,  # type: ignore[assignment]
+        document_path_replacement_regex: str = None,  # type: ignore[assignment]
+        aoai_connection_id: str = None,  # type: ignore[assignment]
     ):
         """
         Generate embeddings for a `input_data` source and create a Faiss index from them.
@@ -472,10 +472,10 @@ def data_index_faiss_pipeline(
     component = data_index_faiss_pipeline(
         input_data=input_data,
         embeddings_model=build_model_protocol(data_index.embedding.model),
-        chunk_size=data_index.source.chunk_size,
-        data_source_glob=data_index.source.input_glob,
-        data_source_url=data_index.source.citation_url,
-        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
+        chunk_size=data_index.source.chunk_size,  # type: ignore[arg-type]
+        data_source_glob=data_index.source.input_glob,  # type: ignore[arg-type]
+        data_source_url=data_index.source.citation_url,  # type: ignore[arg-type]
+        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())  # type: ignore[arg-type]
         if data_index.source.citation_url_replacement_regex
         else None,
         aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -527,13 +527,13 @@ def data_index_acs_pipeline(
         input_data: Input,
         embeddings_model: str,
         acs_config: str,
-        acs_connection_id: Optional[str],
+        acs_connection_id: str,
         embeddings_container: Input,
-        chunk_size: Optional[int] = 1024,
-        data_source_glob: Optional[str] = None,
-        data_source_url: Optional[str] = None,
-        document_path_replacement_regex: Optional[str] = None,
-        aoai_connection_id: Optional[str] = None,
+        chunk_size: int = 1024,
+        data_source_glob: str = None,  # type: ignore[assignment]
+        data_source_url: str = None,  # type: ignore[assignment]
+        document_path_replacement_regex: str = None,  # type: ignore[assignment]
+        aoai_connection_id: str = None,  # type: ignore[assignment]
     ) -> Dict[str, Any]:
         """
         Generate embeddings for a `input_data` source and push them into an Azure Cognitive Search index.
@@ -630,10 +630,10 @@ def data_index_acs_pipeline(
         embeddings_model=build_model_protocol(data_index.embedding.model),
         acs_config=json.dumps(acs_config),
         acs_connection_id=_resolve_connection_id(ml_client, data_index.index.connection),
-        chunk_size=data_index.source.chunk_size,
-        data_source_glob=data_index.source.input_glob,
-        data_source_url=data_index.source.citation_url,
-        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())
+        chunk_size=data_index.source.chunk_size,  # type: ignore[arg-type]
+        data_source_glob=data_index.source.input_glob,  # type: ignore[arg-type]
+        data_source_url=data_index.source.citation_url,  # type: ignore[arg-type]
+        document_path_replacement_regex=json.dumps(data_index.source.citation_url_replacement_regex._to_dict())  # type: ignore[arg-type]
         if data_index.source.citation_url_replacement_regex
         else None,
         aoai_connection_id=_resolve_connection_id(ml_client, data_index.embedding.connection),
@@ -718,9 +718,9 @@ def get_component_obj(ml_client, component_uri):
     return component_obj
 
 
-def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> Optional[str]:
+def _resolve_connection_id(ml_client, connection: Optional[Union[str, WorkspaceConnection]] = None) -> str:
     if connection is None:
-        return None
+        return ""
 
     if isinstance(connection, str):
         short_form = re.match(r"azureml:(?P<connection_name>[^/]*)", connection)
diff --git a/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py b/sdk/ai/azure-ai-resources/azure/ai/resources/client/_ai_client.py
@@ -315,16 +315,16 @@ def build_index_on_cloud(
             IndexSource,
             IndexStore,
         )
-        from azure.ai.resources._index._embeddings.EmbeddingsContainer import from_uri
+        from azure.ai.resources._index._embeddings import EmbeddingsContainer
         if isinstance(input_source, ACSSource):
             from azure.ai.resources._index._utils.connections import get_connection_by_id_v2, get_target_from_connection
 
             # Construct MLIndex object
             mlindex_config = {}
             connection_args = {"connection_type": "workspace_connection", "connection": {"id": aoai_connection_id}}
-            mlindex_config["embeddings"] = from_uri(
+            mlindex_config["embeddings"] = EmbeddingsContainer.from_uri(  # type: ignore[attr-defined]
                 build_open_ai_protocol(embeddings_model), **connection_args
-            ).get_metadata()
+            ).get_metadata()  # Bug 2922096
             mlindex_config["index"] = {
                 "kind": "acs",
                 "connection_type": "workspace_connection",