From 1f5744c31e6ef14d2611e1be171eb83855f45239 Mon Sep 17 00:00:00 2001
From: SDKAuto <sdkautomation@microsoft.com>
Date: Fri, 14 Mar 2025 15:36:33 +0000
Subject: [PATCH] CodeGen from PR 31869 in Azure/azure-rest-api-specs Merge
 ab9c5a3b55da0de3e51e6bb9400b223fd0b87ea2 into
 91fa01cca22d82bb2823e9238650ebf70e4a83a3

---
 sdk/ai/azure-ai-inference/_meta.json          |    6 +
 .../apiview-properties.json                   |   41 +
 .../azure/ai/inference/_client.py             |    6 +-
 .../azure/ai/inference/_configuration.py      |   12 +-
 .../azure/ai/inference/_model_base.py         |    2 +-
 .../ai/inference/_operations/_operations.py   |  242 +--
 .../azure/ai/inference/_patch.py              | 1371 +----------------
 .../azure/ai/inference/_serialization.py      |    6 +-
 .../azure/ai/inference/_version.py            |    2 +-
 .../azure/ai/inference/aio/_client.py         |    3 -
 .../azure/ai/inference/aio/_configuration.py  |    9 -
 .../inference/aio/_operations/_operations.py  |  243 +--
 .../azure/ai/inference/aio/_patch.py          | 1315 +---------------
 .../azure/ai/inference/models/__init__.py     |    8 +-
 .../azure/ai/inference/models/_models.py      |  631 ++++++--
 .../azure/ai/inference/models/_patch.py       |  560 +------
 .../azure/ai/inference/prompts/__init__.py    |    8 -
 .../azure/ai/inference/prompts/_core.py       |  312 ----
 .../azure/ai/inference/prompts/_invoker.py    |  295 ----
 .../azure/ai/inference/prompts/_mustache.py   |  671 --------
 .../azure/ai/inference/prompts/_parsers.py    |  156 --
 .../azure/ai/inference/prompts/_patch.py      |  124 --
 .../ai/inference/prompts/_prompty_utils.py    |  415 -----
 .../azure/ai/inference/prompts/_renderers.py  |   30 -
 .../azure/ai/inference/prompts/_tracer.py     |  316 ----
 .../azure/ai/inference/prompts/_utils.py      |  100 --
 .../azure/ai/inference/tracing.py             |  850 ----------
 ..._chat_completions_from_input_dict_async.py |    1 +
 ...ompletions_streaming_azure_openai_async.py |    1 +
 .../async_samples/sample_embeddings_async.py  |    1 +
 .../async_samples/sample_load_client_async.py |    1 +
 .../sample_chat_completions_azure_openai.py   |    1 +
 ...sample_chat_completions_from_input_dict.py |    1 +
 ...pletions_from_input_dict_with_image_url.py |    1 +
 ...at_completions_from_input_prompt_string.py |    1 +
 ...e_chat_completions_streaming_with_tools.py |    1 +
 .../sample_chat_completions_with_history.py   |    1 +
 .../sample_chat_completions_with_image_url.py |    1 +
 ...chat_completions_with_structured_output.py |    1 +
 ...letions_with_structured_output_pydantic.py |    1 +
 .../sample_chat_completions_with_tools.py     |    1 +
 .../samples/sample_embeddings_azure_openai.py |    1 +
 sdk/ai/azure-ai-inference/sdk_packaging.toml  |    2 +
 sdk/ai/azure-ai-inference/setup.py            |    8 +-
 .../tests/gen_ai_trace_verifier.py            |    1 +
 .../tests/model_inference_test_base.py        |    1 +
 .../tests/test_chat_completions_client.py     |    1 +
 .../test_chat_completions_client_async.py     |    1 +
 .../tests/test_client_tracing.py              |    4 +-
 .../tests/test_embeddings_client_async.py     |    1 +
 .../test_image_embeddings_client_async.py     |    1 +
 .../azure-ai-inference/tests/test_prompts.py  |    1 +
 .../tests/test_unit_tests.py                  |    1 +
 sdk/ai/azure-ai-inference/tsp-location.yaml   |    4 +-
 54 files changed, 655 insertions(+), 7120 deletions(-)
 create mode 100644 sdk/ai/azure-ai-inference/_meta.json
 create mode 100644 sdk/ai/azure-ai-inference/apiview-properties.json
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py
 delete mode 100644 sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py
 create mode 100644 sdk/ai/azure-ai-inference/sdk_packaging.toml

diff --git a/sdk/ai/azure-ai-inference/_meta.json b/sdk/ai/azure-ai-inference/_meta.json
new file mode 100644
index 000000000000..f1310564d353
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/_meta.json
@@ -0,0 +1,6 @@
+{
+  "commit": "4f9bafd8e839c8995dfac592f7e0034a6e231587",
+  "repository_url": "https://github.com/Azure/azure-rest-api-specs",
+  "typespec_src": "specification/ai/ModelClient",
+  "@azure-tools/typespec-python": "0.39.1"
+}
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/apiview-properties.json b/sdk/ai/azure-ai-inference/apiview-properties.json
new file mode 100644
index 000000000000..59060f14accf
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/apiview-properties.json
@@ -0,0 +1,41 @@
+{
+    "CrossLanguagePackageId": "AI.Model",
+    "CrossLanguageDefinitionId": {
+        "azure.ai.inference.models.ContentItem": "AI.Model.ChatMessageContentItem",
+        "azure.ai.inference.models.AudioDataContentItem": "AI.Model.ChatMessageAudioDataContentItem",
+        "azure.ai.inference.models.AudioUrlContentItem": "AI.Model.ChatMessageAudioUrlContentItem",
+        "azure.ai.inference.models.ChatChoice": "AI.Model.ChatChoice",
+        "azure.ai.inference.models.ChatCompletions": "AI.Model.ChatCompletions",
+        "azure.ai.inference.models.ChatCompletionsNamedToolChoice": "AI.Model.ChatCompletionsNamedToolChoice",
+        "azure.ai.inference.models.ChatCompletionsNamedToolChoiceFunction": "AI.Model.ChatCompletionsNamedToolChoiceFunction",
+        "azure.ai.inference.models.ChatCompletionsToolCall": "AI.Model.ChatCompletionsToolCall",
+        "azure.ai.inference.models.ChatCompletionsToolDefinition": "AI.Model.ChatCompletionsToolDefinition",
+        "azure.ai.inference.models.ChatResponseMessage": "AI.Model.ChatResponseMessage",
+        "azure.ai.inference.models.CompletionsUsage": "AI.Model.CompletionsUsage",
+        "azure.ai.inference.models.EmbeddingItem": "AI.Model.EmbeddingItem",
+        "azure.ai.inference.models.EmbeddingsResult": "AI.Model.EmbeddingsResult",
+        "azure.ai.inference.models.EmbeddingsUsage": "AI.Model.EmbeddingsUsage",
+        "azure.ai.inference.models.FunctionCall": "AI.Model.FunctionCall",
+        "azure.ai.inference.models.FunctionDefinition": "AI.Model.FunctionDefinition",
+        "azure.ai.inference.models.ImageContentItem": "AI.Model.ChatMessageImageContentItem",
+        "azure.ai.inference.models.ImageEmbeddingInput": "AI.Model.ImageEmbeddingInput",
+        "azure.ai.inference.models.ImageUrl": "AI.Model.ChatMessageImageUrl",
+        "azure.ai.inference.models.InputAudio": "AI.Model.ChatMessageInputAudio",
+        "azure.ai.inference.models.InputAudioUrl": "AI.Model.ChatMessageInputAudioUrl",
+        "azure.ai.inference.models.JsonSchemaFormat": "AI.Model.ChatCompletionsResponseFormatJsonSchemaDefinition",
+        "azure.ai.inference.models.ModelInfo": "AI.Model.ModelInfo",
+        "azure.ai.inference.models.StreamingChatChoiceUpdate": "AI.Model.StreamingChatChoiceUpdate",
+        "azure.ai.inference.models.StreamingChatCompletionsUpdate": "AI.Model.StreamingChatCompletionsUpdate",
+        "azure.ai.inference.models.StreamingChatResponseMessageUpdate": "AI.Model.StreamingChatResponseMessageUpdate",
+        "azure.ai.inference.models.StreamingChatResponseToolCallUpdate": "AI.Model.StreamingChatResponseToolCallUpdate",
+        "azure.ai.inference.models.TextContentItem": "AI.Model.ChatMessageTextContentItem",
+        "azure.ai.inference.models.CompletionsFinishReason": "AI.Model.CompletionsFinishReason",
+        "azure.ai.inference.models.ChatRole": "AI.Model.ChatRole",
+        "azure.ai.inference.models.ImageDetailLevel": "AI.Model.ChatMessageImageDetailLevel",
+        "azure.ai.inference.models.AudioContentFormat": "AI.Model.AudioContentFormat",
+        "azure.ai.inference.models.ChatCompletionsToolChoicePreset": "AI.Model.ChatCompletionsToolChoicePreset",
+        "azure.ai.inference.models.ModelType": "AI.Model.ModelType",
+        "azure.ai.inference.models.EmbeddingEncodingFormat": "AI.Model.EmbeddingEncodingFormat",
+        "azure.ai.inference.models.EmbeddingInputType": "AI.Model.EmbeddingInputType"
+    }
+}
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
index 0cde08ffa7cc..1fc2ee38dca8 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_client.py
@@ -39,7 +39,7 @@ class ChatCompletionsClient(ChatCompletionsClientOperationsMixin):
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
+     ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
      behavior.
@@ -117,7 +117,7 @@ class EmbeddingsClient(EmbeddingsClientOperationsMixin):
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
+     ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
      behavior.
@@ -195,7 +195,7 @@ class ImageEmbeddingsClient(ImageEmbeddingsClientOperationsMixin):
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
+     ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
      behavior.
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
index 894ec657140f..8fc56f572a89 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_configuration.py
@@ -28,7 +28,7 @@ class ChatCompletionsClientConfiguration:  # pylint: disable=too-many-instance-a
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
+     ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
      behavior.
@@ -54,8 +54,6 @@ def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCr
     def _infer_policy(self, **kwargs):
         if isinstance(self.credential, AzureKeyCredential):
             return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
-        if isinstance(self.credential, AzureKeyCredential):
-            return policies.AzureKeyCredentialPolicy(self.credential, "api-key", **kwargs)
         if hasattr(self.credential, "get_token"):
             return policies.BearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
         raise TypeError(f"Unsupported credential: {self.credential}")
@@ -85,7 +83,7 @@ class EmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attrib
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
+     ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
      behavior.
@@ -111,8 +109,6 @@ def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCr
     def _infer_policy(self, **kwargs):
         if isinstance(self.credential, AzureKeyCredential):
             return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
-        if isinstance(self.credential, AzureKeyCredential):
-            return policies.AzureKeyCredentialPolicy(self.credential, "api-key", **kwargs)
         if hasattr(self.credential, "get_token"):
             return policies.BearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
         raise TypeError(f"Unsupported credential: {self.credential}")
@@ -142,7 +138,7 @@ class ImageEmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-a
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or ~azure.core.credentials.TokenCredential
+     ~azure.core.credentials.TokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
      behavior.
@@ -168,8 +164,6 @@ def __init__(self, endpoint: str, credential: Union[AzureKeyCredential, "TokenCr
     def _infer_policy(self, **kwargs):
         if isinstance(self.credential, AzureKeyCredential):
             return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
-        if isinstance(self.credential, AzureKeyCredential):
-            return policies.AzureKeyCredentialPolicy(self.credential, "api-key", **kwargs)
         if hasattr(self.credential, "get_token"):
             return policies.BearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
         raise TypeError(f"Unsupported credential: {self.credential}")
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
index 359ecebe23f7..3072ee252ed9 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_model_base.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-lines,arguments-differ,signature-differs,no-member
+# pylint: disable=too-many-lines
 # coding=utf-8
 # --------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
index 78e5ee353228..b48a0dc52af5 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_operations/_operations.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-locals
 # coding=utf-8
 # --------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
@@ -9,7 +8,7 @@
 from io import IOBase
 import json
 import sys
-from typing import Any, Callable, Dict, IO, List, Optional, TypeVar, Union, overload
+from typing import Any, Callable, Dict, IO, Optional, TypeVar, Union, overload
 
 from azure.core.exceptions import (
     ClientAuthenticationError,
@@ -36,7 +35,6 @@
 else:
     from typing import MutableMapping  # type: ignore
 JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
 T = TypeVar("T")
 ClsType = Optional[Callable[[PipelineResponse[HttpRequest, HttpResponse], T, Dict[str, Any]], Any]]
 
@@ -184,24 +182,10 @@ class ChatCompletionsClientOperationsMixin(ChatCompletionsClientMixinABC):
     @overload
     def _complete(
         self,
+        body: _models._models.ChatCompletionsOptions,
         *,
-        messages: List[_models._models.ChatRequestMessage],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
-        frequency_penalty: Optional[float] = None,
-        stream_parameter: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.ChatCompletions: ...
     @overload
@@ -226,24 +210,9 @@ def _complete(
     @distributed_trace
     def _complete(
         self,
-        body: Union[JSON, IO[bytes]] = _Unset,
+        body: Union[_models._models.ChatCompletionsOptions, JSON, IO[bytes]],
         *,
-        messages: List[_models._models.ChatRequestMessage] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
-        frequency_penalty: Optional[float] = None,
-        stream_parameter: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.ChatCompletions:
         """Gets chat completions for the provided chat messages.
@@ -252,93 +221,14 @@ def _complete(
         provided prompt data. The method makes a REST API call to the ``/chat/completions`` route
         on the given endpoint.
 
-        :param body: Is either a JSON type or a IO[bytes] type. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models._models.ChatRequestMessage]
+        :param body: The options for chat completions. Is one of the following types:
+         ChatCompletionsOptions, JSON, IO[bytes] Required.
+        :type body: ~azure.ai.inference.models._models.ChatCompletionsOptions or JSON or IO[bytes]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
          "pass-through". Default value is None.
         :paramtype extra_params: str or ~azure.ai.inference.models.ExtraParameters
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative
-         frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2]. Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword stream_parameter: A value indicating whether chat completions should be streamed for
-         this request. Default value is None.
-        :paramtype stream_parameter: bool
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the
-         model's likelihood to output new topics.
-         Supported range is [-2, 2]. Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1]. Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1]. Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: An object specifying the format that the model must output.
-
-         Setting to ``{ "type": "json_schema", "json_schema": {...} }`` enables Structured Outputs
-         which ensures the model will match your supplied JSON schema.
-
-         Setting to ``{ "type": "json_object" }`` enables JSON mode, which ensures the message the
-         model generates is valid JSON.
-
-         **Important:** when using JSON mode, you **must** also instruct the model to produce JSON
-         yourself via a system or user message. Without this, the model may generate an unending stream
-         of whitespace until the generation reaches the token limit, resulting in a long-running and
-         seemingly "stuck" request. Also note that the message content may be partially cut off if
-         ``finish_reason="length"``\\ , which indicates the generation exceeded ``max_tokens`` or the
-         conversation exceeded the max context length. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models._models.ChatCompletionsResponseFormat
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: A list of tools the model may request to call. Currently, only functions are
-         supported as a tool. The model
-         may response with a function call request and provide the input arguments in JSON format for
-         that function. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed. Default
-         value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
         :return: ChatCompletions. The ChatCompletions is compatible with MutableMapping
         :rtype: ~azure.ai.inference.models.ChatCompletions
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -357,25 +247,6 @@ def _complete(
         content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
         cls: ClsType[_models.ChatCompletions] = kwargs.pop("cls", None)
 
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "frequency_penalty": frequency_penalty,
-                "max_tokens": max_tokens,
-                "messages": messages,
-                "model": model,
-                "presence_penalty": presence_penalty,
-                "response_format": response_format,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream_parameter,
-                "temperature": temperature,
-                "tool_choice": tool_choice,
-                "tools": tools,
-                "top_p": top_p,
-            }
-            body = {k: v for k, v in body.items() if v is not None}
         content_type = content_type or "application/json"
         _content = None
         if isinstance(body, (IOBase, bytes)):
@@ -488,14 +359,10 @@ class EmbeddingsClientOperationsMixin(EmbeddingsClientMixinABC):
     @overload
     def _embed(
         self,
+        body: _models._models.EmbeddingsOptions,
         *,
-        input: List[str],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult: ...
     @overload
@@ -520,46 +387,22 @@ def _embed(
     @distributed_trace
     def _embed(
         self,
-        body: Union[JSON, IO[bytes]] = _Unset,
+        body: Union[_models._models.EmbeddingsOptions, JSON, IO[bytes]],
         *,
-        input: List[str] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult:
         """Return the embedding vectors for given text prompts.
         The method makes a REST API call to the ``/embeddings`` route on the given endpoint.
 
-        :param body: Is either a JSON type or a IO[bytes] type. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
+        :param body: The body of the request containing the options for generating embeddings. Is one
+         of the following types: EmbeddingsOptions, JSON, IO[bytes] Required.
+        :type body: ~azure.ai.inference.models._models.EmbeddingsOptions or JSON or IO[bytes]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
          "pass-through". Default value is None.
         :paramtype extra_params: str or ~azure.ai.inference.models.ExtraParameters
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have.
-         Passing null causes the model to use its default value.
-         Returns a 422 error if the model doesn't support the value or parameter. Default value is
-         None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings. Known
-         values are: "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input.
-         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
         :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -578,17 +421,6 @@ def _embed(
         content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
         cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
 
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "dimensions": dimensions,
-                "encoding_format": encoding_format,
-                "input": input,
-                "input_type": input_type,
-                "model": model,
-            }
-            body = {k: v for k, v in body.items() if v is not None}
         content_type = content_type or "application/json"
         _content = None
         if isinstance(body, (IOBase, bytes)):
@@ -701,14 +533,10 @@ class ImageEmbeddingsClientOperationsMixin(ImageEmbeddingsClientMixinABC):
     @overload
     def _embed(
         self,
+        body: _models._models.ImageEmbeddingsOptions,
         *,
-        input: List[_models.ImageEmbeddingInput],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult: ...
     @overload
@@ -733,49 +561,22 @@ def _embed(
     @distributed_trace
     def _embed(
         self,
-        body: Union[JSON, IO[bytes]] = _Unset,
+        body: Union[_models._models.ImageEmbeddingsOptions, JSON, IO[bytes]],
         *,
-        input: List[_models.ImageEmbeddingInput] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult:
         """Return the embedding vectors for given images.
         The method makes a REST API call to the ``/images/embeddings`` route on the given endpoint.
 
-        :param body: Is either a JSON type or a IO[bytes] type. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
+        :param body: The body of the request containing options for image embeddings. Is one of the
+         following types: ImageEmbeddingsOptions, JSON, IO[bytes] Required.
+        :type body: ~azure.ai.inference.models._models.ImageEmbeddingsOptions or JSON or IO[bytes]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
          "pass-through". Default value is None.
         :paramtype extra_params: str or ~azure.ai.inference.models.ExtraParameters
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have.
-         Passing null causes the model to use its default value.
-         Returns a 422 error if the model doesn't support the value or parameter. Default value is
-         None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The number of dimensions the resulting output embeddings
-         should have.
-         Passing null causes the model to use its default value.
-         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input.
-         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
         :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -794,17 +595,6 @@ def _embed(
         content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
         cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
 
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "dimensions": dimensions,
-                "encoding_format": encoding_format,
-                "input": input,
-                "input_type": input_type,
-                "model": model,
-            }
-            body = {k: v for k, v in body.items() if v is not None}
         content_type = content_type or "application/json"
         _content = None
         if isinstance(body, (IOBase, bytes)):
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
index da95cf93daf9..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_patch.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-lines
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
@@ -6,1376 +5,10 @@
 """Customize generated code here.
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
-
-Why do we patch auto-generated code? Below is a summary of the changes made in all _patch files (not just this one):
-1. Add support for input argument `model_extras` (all clients)
-2. Add support for function load_client
-3. Add support for setting sticky chat completions/embeddings input arguments in the client constructor
-4. Add support for get_model_info, while caching the result (all clients)
-5. Add support for chat completion streaming (ChatCompletionsClient client only)
-6. Add support for friendly print of result objects (__str__ method) (all clients)
-7. Add support for load() method in ImageUrl class (see /models/_patch.py)
-8. Add support for sending two auth headers for api-key auth (all clients)
-9. Simplify how chat completions "response_format" is set. Define "response_format" as a flat Union of strings and
-   JsonSchemaFormat object, instead of using auto-generated base/derived classes named
-   ChatCompletionsResponseFormatXxxInternal.
-10. Allow UserMessage("my message") in addition to UserMessage(content="my message"). Same applies to 
-AssistantMessage, SystemMessage, DeveloperMessage and ToolMessage.
-
 """
-import json
-import logging
-import sys
-
-from io import IOBase
-from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING, Iterable
-
-from azure.core.pipeline import PipelineResponse
-from azure.core.credentials import AzureKeyCredential
-from azure.core.tracing.decorator import distributed_trace
-from azure.core.utils import case_insensitive_dict
-from azure.core.exceptions import (
-    ClientAuthenticationError,
-    HttpResponseError,
-    map_error,
-    ResourceExistsError,
-    ResourceNotFoundError,
-    ResourceNotModifiedError,
-)
-from . import models as _models
-from ._model_base import SdkJSONEncoder, _deserialize
-from ._serialization import Serializer
-from ._operations._operations import (
-    build_chat_completions_complete_request,
-    build_embeddings_embed_request,
-    build_image_embeddings_embed_request,
-)
-from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
-from ._client import EmbeddingsClient as EmbeddingsClientGenerated
-from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
-
-if sys.version_info >= (3, 9):
-    from collections.abc import MutableMapping
-else:
-    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
-
-if TYPE_CHECKING:
-    # pylint: disable=unused-import,ungrouped-imports
-    from azure.core.credentials import TokenCredential
-
-JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
-
-_SERIALIZER = Serializer()
-_SERIALIZER.client_side_validation = False
-
-_LOGGER = logging.getLogger(__name__)
-
-
-def _get_internal_response_format(
-    response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]]
-) -> Optional[_models._models.ChatCompletionsResponseFormat]:
-    """
-    Internal helper method to convert between the public response format type that's supported in the `complete` method,
-    and the internal response format type that's used in the generated code.
-
-    :param response_format: Response format. Required.
-    :type response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]]
-    :return: Internal response format.
-    :rtype: ~azure.ai.inference._models._models.ChatCompletionsResponseFormat
-    """
-    if response_format is not None:
-
-        # To make mypy tool happy, start by declaring the type as the base class
-        internal_response_format: _models._models.ChatCompletionsResponseFormat
-
-        if isinstance(response_format, str) and response_format == "text":
-            internal_response_format = (
-                _models._models.ChatCompletionsResponseFormatText()  # pylint: disable=protected-access
-            )
-        elif isinstance(response_format, str) and response_format == "json_object":
-            internal_response_format = (
-                _models._models.ChatCompletionsResponseFormatJsonObject()  # pylint: disable=protected-access
-            )
-        elif isinstance(response_format, _models.JsonSchemaFormat):
-            internal_response_format = (
-                _models._models.ChatCompletionsResponseFormatJsonSchema(  # pylint: disable=protected-access
-                    json_schema=response_format
-                )
-            )
-        else:
-            raise ValueError(f"Unsupported `response_format` {response_format}")
-
-        return internal_response_format
-
-    return None
-
-
-def load_client(
-    endpoint: str, credential: Union[AzureKeyCredential, "TokenCredential"], **kwargs: Any
-) -> Union["ChatCompletionsClient", "EmbeddingsClient", "ImageEmbeddingsClient"]:
-    """
-    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
-    on the given endpoint, to determine the model type and therefore which client to instantiate.
-    Keyword arguments are passed to the appropriate client's constructor, so if you need to set things like
-    `api_version`, `logging_enable`, `user_agent`, etc., you can do so here.
-    This method will only work when using Serverless API or Managed Compute endpoint.
-    It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-    Keyword arguments are passed through to the client constructor (you can set keywords such as
-    `api_version`, `user_agent`, `logging_enable` etc. on the client constructor).
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :return: The appropriate synchronous client associated with the given endpoint
-    :rtype: ~azure.ai.inference.ChatCompletionsClient or ~azure.ai.inference.EmbeddingsClient
-     or ~azure.ai.inference.ImageEmbeddingsClient
-    :raises ~azure.core.exceptions.HttpResponseError:
-    """
-
-    with ChatCompletionsClient(
-        endpoint, credential, **kwargs
-    ) as client:  # Pick any of the clients, it does not matter.
-        try:
-            model_info = client.get_model_info()  # type: ignore
-        except ResourceNotFoundError as error:
-            error.message = (
-                "`load_client` function does not work on this endpoint (`/info` route not supported). "
-                "Please construct one of the clients (e.g. `ChatCompletionsClient`) directly."
-            )
-            raise error
-
-    _LOGGER.info("model_info=%s", model_info)
-    if not model_info.model_type:
-        raise ValueError(
-            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
-        )
-
-    # TODO: Remove "completions", "chat-comletions" and "embedding" once Mistral Large and Cohere fixes their model type
-    if model_info.model_type in (
-        _models.ModelType.CHAT_COMPLETION,
-        "chat_completions",
-        "chat",
-        "completion",
-        "chat-completion",
-        "chat-completions",
-        "chat completion",
-        "chat completions",
-    ):
-        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
-        chat_completion_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return chat_completion_client
-
-    if model_info.model_type in (
-        _models.ModelType.EMBEDDINGS,
-        "embedding",
-        "text_embedding",
-        "text-embeddings",
-        "text embedding",
-        "text embeddings",
-    ):
-        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
-        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
-        return embedding_client
-
-    if model_info.model_type in (
-        _models.ModelType.IMAGE_EMBEDDINGS,
-        "image_embedding",
-        "image-embeddings",
-        "image-embedding",
-        "image embedding",
-        "image embeddings",
-    ):
-        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
-        image_embedding_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return image_embedding_client
-
-    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
-
-
-class ChatCompletionsClient(ChatCompletionsClientGenerated):  # pylint: disable=too-many-instance-attributes
-    """ChatCompletionsClient.
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword frequency_penalty: A value that influences the probability of generated tokens
-        appearing based on their cumulative frequency in generated text.
-        Positive values will make tokens less likely to appear as their frequency increases and
-        decrease the likelihood of the model repeating the same statements verbatim.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype frequency_penalty: float
-    :keyword presence_penalty: A value that influences the probability of generated tokens
-        appearing based on their existing
-        presence in generated text.
-        Positive values will make tokens less likely to appear when they already exist and increase
-        the model's likelihood to output new topics.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype presence_penalty: float
-    :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-        generated completions.
-        Higher values will make output more random while lower values will make results more focused
-        and deterministic.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype temperature: float
-    :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-        causes the
-        model to consider the results of tokens with the provided probability mass. As an example, a
-        value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-        considered.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype top_p: float
-    :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-    :paramtype max_tokens: int
-    :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-        unformatted text by default. This is equivalent to setting "text" as the response_format.
-        To output JSON format, without adhering to any schema, set to "json_object".
-        To output JSON format adhering to a provided schema, set this to an object of the class
-        ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-    :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-    :keyword stop: A collection of textual sequences that will end completions generation. Default
-        value is None.
-    :paramtype stop: list[str]
-    :keyword tools: The available tool definitions that the chat completions request can use,
-        including caller-defined functions. Default value is None.
-    :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-    :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-        use for the chat completions response. Is either a Union[str,
-        "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-        Default value is None.
-    :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-        ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-    :keyword seed: If specified, the system will make a best effort to sample deterministically
-        such that repeated requests with the
-        same seed and parameters should return the same result. Determinism is not guaranteed.
-        Default value is None.
-    :paramtype seed: int
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default chat completions settings, to be applied in all future service calls
-        # unless overridden by arguments in the `complete` method.
-        self._frequency_penalty = frequency_penalty
-        self._presence_penalty = presence_penalty
-        self._temperature = temperature
-        self._top_p = top_p
-        self._max_tokens = max_tokens
-        self._internal_response_format = _get_internal_response_format(response_format)
-        self._stop = stop
-        self._tools = tools
-        self._tool_choice = tool_choice
-        self._seed = seed
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]],
-        stream: Literal[False] = False,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.ChatCompletions: ...
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]],
-        stream: Literal[True],
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Iterable[_models.StreamingChatCompletionsUpdate]: ...
-
-    @overload
-    def complete(
-        self,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]],
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
-        on the given endpoint.
-        When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting StreamingChatCompletions
-        object to get content updates as they arrive. By default, the response is a ChatCompletions object
-        (non-streaming).
-
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage] or list[dict[str, Any]]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def complete(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def complete(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    # pylint:disable=client-method-missing-tracing-decorator
-    def complete(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        messages: Union[List[_models.ChatRequestMessage], List[Dict[str, Any]]] = _Unset,
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[Iterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting :class:`~azure.ai.inference.models.StreamingChatCompletions`
-        object to get content updates as they arrive.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage] or list[dict[str, Any]]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or Iterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.StreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        internal_response_format = _get_internal_response_format(response_format)
-
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "messages": messages,
-                "stream": stream,
-                "frequency_penalty": frequency_penalty if frequency_penalty is not None else self._frequency_penalty,
-                "max_tokens": max_tokens if max_tokens is not None else self._max_tokens,
-                "model": model if model is not None else self._model,
-                "presence_penalty": presence_penalty if presence_penalty is not None else self._presence_penalty,
-                "response_format": (
-                    internal_response_format if internal_response_format is not None else self._internal_response_format
-                ),
-                "seed": seed if seed is not None else self._seed,
-                "stop": stop if stop is not None else self._stop,
-                "temperature": temperature if temperature is not None else self._temperature,
-                "tool_choice": tool_choice if tool_choice is not None else self._tool_choice,
-                "tools": tools if tools is not None else self._tools,
-                "top_p": top_p if top_p is not None else self._top_p,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
-            stream = body["stream"]
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_chat_completions_complete_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = stream or False
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            return _models.StreamingChatCompletions(response)
-
-        return _deserialize(_models._patch.ChatCompletions, response.json())  # pylint: disable=protected-access
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            try:
-                self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-            except ResourceNotFoundError as error:
-                error.message = "Model information is not available on this endpoint (`/info` route not supported)."
-                raise error
-
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class EmbeddingsClient(EmbeddingsClientGenerated):
-    """EmbeddingsClient.
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def embed(
-        self,
-        *,
-        input: List[str],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace
-    def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[str] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            try:
-                self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-            except ResourceNotFoundError as error:
-                error.message = "Model information is not available on this endpoint (`/info` route not supported)."
-                raise error
-
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
-    """ImageEmbeddingsClient.
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a TokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.TokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "TokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    def embed(
-        self,
-        *,
-        input: List[_models.ImageEmbeddingInput],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace
-    def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[_models.ImageEmbeddingInput] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_image_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = self._client._pipeline.run(  # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace
-    def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            try:
-                self._model_info = self._get_model_info(**kwargs)  # pylint: disable=attribute-defined-outside-init
-            except ResourceNotFoundError as error:
-                error.message = "Model information is not available on this endpoint (`/info` route not supported)."
-                raise error
-
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
+from typing import List
 
-__all__: List[str] = [
-    "load_client",
-    "ChatCompletionsClient",
-    "EmbeddingsClient",
-    "ImageEmbeddingsClient",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
index a066e16a64dd..7a0232de5ddc 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_serialization.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-lines
+# pylint: disable=line-too-long,useless-suppression,too-many-lines
 # --------------------------------------------------------------------------
 #
 # Copyright (c) Microsoft Corporation. All rights reserved.
@@ -411,7 +411,7 @@ def from_dict(
         :param function key_extractors: A key extractor function.
         :param str content_type: JSON by default, set application/xml if XML.
         :returns: An instance of this model
-        :raises: DeserializationError if something went wrong
+        :raises DeserializationError: if something went wrong
         :rtype: Self
         """
         deserializer = Deserializer(cls._infer_class_models())
@@ -1361,7 +1361,7 @@ def xml_key_extractor(attr, attr_desc, data):  # pylint: disable=unused-argument
         # Iter and wrapped, should have found one node only (the wrap one)
         if len(children) != 1:
             raise DeserializationError(
-                "Tried to deserialize an array not wrapped, and found several nodes '{}'. Maybe you should declare this array as wrapped?".format(  # pylint: disable=line-too-long
+                "Tried to deserialize an array not wrapped, and found several nodes '{}'. Maybe you should declare this array as wrapped?".format(
                     xml_name
                 )
             )
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
index b1c2836b6921..be71c81bd282 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/_version.py
@@ -6,4 +6,4 @@
 # Changes may cause incorrect behavior and will be lost if the code is regenerated.
 # --------------------------------------------------------------------------
 
-VERSION = "1.0.0b9"
+VERSION = "1.0.0b1"
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
index 88e6773bd8f1..212904c011cf 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_client.py
@@ -39,7 +39,6 @@ class ChatCompletionsClient(ChatCompletionsClientOperationsMixin):
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
@@ -122,7 +121,6 @@ class EmbeddingsClient(EmbeddingsClientOperationsMixin):
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
@@ -205,7 +203,6 @@ class ImageEmbeddingsClient(ImageEmbeddingsClientOperationsMixin):
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
index f60e112599d6..4fbe724f0326 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_configuration.py
@@ -28,7 +28,6 @@ class ChatCompletionsClientConfiguration:  # pylint: disable=too-many-instance-a
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
@@ -57,8 +56,6 @@ def __init__(
     def _infer_policy(self, **kwargs):
         if isinstance(self.credential, AzureKeyCredential):
             return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
-        if isinstance(self.credential, AzureKeyCredential):
-            return policies.AzureKeyCredentialPolicy(self.credential, "api-key", **kwargs)
         if hasattr(self.credential, "get_token"):
             return policies.AsyncBearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
         raise TypeError(f"Unsupported credential: {self.credential}")
@@ -88,7 +85,6 @@ class EmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-attrib
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
@@ -117,8 +113,6 @@ def __init__(
     def _infer_policy(self, **kwargs):
         if isinstance(self.credential, AzureKeyCredential):
             return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
-        if isinstance(self.credential, AzureKeyCredential):
-            return policies.AzureKeyCredentialPolicy(self.credential, "api-key", **kwargs)
         if hasattr(self.credential, "get_token"):
             return policies.AsyncBearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
         raise TypeError(f"Unsupported credential: {self.credential}")
@@ -148,7 +142,6 @@ class ImageEmbeddingsClientConfiguration:  # pylint: disable=too-many-instance-a
     :param credential: Credential used to authenticate requests to the service. Is either a key
      credential type or a token credential type. Required.
     :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials.AzureKeyCredential or
      ~azure.core.credentials_async.AsyncTokenCredential
     :keyword api_version: The API version to use for this operation. Default value is
      "2024-05-01-preview". Note that overriding this default value may result in unsupported
@@ -177,8 +170,6 @@ def __init__(
     def _infer_policy(self, **kwargs):
         if isinstance(self.credential, AzureKeyCredential):
             return policies.AzureKeyCredentialPolicy(self.credential, "Authorization", prefix="Bearer", **kwargs)
-        if isinstance(self.credential, AzureKeyCredential):
-            return policies.AzureKeyCredentialPolicy(self.credential, "api-key", **kwargs)
         if hasattr(self.credential, "get_token"):
             return policies.AsyncBearerTokenCredentialPolicy(self.credential, *self.credential_scopes, **kwargs)
         raise TypeError(f"Unsupported credential: {self.credential}")
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
index 62ec772f6dae..c481e4719835 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_operations/_operations.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-locals
+# pylint: disable=line-too-long,useless-suppression
 # coding=utf-8
 # --------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
@@ -9,7 +9,7 @@
 from io import IOBase
 import json
 import sys
-from typing import Any, Callable, Dict, IO, List, Optional, TypeVar, Union, overload
+from typing import Any, Callable, Dict, IO, Optional, TypeVar, Union, overload
 
 from azure.core.exceptions import (
     ClientAuthenticationError,
@@ -43,7 +43,6 @@
 else:
     from typing import MutableMapping  # type: ignore
 JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
 T = TypeVar("T")
 ClsType = Optional[Callable[[PipelineResponse[HttpRequest, AsyncHttpResponse], T, Dict[str, Any]], Any]]
 
@@ -53,24 +52,10 @@ class ChatCompletionsClientOperationsMixin(ChatCompletionsClientMixinABC):
     @overload
     async def _complete(
         self,
+        body: _models._models.ChatCompletionsOptions,
         *,
-        messages: List[_models._models.ChatRequestMessage],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
-        frequency_penalty: Optional[float] = None,
-        stream_parameter: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.ChatCompletions: ...
     @overload
@@ -95,24 +80,9 @@ async def _complete(
     @distributed_trace_async
     async def _complete(
         self,
-        body: Union[JSON, IO[bytes]] = _Unset,
+        body: Union[_models._models.ChatCompletionsOptions, JSON, IO[bytes]],
         *,
-        messages: List[_models._models.ChatRequestMessage] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
-        frequency_penalty: Optional[float] = None,
-        stream_parameter: Optional[bool] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[_models._models.ChatCompletionsResponseFormat] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.ChatCompletions:
         """Gets chat completions for the provided chat messages.
@@ -121,93 +91,14 @@ async def _complete(
         provided prompt data. The method makes a REST API call to the ``/chat/completions`` route
         on the given endpoint.
 
-        :param body: Is either a JSON type or a IO[bytes] type. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models._models.ChatRequestMessage]
+        :param body: The options for chat completions. Is one of the following types:
+         ChatCompletionsOptions, JSON, IO[bytes] Required.
+        :type body: ~azure.ai.inference.models._models.ChatCompletionsOptions or JSON or IO[bytes]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
          "pass-through". Default value is None.
         :paramtype extra_params: str or ~azure.ai.inference.models.ExtraParameters
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative
-         frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2]. Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword stream_parameter: A value indicating whether chat completions should be streamed for
-         this request. Default value is None.
-        :paramtype stream_parameter: bool
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the
-         model's likelihood to output new topics.
-         Supported range is [-2, 2]. Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1]. Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1]. Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: An object specifying the format that the model must output.
-
-         Setting to ``{ "type": "json_schema", "json_schema": {...} }`` enables Structured Outputs
-         which ensures the model will match your supplied JSON schema.
-
-         Setting to ``{ "type": "json_object" }`` enables JSON mode, which ensures the message the
-         model generates is valid JSON.
-
-         **Important:** when using JSON mode, you **must** also instruct the model to produce JSON
-         yourself via a system or user message. Without this, the model may generate an unending stream
-         of whitespace until the generation reaches the token limit, resulting in a long-running and
-         seemingly "stuck" request. Also note that the message content may be partially cut off if
-         ``finish_reason="length"``\\ , which indicates the generation exceeded ``max_tokens`` or the
-         conversation exceeded the max context length. Default value is None.
-        :paramtype response_format: ~azure.ai.inference.models._models.ChatCompletionsResponseFormat
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: A list of tools the model may request to call. Currently, only functions are
-         supported as a tool. The model
-         may response with a function call request and provide the input arguments in JSON format for
-         that function. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed. Default
-         value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
         :return: ChatCompletions. The ChatCompletions is compatible with MutableMapping
         :rtype: ~azure.ai.inference.models.ChatCompletions
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -226,25 +117,6 @@ async def _complete(
         content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
         cls: ClsType[_models.ChatCompletions] = kwargs.pop("cls", None)
 
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "frequency_penalty": frequency_penalty,
-                "max_tokens": max_tokens,
-                "messages": messages,
-                "model": model,
-                "presence_penalty": presence_penalty,
-                "response_format": response_format,
-                "seed": seed,
-                "stop": stop,
-                "stream": stream_parameter,
-                "temperature": temperature,
-                "tool_choice": tool_choice,
-                "tools": tools,
-                "top_p": top_p,
-            }
-            body = {k: v for k, v in body.items() if v is not None}
         content_type = content_type or "application/json"
         _content = None
         if isinstance(body, (IOBase, bytes)):
@@ -357,14 +229,10 @@ class EmbeddingsClientOperationsMixin(EmbeddingsClientMixinABC):
     @overload
     async def _embed(
         self,
+        body: _models._models.EmbeddingsOptions,
         *,
-        input: List[str],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult: ...
     @overload
@@ -389,46 +257,22 @@ async def _embed(
     @distributed_trace_async
     async def _embed(
         self,
-        body: Union[JSON, IO[bytes]] = _Unset,
+        body: Union[_models._models.EmbeddingsOptions, JSON, IO[bytes]],
         *,
-        input: List[str] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult:
         """Return the embedding vectors for given text prompts.
         The method makes a REST API call to the ``/embeddings`` route on the given endpoint.
 
-        :param body: Is either a JSON type or a IO[bytes] type. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
+        :param body: The body of the request containing the options for generating embeddings. Is one
+         of the following types: EmbeddingsOptions, JSON, IO[bytes] Required.
+        :type body: ~azure.ai.inference.models._models.EmbeddingsOptions or JSON or IO[bytes]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
          "pass-through". Default value is None.
         :paramtype extra_params: str or ~azure.ai.inference.models.ExtraParameters
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have.
-         Passing null causes the model to use its default value.
-         Returns a 422 error if the model doesn't support the value or parameter. Default value is
-         None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings. Known
-         values are: "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input.
-         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
         :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -447,17 +291,6 @@ async def _embed(
         content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
         cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
 
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "dimensions": dimensions,
-                "encoding_format": encoding_format,
-                "input": input,
-                "input_type": input_type,
-                "model": model,
-            }
-            body = {k: v for k, v in body.items() if v is not None}
         content_type = content_type or "application/json"
         _content = None
         if isinstance(body, (IOBase, bytes)):
@@ -570,14 +403,10 @@ class ImageEmbeddingsClientOperationsMixin(ImageEmbeddingsClientMixinABC):
     @overload
     async def _embed(
         self,
+        body: _models._models.ImageEmbeddingsOptions,
         *,
-        input: List[_models.ImageEmbeddingInput],
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
         content_type: str = "application/json",
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult: ...
     @overload
@@ -602,49 +431,22 @@ async def _embed(
     @distributed_trace_async
     async def _embed(
         self,
-        body: Union[JSON, IO[bytes]] = _Unset,
+        body: Union[_models._models.ImageEmbeddingsOptions, JSON, IO[bytes]],
         *,
-        input: List[_models.ImageEmbeddingInput] = _Unset,
         extra_params: Optional[Union[str, _models._enums.ExtraParameters]] = None,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
         **kwargs: Any
     ) -> _models.EmbeddingsResult:
         """Return the embedding vectors for given images.
         The method makes a REST API call to the ``/images/embeddings`` route on the given endpoint.
 
-        :param body: Is either a JSON type or a IO[bytes] type. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
+        :param body: The body of the request containing options for image embeddings. Is one of the
+         following types: ImageEmbeddingsOptions, JSON, IO[bytes] Required.
+        :type body: ~azure.ai.inference.models._models.ImageEmbeddingsOptions or JSON or IO[bytes]
         :keyword extra_params: Controls what happens if extra parameters, undefined by the REST API,
          are passed in the JSON request payload.
          This sets the HTTP request header ``extra-parameters``. Known values are: "error", "drop", and
          "pass-through". Default value is None.
         :paramtype extra_params: str or ~azure.ai.inference.models.ExtraParameters
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have.
-         Passing null causes the model to use its default value.
-         Returns a 422 error if the model doesn't support the value or parameter. Default value is
-         None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The number of dimensions the resulting output embeddings
-         should have.
-         Passing null causes the model to use its default value.
-         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input.
-         Returns a 422 error if the model doesn't support the value or parameter. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
         :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
         :rtype: ~azure.ai.inference.models.EmbeddingsResult
         :raises ~azure.core.exceptions.HttpResponseError:
@@ -663,17 +465,6 @@ async def _embed(
         content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
         cls: ClsType[_models.EmbeddingsResult] = kwargs.pop("cls", None)
 
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "dimensions": dimensions,
-                "encoding_format": encoding_format,
-                "input": input,
-                "input_type": input_type,
-                "model": model,
-            }
-            body = {k: v for k, v in body.items() if v is not None}
         content_type = content_type or "application/json"
         _content = None
         if isinstance(body, (IOBase, bytes)):
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
index 2f9873805aa6..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/aio/_patch.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-lines
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
@@ -7,1319 +6,9 @@
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
 """
-import json
-import logging
-import sys
+from typing import List
 
-from io import IOBase
-from typing import Any, Dict, Union, IO, List, Literal, Optional, overload, Type, TYPE_CHECKING, AsyncIterable
-
-from azure.core.pipeline import PipelineResponse
-from azure.core.credentials import AzureKeyCredential
-from azure.core.tracing.decorator_async import distributed_trace_async
-from azure.core.utils import case_insensitive_dict
-from azure.core.exceptions import (
-    ClientAuthenticationError,
-    HttpResponseError,
-    map_error,
-    ResourceExistsError,
-    ResourceNotFoundError,
-    ResourceNotModifiedError,
-)
-from .. import models as _models
-from .._model_base import SdkJSONEncoder, _deserialize
-from ._client import ChatCompletionsClient as ChatCompletionsClientGenerated
-from ._client import EmbeddingsClient as EmbeddingsClientGenerated
-from ._client import ImageEmbeddingsClient as ImageEmbeddingsClientGenerated
-from .._operations._operations import (
-    build_chat_completions_complete_request,
-    build_embeddings_embed_request,
-    build_image_embeddings_embed_request,
-)
-from .._patch import _get_internal_response_format
-
-if TYPE_CHECKING:
-    # pylint: disable=unused-import,ungrouped-imports
-    from azure.core.credentials_async import AsyncTokenCredential
-
-if sys.version_info >= (3, 9):
-    from collections.abc import MutableMapping
-else:
-    from typing import MutableMapping  # type: ignore  # pylint: disable=ungrouped-imports
-
-JSON = MutableMapping[str, Any]  # pylint: disable=unsubscriptable-object
-_Unset: Any = object()
-_LOGGER = logging.getLogger(__name__)
-
-
-async def load_client(
-    endpoint: str, credential: Union[AzureKeyCredential, "AsyncTokenCredential"], **kwargs: Any
-) -> Union["ChatCompletionsClient", "EmbeddingsClient", "ImageEmbeddingsClient"]:
-    """
-    Load a client from a given endpoint URL. The method makes a REST API call to the `/info` route
-    on the given endpoint, to determine the model type and therefore which client to instantiate.
-    This method will only work when using Serverless API or Managed Compute endpoint.
-    It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-    Keyword arguments are passed through to the client constructor (you can set keywords such as
-    `api_version`, `user_agent`, `logging_enable` etc. on the client constructor).
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :return: The appropriate asynchronous client associated with the given endpoint
-    :rtype: ~azure.ai.inference.aio.ChatCompletionsClient or ~azure.ai.inference.aio.EmbeddingsClient
-     or ~azure.ai.inference.aio.ImageEmbeddingsClient
-    :raises ~azure.core.exceptions.HttpResponseError:
-    """
-
-    async with ChatCompletionsClient(
-        endpoint, credential, **kwargs
-    ) as client:  # Pick any of the clients, it does not matter.
-        try:
-            model_info = await client.get_model_info()  # type: ignore
-        except ResourceNotFoundError as error:
-            error.message = (
-                "`load_client` function does not work on this endpoint (`/info` route not supported). "
-                "Please construct one of the clients (e.g. `ChatCompletionsClient`) directly."
-            )
-            raise error
-
-    _LOGGER.info("model_info=%s", model_info)
-    if not model_info.model_type:
-        raise ValueError(
-            "The AI model information is missing a value for `model type`. Cannot create an appropriate client."
-        )
-
-    # TODO: Remove "completions", "chat-comletions" and "embedding" once Mistral Large and Cohere fixes their model type
-    if model_info.model_type in (
-        _models.ModelType.CHAT_COMPLETION,
-        "chat_completions",
-        "chat",
-        "completion",
-        "chat-completion",
-        "chat-completions",
-        "chat completion",
-        "chat completions",
-    ):
-        chat_completion_client = ChatCompletionsClient(endpoint, credential, **kwargs)
-        chat_completion_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return chat_completion_client
-
-    if model_info.model_type in (
-        _models.ModelType.EMBEDDINGS,
-        "embedding",
-        "text_embedding",
-        "text-embeddings",
-        "text embedding",
-        "text embeddings",
-    ):
-        embedding_client = EmbeddingsClient(endpoint, credential, **kwargs)
-        embedding_client._model_info = model_info  # pylint: disable=protected-access,attribute-defined-outside-init
-        return embedding_client
-
-    if model_info.model_type in (
-        _models.ModelType.IMAGE_EMBEDDINGS,
-        "image_embedding",
-        "image-embeddings",
-        "image-embedding",
-        "image embedding",
-        "image embeddings",
-    ):
-        image_embedding_client = ImageEmbeddingsClient(endpoint, credential, **kwargs)
-        image_embedding_client._model_info = (  # pylint: disable=protected-access,attribute-defined-outside-init
-            model_info
-        )
-        return image_embedding_client
-
-    raise ValueError(f"No client available to support AI model type `{model_info.model_type}`")
-
-
-class ChatCompletionsClient(ChatCompletionsClientGenerated):  # pylint: disable=too-many-instance-attributes
-    """ChatCompletionsClient.
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword frequency_penalty: A value that influences the probability of generated tokens
-        appearing based on their cumulative frequency in generated text.
-        Positive values will make tokens less likely to appear as their frequency increases and
-        decrease the likelihood of the model repeating the same statements verbatim.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype frequency_penalty: float
-    :keyword presence_penalty: A value that influences the probability of generated tokens
-        appearing based on their existing
-        presence in generated text.
-        Positive values will make tokens less likely to appear when they already exist and increase
-        the model's likelihood to output new topics.
-        Supported range is [-2, 2].
-        Default value is None.
-    :paramtype presence_penalty: float
-    :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-        generated completions.
-        Higher values will make output more random while lower values will make results more focused
-        and deterministic.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype temperature: float
-    :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-        causes the
-        model to consider the results of tokens with the provided probability mass. As an example, a
-        value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-        considered.
-        It is not recommended to modify temperature and top_p for the same completions request as the
-        interaction of these two settings is difficult to predict.
-        Supported range is [0, 1].
-        Default value is None.
-    :paramtype top_p: float
-    :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-    :paramtype max_tokens: int
-    :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-        unformatted text by default. This is equivalent to setting "text" as the response_format.
-        To output JSON format, without adhering to any schema, set to "json_object".
-        To output JSON format adhering to a provided schema, set this to an object of the class
-        ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-    :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-    :keyword stop: A collection of textual sequences that will end completions generation. Default
-        value is None.
-    :paramtype stop: list[str]
-    :keyword tools: The available tool definitions that the chat completions request can use,
-        including caller-defined functions. Default value is None.
-    :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-    :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-        use for the chat completions response. Is either a Union[str,
-        "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-        Default value is None.
-    :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-        ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-    :keyword seed: If specified, the system will make a best effort to sample deterministically
-        such that repeated requests with the
-        same seed and parameters should return the same result. Determinism is not guaranteed.
-        Default value is None.
-    :paramtype seed: int
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default chat completions settings, to be applied in all future service calls
-        # unless overridden by arguments in the `complete` method.
-        self._frequency_penalty = frequency_penalty
-        self._presence_penalty = presence_penalty
-        self._temperature = temperature
-        self._top_p = top_p
-        self._max_tokens = max_tokens
-        self._internal_response_format = _get_internal_response_format(response_format)
-        self._stop = stop
-        self._tools = tools
-        self._tool_choice = tool_choice
-        self._seed = seed
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[False] = False,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.ChatCompletions: ...
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Literal[True],
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> AsyncIterable[_models.StreamingChatCompletionsUpdate]: ...
-
-    @overload
-    async def complete(
-        self,
-        *,
-        messages: List[_models.ChatRequestMessage],
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. The method makes a REST API call to the `/chat/completions` route
-        on the given endpoint.
-        When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting StreamingChatCompletions
-        object to get content updates as they arrive. By default, the response is a ChatCompletions object
-        (non-streaming).
-
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def complete(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def complete(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    # pylint:disable=client-method-missing-tracing-decorator-async
-    async def complete(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        messages: List[_models.ChatRequestMessage] = _Unset,
-        stream: Optional[bool] = None,
-        frequency_penalty: Optional[float] = None,
-        presence_penalty: Optional[float] = None,
-        temperature: Optional[float] = None,
-        top_p: Optional[float] = None,
-        max_tokens: Optional[int] = None,
-        response_format: Optional[Union[Literal["text", "json_object"], _models.JsonSchemaFormat]] = None,
-        stop: Optional[List[str]] = None,
-        tools: Optional[List[_models.ChatCompletionsToolDefinition]] = None,
-        tool_choice: Optional[
-            Union[str, _models.ChatCompletionsToolChoicePreset, _models.ChatCompletionsNamedToolChoice]
-        ] = None,
-        seed: Optional[int] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> Union[AsyncIterable[_models.StreamingChatCompletionsUpdate], _models.ChatCompletions]:
-        # pylint: disable=line-too-long
-        # pylint: disable=too-many-locals
-        """Gets chat completions for the provided chat messages.
-        Completions support a wide variety of tasks and generate text that continues from or
-        "completes" provided prompt data. When using this method with `stream=True`, the response is streamed
-        back to the client. Iterate over the resulting :class:`~azure.ai.inference.models.StreamingChatCompletions`
-        object to get content updates as they arrive.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword messages: The collection of context messages associated with this chat completions
-         request.
-         Typical usage begins with a chat message for the System role that provides instructions for
-         the behavior of the assistant, followed by alternating messages between the User and
-         Assistant roles. Required.
-        :paramtype messages: list[~azure.ai.inference.models.ChatRequestMessage]
-        :keyword stream: A value indicating whether chat completions should be streamed for this request.
-         Default value is False. If streaming is enabled, the response will be a StreamingChatCompletions.
-         Otherwise the response will be a ChatCompletions.
-        :paramtype stream: bool
-        :keyword frequency_penalty: A value that influences the probability of generated tokens
-         appearing based on their cumulative frequency in generated text.
-         Positive values will make tokens less likely to appear as their frequency increases and
-         decrease the likelihood of the model repeating the same statements verbatim.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype frequency_penalty: float
-        :keyword presence_penalty: A value that influences the probability of generated tokens
-         appearing based on their existing
-         presence in generated text.
-         Positive values will make tokens less likely to appear when they already exist and increase
-         the model's likelihood to output new topics.
-         Supported range is [-2, 2].
-         Default value is None.
-        :paramtype presence_penalty: float
-        :keyword temperature: The sampling temperature to use that controls the apparent creativity of
-         generated completions.
-         Higher values will make output more random while lower values will make results more focused
-         and deterministic.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype temperature: float
-        :keyword top_p: An alternative to sampling with temperature called nucleus sampling. This value
-         causes the
-         model to consider the results of tokens with the provided probability mass. As an example, a
-         value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
-         considered.
-         It is not recommended to modify temperature and top_p for the same completions request as the
-         interaction of these two settings is difficult to predict.
-         Supported range is [0, 1].
-         Default value is None.
-        :paramtype top_p: float
-        :keyword max_tokens: The maximum number of tokens to generate. Default value is None.
-        :paramtype max_tokens: int
-        :keyword response_format: The format that the AI model must output. AI chat completions models typically output
-         unformatted text by default. This is equivalent to setting "text" as the response_format.
-         To output JSON format, without adhering to any schema, set to "json_object".
-         To output JSON format adhering to a provided schema, set this to an object of the class
-         ~azure.ai.inference.models.JsonSchemaFormat. Default value is None.
-        :paramtype response_format: Union[Literal['text', 'json_object'], ~azure.ai.inference.models.JsonSchemaFormat]
-        :keyword stop: A collection of textual sequences that will end completions generation. Default
-         value is None.
-        :paramtype stop: list[str]
-        :keyword tools: The available tool definitions that the chat completions request can use,
-         including caller-defined functions. Default value is None.
-        :paramtype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
-        :keyword tool_choice: If specified, the model will configure which of the provided tools it can
-         use for the chat completions response. Is either a Union[str,
-         "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
-         Default value is None.
-        :paramtype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
-         ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
-        :keyword seed: If specified, the system will make a best effort to sample deterministically
-         such that repeated requests with the
-         same seed and parameters should return the same result. Determinism is not guaranteed.
-         Default value is None.
-        :paramtype seed: int
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: ChatCompletions for non-streaming, or AsyncIterable[StreamingChatCompletionsUpdate] for streaming.
-        :rtype: ~azure.ai.inference.models.ChatCompletions or ~azure.ai.inference.models.AsyncStreamingChatCompletions
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        internal_response_format = _get_internal_response_format(response_format)
-
-        if body is _Unset:
-            if messages is _Unset:
-                raise TypeError("missing required argument: messages")
-            body = {
-                "messages": messages,
-                "stream": stream,
-                "frequency_penalty": frequency_penalty if frequency_penalty is not None else self._frequency_penalty,
-                "max_tokens": max_tokens if max_tokens is not None else self._max_tokens,
-                "model": model if model is not None else self._model,
-                "presence_penalty": presence_penalty if presence_penalty is not None else self._presence_penalty,
-                "response_format": (
-                    internal_response_format if internal_response_format is not None else self._internal_response_format
-                ),
-                "seed": seed if seed is not None else self._seed,
-                "stop": stop if stop is not None else self._stop,
-                "temperature": temperature if temperature is not None else self._temperature,
-                "tool_choice": tool_choice if tool_choice is not None else self._tool_choice,
-                "tools": tools if tools is not None else self._tools,
-                "top_p": top_p if top_p is not None else self._top_p,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        elif isinstance(body, dict) and "stream" in body and isinstance(body["stream"], bool):
-            stream = body["stream"]
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_chat_completions_complete_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = stream or False
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            return _models.AsyncStreamingChatCompletions(response)
-
-        return _deserialize(_models._patch.ChatCompletions, response.json())  # pylint: disable=protected-access
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            try:
-                self._model_info = await self._get_model_info(
-                    **kwargs
-                )  # pylint: disable=attribute-defined-outside-init
-            except ResourceNotFoundError as error:
-                error.message = "Model information is not available on this endpoint (`/info` route not supported)."
-                raise error
-
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class EmbeddingsClient(EmbeddingsClientGenerated):
-    """EmbeddingsClient.
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def embed(
-        self,
-        *,
-        input: List[str],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace_async
-    async def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[str] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given text prompts.
-        The method makes a REST API call to the `/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input text to embed, encoded as a string or array of tokens.
-         To embed multiple inputs in a single request, pass an array
-         of strings or array of token arrays. Required.
-        :paramtype input: list[str]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            try:
-                self._model_info = await self._get_model_info(
-                    **kwargs
-                )  # pylint: disable=attribute-defined-outside-init
-            except ResourceNotFoundError as error:
-                error.message = "Model information is not available on this endpoint (`/info` route not supported)."
-                raise error
-
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-class ImageEmbeddingsClient(ImageEmbeddingsClientGenerated):
-    """ImageEmbeddingsClient.
-
-    :param endpoint: Service endpoint URL for AI model inference. Required.
-    :type endpoint: str
-    :param credential: Credential used to authenticate requests to the service. Is either a
-     AzureKeyCredential type or a AsyncTokenCredential type. Required.
-    :type credential: ~azure.core.credentials.AzureKeyCredential or
-     ~azure.core.credentials_async.AsyncTokenCredential
-    :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-        have. Default value is None.
-    :paramtype dimensions: int
-    :keyword encoding_format: Optional. The desired format for the returned embeddings.
-        Known values are:
-        "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-    :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-    :keyword input_type: Optional. The type of the input. Known values are:
-        "text", "query", and "document". Default value is None.
-    :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-    :keyword model: ID of the specific AI model to use, if more than one model is available on the
-        endpoint. Default value is None.
-    :paramtype model: str
-    :keyword model_extras: Additional, model-specific parameters that are not in the
-        standard request payload. They will be added as-is to the root of the JSON in the request body.
-        How the service handles these extra parameters depends on the value of the
-        ``extra-parameters`` request header. Default value is None.
-    :paramtype model_extras: dict[str, Any]
-    :keyword api_version: The API version to use for this operation. Default value is
-     "2024-05-01-preview". Note that overriding this default value may result in unsupported
-     behavior.
-    :paramtype api_version: str
-    """
-
-    def __init__(
-        self,
-        endpoint: str,
-        credential: Union[AzureKeyCredential, "AsyncTokenCredential"],
-        *,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> None:
-
-        self._model_info: Optional[_models.ModelInfo] = None
-
-        # Store default embeddings settings, to be applied in all future service calls
-        # unless overridden by arguments in the `embed` method.
-        self._dimensions = dimensions
-        self._encoding_format = encoding_format
-        self._input_type = input_type
-        self._model = model
-        self._model_extras = model_extras
-
-        # For Key auth, we need to send these two auth HTTP request headers simultaneously:
-        # 1. "Authorization: Bearer <key>"
-        # 2. "api-key: <key>"
-        # This is because Serverless API, Managed Compute and GitHub endpoints support the first header,
-        # and Azure OpenAI and the new Unified Inference endpoints support the second header.
-        # The first header will be taken care of by auto-generated code.
-        # The second one is added here.
-        if isinstance(credential, AzureKeyCredential):
-            headers = kwargs.pop("headers", {})
-            if "api-key" not in headers:
-                headers["api-key"] = credential.key
-            kwargs["headers"] = headers
-
-        super().__init__(endpoint, credential, **kwargs)
-
-    @overload
-    async def embed(
-        self,
-        *,
-        input: List[_models.ImageEmbeddingInput],
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: JSON,
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: An object of type MutableMapping[str, Any], such as a dictionary, that
-         specifies the full request payload. Required.
-        :type body: JSON
-        :keyword content_type: Body Parameter content-type. Content type parameter for JSON body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @overload
-    async def embed(
-        self,
-        body: IO[bytes],
-        *,
-        content_type: str = "application/json",
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Specifies the full request payload. Required.
-        :type body: IO[bytes]
-        :keyword content_type: Body Parameter content-type. Content type parameter for binary body.
-         Default value is "application/json".
-        :paramtype content_type: str
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-
-    @distributed_trace_async
-    async def embed(
-        self,
-        body: Union[JSON, IO[bytes]] = _Unset,
-        *,
-        input: List[_models.ImageEmbeddingInput] = _Unset,
-        dimensions: Optional[int] = None,
-        encoding_format: Optional[Union[str, _models.EmbeddingEncodingFormat]] = None,
-        input_type: Optional[Union[str, _models.EmbeddingInputType]] = None,
-        model: Optional[str] = None,
-        model_extras: Optional[Dict[str, Any]] = None,
-        **kwargs: Any,
-    ) -> _models.EmbeddingsResult:
-        # pylint: disable=line-too-long
-        """Return the embedding vectors for given images.
-        The method makes a REST API call to the `/images/embeddings` route on the given endpoint.
-
-        :param body: Is either a MutableMapping[str, Any] type (like a dictionary) or a IO[bytes] type
-         that specifies the full request payload. Required.
-        :type body: JSON or IO[bytes]
-        :keyword input: Input image to embed. To embed multiple inputs in a single request, pass an
-         array.
-         The input must not exceed the max input tokens for the model. Required.
-        :paramtype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
-        :keyword dimensions: Optional. The number of dimensions the resulting output embeddings should
-         have. Default value is None.
-        :paramtype dimensions: int
-        :keyword encoding_format: Optional. The desired format for the returned embeddings.
-         Known values are:
-         "base64", "binary", "float", "int8", "ubinary", and "uint8". Default value is None.
-        :paramtype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
-        :keyword input_type: Optional. The type of the input. Known values are:
-         "text", "query", and "document". Default value is None.
-        :paramtype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
-        :keyword model: ID of the specific AI model to use, if more than one model is available on the
-         endpoint. Default value is None.
-        :paramtype model: str
-        :keyword model_extras: Additional, model-specific parameters that are not in the
-         standard request payload. They will be added as-is to the root of the JSON in the request body.
-         How the service handles these extra parameters depends on the value of the
-         ``extra-parameters`` request header. Default value is None.
-        :paramtype model_extras: dict[str, Any]
-        :return: EmbeddingsResult. The EmbeddingsResult is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.EmbeddingsResult
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        error_map: MutableMapping[int, Type[HttpResponseError]] = {
-            401: ClientAuthenticationError,
-            404: ResourceNotFoundError,
-            409: ResourceExistsError,
-            304: ResourceNotModifiedError,
-        }
-        error_map.update(kwargs.pop("error_map", {}) or {})
-
-        _headers = case_insensitive_dict(kwargs.pop("headers", {}) or {})
-        _params = kwargs.pop("params", {}) or {}
-        _extra_parameters: Union[_models._enums.ExtraParameters, None] = None
-
-        content_type: Optional[str] = kwargs.pop("content_type", _headers.pop("Content-Type", None))
-
-        if body is _Unset:
-            if input is _Unset:
-                raise TypeError("missing required argument: input")
-            body = {
-                "input": input,
-                "dimensions": dimensions if dimensions is not None else self._dimensions,
-                "encoding_format": encoding_format if encoding_format is not None else self._encoding_format,
-                "input_type": input_type if input_type is not None else self._input_type,
-                "model": model if model is not None else self._model,
-            }
-            if model_extras is not None and bool(model_extras):
-                body.update(model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            elif self._model_extras is not None and bool(self._model_extras):
-                body.update(self._model_extras)
-                _extra_parameters = _models._enums.ExtraParameters.PASS_THROUGH  # pylint: disable=protected-access
-            body = {k: v for k, v in body.items() if v is not None}
-        content_type = content_type or "application/json"
-        _content = None
-        if isinstance(body, (IOBase, bytes)):
-            _content = body
-        else:
-            _content = json.dumps(body, cls=SdkJSONEncoder, exclude_readonly=True)  # type: ignore
-
-        _request = build_image_embeddings_embed_request(
-            extra_params=_extra_parameters,
-            content_type=content_type,
-            api_version=self._config.api_version,
-            content=_content,
-            headers=_headers,
-            params=_params,
-        )
-        path_format_arguments = {
-            "endpoint": self._serialize.url("self._config.endpoint", self._config.endpoint, "str", skip_quote=True),
-        }
-        _request.url = self._client.format_url(_request.url, **path_format_arguments)
-
-        _stream = kwargs.pop("stream", False)
-        pipeline_response: PipelineResponse = await self._client._pipeline.run(  # type: ignore # pylint: disable=protected-access
-            _request, stream=_stream, **kwargs
-        )
-
-        response = pipeline_response.http_response
-
-        if response.status_code not in [200]:
-            if _stream:
-                await response.read()  # Load the body in memory and close the socket
-            map_error(status_code=response.status_code, response=response, error_map=error_map)
-            raise HttpResponseError(response=response)
-
-        if _stream:
-            deserialized = response.iter_bytes()
-        else:
-            deserialized = _deserialize(
-                _models._patch.EmbeddingsResult, response.json()  # pylint: disable=protected-access
-            )
-
-        return deserialized  # type: ignore
-
-    @distributed_trace_async
-    async def get_model_info(self, **kwargs: Any) -> _models.ModelInfo:
-        # pylint: disable=line-too-long
-        """Returns information about the AI model.
-        The method makes a REST API call to the ``/info`` route on the given endpoint.
-        This method will only work when using Serverless API or Managed Compute endpoint.
-        It will not work for GitHub Models endpoint or Azure OpenAI endpoint.
-
-        :return: ModelInfo. The ModelInfo is compatible with MutableMapping
-        :rtype: ~azure.ai.inference.models.ModelInfo
-        :raises ~azure.core.exceptions.HttpResponseError:
-        """
-        if not self._model_info:
-            try:
-                self._model_info = await self._get_model_info(
-                    **kwargs
-                )  # pylint: disable=attribute-defined-outside-init
-            except ResourceNotFoundError as error:
-                error.message = "Model information is not available on this endpoint (`/info` route not supported)."
-                raise error
-
-        return self._model_info
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return super().__str__() + f"\n{self._model_info}" if self._model_info else super().__str__()
-
-
-__all__: List[str] = [
-    "load_client",
-    "ChatCompletionsClient",
-    "EmbeddingsClient",
-    "ImageEmbeddingsClient",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
index 66e625705c58..5dfafa1a420d 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/__init__.py
@@ -14,7 +14,8 @@
 
 
 from ._models import (  # type: ignore
-    AudioContentItem,
+    AudioDataContentItem,
+    AudioUrlContentItem,
     ChatChoice,
     ChatCompletions,
     ChatCompletionsNamedToolChoice,
@@ -33,6 +34,7 @@
     ImageEmbeddingInput,
     ImageUrl,
     InputAudio,
+    InputAudioUrl,
     JsonSchemaFormat,
     ModelInfo,
     StreamingChatChoiceUpdate,
@@ -57,7 +59,8 @@
 from ._patch import patch_sdk as _patch_sdk
 
 __all__ = [
-    "AudioContentItem",
+    "AudioDataContentItem",
+    "AudioUrlContentItem",
     "ChatChoice",
     "ChatCompletions",
     "ChatCompletionsNamedToolChoice",
@@ -76,6 +79,7 @@
     "ImageEmbeddingInput",
     "ImageUrl",
     "InputAudio",
+    "InputAudioUrl",
     "JsonSchemaFormat",
     "ModelInfo",
     "StreamingChatChoiceUpdate",
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
index 53934528434f..88896f3913c7 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_models.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-lines
+# pylint: disable=line-too-long,useless-suppression,too-many-lines
 # coding=utf-8
 # --------------------------------------------------------------------------
 # Copyright (c) Microsoft Corporation. All rights reserved.
@@ -23,14 +23,14 @@ class ContentItem(_model_base.Model):
     """An abstract representation of a structured content item within a chat message.
 
     You probably want to use the sub-classes and not this class directly. Known sub-classes are:
-    ImageContentItem, AudioContentItem, TextContentItem
+    AudioUrlContentItem, ImageContentItem, AudioDataContentItem, TextContentItem
 
     :ivar type: The discriminated object type. Required. Default value is None.
     :vartype type: str
     """
 
     __mapping__: Dict[str, _model_base.Model] = {}
-    type: str = rest_discriminator(name="type")
+    type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])
     """The discriminated object type. Required. Default value is None."""
 
     @overload
@@ -51,21 +51,21 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
 
-class AudioContentItem(ContentItem, discriminator="input_audio"):
-    """A structured chat content item containing an audio content.
+class AudioDataContentItem(ContentItem, discriminator="input_audio"):
+    """A structured chat content item for audio content passed as base64 encoded data.
 
     :ivar type: The discriminated object type: always 'input_audio' for this type. Required.
      Default value is "input_audio".
     :vartype type: str
-    :ivar input_audio: The details of the input audio. Required.
+    :ivar input_audio: The details of the input audio data. Required.
     :vartype input_audio: ~azure.ai.inference.models.InputAudio
     """
 
-    type: Literal["input_audio"] = rest_discriminator(name="type")  # type: ignore
+    type: Literal["input_audio"] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The discriminated object type: always 'input_audio' for this type. Required. Default value is
      \"input_audio\"."""
-    input_audio: "_models.InputAudio" = rest_field()
-    """The details of the input audio. Required."""
+    input_audio: "_models.InputAudio" = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """The details of the input audio data. Required."""
 
     @overload
     def __init__(
@@ -85,13 +85,46 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, type="input_audio", **kwargs)
 
 
+class AudioUrlContentItem(ContentItem, discriminator="audio_url"):
+    """A structured chat content item for audio content passed as a url.
+
+    :ivar type: The discriminated object type: always 'audio_url' for this type. Required. Default
+     value is "audio_url".
+    :vartype type: str
+    :ivar audio_url: The details of the audio url. Required.
+    :vartype audio_url: ~azure.ai.inference.models.InputAudioUrl
+    """
+
+    type: Literal["audio_url"] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
+    """The discriminated object type: always 'audio_url' for this type. Required. Default value is
+     \"audio_url\"."""
+    audio_url: "_models.InputAudioUrl" = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """The details of the audio url. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        audio_url: "_models.InputAudioUrl",
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, type="audio_url", **kwargs)
+
+
 class ChatChoice(_model_base.Model):
     """The representation of a single prompt completion as part of an overall chat completions
     request.
     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
     Token limits and other settings may limit the number of choices generated.
 
-
     :ivar index: The ordered index associated with this chat completions choice. Required.
     :vartype index: int
     :ivar finish_reason: The reason that this chat completions choice completed its generated.
@@ -101,12 +134,14 @@ class ChatChoice(_model_base.Model):
     :vartype message: ~azure.ai.inference.models.ChatResponseMessage
     """
 
-    index: int = rest_field()
+    index: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The ordered index associated with this chat completions choice. Required."""
-    finish_reason: Union[str, "_models.CompletionsFinishReason"] = rest_field()
+    finish_reason: Union[str, "_models.CompletionsFinishReason"] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The reason that this chat completions choice completed its generated. Required. Known values
      are: \"stop\", \"length\", \"content_filter\", and \"tool_calls\"."""
-    message: "_models.ChatResponseMessage" = rest_field()
+    message: "_models.ChatResponseMessage" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The chat message for a given chat completions prompt. Required."""
 
     @overload
@@ -135,7 +170,6 @@ class ChatCompletions(_model_base.Model):
     "completes"
     provided prompt data.
 
-
     :ivar id: A unique identifier associated with this chat completions response. Required.
     :vartype id: str
     :ivar created: The first timestamp associated with generation activity for this completions
@@ -153,18 +187,20 @@ class ChatCompletions(_model_base.Model):
     :vartype usage: ~azure.ai.inference.models.CompletionsUsage
     """
 
-    id: str = rest_field()
+    id: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """A unique identifier associated with this chat completions response. Required."""
-    created: datetime.datetime = rest_field(format="unix-timestamp")
+    created: datetime.datetime = rest_field(
+        visibility=["read", "create", "update", "delete", "query"], format="unix-timestamp"
+    )
     """The first timestamp associated with generation activity for this completions response,
      represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required."""
-    model: str = rest_field()
+    model: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The model used for the chat completion. Required."""
-    choices: List["_models.ChatChoice"] = rest_field()
+    choices: List["_models.ChatChoice"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The collection of completions choices associated with this completions response.
      Generally, ``n`` choices are generated per provided prompt with a default value of 1.
      Token limits and other settings may limit the number of choices generated. Required."""
-    usage: "_models.CompletionsUsage" = rest_field()
+    usage: "_models.CompletionsUsage" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Usage information for tokens processed and generated as part of this completions operation.
      Required."""
 
@@ -201,10 +237,12 @@ class ChatCompletionsNamedToolChoice(_model_base.Model):
     :vartype function: ~azure.ai.inference.models.ChatCompletionsNamedToolChoiceFunction
     """
 
-    type: Literal["function"] = rest_field()
+    type: Literal["function"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The type of the tool. Currently, only ``function`` is supported. Required. Default value is
      \"function\"."""
-    function: "_models.ChatCompletionsNamedToolChoiceFunction" = rest_field()
+    function: "_models.ChatCompletionsNamedToolChoiceFunction" = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The function that should be called. Required."""
 
     @overload
@@ -234,7 +272,7 @@ class ChatCompletionsNamedToolChoiceFunction(_model_base.Model):
     :vartype name: str
     """
 
-    name: str = rest_field()
+    name: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The name of the function that should be called. Required."""
 
     @overload
@@ -255,6 +293,198 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
 
+class ChatCompletionsOptions(_model_base.Model):
+    """The configuration information for a chat completions request.
+    Completions support a wide variety of tasks and generate text that continues from or
+    "completes"
+    provided prompt data.
+
+    :ivar messages: The collection of context messages associated with this chat completions
+     request.
+     Typical usage begins with a chat message for the System role that provides instructions for
+     the behavior of the assistant, followed by alternating messages between the User and
+     Assistant roles. Required.
+    :vartype messages: list[~azure.ai.inference.models._models.ChatRequestMessage]
+    :ivar frequency_penalty: A value that influences the probability of generated tokens appearing
+     based on their cumulative
+     frequency in generated text.
+     Positive values will make tokens less likely to appear as their frequency increases and
+     decrease the likelihood of the model repeating the same statements verbatim.
+     Supported range is [-2, 2].
+    :vartype frequency_penalty: float
+    :ivar stream: A value indicating whether chat completions should be streamed for this request.
+    :vartype stream: bool
+    :ivar presence_penalty: A value that influences the probability of generated tokens appearing
+     based on their existing
+     presence in generated text.
+     Positive values will make tokens less likely to appear when they already exist and increase the
+     model's likelihood to output new topics.
+     Supported range is [-2, 2].
+    :vartype presence_penalty: float
+    :ivar temperature: The sampling temperature to use that controls the apparent creativity of
+     generated completions.
+     Higher values will make output more random while lower values will make results more focused
+     and deterministic.
+     It is not recommended to modify temperature and top_p for the same completions request as the
+     interaction of these two settings is difficult to predict.
+     Supported range is [0, 1].
+    :vartype temperature: float
+    :ivar top_p: An alternative to sampling with temperature called nucleus sampling. This value
+     causes the
+     model to consider the results of tokens with the provided probability mass. As an example, a
+     value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+     considered.
+     It is not recommended to modify temperature and top_p for the same completions request as the
+     interaction of these two settings is difficult to predict.
+     Supported range is [0, 1].
+    :vartype top_p: float
+    :ivar max_tokens: The maximum number of tokens to generate.
+    :vartype max_tokens: int
+    :ivar response_format: An object specifying the format that the model must output.
+
+     Setting to ``{ "type": "json_schema", "json_schema": {...} }`` enables Structured Outputs which
+     ensures the model will match your supplied JSON schema.
+
+     Setting to ``{ "type": "json_object" }`` enables JSON mode, which ensures the message the model
+     generates is valid JSON.
+
+     **Important:** when using JSON mode, you **must** also instruct the model to produce JSON
+     yourself via a system or user message. Without this, the model may generate an unending stream
+     of whitespace until the generation reaches the token limit, resulting in a long-running and
+     seemingly "stuck" request. Also note that the message content may be partially cut off if
+     ``finish_reason="length"``, which indicates the generation exceeded ``max_tokens`` or the
+     conversation exceeded the max context length.
+    :vartype response_format: ~azure.ai.inference.models._models.ChatCompletionsResponseFormat
+    :ivar stop: A collection of textual sequences that will end completions generation.
+    :vartype stop: list[str]
+    :ivar tools: A list of tools the model may request to call. Currently, only functions are
+     supported as a tool. The model
+     may response with a function call request and provide the input arguments in JSON format for
+     that function.
+    :vartype tools: list[~azure.ai.inference.models.ChatCompletionsToolDefinition]
+    :ivar tool_choice: If specified, the model will configure which of the provided tools it can
+     use for the chat completions response. Is either a Union[str,
+     "_models.ChatCompletionsToolChoicePreset"] type or a ChatCompletionsNamedToolChoice type.
+    :vartype tool_choice: str or ~azure.ai.inference.models.ChatCompletionsToolChoicePreset or
+     ~azure.ai.inference.models.ChatCompletionsNamedToolChoice
+    :ivar seed: If specified, the system will make a best effort to sample deterministically such
+     that repeated requests with the
+     same seed and parameters should return the same result. Determinism is not guaranteed.
+    :vartype seed: int
+    :ivar model: ID of the specific AI model to use, if more than one model is available on the
+     endpoint.
+    :vartype model: str
+    """
+
+    messages: List["_models._models.ChatRequestMessage"] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
+    """The collection of context messages associated with this chat completions request.
+     Typical usage begins with a chat message for the System role that provides instructions for
+     the behavior of the assistant, followed by alternating messages between the User and
+     Assistant roles. Required."""
+    frequency_penalty: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """A value that influences the probability of generated tokens appearing based on their cumulative
+     frequency in generated text.
+     Positive values will make tokens less likely to appear as their frequency increases and
+     decrease the likelihood of the model repeating the same statements verbatim.
+     Supported range is [-2, 2]."""
+    stream: Optional[bool] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """A value indicating whether chat completions should be streamed for this request."""
+    presence_penalty: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """A value that influences the probability of generated tokens appearing based on their existing
+     presence in generated text.
+     Positive values will make tokens less likely to appear when they already exist and increase the
+     model's likelihood to output new topics.
+     Supported range is [-2, 2]."""
+    temperature: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """The sampling temperature to use that controls the apparent creativity of generated completions.
+     Higher values will make output more random while lower values will make results more focused
+     and deterministic.
+     It is not recommended to modify temperature and top_p for the same completions request as the
+     interaction of these two settings is difficult to predict.
+     Supported range is [0, 1]."""
+    top_p: Optional[float] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """An alternative to sampling with temperature called nucleus sampling. This value causes the
+     model to consider the results of tokens with the provided probability mass. As an example, a
+     value of 0.15 will cause only the tokens comprising the top 15% of probability mass to be
+     considered.
+     It is not recommended to modify temperature and top_p for the same completions request as the
+     interaction of these two settings is difficult to predict.
+     Supported range is [0, 1]."""
+    max_tokens: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """The maximum number of tokens to generate."""
+    response_format: Optional["_models._models.ChatCompletionsResponseFormat"] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
+    """An object specifying the format that the model must output.
+     
+     Setting to ``{ \"type\": \"json_schema\", \"json_schema\": {...} }`` enables Structured Outputs
+     which ensures the model will match your supplied JSON schema.
+     
+     Setting to ``{ \"type\": \"json_object\" }`` enables JSON mode, which ensures the message the
+     model generates is valid JSON.
+     
+     **Important:** when using JSON mode, you **must** also instruct the model to produce JSON
+     yourself via a system or user message. Without this, the model may generate an unending stream
+     of whitespace until the generation reaches the token limit, resulting in a long-running and
+     seemingly \"stuck\" request. Also note that the message content may be partially cut off if
+     ``finish_reason=\"length\"``, which indicates the generation exceeded ``max_tokens`` or the
+     conversation exceeded the max context length."""
+    stop: Optional[List[str]] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """A collection of textual sequences that will end completions generation."""
+    tools: Optional[List["_models.ChatCompletionsToolDefinition"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
+    """A list of tools the model may request to call. Currently, only functions are supported as a
+     tool. The model
+     may response with a function call request and provide the input arguments in JSON format for
+     that function."""
+    tool_choice: Optional[
+        Union[str, "_models.ChatCompletionsToolChoicePreset", "_models.ChatCompletionsNamedToolChoice"]
+    ] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """If specified, the model will configure which of the provided tools it can use for the chat
+     completions response. Is either a Union[str, \"_models.ChatCompletionsToolChoicePreset\"] type
+     or a ChatCompletionsNamedToolChoice type."""
+    seed: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """If specified, the system will make a best effort to sample deterministically such that repeated
+     requests with the
+     same seed and parameters should return the same result. Determinism is not guaranteed."""
+    model: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """ID of the specific AI model to use, if more than one model is available on the endpoint."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        messages: List["_models._models.ChatRequestMessage"],
+        frequency_penalty: Optional[float] = None,
+        stream: Optional[bool] = None,
+        presence_penalty: Optional[float] = None,
+        temperature: Optional[float] = None,
+        top_p: Optional[float] = None,
+        max_tokens: Optional[int] = None,
+        response_format: Optional["_models._models.ChatCompletionsResponseFormat"] = None,
+        stop: Optional[List[str]] = None,
+        tools: Optional[List["_models.ChatCompletionsToolDefinition"]] = None,
+        tool_choice: Optional[
+            Union[str, "_models.ChatCompletionsToolChoicePreset", "_models.ChatCompletionsNamedToolChoice"]
+        ] = None,
+        seed: Optional[int] = None,
+        model: Optional[str] = None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+
 class ChatCompletionsResponseFormat(_model_base.Model):
     """Represents the format that the model must output. Use this to enable JSON mode instead of the
     default text mode.
@@ -272,7 +502,7 @@ class ChatCompletionsResponseFormat(_model_base.Model):
     """
 
     __mapping__: Dict[str, _model_base.Model] = {}
-    type: str = rest_discriminator(name="type")
+    type: str = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])
     """The response format type to use for chat completions. Required. Default value is None."""
 
     @overload
@@ -304,7 +534,7 @@ class ChatCompletionsResponseFormatJsonObject(ChatCompletionsResponseFormat, dis
     :vartype type: str
     """
 
-    type: Literal["json_object"] = rest_discriminator(name="type")  # type: ignore
+    type: Literal["json_object"] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """Response format type: always 'json_object' for this object. Required. Default value is
      \"json_object\"."""
 
@@ -337,10 +567,10 @@ class ChatCompletionsResponseFormatJsonSchema(ChatCompletionsResponseFormat, dis
     :vartype json_schema: ~azure.ai.inference.models.JsonSchemaFormat
     """
 
-    type: Literal["json_schema"] = rest_discriminator(name="type")  # type: ignore
+    type: Literal["json_schema"] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The type of response format being defined: ``json_schema``. Required. Default value is
      \"json_schema\"."""
-    json_schema: "_models.JsonSchemaFormat" = rest_field()
+    json_schema: "_models.JsonSchemaFormat" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The definition of the required JSON schema in the response, and associated metadata. Required."""
 
     @overload
@@ -370,7 +600,7 @@ class ChatCompletionsResponseFormatText(ChatCompletionsResponseFormat, discrimin
     :vartype type: str
     """
 
-    type: Literal["text"] = rest_discriminator(name="type")  # type: ignore
+    type: Literal["text"] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """Response format type: always 'text' for this object. Required. Default value is \"text\"."""
 
     @overload
@@ -401,12 +631,12 @@ class ChatCompletionsToolCall(_model_base.Model):
     :vartype function: ~azure.ai.inference.models.FunctionCall
     """
 
-    id: str = rest_field()
+    id: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The ID of the tool call. Required."""
-    type: Literal["function"] = rest_field()
+    type: Literal["function"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The type of tool call. Currently, only ``function`` is supported. Required. Default value is
      \"function\"."""
-    function: "_models.FunctionCall" = rest_field()
+    function: "_models.FunctionCall" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The details of the function call requested by the AI model. Required."""
 
     @overload
@@ -439,10 +669,10 @@ class ChatCompletionsToolDefinition(_model_base.Model):
     :vartype function: ~azure.ai.inference.models.FunctionDefinition
     """
 
-    type: Literal["function"] = rest_field()
+    type: Literal["function"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The type of the tool. Currently, only ``function`` is supported. Required. Default value is
      \"function\"."""
-    function: "_models.FunctionDefinition" = rest_field()
+    function: "_models.FunctionDefinition" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The function definition details for the function tool. Required."""
 
     @overload
@@ -477,7 +707,7 @@ class ChatRequestMessage(_model_base.Model):
     """
 
     __mapping__: Dict[str, _model_base.Model] = {}
-    role: str = rest_discriminator(name="role")
+    role: str = rest_discriminator(name="role", visibility=["read", "create", "update", "delete", "query"])
     """The chat role associated with this message. Required. Known values are: \"system\", \"user\",
      \"assistant\", \"tool\", and \"developer\"."""
 
@@ -514,12 +744,14 @@ class ChatRequestAssistantMessage(ChatRequestMessage, discriminator="assistant")
     :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
     """
 
-    role: Literal[ChatRole.ASSISTANT] = rest_discriminator(name="role")  # type: ignore
+    role: Literal[ChatRole.ASSISTANT] = rest_discriminator(name="role", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The chat role associated with this message, which is always 'assistant' for assistant messages.
      Required. The role that provides responses to system-instructed, user-prompted input."""
-    content: Optional[str] = rest_field()
+    content: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The content of the message."""
-    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
+    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The tool calls that must be resolved and have their outputs appended to subsequent input
      messages for the chat
      completions request to resolve as configured."""
@@ -544,7 +776,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 
 
 class ChatRequestDeveloperMessage(ChatRequestMessage, discriminator="developer"):
-    """A request chat message containing system instructions that influence how the model will
+    """A request chat message containing developer instructions that influence how the model will
     generate a chat completions
     response. Some AI models support a developer message instead of a system message.
 
@@ -556,11 +788,11 @@ class ChatRequestDeveloperMessage(ChatRequestMessage, discriminator="developer")
     :vartype content: str
     """
 
-    role: Literal[ChatRole.DEVELOPER] = rest_discriminator(name="role")  # type: ignore
+    role: Literal[ChatRole.DEVELOPER] = rest_discriminator(name="role", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The chat role associated with this message, which is always 'developer' for developer messages.
      Required. The role that instructs or sets the behavior of the assistant. Some AI models support
      this role instead of the 'system' role."""
-    content: str = rest_field()
+    content: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The contents of the developer message. Required."""
 
     @overload
@@ -593,10 +825,10 @@ class ChatRequestSystemMessage(ChatRequestMessage, discriminator="system"):
     :vartype content: str
     """
 
-    role: Literal[ChatRole.SYSTEM] = rest_discriminator(name="role")  # type: ignore
+    role: Literal[ChatRole.SYSTEM] = rest_discriminator(name="role", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The chat role associated with this message, which is always 'system' for system messages.
      Required. The role that instructs or sets the behavior of the assistant."""
-    content: str = rest_field()
+    content: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The contents of the system message. Required."""
 
     @overload
@@ -630,12 +862,12 @@ class ChatRequestToolMessage(ChatRequestMessage, discriminator="tool"):
     :vartype tool_call_id: str
     """
 
-    role: Literal[ChatRole.TOOL] = rest_discriminator(name="role")  # type: ignore
+    role: Literal[ChatRole.TOOL] = rest_discriminator(name="role", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The chat role associated with this message, which is always 'tool' for tool messages. Required.
      The role that represents extension tool activity within a chat completions operation."""
-    content: Optional[str] = rest_field()
+    content: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The content of the message."""
-    tool_call_id: str = rest_field()
+    tool_call_id: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The ID of the tool call resolved by the provided content. Required."""
 
     @overload
@@ -668,10 +900,12 @@ class ChatRequestUserMessage(ChatRequestMessage, discriminator="user"):
     :vartype content: str or list[~azure.ai.inference.models.ContentItem]
     """
 
-    role: Literal[ChatRole.USER] = rest_discriminator(name="role")  # type: ignore
+    role: Literal[ChatRole.USER] = rest_discriminator(name="role", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The chat role associated with this message, which is always 'user' for user messages. Required.
      The role that provides input for chat completions."""
-    content: Union["str", List["_models.ContentItem"]] = rest_field()
+    content: Union[str, List["_models.ContentItem"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The contents of the user message, with available input types varying by selected model.
      Required. Is either a str type or a [ContentItem] type."""
 
@@ -696,7 +930,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class ChatResponseMessage(_model_base.Model):
     """A representation of a chat message as received in a response.
 
-
     :ivar role: The chat role associated with the message. Required. Known values are: "system",
      "user", "assistant", "tool", and "developer".
     :vartype role: str or ~azure.ai.inference.models.ChatRole
@@ -708,12 +941,14 @@ class ChatResponseMessage(_model_base.Model):
     :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
     """
 
-    role: Union[str, "_models.ChatRole"] = rest_field()
+    role: Union[str, "_models.ChatRole"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The chat role associated with the message. Required. Known values are: \"system\", \"user\",
      \"assistant\", \"tool\", and \"developer\"."""
-    content: str = rest_field()
+    content: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The content of the message. Required."""
-    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
+    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The tool calls that must be resolved and have their outputs appended to subsequent input
      messages for the chat
      completions request to resolve as configured."""
@@ -743,7 +978,6 @@ class CompletionsUsage(_model_base.Model):
     Counts consider all tokens across prompts, choices, choice alternates, best_of generations, and
     other consumers.
 
-
     :ivar completion_tokens: The number of tokens generated across all completions emissions.
      Required.
     :vartype completion_tokens: int
@@ -755,11 +989,11 @@ class CompletionsUsage(_model_base.Model):
     :vartype total_tokens: int
     """
 
-    completion_tokens: int = rest_field()
+    completion_tokens: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The number of tokens generated across all completions emissions. Required."""
-    prompt_tokens: int = rest_field()
+    prompt_tokens: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The number of tokens in the provided prompts for the completions request. Required."""
-    total_tokens: int = rest_field()
+    total_tokens: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The total number of tokens processed for the completions request and response. Required."""
 
     @overload
@@ -785,7 +1019,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class EmbeddingItem(_model_base.Model):
     """Representation of a single embeddings relatedness comparison.
 
-
     :ivar embedding: List of embedding values for the input prompt. These represent a measurement
      of the
      vector-based relatedness of the provided input. Or a base64 encoded string of the embedding
@@ -795,11 +1028,11 @@ class EmbeddingItem(_model_base.Model):
     :vartype index: int
     """
 
-    embedding: Union["str", List[float]] = rest_field()
+    embedding: Union[str, List[float]] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """List of embedding values for the input prompt. These represent a measurement of the
      vector-based relatedness of the provided input. Or a base64 encoded string of the embedding
      vector. Required. Is either a str type or a [float] type."""
-    index: int = rest_field()
+    index: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Index of the prompt to which the EmbeddingItem corresponds. Required."""
 
     @overload
@@ -821,13 +1054,80 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
 
+class EmbeddingsOptions(_model_base.Model):
+    """The configuration information for an embeddings request.
+
+    :ivar input: Input text to embed, encoded as a string or array of tokens.
+     To embed multiple inputs in a single request, pass an array
+     of strings or array of token arrays. Required.
+    :vartype input: list[str]
+    :ivar dimensions: Optional. The number of dimensions the resulting output embeddings should
+     have.
+     Passing null causes the model to use its default value.
+     Returns a 422 error if the model doesn't support the value or parameter.
+    :vartype dimensions: int
+    :ivar encoding_format: Optional. The desired format for the returned embeddings. Known values
+     are: "base64", "binary", "float", "int8", "ubinary", and "uint8".
+    :vartype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+    :ivar input_type: Optional. The type of the input.
+     Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+     "text", "query", and "document".
+    :vartype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+    :ivar model: ID of the specific AI model to use, if more than one model is available on the
+     endpoint.
+    :vartype model: str
+    """
+
+    input: List[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """Input text to embed, encoded as a string or array of tokens.
+     To embed multiple inputs in a single request, pass an array
+     of strings or array of token arrays. Required."""
+    dimensions: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """Optional. The number of dimensions the resulting output embeddings should have.
+     Passing null causes the model to use its default value.
+     Returns a 422 error if the model doesn't support the value or parameter."""
+    encoding_format: Optional[Union[str, "_models.EmbeddingEncodingFormat"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
+    """Optional. The desired format for the returned embeddings. Known values are: \"base64\",
+     \"binary\", \"float\", \"int8\", \"ubinary\", and \"uint8\"."""
+    input_type: Optional[Union[str, "_models.EmbeddingInputType"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
+    """Optional. The type of the input.
+     Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+     \"text\", \"query\", and \"document\"."""
+    model: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """ID of the specific AI model to use, if more than one model is available on the endpoint."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        input: List[str],
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, "_models.EmbeddingEncodingFormat"]] = None,
+        input_type: Optional[Union[str, "_models.EmbeddingInputType"]] = None,
+        model: Optional[str] = None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+
 class EmbeddingsResult(_model_base.Model):
     """Representation of the response data from an embeddings request.
     Embeddings measure the relatedness of text strings and are commonly used for search,
     clustering,
     recommendations, and other similar scenarios.
 
-
     :ivar id: Unique identifier for the embeddings result. Required.
     :vartype id: str
     :ivar data: Embedding values for the prompts submitted in the request. Required.
@@ -838,13 +1138,13 @@ class EmbeddingsResult(_model_base.Model):
     :vartype model: str
     """
 
-    id: str = rest_field()
+    id: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Unique identifier for the embeddings result. Required."""
-    data: List["_models.EmbeddingItem"] = rest_field()
+    data: List["_models.EmbeddingItem"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Embedding values for the prompts submitted in the request. Required."""
-    usage: "_models.EmbeddingsUsage" = rest_field()
+    usage: "_models.EmbeddingsUsage" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Usage counts for tokens input using the embeddings API. Required."""
-    model: str = rest_field()
+    model: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The model ID used to generate this result. Required."""
 
     @overload
@@ -871,7 +1171,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class EmbeddingsUsage(_model_base.Model):
     """Measurement of the amount of tokens used in this request and response.
 
-
     :ivar prompt_tokens: Number of tokens in the request. Required.
     :vartype prompt_tokens: int
     :ivar total_tokens: Total number of tokens transacted in this request/response. Should equal
@@ -880,9 +1179,9 @@ class EmbeddingsUsage(_model_base.Model):
     :vartype total_tokens: int
     """
 
-    prompt_tokens: int = rest_field()
+    prompt_tokens: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Number of tokens in the request. Required."""
-    total_tokens: int = rest_field()
+    total_tokens: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Total number of tokens transacted in this request/response. Should equal the
      number of tokens in the request. Required."""
 
@@ -908,7 +1207,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class FunctionCall(_model_base.Model):
     """The name and arguments of a function that should be called, as generated by the model.
 
-
     :ivar name: The name of the function to call. Required.
     :vartype name: str
     :ivar arguments: The arguments to call the function with, as generated by the model in JSON
@@ -919,9 +1217,9 @@ class FunctionCall(_model_base.Model):
     :vartype arguments: str
     """
 
-    name: str = rest_field()
+    name: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The name of the function to call. Required."""
-    arguments: str = rest_field()
+    arguments: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The arguments to call the function with, as generated by the model in JSON format.
      Note that the model does not always generate valid JSON, and may hallucinate parameters
      not defined by your function schema. Validate the arguments in your code before calling
@@ -957,16 +1255,16 @@ class FunctionDefinition(_model_base.Model):
      interpreting its parameters.
     :vartype description: str
     :ivar parameters: The parameters the function accepts, described as a JSON Schema object.
-    :vartype parameters: any
+    :vartype parameters: dict[str, any]
     """
 
-    name: str = rest_field()
+    name: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The name of the function to be called. Required."""
-    description: Optional[str] = rest_field()
+    description: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """A description of what the function does. The model will use this description when selecting the
      function and
      interpreting its parameters."""
-    parameters: Optional[Any] = rest_field()
+    parameters: Optional[Dict[str, Any]] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The parameters the function accepts, described as a JSON Schema object."""
 
     @overload
@@ -975,7 +1273,7 @@ def __init__(
         *,
         name: str,
         description: Optional[str] = None,
-        parameters: Optional[Any] = None,
+        parameters: Optional[Dict[str, Any]] = None,
     ) -> None: ...
 
     @overload
@@ -1000,10 +1298,10 @@ class ImageContentItem(ContentItem, discriminator="image_url"):
     :vartype image_url: ~azure.ai.inference.models.ImageUrl
     """
 
-    type: Literal["image_url"] = rest_discriminator(name="type")  # type: ignore
+    type: Literal["image_url"] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The discriminated object type: always 'image_url' for this type. Required. Default value is
      \"image_url\"."""
-    image_url: "_models.ImageUrl" = rest_field()
+    image_url: "_models.ImageUrl" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """An internet location, which must be accessible to the model,from which the image may be
      retrieved. Required."""
 
@@ -1036,10 +1334,10 @@ class ImageEmbeddingInput(_model_base.Model):
     :vartype text: str
     """
 
-    image: str = rest_field()
+    image: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The input image encoded in base64 string as a data URL. Example:
      ``data:image/{format};base64,{data}``. Required."""
-    text: Optional[str] = rest_field()
+    text: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Optional. The text input to feed into the model (like DINO, CLIP).
      Returns a 422 error if the model doesn't support the value or parameter."""
 
@@ -1062,6 +1360,77 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
 
+class ImageEmbeddingsOptions(_model_base.Model):
+    """The configuration information for an image embeddings request.
+
+    :ivar input: Input image to embed. To embed multiple inputs in a single request, pass an array.
+     The input must not exceed the max input tokens for the model. Required.
+    :vartype input: list[~azure.ai.inference.models.ImageEmbeddingInput]
+    :ivar dimensions: Optional. The number of dimensions the resulting output embeddings should
+     have.
+     Passing null causes the model to use its default value.
+     Returns a 422 error if the model doesn't support the value or parameter.
+    :vartype dimensions: int
+    :ivar encoding_format: Optional. The number of dimensions the resulting output embeddings
+     should have.
+     Passing null causes the model to use its default value.
+     Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+     "base64", "binary", "float", "int8", "ubinary", and "uint8".
+    :vartype encoding_format: str or ~azure.ai.inference.models.EmbeddingEncodingFormat
+    :ivar input_type: Optional. The type of the input.
+     Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+     "text", "query", and "document".
+    :vartype input_type: str or ~azure.ai.inference.models.EmbeddingInputType
+    :ivar model: ID of the specific AI model to use, if more than one model is available on the
+     endpoint.
+    :vartype model: str
+    """
+
+    input: List["_models.ImageEmbeddingInput"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """Input image to embed. To embed multiple inputs in a single request, pass an array.
+     The input must not exceed the max input tokens for the model. Required."""
+    dimensions: Optional[int] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """Optional. The number of dimensions the resulting output embeddings should have.
+     Passing null causes the model to use its default value.
+     Returns a 422 error if the model doesn't support the value or parameter."""
+    encoding_format: Optional[Union[str, "_models.EmbeddingEncodingFormat"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
+    """Optional. The number of dimensions the resulting output embeddings should have.
+     Passing null causes the model to use its default value.
+     Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+     \"base64\", \"binary\", \"float\", \"int8\", \"ubinary\", and \"uint8\"."""
+    input_type: Optional[Union[str, "_models.EmbeddingInputType"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
+    """Optional. The type of the input.
+     Returns a 422 error if the model doesn't support the value or parameter. Known values are:
+     \"text\", \"query\", and \"document\"."""
+    model: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """ID of the specific AI model to use, if more than one model is available on the endpoint."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        input: List["_models.ImageEmbeddingInput"],
+        dimensions: Optional[int] = None,
+        encoding_format: Optional[Union[str, "_models.EmbeddingEncodingFormat"]] = None,
+        input_type: Optional[Union[str, "_models.EmbeddingInputType"]] = None,
+        model: Optional[str] = None,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+
 class ImageUrl(_model_base.Model):
     """An internet location from which the model may retrieve an image.
 
@@ -1073,9 +1442,11 @@ class ImageUrl(_model_base.Model):
     :vartype detail: str or ~azure.ai.inference.models.ImageDetailLevel
     """
 
-    url: str = rest_field()
+    url: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The URL of the image. Required."""
-    detail: Optional[Union[str, "_models.ImageDetailLevel"]] = rest_field()
+    detail: Optional[Union[str, "_models.ImageDetailLevel"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The evaluation quality setting to use, which controls relative prioritization of speed, token
      consumption, and
      accuracy. Known values are: \"auto\", \"low\", and \"high\"."""
@@ -1100,7 +1471,7 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 
 
 class InputAudio(_model_base.Model):
-    """The details of an audio chat message content part.
+    """The details of the input audio data.
 
     :ivar data: Base64 encoded audio data. Required.
     :vartype data: str
@@ -1109,9 +1480,11 @@ class InputAudio(_model_base.Model):
     :vartype format: str or ~azure.ai.inference.models.AudioContentFormat
     """
 
-    data: str = rest_field()
+    data: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Base64 encoded audio data. Required."""
-    format: Union[str, "_models.AudioContentFormat"] = rest_field()
+    format: Union[str, "_models.AudioContentFormat"] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The audio format of the audio content. Required. Known values are: \"wav\" and \"mp3\"."""
 
     @overload
@@ -1133,6 +1506,34 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
         super().__init__(*args, **kwargs)
 
 
+class InputAudioUrl(_model_base.Model):
+    """The details of the audio url.
+
+    :ivar url: The URL of the audio content. Required.
+    :vartype url: str
+    """
+
+    url: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """The URL of the audio content. Required."""
+
+    @overload
+    def __init__(
+        self,
+        *,
+        url: str,
+    ) -> None: ...
+
+    @overload
+    def __init__(self, mapping: Mapping[str, Any]) -> None:
+        """
+        :param mapping: raw JSON to initialize the model.
+        :type mapping: Mapping[str, Any]
+        """
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        super().__init__(*args, **kwargs)
+
+
 class JsonSchemaFormat(_model_base.Model):
     """Defines the response format for chat completions as JSON with a given schema.
     The AI model will need to adhere to this schema when generating completions.
@@ -1141,7 +1542,8 @@ class JsonSchemaFormat(_model_base.Model):
      and dashes, with a maximum length of 64. Required.
     :vartype name: str
     :ivar schema: The definition of the JSON schema. See
-     https://json-schema.org/overview/what-is-jsonschema.
+     `https://json-schema.org/overview/what-is-jsonschema
+     <https://json-schema.org/overview/what-is-jsonschema>`_.
      Note that AI models usually only support a subset of the keywords defined by JSON schema.
      Consult your AI model documentation to determine what is supported. Required.
     :vartype schema: dict[str, any]
@@ -1157,17 +1559,18 @@ class JsonSchemaFormat(_model_base.Model):
     :vartype strict: bool
     """
 
-    name: str = rest_field()
+    name: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """A name that labels this JSON schema. Must be a-z, A-Z, 0-9, or contain underscores and dashes,
      with a maximum length of 64. Required."""
-    schema: Dict[str, Any] = rest_field()
-    """The definition of the JSON schema. See https://json-schema.org/overview/what-is-jsonschema.
+    schema: Dict[str, Any] = rest_field(visibility=["read", "create", "update", "delete", "query"])
+    """The definition of the JSON schema. See `https://json-schema.org/overview/what-is-jsonschema
+     <https://json-schema.org/overview/what-is-jsonschema>`_.
      Note that AI models usually only support a subset of the keywords defined by JSON schema.
      Consult your AI model documentation to determine what is supported. Required."""
-    description: Optional[str] = rest_field()
+    description: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """A description of the response format, used by the AI model to determine how to generate
      responses in this format."""
-    strict: Optional[bool] = rest_field()
+    strict: Optional[bool] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """If set to true, the service will error out if the provided JSON schema contains keywords
      not supported by the AI model. An example of such keyword may be ``maxLength`` for JSON type
      ``string``.
@@ -1198,7 +1601,6 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class ModelInfo(_model_base.Model):
     """Represents some basic information about the AI model.
 
-
     :ivar model_name: The name of the AI model. For example: ``Phi21``. Required.
     :vartype model_name: str
     :ivar model_type: The type of the AI model. A Unique identifier for the profile. Required.
@@ -1210,13 +1612,13 @@ class ModelInfo(_model_base.Model):
     :vartype model_provider_name: str
     """
 
-    model_name: str = rest_field()
+    model_name: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The name of the AI model. For example: ``Phi21``. Required."""
-    model_type: Union[str, "_models.ModelType"] = rest_field()
+    model_type: Union[str, "_models.ModelType"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The type of the AI model. A Unique identifier for the profile. Required. Known values are:
      \"embeddings\", \"image_generation\", \"text_generation\", \"image_embeddings\",
      \"audio_generation\", and \"chat_completion\"."""
-    model_provider_name: str = rest_field()
+    model_provider_name: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The model provider name. For example: ``Microsoft Research``. Required."""
 
     @overload
@@ -1245,7 +1647,6 @@ class StreamingChatChoiceUpdate(_model_base.Model):
     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
     Token limits and other settings may limit the number of choices generated.
 
-
     :ivar index: The ordered index associated with this chat completions choice. Required.
     :vartype index: int
     :ivar finish_reason: The reason that this chat completions choice completed its generated.
@@ -1255,12 +1656,16 @@ class StreamingChatChoiceUpdate(_model_base.Model):
     :vartype delta: ~azure.ai.inference.models.StreamingChatResponseMessageUpdate
     """
 
-    index: int = rest_field()
+    index: int = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The ordered index associated with this chat completions choice. Required."""
-    finish_reason: Union[str, "_models.CompletionsFinishReason"] = rest_field()
+    finish_reason: Union[str, "_models.CompletionsFinishReason"] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The reason that this chat completions choice completed its generated. Required. Known values
      are: \"stop\", \"length\", \"content_filter\", and \"tool_calls\"."""
-    delta: "_models.StreamingChatResponseMessageUpdate" = rest_field()
+    delta: "_models.StreamingChatResponseMessageUpdate" = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """An update to the chat message for a given chat completions prompt. Required."""
 
     @overload
@@ -1291,7 +1696,6 @@ class StreamingChatCompletionsUpdate(_model_base.Model):
     "completes"
     provided prompt data.
 
-
     :ivar id: A unique identifier associated with this chat completions response. Required.
     :vartype id: str
     :ivar created: The first timestamp associated with generation activity for this completions
@@ -1310,18 +1714,22 @@ class StreamingChatCompletionsUpdate(_model_base.Model):
     :vartype usage: ~azure.ai.inference.models.CompletionsUsage
     """
 
-    id: str = rest_field()
+    id: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """A unique identifier associated with this chat completions response. Required."""
-    created: datetime.datetime = rest_field(format="unix-timestamp")
+    created: datetime.datetime = rest_field(
+        visibility=["read", "create", "update", "delete", "query"], format="unix-timestamp"
+    )
     """The first timestamp associated with generation activity for this completions response,
      represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required."""
-    model: str = rest_field()
+    model: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The model used for the chat completion. Required."""
-    choices: List["_models.StreamingChatChoiceUpdate"] = rest_field()
+    choices: List["_models.StreamingChatChoiceUpdate"] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """An update to the collection of completion choices associated with this completions response.
      Generally, ``n`` choices are generated per provided prompt with a default value of 1.
      Token limits and other settings may limit the number of choices generated. Required."""
-    usage: Optional["_models.CompletionsUsage"] = rest_field()
+    usage: Optional["_models.CompletionsUsage"] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Usage information for tokens processed and generated as part of this completions operation."""
 
     @overload
@@ -1360,12 +1768,16 @@ class StreamingChatResponseMessageUpdate(_model_base.Model):
     :vartype tool_calls: list[~azure.ai.inference.models.StreamingChatResponseToolCallUpdate]
     """
 
-    role: Optional[Union[str, "_models.ChatRole"]] = rest_field()
+    role: Optional[Union[str, "_models.ChatRole"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The chat role associated with the message. If present, should always be 'assistant'. Known
      values are: \"system\", \"user\", \"assistant\", \"tool\", and \"developer\"."""
-    content: Optional[str] = rest_field()
+    content: Optional[str] = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The content of the message."""
-    tool_calls: Optional[List["_models.StreamingChatResponseToolCallUpdate"]] = rest_field()
+    tool_calls: Optional[List["_models.StreamingChatResponseToolCallUpdate"]] = rest_field(
+        visibility=["read", "create", "update", "delete", "query"]
+    )
     """The tool calls that must be resolved and have their outputs appended to subsequent input
      messages for the chat
      completions request to resolve as configured."""
@@ -1393,16 +1805,15 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
 class StreamingChatResponseToolCallUpdate(_model_base.Model):
     """An update to the function tool call information requested by the AI model.
 
-
     :ivar id: The ID of the tool call. Required.
     :vartype id: str
     :ivar function: Updates to the function call requested by the AI model. Required.
     :vartype function: ~azure.ai.inference.models.FunctionCall
     """
 
-    id: str = rest_field()
+    id: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The ID of the tool call. Required."""
-    function: "_models.FunctionCall" = rest_field()
+    function: "_models.FunctionCall" = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """Updates to the function call requested by the AI model. Required."""
 
     @overload
@@ -1434,10 +1845,10 @@ class TextContentItem(ContentItem, discriminator="text"):
     :vartype text: str
     """
 
-    type: Literal["text"] = rest_discriminator(name="type")  # type: ignore
+    type: Literal["text"] = rest_discriminator(name="type", visibility=["read", "create", "update", "delete", "query"])  # type: ignore
     """The discriminated object type: always 'text' for this type. Required. Default value is
      \"text\"."""
-    text: str = rest_field()
+    text: str = rest_field(visibility=["read", "create", "update", "delete", "query"])
     """The content of the message. Required."""
 
     @overload
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
index 1bc0679964d9..f7dd32510333 100644
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
+++ b/sdk/ai/azure-ai-inference/azure/ai/inference/models/_patch.py
@@ -6,565 +6,9 @@
 
 Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
 """
-import base64
-import json
-import logging
-import queue
-import re
-import sys
+from typing import List
 
-from typing import Mapping, Literal, Any, List, AsyncIterator, Iterator, Optional, Union, overload
-from azure.core.rest import HttpResponse, AsyncHttpResponse
-from ._enums import ChatRole
-from .._model_base import rest_discriminator, rest_field
-from ._models import ChatRequestMessage
-from ._models import ImageUrl as ImageUrlGenerated
-from ._models import ChatCompletions as ChatCompletionsGenerated
-from ._models import EmbeddingsResult as EmbeddingsResultGenerated
-from ._models import ImageEmbeddingInput as EmbeddingInputGenerated
-from ._models import InputAudio as InputAudioGenerated
-from .. import models as _models
-
-if sys.version_info >= (3, 11):
-    from typing import Self
-else:
-    from typing_extensions import Self
-
-logger = logging.getLogger(__name__)
-
-
-class UserMessage(ChatRequestMessage, discriminator="user"):
-    """A request chat message representing user input to the assistant.
-
-    :ivar role: The chat role associated with this message, which is always 'user' for user
-     messages. Required. The role that provides input for chat completions.
-    :vartype role: str or ~azure.ai.inference.models.USER
-    :ivar content: The contents of the user message, with available input types varying by selected
-     model. Required. Is either a str type or a [ContentItem] type.
-    :vartype content: str or list[~azure.ai.inference.models.ContentItem]
-    """
-
-    role: Literal[ChatRole.USER] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'user' for user messages. Required.
-     The role that provides input for chat completions."""
-    content: Union["str", List["_models.ContentItem"]] = rest_field()
-    """The contents of the user message, with available input types varying by selected model.
-     Required. Is either a str type or a [ContentItem] type."""
-
-    @overload
-    def __init__(
-        self,
-        content: Union[str, List["_models.ContentItem"]],
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], (List, str)):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.USER, **kwargs)
-
-
-class SystemMessage(ChatRequestMessage, discriminator="system"):
-    """A request chat message containing system instructions that influence how the model will
-    generate a chat completions response.
-
-    :ivar role: The chat role associated with this message, which is always 'system' for system
-     messages. Required.
-    :vartype role: str or ~azure.ai.inference.models.SYSTEM
-    :ivar content: The contents of the system message. Required.
-    :vartype content: str
-    """
-
-    role: Literal[ChatRole.SYSTEM] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'system' for system messages.
-     Required."""
-    content: str = rest_field()
-    """The contents of the system message. Required."""
-
-    @overload
-    def __init__(
-        self,
-        content: str,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], str):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.SYSTEM, **kwargs)
-
-
-class DeveloperMessage(ChatRequestMessage, discriminator="developer"):
-    """A request chat message containing developer instructions that influence how the model will
-    generate a chat completions response. Some AI models support developer messages instead
-    of system messages.
-
-    :ivar role: The chat role associated with this message, which is always 'developer' for developer
-     messages. Required.
-    :vartype role: str or ~azure.ai.inference.models.DEVELOPER
-    :ivar content: The contents of the developer message. Required.
-    :vartype content: str
-    """
-
-    role: Literal[ChatRole.DEVELOPER] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'developer' for developer messages.
-     Required."""
-    content: str = rest_field()
-    """The contents of the developer message. Required."""
-
-    @overload
-    def __init__(
-        self,
-        content: str,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], str):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.DEVELOPER, **kwargs)
-
-
-class AssistantMessage(ChatRequestMessage, discriminator="assistant"):
-    """A request chat message representing response or action from the assistant.
-
-    :ivar role: The chat role associated with this message, which is always 'assistant' for
-     assistant messages. Required. The role that provides responses to system-instructed,
-     user-prompted input.
-    :vartype role: str or ~azure.ai.inference.models.ASSISTANT
-    :ivar content: The content of the message.
-    :vartype content: str
-    :ivar tool_calls: The tool calls that must be resolved and have their outputs appended to
-     subsequent input messages for the chat
-     completions request to resolve as configured.
-    :vartype tool_calls: list[~azure.ai.inference.models.ChatCompletionsToolCall]
-    """
-
-    role: Literal[ChatRole.ASSISTANT] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'assistant' for assistant messages.
-     Required. The role that provides responses to system-instructed, user-prompted input."""
-    content: Optional[str] = rest_field()
-    """The content of the message."""
-    tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = rest_field()
-    """The tool calls that must be resolved and have their outputs appended to subsequent input
-     messages for the chat
-     completions request to resolve as configured."""
-
-    @overload
-    def __init__(
-        self,
-        content: Optional[str] = None,
-        *,
-        tool_calls: Optional[List["_models.ChatCompletionsToolCall"]] = None,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], str):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.ASSISTANT, **kwargs)
-
-
-class ToolMessage(ChatRequestMessage, discriminator="tool"):
-    """A request chat message representing requested output from a configured tool.
-
-    :ivar role: The chat role associated with this message, which is always 'tool' for tool
-     messages. Required. The role that represents extension tool activity within a chat completions
-     operation.
-    :vartype role: str or ~azure.ai.inference.models.TOOL
-    :ivar content: The content of the message.
-    :vartype content: str
-    :ivar tool_call_id: The ID of the tool call resolved by the provided content. Required.
-    :vartype tool_call_id: str
-    """
-
-    role: Literal[ChatRole.TOOL] = rest_discriminator(name="role")  # type: ignore
-    """The chat role associated with this message, which is always 'tool' for tool messages. Required.
-     The role that represents extension tool activity within a chat completions operation."""
-    content: Optional[str] = rest_field()
-    """The content of the message."""
-    tool_call_id: str = rest_field()
-    """The ID of the tool call resolved by the provided content. Required."""
-
-    @overload
-    def __init__(
-        self,
-        content: Optional[str] = None,
-        *,
-        tool_call_id: str,
-    ) -> None: ...
-
-    @overload
-    def __init__(self, mapping: Mapping[str, Any]) -> None:
-        """
-        :param mapping: raw JSON to initialize the model.
-        :type mapping: Mapping[str, Any]
-        """
-
-    def __init__(self, *args: Any, **kwargs: Any) -> None:
-        if len(args) == 1 and isinstance(args[0], str):
-            if kwargs.get("content") is not None:
-                raise ValueError("content cannot be provided as positional and keyword arguments")
-            kwargs["content"] = args[0]
-            args = tuple()
-        super().__init__(*args, role=ChatRole.TOOL, **kwargs)
-
-
-class ChatCompletions(ChatCompletionsGenerated):
-    """Representation of the response data from a chat completions request.
-    Completions support a wide variety of tasks and generate text that continues from or
-    "completes"
-    provided prompt data.
-
-
-    :ivar id: A unique identifier associated with this chat completions response. Required.
-    :vartype id: str
-    :ivar created: The first timestamp associated with generation activity for this completions
-     response,
-     represented as seconds since the beginning of the Unix epoch of 00:00 on 1 Jan 1970. Required.
-    :vartype created: ~datetime.datetime
-    :ivar model: The model used for the chat completion. Required.
-    :vartype model: str
-    :ivar usage: Usage information for tokens processed and generated as part of this completions
-     operation. Required.
-    :vartype usage: ~azure.ai.inference.models.CompletionsUsage
-    :ivar choices: The collection of completions choices associated with this completions response.
-     Generally, ``n`` choices are generated per provided prompt with a default value of 1.
-     Token limits and other settings may limit the number of choices generated. Required.
-    :vartype choices: list[~azure.ai.inference.models.ChatChoice]
-    """
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return json.dumps(self.as_dict(), indent=2)
-
-
-class EmbeddingsResult(EmbeddingsResultGenerated):
-    """Representation of the response data from an embeddings request.
-    Embeddings measure the relatedness of text strings and are commonly used for search,
-    clustering,
-    recommendations, and other similar scenarios.
-
-
-    :ivar data: Embedding values for the prompts submitted in the request. Required.
-    :vartype data: list[~azure.ai.inference.models.EmbeddingItem]
-    :ivar usage: Usage counts for tokens input using the embeddings API. Required.
-    :vartype usage: ~azure.ai.inference.models.EmbeddingsUsage
-    :ivar model: The model ID used to generate this result. Required.
-    :vartype model: str
-    """
-
-    def __str__(self) -> str:
-        # pylint: disable=client-method-name-no-double-underscore
-        return json.dumps(self.as_dict(), indent=2)
-
-
-class ImageUrl(ImageUrlGenerated):
-
-    @classmethod
-    def load(
-        cls, *, image_file: str, image_format: str, detail: Optional[Union[str, "_models.ImageDetailLevel"]] = None
-    ) -> Self:
-        """
-        Create an ImageUrl object from a local image file. The method reads the image
-        file and encodes it as a base64 string, which together with the image format
-        is then used to format the JSON `url` value passed in the request payload.
-
-        :keyword image_file: The name of the local image file to load. Required.
-        :paramtype image_file: str
-        :keyword image_format: The MIME type format of the image. For example: "jpeg", "png". Required.
-        :paramtype image_format: str
-        :keyword detail: The evaluation quality setting to use, which controls relative prioritization of
-         speed, token consumption, and accuracy. Known values are: "auto", "low", and "high".
-        :paramtype detail: str or ~azure.ai.inference.models.ImageDetailLevel
-        :return: An ImageUrl object with the image data encoded as a base64 string.
-        :rtype: ~azure.ai.inference.models.ImageUrl
-        :raises FileNotFoundError: when the image file could not be opened.
-        """
-        with open(image_file, "rb") as f:
-            image_data = base64.b64encode(f.read()).decode("utf-8")
-        url = f"data:image/{image_format};base64,{image_data}"
-        return cls(url=url, detail=detail)
-
-
-class ImageEmbeddingInput(EmbeddingInputGenerated):
-
-    @classmethod
-    def load(cls, *, image_file: str, image_format: str, text: Optional[str] = None) -> Self:
-        """
-        Create an ImageEmbeddingInput object from a local image file. The method reads the image
-        file and encodes it as a base64 string, which together with the image format
-        is then used to format the JSON `url` value passed in the request payload.
-
-        :keyword image_file: The name of the local image file to load. Required.
-        :paramtype image_file: str
-        :keyword image_format: The MIME type format of the image. For example: "jpeg", "png". Required.
-        :paramtype image_format: str
-        :keyword text: Optional. The text input to feed into the model (like DINO, CLIP).
-         Returns a 422 error if the model doesn't support the value or parameter.
-        :paramtype text: str
-        :return: An ImageEmbeddingInput object with the image data encoded as a base64 string.
-        :rtype: ~azure.ai.inference.models.EmbeddingsInput
-        :raises FileNotFoundError: when the image file could not be opened.
-        """
-        with open(image_file, "rb") as f:
-            image_data = base64.b64encode(f.read()).decode("utf-8")
-        image_uri = f"data:image/{image_format};base64,{image_data}"
-        return cls(image=image_uri, text=text)
-
-
-class BaseStreamingChatCompletions:
-    """A base class for the sync and async streaming chat completions responses, holding any common code
-    to deserializes the Server Sent Events (SSE) response stream into chat completions updates, each one
-    represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    # Enable detailed logs of SSE parsing. For development only, should be `False` by default.
-    _ENABLE_CLASS_LOGS = False
-
-    # The prefix of each line in the SSE stream that contains a JSON string
-    # to deserialize into a StreamingChatCompletionsUpdate object
-    _SSE_DATA_EVENT_PREFIX = b"data: "
-
-    # The line indicating the end of the SSE stream
-    _SSE_DATA_EVENT_DONE = b"data: [DONE]"
-
-    def __init__(self):
-        self._queue: "queue.Queue[_models.StreamingChatCompletionsUpdate]" = queue.Queue()
-        self._incomplete_line = b""
-        self._done = False  # Will be set to True when reading 'data: [DONE]' line
-
-    # See https://html.spec.whatwg.org/multipage/server-sent-events.html#parsing-an-event-stream
-    def _deserialize_and_add_to_queue(self, element: bytes) -> bool:
-
-        if self._ENABLE_CLASS_LOGS:
-            logger.debug("[Original element] %s", repr(element))
-
-        # Clear the queue of StreamingChatCompletionsUpdate before processing the next block
-        self._queue.queue.clear()
-
-        # Split the single input bytes object at new line characters, and get a list of bytes objects, each
-        # representing a single "line". The bytes object at the end of the list may be a partial "line" that
-        # does not contain a new line character at the end.
-        # Note 1: DO NOT try to use something like this here:
-        #   line_list: List[str] = re.split(r"(?<=\n)", element.decode("utf-8"))
-        #   to do full UTF8 decoding of the whole input bytes object, as the last line in the list may be partial, and
-        #   as such may contain a partial UTF8 Chinese character (for example). `decode("utf-8")` will raise an
-        #   exception for such a case. See GitHub issue https://github.com/Azure/azure-sdk-for-python/issues/39565
-        # Note 2: Consider future re-write and simplifications of this code by using:
-        #   `codecs.getincrementaldecoder("utf-8")`
-        line_list: List[bytes] = re.split(re.compile(b"(?<=\n)"), element)
-        for index, line in enumerate(line_list):
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Original line] %s", repr(line))
-
-            if index == 0:
-                line = self._incomplete_line + line
-                self._incomplete_line = b""
-
-            if index == len(line_list) - 1 and not line.endswith(b"\n"):
-                self._incomplete_line = line
-                return False
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Modified line] %s", repr(line))
-
-            if line == b"\n":  # Empty line, indicating flush output to client
-                continue
-
-            if not line.startswith(self._SSE_DATA_EVENT_PREFIX):
-                raise ValueError(f"SSE event not supported (line `{repr(line)}`)")
-
-            if line.startswith(self._SSE_DATA_EVENT_DONE):
-                if self._ENABLE_CLASS_LOGS:
-                    logger.debug("[Done]")
-                return True
-
-            # If you reached here, the line should contain `data: {...}\n`
-            # where the curly braces contain a valid JSON object.
-            # It is now safe to do UTF8 decoding of the line.
-            line_str = line.decode("utf-8")
-
-            # Deserialize it into a StreamingChatCompletionsUpdate object
-            # and add it to the queue.
-            # pylint: disable=W0212 # Access to a protected member _deserialize of a client class
-            update = _models.StreamingChatCompletionsUpdate._deserialize(
-                json.loads(line_str[len(self._SSE_DATA_EVENT_PREFIX) : -1]), []
-            )
-
-            # We skip any update that has a None or empty choices list, and does not have token usage info.
-            # (this is what OpenAI Python SDK does)
-            if update.choices or update.usage:
-                self._queue.put(update)
-
-            if self._ENABLE_CLASS_LOGS:
-                logger.debug("[Added to queue]")
-
-        return False
-
-
-class StreamingChatCompletions(BaseStreamingChatCompletions):
-    """Represents an interator over StreamingChatCompletionsUpdate objects. It can be used for either synchronous or
-    asynchronous iterations. The class deserializes the Server Sent Events (SSE) response stream
-    into chat completions updates, each one represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    def __init__(self, response: HttpResponse):
-        super().__init__()
-        self._response = response
-        self._bytes_iterator: Iterator[bytes] = response.iter_bytes()
-
-    def __iter__(self) -> Any:
-        return self
-
-    def __next__(self) -> "_models.StreamingChatCompletionsUpdate":
-        while self._queue.empty() and not self._done:
-            self._done = self._read_next_block()
-        if self._queue.empty():
-            raise StopIteration
-        return self._queue.get()
-
-    def _read_next_block(self) -> bool:
-        if self._ENABLE_CLASS_LOGS:
-            logger.debug("[Reading next block]")
-        try:
-            element = self._bytes_iterator.__next__()
-        except StopIteration:
-            self.close()
-            return True
-        return self._deserialize_and_add_to_queue(element)
-
-    def __enter__(self):
-        return self
-
-    def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:  # type: ignore
-        self.close()
-
-    def close(self) -> None:
-        self._response.close()
-
-
-class AsyncStreamingChatCompletions(BaseStreamingChatCompletions):
-    """Represents an async interator over StreamingChatCompletionsUpdate objects.
-    It can be used for either synchronous or asynchronous iterations. The class
-    deserializes the Server Sent Events (SSE) response stream into chat
-    completions updates, each one represented by a StreamingChatCompletionsUpdate object.
-    """
-
-    def __init__(self, response: AsyncHttpResponse):
-        super().__init__()
-        self._response = response
-        self._bytes_iterator: AsyncIterator[bytes] = response.iter_bytes()
-
-    def __aiter__(self) -> Any:
-        return self
-
-    async def __anext__(self) -> "_models.StreamingChatCompletionsUpdate":
-        while self._queue.empty() and not self._done:
-            self._done = await self._read_next_block_async()
-        if self._queue.empty():
-            raise StopAsyncIteration
-        return self._queue.get()
-
-    async def _read_next_block_async(self) -> bool:
-        if self._ENABLE_CLASS_LOGS:
-            logger.debug("[Reading next block]")
-        try:
-            element = await self._bytes_iterator.__anext__()
-        except StopAsyncIteration:
-            await self.aclose()
-            return True
-        return self._deserialize_and_add_to_queue(element)
-
-    async def __aenter__(self):
-        return self
-
-    async def __aexit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:  # type: ignore
-        await self.aclose()
-
-    async def aclose(self) -> None:
-        await self._response.close()
-
-
-class InputAudio(InputAudioGenerated):
-
-    @classmethod
-    def load(
-        cls,
-        *,
-        audio_file: str,
-        audio_format: str,
-    ) -> Self:
-        """
-        Create an InputAudio object from a local audio file. The method reads the audio
-        file and encodes it as a base64 string, which together with the audio format
-        is then used to create the InputAudio object passed to the request payload.
-
-        :keyword audio_file: The name of the local audio file to load. Required.
-        :vartype audio_file: str
-        :keyword audio_format: The MIME type format of the audio. For example: "wav", "mp3". Required.
-        :vartype audio_format: str
-        :return: An InputAudio object with the audio data encoded as a base64 string.
-        :rtype: ~azure.ai.inference.models.InputAudio
-        :raises FileNotFoundError: when the image file could not be opened.
-        """
-        with open(audio_file, "rb") as f:
-            audio_data = base64.b64encode(f.read()).decode("utf-8")
-        return cls(data=audio_data, format=audio_format)
-
-
-__all__: List[str] = [
-    "AssistantMessage",
-    "AsyncStreamingChatCompletions",
-    "ChatCompletions",
-    "ChatRequestMessage",
-    "EmbeddingsResult",
-    "ImageEmbeddingInput",
-    "ImageUrl",
-    "InputAudio",
-    "StreamingChatCompletions",
-    "SystemMessage",
-    "ToolMessage",
-    "UserMessage",
-    "DeveloperMessage",
-]  # Add all objects you want publicly available to users at this package level
+__all__: List[str] = []  # Add all objects you want publicly available to users at this package level
 
 
 def patch_sdk():
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py
deleted file mode 100644
index 2e11b31cb6a4..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# pylint: disable=unused-import
-from ._patch import patch_sdk as _patch_sdk, PromptTemplate
-
-_patch_sdk()
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py
deleted file mode 100644
index ec6702995149..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_core.py
+++ /dev/null
@@ -1,312 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="assignment,attr-defined,index,arg-type"
-# pylint: disable=line-too-long,R,consider-iterating-dictionary,raise-missing-from,dangerous-default-value
-from __future__ import annotations
-import os
-from dataclasses import dataclass, field, asdict
-from pathlib import Path
-from typing import Any, AsyncIterator, Dict, Iterator, List, Literal, Union
-from ._tracer import Tracer, to_dict
-from ._utils import load_json
-
-
-@dataclass
-class ToolCall:
-    id: str
-    name: str
-    arguments: str
-
-
-@dataclass
-class PropertySettings:
-    """PropertySettings class to define the properties of the model
-
-    Attributes
-    ----------
-    type : str
-        The type of the property
-    default : Any
-        The default value of the property
-    description : str
-        The description of the property
-    """
-
-    type: Literal["string", "number", "array", "object", "boolean"]
-    default: Union[str, int, float, List, Dict, bool, None] = field(default=None)
-    description: str = field(default="")
-
-
-@dataclass
-class ModelSettings:
-    """ModelSettings class to define the model of the prompty
-
-    Attributes
-    ----------
-    api : str
-        The api of the model
-    configuration : Dict
-        The configuration of the model
-    parameters : Dict
-        The parameters of the model
-    response : Dict
-        The response of the model
-    """
-
-    api: str = field(default="")
-    configuration: Dict = field(default_factory=dict)
-    parameters: Dict = field(default_factory=dict)
-    response: Dict = field(default_factory=dict)
-
-
-@dataclass
-class TemplateSettings:
-    """TemplateSettings class to define the template of the prompty
-
-    Attributes
-    ----------
-    type : str
-        The type of the template
-    parser : str
-        The parser of the template
-    """
-
-    type: str = field(default="mustache")
-    parser: str = field(default="")
-
-
-@dataclass
-class Prompty:
-    """Prompty class to define the prompty
-
-    Attributes
-    ----------
-    name : str
-        The name of the prompty
-    description : str
-        The description of the prompty
-    authors : List[str]
-        The authors of the prompty
-    tags : List[str]
-        The tags of the prompty
-    version : str
-        The version of the prompty
-    base : str
-        The base of the prompty
-    basePrompty : Prompty
-        The base prompty
-    model : ModelSettings
-        The model of the prompty
-    sample : Dict
-        The sample of the prompty
-    inputs : Dict[str, PropertySettings]
-        The inputs of the prompty
-    outputs : Dict[str, PropertySettings]
-        The outputs of the prompty
-    template : TemplateSettings
-        The template of the prompty
-    file : FilePath
-        The file of the prompty
-    content : Union[str, List[str], Dict]
-        The content of the prompty
-    """
-
-    # metadata
-    name: str = field(default="")
-    description: str = field(default="")
-    authors: List[str] = field(default_factory=list)
-    tags: List[str] = field(default_factory=list)
-    version: str = field(default="")
-    base: str = field(default="")
-    basePrompty: Union[Prompty, None] = field(default=None)
-    # model
-    model: ModelSettings = field(default_factory=ModelSettings)
-
-    # sample
-    sample: Dict = field(default_factory=dict)
-
-    # input / output
-    inputs: Dict[str, PropertySettings] = field(default_factory=dict)
-    outputs: Dict[str, PropertySettings] = field(default_factory=dict)
-
-    # template
-    template: TemplateSettings = field(default_factory=TemplateSettings)
-
-    file: Union[Path, str] = field(default="")
-    content: Union[str, List[str], Dict] = field(default="")
-
-    def to_safe_dict(self) -> Dict[str, Any]:
-        d = {}
-        if self.model:
-            d["model"] = asdict(self.model)
-            _mask_secrets(d, ["model", "configuration"])
-        if self.template:
-            d["template"] = asdict(self.template)
-        if self.inputs:
-            d["inputs"] = {k: asdict(v) for k, v in self.inputs.items()}
-        if self.outputs:
-            d["outputs"] = {k: asdict(v) for k, v in self.outputs.items()}
-        if self.file:
-            d["file"] = str(self.file.as_posix()) if isinstance(self.file, Path) else self.file
-        return d
-
-    @staticmethod
-    def hoist_base_prompty(top: Prompty, base: Prompty) -> Prompty:
-        top.name = base.name if top.name == "" else top.name
-        top.description = base.description if top.description == "" else top.description
-        top.authors = list(set(base.authors + top.authors))
-        top.tags = list(set(base.tags + top.tags))
-        top.version = base.version if top.version == "" else top.version
-
-        top.model.api = base.model.api if top.model.api == "" else top.model.api
-        top.model.configuration = param_hoisting(top.model.configuration, base.model.configuration)
-        top.model.parameters = param_hoisting(top.model.parameters, base.model.parameters)
-        top.model.response = param_hoisting(top.model.response, base.model.response)
-
-        top.sample = param_hoisting(top.sample, base.sample)
-
-        top.basePrompty = base
-
-        return top
-
-    @staticmethod
-    def _process_file(file: str, parent: Path) -> Any:
-        file_path = Path(parent / Path(file)).resolve().absolute()
-        if file_path.exists():
-            items = load_json(file_path)
-            if isinstance(items, list):
-                return [Prompty.normalize(value, parent) for value in items]
-            elif isinstance(items, Dict):
-                return {key: Prompty.normalize(value, parent) for key, value in items.items()}
-            else:
-                return items
-        else:
-            raise FileNotFoundError(f"File {file} not found")
-
-    @staticmethod
-    def _process_env(variable: str, env_error=True, default: Union[str, None] = None) -> Any:
-        if variable in os.environ.keys():
-            return os.environ[variable]
-        else:
-            if default:
-                return default
-            if env_error:
-                raise ValueError(f"Variable {variable} not found in environment")
-
-            return ""
-
-    @staticmethod
-    def normalize(attribute: Any, parent: Path, env_error=True) -> Any:
-        if isinstance(attribute, str):
-            attribute = attribute.strip()
-            if attribute.startswith("${") and attribute.endswith("}"):
-                # check if env or file
-                variable = attribute[2:-1].split(":")
-                if variable[0] == "env" and len(variable) > 1:
-                    return Prompty._process_env(
-                        variable[1],
-                        env_error,
-                        variable[2] if len(variable) > 2 else None,
-                    )
-                elif variable[0] == "file" and len(variable) > 1:
-                    return Prompty._process_file(variable[1], parent)
-                else:
-                    raise ValueError(f"Invalid attribute format ({attribute})")
-            else:
-                return attribute
-        elif isinstance(attribute, list):
-            return [Prompty.normalize(value, parent) for value in attribute]
-        elif isinstance(attribute, Dict):
-            return {key: Prompty.normalize(value, parent) for key, value in attribute.items()}
-        else:
-            return attribute
-
-
-def param_hoisting(top: Dict[str, Any], bottom: Dict[str, Any], top_key: Union[str, None] = None) -> Dict[str, Any]:
-    if top_key:
-        new_dict = {**top[top_key]} if top_key in top else {}
-    else:
-        new_dict = {**top}
-    for key, value in bottom.items():
-        if not key in new_dict:
-            new_dict[key] = value
-    return new_dict
-
-
-class PromptyStream(Iterator):
-    """PromptyStream class to iterate over LLM stream.
-    Necessary for Prompty to handle streaming data when tracing."""
-
-    def __init__(self, name: str, iterator: Iterator):
-        self.name = name
-        self.iterator = iterator
-        self.items: List[Any] = []
-        self.__name__ = "PromptyStream"
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        try:
-            # enumerate but add to list
-            o = self.iterator.__next__()
-            self.items.append(o)
-            return o
-
-        except StopIteration:
-            # StopIteration is raised
-            # contents are exhausted
-            if len(self.items) > 0:
-                with Tracer.start("PromptyStream") as trace:
-                    trace("signature", f"{self.name}.PromptyStream")
-                    trace("inputs", "None")
-                    trace("result", [to_dict(s) for s in self.items])
-
-            raise StopIteration
-
-
-class AsyncPromptyStream(AsyncIterator):
-    """AsyncPromptyStream class to iterate over LLM stream.
-    Necessary for Prompty to handle streaming data when tracing."""
-
-    def __init__(self, name: str, iterator: AsyncIterator):
-        self.name = name
-        self.iterator = iterator
-        self.items: List[Any] = []
-        self.__name__ = "AsyncPromptyStream"
-
-    def __aiter__(self):
-        return self
-
-    async def __anext__(self):
-        try:
-            # enumerate but add to list
-            o = await self.iterator.__anext__()
-            self.items.append(o)
-            return o
-
-        except StopAsyncIteration:
-            # StopIteration is raised
-            # contents are exhausted
-            if len(self.items) > 0:
-                with Tracer.start("AsyncPromptyStream") as trace:
-                    trace("signature", f"{self.name}.AsyncPromptyStream")
-                    trace("inputs", "None")
-                    trace("result", [to_dict(s) for s in self.items])
-
-            raise StopAsyncIteration
-
-
-def _mask_secrets(d: Dict[str, Any], path: list[str], patterns: list[str] = ["key", "secret"]) -> bool:
-    sub_d = d
-    for key in path:
-        if key not in sub_d:
-            return False
-        sub_d = sub_d[key]
-
-    for k, v in sub_d.items():
-        if any([pattern in k.lower() for pattern in patterns]):
-            sub_d[k] = "*" * len(v)
-    return True
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py
deleted file mode 100644
index d682662e7b01..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_invoker.py
+++ /dev/null
@@ -1,295 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="return-value,operator"
-# pylint: disable=line-too-long,R,docstring-missing-param,docstring-missing-return,docstring-missing-rtype,unnecessary-pass
-import abc
-from typing import Any, Callable, Dict, Literal
-from ._tracer import trace
-from ._core import Prompty
-
-
-class Invoker(abc.ABC):
-    """Abstract class for Invoker
-
-    Attributes
-    ----------
-    prompty : Prompty
-        The prompty object
-    name : str
-        The name of the invoker
-
-    """
-
-    def __init__(self, prompty: Prompty) -> None:
-        self.prompty = prompty
-        self.name = self.__class__.__name__
-
-    @abc.abstractmethod
-    def invoke(self, data: Any) -> Any:
-        """Abstract method to invoke the invoker
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        pass
-
-    @abc.abstractmethod
-    async def invoke_async(self, data: Any) -> Any:
-        """Abstract method to invoke the invoker asynchronously
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        pass
-
-    @trace
-    def run(self, data: Any) -> Any:
-        """Method to run the invoker
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        return self.invoke(data)
-
-    @trace
-    async def run_async(self, data: Any) -> Any:
-        """Method to run the invoker asynchronously
-
-        Parameters
-        ----------
-        data : Any
-            The data to be invoked
-
-        Returns
-        -------
-        Any
-            The invoked
-        """
-        return await self.invoke_async(data)
-
-
-class InvokerFactory:
-    """Factory class for Invoker"""
-
-    _renderers: Dict[str, Invoker] = {}
-    _parsers: Dict[str, Invoker] = {}
-    _executors: Dict[str, Invoker] = {}
-    _processors: Dict[str, Invoker] = {}
-
-    @classmethod
-    def add_renderer(cls, name: str, invoker: Invoker) -> None:
-        cls._renderers[name] = invoker
-
-    @classmethod
-    def add_parser(cls, name: str, invoker: Invoker) -> None:
-        cls._parsers[name] = invoker
-
-    @classmethod
-    def add_executor(cls, name: str, invoker: Invoker) -> None:
-        cls._executors[name] = invoker
-
-    @classmethod
-    def add_processor(cls, name: str, invoker: Invoker) -> None:
-        cls._processors[name] = invoker
-
-    @classmethod
-    def register_renderer(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._renderers[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def register_parser(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._parsers[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def register_executor(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._executors[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def register_processor(cls, name: str) -> Callable:
-        def inner_wrapper(wrapped_class: Invoker) -> Callable:
-            cls._processors[name] = wrapped_class
-            return wrapped_class  # type: ignore
-
-        return inner_wrapper
-
-    @classmethod
-    def _get_name(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-    ) -> str:
-        if type == "renderer":
-            return prompty.template.type
-        elif type == "parser":
-            return f"{prompty.template.parser}.{prompty.model.api}"
-        elif type == "executor":
-            return prompty.model.configuration["type"]
-        elif type == "processor":
-            return prompty.model.configuration["type"]
-        else:
-            raise ValueError(f"Type {type} not found")
-
-    @classmethod
-    def _get_invoker(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-    ) -> Invoker:
-        if type == "renderer":
-            name = prompty.template.type
-            if name not in cls._renderers:
-                raise ValueError(f"Renderer {name} not found")
-
-            return cls._renderers[name](prompty)  # type: ignore
-
-        elif type == "parser":
-            name = f"{prompty.template.parser}.{prompty.model.api}"
-            if name not in cls._parsers:
-                raise ValueError(f"Parser {name} not found")
-
-            return cls._parsers[name](prompty)  # type: ignore
-
-        elif type == "executor":
-            name = prompty.model.configuration["type"]
-            if name not in cls._executors:
-                raise ValueError(f"Executor {name} not found")
-
-            return cls._executors[name](prompty)  # type: ignore
-
-        elif type == "processor":
-            name = prompty.model.configuration["type"]
-            if name not in cls._processors:
-                raise ValueError(f"Processor {name} not found")
-
-            return cls._processors[name](prompty)  # type: ignore
-
-        else:
-            raise ValueError(f"Type {type} not found")
-
-    @classmethod
-    def run(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-        data: Any,
-        default: Any = None,
-    ):
-        name = cls._get_name(type, prompty)
-        if name.startswith("NOOP") and default is not None:
-            return default
-        elif name.startswith("NOOP"):
-            return data
-
-        invoker = cls._get_invoker(type, prompty)
-        value = invoker.run(data)
-        return value
-
-    @classmethod
-    async def run_async(
-        cls,
-        type: Literal["renderer", "parser", "executor", "processor"],
-        prompty: Prompty,
-        data: Any,
-        default: Any = None,
-    ):
-        name = cls._get_name(type, prompty)
-        if name.startswith("NOOP") and default is not None:
-            return default
-        elif name.startswith("NOOP"):
-            return data
-        invoker = cls._get_invoker(type, prompty)
-        value = await invoker.run_async(data)
-        return value
-
-    @classmethod
-    def run_renderer(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("renderer", prompty, data, default)
-
-    @classmethod
-    async def run_renderer_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("renderer", prompty, data, default)
-
-    @classmethod
-    def run_parser(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("parser", prompty, data, default)
-
-    @classmethod
-    async def run_parser_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("parser", prompty, data, default)
-
-    @classmethod
-    def run_executor(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("executor", prompty, data, default)
-
-    @classmethod
-    async def run_executor_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("executor", prompty, data, default)
-
-    @classmethod
-    def run_processor(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return cls.run("processor", prompty, data, default)
-
-    @classmethod
-    async def run_processor_async(cls, prompty: Prompty, data: Any, default: Any = None) -> Any:
-        return await cls.run_async("processor", prompty, data, default)
-
-
-class InvokerException(Exception):
-    """Exception class for Invoker"""
-
-    def __init__(self, message: str, type: str) -> None:
-        super().__init__(message)
-        self.type = type
-
-    def __str__(self) -> str:
-        return f"{super().__str__()}. Make sure to pip install any necessary package extras (i.e. could be something like `pip install prompty[{self.type}]`) for {self.type} as well as import the appropriate invokers (i.e. could be something like `import prompty.{self.type}`)."
-
-
-@InvokerFactory.register_renderer("NOOP")
-@InvokerFactory.register_parser("NOOP")
-@InvokerFactory.register_executor("NOOP")
-@InvokerFactory.register_processor("NOOP")
-@InvokerFactory.register_parser("prompty.embedding")
-@InvokerFactory.register_parser("prompty.image")
-@InvokerFactory.register_parser("prompty.completion")
-class NoOp(Invoker):
-    def invoke(self, data: Any) -> Any:
-        return data
-
-    async def invoke_async(self, data: str) -> Any:
-        return self.invoke(data)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py
deleted file mode 100644
index f7a0c21d8bb8..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_mustache.py
+++ /dev/null
@@ -1,671 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# pylint: disable=line-too-long,R,consider-using-dict-items,docstring-missing-return,docstring-missing-rtype,docstring-missing-param,global-statement,unused-argument,global-variable-not-assigned,protected-access,logging-fstring-interpolation,deprecated-method
-from __future__ import annotations
-import logging
-from collections.abc import Iterator, Sequence
-from types import MappingProxyType
-from typing import (
-    Any,
-    Dict,
-    List,
-    Literal,
-    Mapping,
-    Optional,
-    Union,
-    cast,
-)
-from typing_extensions import TypeAlias
-
-logger = logging.getLogger(__name__)
-
-
-Scopes: TypeAlias = List[Union[Literal[False, 0], Mapping[str, Any]]]
-
-
-# Globals
-_CURRENT_LINE = 1
-_LAST_TAG_LINE = None
-
-
-class ChevronError(SyntaxError):
-    """Custom exception for Chevron errors."""
-
-
-#
-# Helper functions
-#
-
-
-def grab_literal(template: str, l_del: str) -> tuple[str, str]:
-    """Parse a literal from the template.
-
-    Args:
-        template: The template to parse.
-        l_del: The left delimiter.
-
-    Returns:
-        Tuple[str, str]: The literal and the template.
-    """
-
-    global _CURRENT_LINE
-
-    try:
-        # Look for the next tag and move the template to it
-        literal, template = template.split(l_del, 1)
-        _CURRENT_LINE += literal.count("\n")
-        return (literal, template)
-
-    # There are no more tags in the template?
-    except ValueError:
-        # Then the rest of the template is a literal
-        return (template, "")
-
-
-def l_sa_check(template: str, literal: str, is_standalone: bool) -> bool:
-    """Do a preliminary check to see if a tag could be a standalone.
-
-    Args:
-        template: The template. (Not used.)
-        literal: The literal.
-        is_standalone: Whether the tag is standalone.
-
-    Returns:
-        bool: Whether the tag could be a standalone.
-    """
-
-    # If there is a newline, or the previous tag was a standalone
-    if literal.find("\n") != -1 or is_standalone:
-        padding = literal.split("\n")[-1]
-
-        # If all the characters since the last newline are spaces
-        # Then the next tag could be a standalone
-        # Otherwise it can't be
-        return padding.isspace() or padding == ""
-    else:
-        return False
-
-
-def r_sa_check(template: str, tag_type: str, is_standalone: bool) -> bool:
-    """Do a final check to see if a tag could be a standalone.
-
-    Args:
-        template: The template.
-        tag_type: The type of the tag.
-        is_standalone: Whether the tag is standalone.
-
-    Returns:
-        bool: Whether the tag could be a standalone.
-    """
-
-    # Check right side if we might be a standalone
-    if is_standalone and tag_type not in ["variable", "no escape"]:
-        on_newline = template.split("\n", 1)
-
-        # If the stuff to the right of us are spaces we're a standalone
-        return on_newline[0].isspace() or not on_newline[0]
-
-    # If we're a tag can't be a standalone
-    else:
-        return False
-
-
-def parse_tag(template: str, l_del: str, r_del: str) -> tuple[tuple[str, str], str]:
-    """Parse a tag from a template.
-
-    Args:
-        template: The template.
-        l_del: The left delimiter.
-        r_del: The right delimiter.
-
-    Returns:
-        Tuple[Tuple[str, str], str]: The tag and the template.
-
-    Raises:
-        ChevronError: If the tag is unclosed.
-        ChevronError: If the set delimiter tag is unclosed.
-    """
-    global _CURRENT_LINE
-    global _LAST_TAG_LINE
-
-    tag_types = {
-        "!": "comment",
-        "#": "section",
-        "^": "inverted section",
-        "/": "end",
-        ">": "partial",
-        "=": "set delimiter?",
-        "{": "no escape?",
-        "&": "no escape",
-    }
-
-    # Get the tag
-    try:
-        tag, template = template.split(r_del, 1)
-    except ValueError as e:
-        msg = "unclosed tag " f"at line {_CURRENT_LINE}"
-        raise ChevronError(msg) from e
-
-    # Find the type meaning of the first character
-    tag_type = tag_types.get(tag[0], "variable")
-
-    # If the type is not a variable
-    if tag_type != "variable":
-        # Then that first character is not needed
-        tag = tag[1:]
-
-    # If we might be a set delimiter tag
-    if tag_type == "set delimiter?":
-        # Double check to make sure we are
-        if tag.endswith("="):
-            tag_type = "set delimiter"
-            # Remove the equal sign
-            tag = tag[:-1]
-
-        # Otherwise we should complain
-        else:
-            msg = "unclosed set delimiter tag\n" f"at line {_CURRENT_LINE}"
-            raise ChevronError(msg)
-
-    elif (
-        # If we might be a no html escape tag
-        tag_type == "no escape?"
-        # And we have a third curly brace
-        # (And are using curly braces as delimiters)
-        and l_del == "{{"
-        and r_del == "}}"
-        and template.startswith("}")
-    ):
-        # Then we are a no html escape tag
-        template = template[1:]
-        tag_type = "no escape"
-
-    # Strip the whitespace off the key and return
-    return ((tag_type, tag.strip()), template)
-
-
-#
-# The main tokenizing function
-#
-
-
-def tokenize(template: str, def_ldel: str = "{{", def_rdel: str = "}}") -> Iterator[tuple[str, str]]:
-    """Tokenize a mustache template.
-
-    Tokenizes a mustache template in a generator fashion,
-    using file-like objects. It also accepts a string containing
-    the template.
-
-
-    Arguments:
-
-    template -- a file-like object, or a string of a mustache template
-
-    def_ldel -- The default left delimiter
-                ("{{" by default, as in spec compliant mustache)
-
-    def_rdel -- The default right delimiter
-                ("}}" by default, as in spec compliant mustache)
-
-
-    Returns:
-
-    A generator of mustache tags in the form of a tuple
-
-    -- (tag_type, tag_key)
-
-    Where tag_type is one of:
-     * literal
-     * section
-     * inverted section
-     * end
-     * partial
-     * no escape
-
-    And tag_key is either the key or in the case of a literal tag,
-    the literal itself.
-    """
-
-    global _CURRENT_LINE, _LAST_TAG_LINE
-    _CURRENT_LINE = 1
-    _LAST_TAG_LINE = None
-
-    is_standalone = True
-    open_sections = []
-    l_del = def_ldel
-    r_del = def_rdel
-
-    while template:
-        literal, template = grab_literal(template, l_del)
-
-        # If the template is completed
-        if not template:
-            # Then yield the literal and leave
-            yield ("literal", literal)
-            break
-
-        # Do the first check to see if we could be a standalone
-        is_standalone = l_sa_check(template, literal, is_standalone)
-
-        # Parse the tag
-        tag, template = parse_tag(template, l_del, r_del)
-        tag_type, tag_key = tag
-
-        # Special tag logic
-
-        # If we are a set delimiter tag
-        if tag_type == "set delimiter":
-            # Then get and set the delimiters
-            dels = tag_key.strip().split(" ")
-            l_del, r_del = dels[0], dels[-1]
-
-        # If we are a section tag
-        elif tag_type in ["section", "inverted section"]:
-            # Then open a new section
-            open_sections.append(tag_key)
-            _LAST_TAG_LINE = _CURRENT_LINE
-
-        # If we are an end tag
-        elif tag_type == "end":
-            # Then check to see if the last opened section
-            # is the same as us
-            try:
-                last_section = open_sections.pop()
-            except IndexError as e:
-                msg = f'Trying to close tag "{tag_key}"\n' "Looks like it was not opened.\n" f"line {_CURRENT_LINE + 1}"
-                raise ChevronError(msg) from e
-            if tag_key != last_section:
-                # Otherwise we need to complain
-                msg = (
-                    f'Trying to close tag "{tag_key}"\n'
-                    f'last open tag is "{last_section}"\n'
-                    f"line {_CURRENT_LINE + 1}"
-                )
-                raise ChevronError(msg)
-
-        # Do the second check to see if we're a standalone
-        is_standalone = r_sa_check(template, tag_type, is_standalone)
-
-        # Which if we are
-        if is_standalone:
-            # Remove the stuff before the newline
-            template = template.split("\n", 1)[-1]
-
-            # Partials need to keep the spaces on their left
-            if tag_type != "partial":
-                # But other tags don't
-                literal = literal.rstrip(" ")
-
-        # Start yielding
-        # Ignore literals that are empty
-        if literal != "":
-            yield ("literal", literal)
-
-        # Ignore comments and set delimiters
-        if tag_type not in ["comment", "set delimiter?"]:
-            yield (tag_type, tag_key)
-
-    # If there are any open sections when we're done
-    if open_sections:
-        # Then we need to complain
-        msg = (
-            "Unexpected EOF\n"
-            f'the tag "{open_sections[-1]}" was never closed\n'
-            f"was opened at line {_LAST_TAG_LINE}"
-        )
-        raise ChevronError(msg)
-
-
-#
-# Helper functions
-#
-
-
-def _html_escape(string: str) -> str:
-    """HTML escape all of these " & < >"""
-
-    html_codes = {
-        '"': "&quot;",
-        "<": "&lt;",
-        ">": "&gt;",
-    }
-
-    # & must be handled first
-    string = string.replace("&", "&amp;")
-    for char in html_codes:
-        string = string.replace(char, html_codes[char])
-    return string
-
-
-def _get_key(
-    key: str,
-    scopes: Scopes,
-    warn: bool,
-    keep: bool,
-    def_ldel: str,
-    def_rdel: str,
-) -> Any:
-    """Get a key from the current scope"""
-
-    # If the key is a dot
-    if key == ".":
-        # Then just return the current scope
-        return scopes[0]
-
-    # Loop through the scopes
-    for scope in scopes:
-        try:
-            # Return an empty string if falsy, with two exceptions
-            # 0 should return 0, and False should return False
-            if scope in (0, False):
-                return scope
-
-            # For every dot separated key
-            for child in key.split("."):
-                # Return an empty string if falsy, with two exceptions
-                # 0 should return 0, and False should return False
-                if scope in (0, False):
-                    return scope
-                # Move into the scope
-                try:
-                    # Try subscripting (Normal dictionaries)
-                    scope = cast(Dict[str, Any], scope)[child]
-                except (TypeError, AttributeError):
-                    try:
-                        scope = getattr(scope, child)
-                    except (TypeError, AttributeError):
-                        # Try as a list
-                        scope = scope[int(child)]  # type: ignore
-
-            try:
-                # This allows for custom falsy data types
-                # https://github.com/noahmorrison/chevron/issues/35
-                if scope._CHEVRON_return_scope_when_falsy:  # type: ignore
-                    return scope
-            except AttributeError:
-                if scope in (0, False):
-                    return scope
-                return scope or ""
-        except (AttributeError, KeyError, IndexError, ValueError):
-            # We couldn't find the key in the current scope
-            # We'll try again on the next pass
-            pass
-
-    # We couldn't find the key in any of the scopes
-
-    if warn:
-        logger.warn(f"Could not find key '{key}'")
-
-    if keep:
-        return f"{def_ldel} {key} {def_rdel}"
-
-    return ""
-
-
-def _get_partial(name: str, partials_dict: Mapping[str, str]) -> str:
-    """Load a partial"""
-    try:
-        # Maybe the partial is in the dictionary
-        return partials_dict[name]
-    except KeyError:
-        return ""
-
-
-#
-# The main rendering function
-#
-g_token_cache: Dict[str, List[tuple[str, str]]] = {}
-
-EMPTY_DICT: MappingProxyType[str, str] = MappingProxyType({})
-
-
-def render(
-    template: Union[str, List[tuple[str, str]]] = "",
-    data: Mapping[str, Any] = EMPTY_DICT,
-    partials_dict: Mapping[str, str] = EMPTY_DICT,
-    padding: str = "",
-    def_ldel: str = "{{",
-    def_rdel: str = "}}",
-    scopes: Optional[Scopes] = None,
-    warn: bool = False,
-    keep: bool = False,
-) -> str:
-    """Render a mustache template.
-
-    Renders a mustache template with a data scope and inline partial capability.
-
-    Arguments:
-
-    template      -- A file-like object or a string containing the template.
-
-    data          -- A python dictionary with your data scope.
-
-    partials_path -- The path to where your partials are stored.
-                     If set to None, then partials won't be loaded from the file system
-                     (defaults to '.').
-
-    partials_ext  -- The extension that you want the parser to look for
-                     (defaults to 'mustache').
-
-    partials_dict -- A python dictionary which will be search for partials
-                     before the filesystem is. {'include': 'foo'} is the same
-                     as a file called include.mustache
-                     (defaults to {}).
-
-    padding       -- This is for padding partials, and shouldn't be used
-                     (but can be if you really want to).
-
-    def_ldel      -- The default left delimiter
-                     ("{{" by default, as in spec compliant mustache).
-
-    def_rdel      -- The default right delimiter
-                     ("}}" by default, as in spec compliant mustache).
-
-    scopes        -- The list of scopes that get_key will look through.
-
-    warn          -- Log a warning when a template substitution isn't found in the data
-
-    keep          -- Keep unreplaced tags when a substitution isn't found in the data.
-
-
-    Returns:
-
-    A string containing the rendered template.
-    """
-
-    # If the template is a sequence but not derived from a string
-    if isinstance(template, Sequence) and not isinstance(template, str):
-        # Then we don't need to tokenize it
-        # But it does need to be a generator
-        tokens: Iterator[tuple[str, str]] = (token for token in template)
-    else:
-        if template in g_token_cache:
-            tokens = (token for token in g_token_cache[template])
-        else:
-            # Otherwise make a generator
-            tokens = tokenize(template, def_ldel, def_rdel)
-
-    output = ""
-
-    if scopes is None:
-        scopes = [data]
-
-    # Run through the tokens
-    for tag, key in tokens:
-        # Set the current scope
-        current_scope = scopes[0]
-
-        # If we're an end tag
-        if tag == "end":
-            # Pop out of the latest scope
-            del scopes[0]
-
-        # If the current scope is falsy and not the only scope
-        elif not current_scope and len(scopes) != 1:
-            if tag in ["section", "inverted section"]:
-                # Set the most recent scope to a falsy value
-                scopes.insert(0, False)
-
-        # If we're a literal tag
-        elif tag == "literal":
-            # Add padding to the key and add it to the output
-            output += key.replace("\n", "\n" + padding)
-
-        # If we're a variable tag
-        elif tag == "variable":
-            # Add the html escaped key to the output
-            thing = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-            if thing is True and key == ".":
-                # if we've coerced into a boolean by accident
-                # (inverted tags do this)
-                # then get the un-coerced object (next in the stack)
-                thing = scopes[1]
-            if not isinstance(thing, str):
-                thing = str(thing)
-            output += _html_escape(thing)
-
-        # If we're a no html escape tag
-        elif tag == "no escape":
-            # Just lookup the key and add it
-            thing = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-            if not isinstance(thing, str):
-                thing = str(thing)
-            output += thing
-
-        # If we're a section tag
-        elif tag == "section":
-            # Get the sections scope
-            scope = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-
-            # If the scope is a callable (as described in
-            # https://mustache.github.io/mustache.5.html)
-            if callable(scope):
-                # Generate template text from tags
-                text = ""
-                tags: List[tuple[str, str]] = []
-                for token in tokens:
-                    if token == ("end", key):
-                        break
-
-                    tags.append(token)
-                    tag_type, tag_key = token
-                    if tag_type == "literal":
-                        text += tag_key
-                    elif tag_type == "no escape":
-                        text += f"{def_ldel}& {tag_key} {def_rdel}"
-                    else:
-                        text += "{}{} {}{}".format(
-                            def_ldel,
-                            {
-                                "comment": "!",
-                                "section": "#",
-                                "inverted section": "^",
-                                "end": "/",
-                                "partial": ">",
-                                "set delimiter": "=",
-                                "no escape": "&",
-                                "variable": "",
-                            }[tag_type],
-                            tag_key,
-                            def_rdel,
-                        )
-
-                g_token_cache[text] = tags
-
-                rend = scope(
-                    text,
-                    lambda template, data=None: render(
-                        template,
-                        data={},
-                        partials_dict=partials_dict,
-                        padding=padding,
-                        def_ldel=def_ldel,
-                        def_rdel=def_rdel,
-                        scopes=data and [data] + scopes or scopes,
-                        warn=warn,
-                        keep=keep,
-                    ),
-                )
-
-                output += rend  # type: ignore[reportOperatorIssue]
-
-            # If the scope is a sequence, an iterator or generator but not
-            # derived from a string
-            elif isinstance(scope, (Sequence, Iterator)) and not isinstance(scope, str):
-                # Then we need to do some looping
-
-                # Gather up all the tags inside the section
-                # (And don't be tricked by nested end tags with the same key)
-                # TODO: This feels like it still has edge cases, no?
-                tags = []
-                tags_with_same_key = 0
-                for token in tokens:
-                    if token == ("section", key):
-                        tags_with_same_key += 1
-                    if token == ("end", key):
-                        tags_with_same_key -= 1
-                        if tags_with_same_key < 0:
-                            break
-                    tags.append(token)
-
-                # For every item in the scope
-                for thing in scope:
-                    # Append it as the most recent scope and render
-                    new_scope = [thing] + scopes
-                    rend = render(
-                        template=tags,
-                        scopes=new_scope,
-                        padding=padding,
-                        partials_dict=partials_dict,
-                        def_ldel=def_ldel,
-                        def_rdel=def_rdel,
-                        warn=warn,
-                        keep=keep,
-                    )
-
-                    output += rend
-
-            else:
-                # Otherwise we're just a scope section
-                scopes.insert(0, scope)  # type: ignore[reportArgumentType]
-
-        # If we're an inverted section
-        elif tag == "inverted section":
-            # Add the flipped scope to the scopes
-            scope = _get_key(key, scopes, warn=warn, keep=keep, def_ldel=def_ldel, def_rdel=def_rdel)
-            scopes.insert(0, cast(Literal[False], not scope))
-
-        # If we're a partial
-        elif tag == "partial":
-            # Load the partial
-            partial = _get_partial(key, partials_dict)
-
-            # Find what to pad the partial with
-            left = output.rpartition("\n")[2]
-            part_padding = padding
-            if left.isspace():
-                part_padding += left
-
-            # Render the partial
-            part_out = render(
-                template=partial,
-                partials_dict=partials_dict,
-                def_ldel=def_ldel,
-                def_rdel=def_rdel,
-                padding=part_padding,
-                scopes=scopes,
-                warn=warn,
-                keep=keep,
-            )
-
-            # If the partial was indented
-            if left.isspace():
-                # then remove the spaces from the end
-                part_out = part_out.rstrip(" \t")
-
-            # Add the partials output to the output
-            output += part_out
-
-    return output
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py
deleted file mode 100644
index de3c570e5c89..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_parsers.py
+++ /dev/null
@@ -1,156 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="union-attr,return-value"
-# pylint: disable=line-too-long,R,consider-using-enumerate,docstring-missing-param,docstring-missing-return,docstring-missing-rtype
-import re
-import base64
-from pathlib import Path
-from typing import Any, Union
-from ._core import Prompty
-from ._invoker import Invoker, InvokerFactory
-
-
-ROLES = ["assistant", "function", "system", "user"]
-
-
-@InvokerFactory.register_parser("prompty.chat")
-class PromptyChatParser(Invoker):
-    """Prompty Chat Parser"""
-
-    def __init__(self, prompty: Prompty) -> None:
-        super().__init__(prompty)
-        self.path = Path(self.prompty.file).parent
-
-    def invoke(self, data: str) -> Any:
-        return invoke_parser(self.path, data)
-
-    async def invoke_async(self, data: str) -> Any:
-        """Invoke the Prompty Chat Parser (Async)
-
-        Parameters
-        ----------
-        data : str
-            The data to parse
-
-        Returns
-        -------
-        str
-            The parsed data
-        """
-        return self.invoke(data)
-
-
-def _inline_image(path: Union[Path, None], image_item: str) -> str:
-    """Inline Image
-
-    Parameters
-    ----------
-    image_item : str
-        The image item to inline
-
-    Returns
-    -------
-    str
-        The inlined image
-    """
-    # pass through if it's a url or base64 encoded or the path is None
-    if image_item.startswith("http") or image_item.startswith("data") or path is None:
-        return image_item
-    # otherwise, it's a local file - need to base64 encode it
-    else:
-        image_path = (path if path is not None else Path(".")) / image_item
-        with open(image_path, "rb") as f:
-            base64_image = base64.b64encode(f.read()).decode("utf-8")
-
-        if image_path.suffix == ".png":
-            return f"data:image/png;base64,{base64_image}"
-        elif image_path.suffix == ".jpg":
-            return f"data:image/jpeg;base64,{base64_image}"
-        elif image_path.suffix == ".jpeg":
-            return f"data:image/jpeg;base64,{base64_image}"
-        else:
-            raise ValueError(
-                f"Invalid image format {image_path.suffix} - currently only .png and .jpg / .jpeg are supported."
-            )
-
-
-def _parse_content(path: Union[Path, None], content: str):
-    """for parsing inline images
-
-    Parameters
-    ----------
-    content : str
-        The content to parse
-
-    Returns
-    -------
-    any
-        The parsed content
-    """
-    # regular expression to parse markdown images
-    image = r"(?P<alt>!\[[^\]]*\])\((?P<filename>.*?)(?=\"|\))\)"
-    matches = re.findall(image, content, flags=re.MULTILINE)
-    if len(matches) > 0:
-        content_items = []
-        content_chunks = re.split(image, content, flags=re.MULTILINE)
-        current_chunk = 0
-        for i in range(len(content_chunks)):
-            # image entry
-            if current_chunk < len(matches) and content_chunks[i] == matches[current_chunk][0]:
-                content_items.append(
-                    {
-                        "type": "image_url",
-                        "image_url": {"url": _inline_image(path, matches[current_chunk][1].split(" ")[0].strip())},
-                    }
-                )
-            # second part of image entry
-            elif current_chunk < len(matches) and content_chunks[i] == matches[current_chunk][1]:
-                current_chunk += 1
-            # text entry
-            else:
-                if len(content_chunks[i].strip()) > 0:
-                    content_items.append({"type": "text", "text": content_chunks[i].strip()})
-        return content_items
-    else:
-        return content
-
-
-def invoke_parser(path: Union[Path, None], data: str) -> Any:
-    """Invoke the Prompty Chat Parser
-
-    Parameters
-    ----------
-    data : str
-        The data to parse
-
-    Returns
-    -------
-    str
-        The parsed data
-    """
-    messages = []
-    separator = r"(?i)^\s*#?\s*(" + "|".join(ROLES) + r")\s*:\s*\n"
-
-    # get valid chunks - remove empty items
-    chunks = [item for item in re.split(separator, data, flags=re.MULTILINE) if len(item.strip()) > 0]
-
-    # if no starter role, then inject system role
-    if not chunks[0].strip().lower() in ROLES:
-        chunks.insert(0, "system")
-
-    # if last chunk is role entry, then remove (no content?)
-    if chunks[-1].strip().lower() in ROLES:
-        chunks.pop()
-
-    if len(chunks) % 2 != 0:
-        raise ValueError("Invalid prompt format")
-
-    # create messages
-    for i in range(0, len(chunks), 2):
-        role = chunks[i].strip().lower()
-        content = chunks[i + 1].strip()
-        messages.append({"role": role, "content": _parse_content(path, content)})
-
-    return messages
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py
deleted file mode 100644
index 14ad4f62b4c1..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_patch.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# pylint: disable=line-too-long,R
-"""Customize generated code here.
-
-Follow our quickstart for examples: https://aka.ms/azsdk/python/dpcodegen/python/customize
-"""
-
-import traceback
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-from typing_extensions import Self
-from ._core import Prompty
-from ._mustache import render
-from ._parsers import invoke_parser
-from ._prompty_utils import load, prepare
-from ._utils import remove_leading_empty_space
-
-
-class PromptTemplate:
-    """The helper class which takes variant of inputs, e.g. Prompty format or string, and returns the parsed prompt in an array."""
-
-    @classmethod
-    def from_prompty(cls, file_path: str) -> Self:
-        """Initialize a PromptTemplate object from a prompty file.
-
-        :param file_path: The path to the prompty file.
-        :type file_path: str
-        :return: The PromptTemplate object.
-        :rtype: PromptTemplate
-        """
-        if not file_path:
-            raise ValueError("Please provide file_path")
-
-        # Get the absolute path of the file by `traceback.extract_stack()`, it's "-2" because:
-        #  In the stack, the last function is the current function.
-        #  The second last function is the caller function, which is the root of the file_path.
-        stack = traceback.extract_stack()
-        caller = Path(stack[-2].filename)
-        abs_file_path = Path(caller.parent / Path(file_path)).resolve().absolute()
-
-        prompty = load(str(abs_file_path))
-        return cls(prompty=prompty)
-
-    @classmethod
-    def from_string(cls, prompt_template: str, api: str = "chat", model_name: Optional[str] = None) -> Self:
-        """Initialize a PromptTemplate object from a message template.
-
-        :param prompt_template: The prompt template string.
-        :type prompt_template: str
-        :param api: The API type, e.g. "chat" or "completion".
-        :type api: str
-        :param model_name: The model name, e.g. "gpt-4o-mini".
-        :type model_name: str
-        :return: The PromptTemplate object.
-        :rtype: PromptTemplate
-        """
-        return cls(
-            api=api,
-            prompt_template=prompt_template,
-            model_name=model_name,
-            prompty=None,
-        )
-
-    def __init__(
-        self,
-        *,
-        api: str = "chat",
-        prompty: Optional[Prompty] = None,
-        prompt_template: Optional[str] = None,
-        model_name: Optional[str] = None,
-    ) -> None:
-        self.prompty = prompty
-        if self.prompty is not None:
-            self.model_name = (
-                self.prompty.model.configuration["azure_deployment"]
-                if "azure_deployment" in self.prompty.model.configuration
-                else None
-            )
-            self.parameters = self.prompty.model.parameters
-            self._config = {}
-        elif prompt_template is not None:
-            self.model_name = model_name
-            self.parameters = {}
-            # _config is a dict to hold the internal configuration
-            self._config = {
-                "api": api if api is not None else "chat",
-                "prompt_template": prompt_template,
-            }
-        else:
-            raise ValueError("Please pass valid arguments for PromptTemplate")
-
-    def create_messages(self, data: Optional[Dict[str, Any]] = None, **kwargs) -> List[Dict[str, Any]]:
-        """Render the prompt template with the given data.
-
-        :param data: The data to render the prompt template with.
-        :type data: Optional[Dict[str, Any]]
-        :return: The rendered prompt template.
-        :rtype: List[Dict[str, Any]]
-        """
-        if data is None:
-            data = kwargs
-
-        if self.prompty is not None:
-            parsed = prepare(self.prompty, data)
-            return parsed
-        elif "prompt_template" in self._config:
-            prompt_template = remove_leading_empty_space(self._config["prompt_template"])
-            system_prompt_str = render(prompt_template, data)
-            parsed = invoke_parser(None, system_prompt_str)
-            return parsed
-        else:
-            raise ValueError("Please provide valid prompt template")
-
-
-def patch_sdk():
-    """Do not remove from this file.
-
-    `patch_sdk` is a last resort escape hatch that allows you to do customizations
-    you can't accomplish using the techniques described in
-    https://aka.ms/azsdk/python/dpcodegen/python/customize
-    """
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py
deleted file mode 100644
index 5ea38bda6229..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_prompty_utils.py
+++ /dev/null
@@ -1,415 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="assignment"
-# pylint: disable=R,docstring-missing-param,docstring-missing-return,docstring-missing-rtype,dangerous-default-value,redefined-outer-name,unused-wildcard-import,wildcard-import,raise-missing-from
-import traceback
-from pathlib import Path
-from typing import Any, Dict, List, Union
-from ._tracer import trace
-from ._invoker import InvokerFactory
-from ._core import (
-    ModelSettings,
-    Prompty,
-    PropertySettings,
-    TemplateSettings,
-    param_hoisting,
-)
-from ._utils import (
-    load_global_config,
-    load_prompty,
-)
-
-from ._renderers import *
-from ._parsers import *
-
-
-@trace(description="Create a headless prompty object for programmatic use.")
-def headless(
-    api: str,
-    content: Union[str, List[str], dict],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    connection: str = "default",
-) -> Prompty:
-    """Create a headless prompty object for programmatic use.
-
-    Parameters
-    ----------
-    api : str
-        The API to use for the model
-    content : Union[str, List[str], dict]
-        The content to process
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    connection : str, optional
-        The connection to use, by default "default"
-
-    Returns
-    -------
-    Prompty
-        The headless prompty object
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.headless(
-            api="embedding",
-            configuration={"type": "azure", "azure_deployment": "text-embedding-ada-002"},
-            content="hello world",
-        )
-    >>> emb = prompty.execute(p)
-
-    """
-
-    # get caller's path (to get relative path for prompty.json)
-    caller = Path(traceback.extract_stack()[-2].filename)
-    templateSettings = TemplateSettings(type="NOOP", parser="NOOP")
-    modelSettings = ModelSettings(
-        api=api,
-        configuration=Prompty.normalize(
-            param_hoisting(configuration, load_global_config(caller.parent, connection)),
-            caller.parent,
-        ),
-        parameters=parameters,
-    )
-
-    return Prompty(model=modelSettings, template=templateSettings, content=content)
-
-
-def _load_raw_prompty(attributes: dict, content: str, p: Path, global_config: dict):
-    if "model" not in attributes:
-        attributes["model"] = {}
-
-    if "configuration" not in attributes["model"]:
-        attributes["model"]["configuration"] = global_config
-    else:
-        attributes["model"]["configuration"] = param_hoisting(
-            attributes["model"]["configuration"],
-            global_config,
-        )
-
-    # pull model settings out of attributes
-    try:
-        model = ModelSettings(**attributes.pop("model"))
-    except Exception as e:
-        raise ValueError(f"Error in model settings: {e}")
-
-    # pull template settings
-    try:
-        if "template" in attributes:
-            t = attributes.pop("template")
-            if isinstance(t, dict):
-                template = TemplateSettings(**t)
-            # has to be a string denoting the type
-            else:
-                template = TemplateSettings(type=t, parser="prompty")
-        else:
-            template = TemplateSettings(type="mustache", parser="prompty")
-    except Exception as e:
-        raise ValueError(f"Error in template loader: {e}")
-
-    # formalize inputs and outputs
-    if "inputs" in attributes:
-        try:
-            inputs = {k: PropertySettings(**v) for (k, v) in attributes.pop("inputs").items()}
-        except Exception as e:
-            raise ValueError(f"Error in inputs: {e}")
-    else:
-        inputs = {}
-    if "outputs" in attributes:
-        try:
-            outputs = {k: PropertySettings(**v) for (k, v) in attributes.pop("outputs").items()}
-        except Exception as e:
-            raise ValueError(f"Error in outputs: {e}")
-    else:
-        outputs = {}
-
-    prompty = Prompty(
-        **attributes,
-        model=model,
-        inputs=inputs,
-        outputs=outputs,
-        template=template,
-        content=content,
-        file=p,
-    )
-
-    return prompty
-
-
-@trace(description="Load a prompty file.")
-def load(prompty_file: Union[str, Path], configuration: str = "default") -> Prompty:
-    """Load a prompty file.
-
-    Parameters
-    ----------
-    prompty_file : Union[str, Path]
-        The path to the prompty file
-    configuration : str, optional
-        The configuration to use, by default "default"
-
-    Returns
-    -------
-    Prompty
-        The loaded prompty object
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> print(p)
-    """
-
-    p = Path(prompty_file)
-    if not p.is_absolute():
-        # get caller's path (take into account trace frame)
-        caller = Path(traceback.extract_stack()[-3].filename)
-        p = Path(caller.parent / p).resolve().absolute()
-
-    # load dictionary from prompty file
-    matter = load_prompty(p)
-
-    attributes = matter["attributes"]
-    content = matter["body"]
-
-    # normalize attribute dictionary resolve keys and files
-    attributes = Prompty.normalize(attributes, p.parent)
-
-    # load global configuration
-    global_config = Prompty.normalize(load_global_config(p.parent, configuration), p.parent)
-
-    prompty = _load_raw_prompty(attributes, content, p, global_config)
-
-    # recursive loading of base prompty
-    if "base" in attributes:
-        # load the base prompty from the same directory as the current prompty
-        base = load(p.parent / attributes["base"])
-        prompty = Prompty.hoist_base_prompty(prompty, base)
-
-    return prompty
-
-
-@trace(description="Prepare the inputs for the prompt.")
-def prepare(
-    prompt: Prompty,
-    inputs: Dict[str, Any] = {},
-):
-    """Prepare the inputs for the prompt.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    inputs : Dict[str, Any], optional
-        The inputs to the prompt, by default {}
-
-    Returns
-    -------
-    dict
-        The prepared and hidrated template shaped to the LLM model
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = prompty.prepare(p, inputs)
-    """
-    inputs = param_hoisting(inputs, prompt.sample)
-
-    render = InvokerFactory.run_renderer(prompt, inputs, prompt.content)
-    result = InvokerFactory.run_parser(prompt, render)
-
-    return result
-
-
-@trace(description="Prepare the inputs for the prompt.")
-async def prepare_async(
-    prompt: Prompty,
-    inputs: Dict[str, Any] = {},
-):
-    """Prepare the inputs for the prompt.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    inputs : Dict[str, Any], optional
-        The inputs to the prompt, by default {}
-
-    Returns
-    -------
-    dict
-        The prepared and hidrated template shaped to the LLM model
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = await prompty.prepare_async(p, inputs)
-    """
-    inputs = param_hoisting(inputs, prompt.sample)
-
-    render = await InvokerFactory.run_renderer_async(prompt, inputs, prompt.content)
-    result = await InvokerFactory.run_parser_async(prompt, render)
-
-    return result
-
-
-@trace(description="Run the prepared Prompty content against the model.")
-def run(
-    prompt: Prompty,
-    content: Union[dict, list, str],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    raw: bool = False,
-):
-    """Run the prepared Prompty content.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    content : Union[dict, list, str]
-        The content to process
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    raw : bool, optional
-        Whether to skip processing, by default False
-
-    Returns
-    -------
-    Any
-        The result of the prompt
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = prompty.prepare(p, inputs)
-    >>> result = prompty.run(p, content)
-    """
-
-    if configuration != {}:
-        prompt.model.configuration = param_hoisting(configuration, prompt.model.configuration)
-
-    if parameters != {}:
-        prompt.model.parameters = param_hoisting(parameters, prompt.model.parameters)
-
-    result = InvokerFactory.run_executor(prompt, content)
-    if not raw:
-        result = InvokerFactory.run_processor(prompt, result)
-
-    return result
-
-
-@trace(description="Run the prepared Prompty content against the model.")
-async def run_async(
-    prompt: Prompty,
-    content: Union[dict, list, str],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    raw: bool = False,
-):
-    """Run the prepared Prompty content.
-
-    Parameters
-    ----------
-    prompt : Prompty
-        The prompty object
-    content : Union[dict, list, str]
-        The content to process
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    raw : bool, optional
-        Whether to skip processing, by default False
-
-    Returns
-    -------
-    Any
-        The result of the prompt
-
-    Example
-    -------
-    >>> import prompty
-    >>> p = prompty.load("prompts/basic.prompty")
-    >>> inputs = {"name": "John Doe"}
-    >>> content = await prompty.prepare_async(p, inputs)
-    >>> result = await prompty.run_async(p, content)
-    """
-
-    if configuration != {}:
-        prompt.model.configuration = param_hoisting(configuration, prompt.model.configuration)
-
-    if parameters != {}:
-        prompt.model.parameters = param_hoisting(parameters, prompt.model.parameters)
-
-    result = await InvokerFactory.run_executor_async(prompt, content)
-    if not raw:
-        result = await InvokerFactory.run_processor_async(prompt, result)
-
-    return result
-
-
-@trace(description="Execute a prompty")
-def execute(
-    prompt: Union[str, Prompty],
-    configuration: Dict[str, Any] = {},
-    parameters: Dict[str, Any] = {},
-    inputs: Dict[str, Any] = {},
-    raw: bool = False,
-    config_name: str = "default",
-):
-    """Execute a prompty.
-
-    Parameters
-    ----------
-    prompt : Union[str, Prompty]
-        The prompty object or path to the prompty file
-    configuration : Dict[str, Any], optional
-        The configuration to use, by default {}
-    parameters : Dict[str, Any], optional
-        The parameters to use, by default {}
-    inputs : Dict[str, Any], optional
-        The inputs to the prompt, by default {}
-    raw : bool, optional
-        Whether to skip processing, by default False
-    connection : str, optional
-        The connection to use, by default "default"
-
-    Returns
-    -------
-    Any
-        The result of the prompt
-
-    Example
-    -------
-    >>> import prompty
-    >>> inputs = {"name": "John Doe"}
-    >>> result = prompty.execute("prompts/basic.prompty", inputs=inputs)
-    """
-    if isinstance(prompt, str):
-        path = Path(prompt)
-        if not path.is_absolute():
-            # get caller's path (take into account trace frame)
-            caller = Path(traceback.extract_stack()[-3].filename)
-            path = Path(caller.parent / path).resolve().absolute()
-        prompt = load(path, config_name)
-
-    # prepare content
-    content = prepare(prompt, inputs)
-
-    # run LLM model
-    result = run(prompt, content, configuration, parameters, raw)
-
-    return result
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py
deleted file mode 100644
index 0d682a7fe151..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_renderers.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="union-attr,assignment,arg-type"
-from pathlib import Path
-from ._core import Prompty
-from ._invoker import Invoker, InvokerFactory
-from ._mustache import render
-
-
-@InvokerFactory.register_renderer("mustache")
-class MustacheRenderer(Invoker):
-    """Render a mustache template."""
-
-    def __init__(self, prompty: Prompty) -> None:
-        super().__init__(prompty)
-        self.templates = {}
-        cur_prompt = self.prompty
-        while cur_prompt:
-            self.templates[Path(cur_prompt.file).name] = cur_prompt.content
-            cur_prompt = cur_prompt.basePrompty
-        self.name = Path(self.prompty.file).name
-
-    def invoke(self, data: str) -> str:
-        generated = render(self.prompty.content, data)  # type: ignore
-        return generated
-
-    async def invoke_async(self, data: str) -> str:
-        return self.invoke(data)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py
deleted file mode 100644
index 24f800b465f4..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_tracer.py
+++ /dev/null
@@ -1,316 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="union-attr,arg-type,misc,return-value,assignment,func-returns-value"
-# pylint: disable=R,redefined-outer-name,bare-except,unspecified-encoding
-import os
-import json
-import inspect
-import traceback
-import importlib
-import contextlib
-from pathlib import Path
-from numbers import Number
-from datetime import datetime
-from functools import wraps, partial
-from typing import Any, Callable, Dict, Iterator, List, Union
-
-
-# clean up key value pairs for sensitive values
-def sanitize(key: str, value: Any) -> Any:
-    if isinstance(value, str) and any([s in key.lower() for s in ["key", "token", "secret", "password", "credential"]]):
-        return len(str(value)) * "*"
-
-    if isinstance(value, dict):
-        return {k: sanitize(k, v) for k, v in value.items()}
-
-    return value
-
-
-class Tracer:
-    _tracers: Dict[str, Callable[[str], Iterator[Callable[[str, Any], None]]]] = {}
-
-    @classmethod
-    def add(cls, name: str, tracer: Callable[[str], Iterator[Callable[[str, Any], None]]]) -> None:
-        cls._tracers[name] = tracer
-
-    @classmethod
-    def clear(cls) -> None:
-        cls._tracers = {}
-
-    @classmethod
-    @contextlib.contextmanager
-    def start(cls, name: str) -> Iterator[Callable[[str, Any], None]]:
-        with contextlib.ExitStack() as stack:
-            traces: List[Any] = [stack.enter_context(tracer(name)) for tracer in cls._tracers.values()]  # type: ignore
-            yield lambda key, value: [  # type: ignore
-                # normalize and sanitize any trace values
-                trace(key, sanitize(key, to_dict(value)))
-                for trace in traces
-            ]
-
-
-def to_dict(obj: Any) -> Union[Dict[str, Any], List[Dict[str, Any]], str, Number, bool]:
-    # simple json types
-    if isinstance(obj, str) or isinstance(obj, Number) or isinstance(obj, bool):
-        return obj
-
-    # datetime
-    if isinstance(obj, datetime):
-        return obj.isoformat()
-
-    # safe Prompty obj serialization
-    if type(obj).__name__ == "Prompty":
-        return obj.to_safe_dict()
-
-    # safe PromptyStream obj serialization
-    if type(obj).__name__ == "PromptyStream":
-        return "PromptyStream"
-
-    if type(obj).__name__ == "AsyncPromptyStream":
-        return "AsyncPromptyStream"
-
-    # recursive list and dict
-    if isinstance(obj, List):
-        return [to_dict(item) for item in obj]  # type: ignore
-
-    if isinstance(obj, Dict):
-        return {k: v if isinstance(v, str) else to_dict(v) for k, v in obj.items()}
-
-    if isinstance(obj, Path):
-        return str(obj)
-
-    # cast to string otherwise...
-    return str(obj)
-
-
-def _name(func: Callable, args):
-    if hasattr(func, "__qualname__"):
-        signature = f"{func.__module__}.{func.__qualname__}"
-    else:
-        signature = f"{func.__module__}.{func.__name__}"
-
-    # core invoker gets special treatment prompty.invoker.Invoker
-    core_invoker = signature.startswith("prompty.invoker.Invoker.run")
-    if core_invoker:
-        name = type(args[0]).__name__
-        if signature.endswith("async"):
-            signature = f"{args[0].__module__}.{args[0].__class__.__name__}.invoke_async"
-        else:
-            signature = f"{args[0].__module__}.{args[0].__class__.__name__}.invoke"
-    else:
-        name = func.__name__
-
-    return name, signature
-
-
-def _inputs(func: Callable, args, kwargs) -> dict:
-    ba = inspect.signature(func).bind(*args, **kwargs)
-    ba.apply_defaults()
-
-    inputs = {k: to_dict(v) for k, v in ba.arguments.items() if k != "self"}
-
-    return inputs
-
-
-def _results(result: Any) -> Union[Dict, List[Dict], str, Number, bool]:
-    return to_dict(result) if result is not None else "None"
-
-
-def _trace_sync(func: Union[Callable, None] = None, **okwargs: Any) -> Callable:
-
-    @wraps(func)  # type: ignore
-    def wrapper(*args, **kwargs):
-        name, signature = _name(func, args)  # type: ignore
-        with Tracer.start(name) as trace:
-            trace("signature", signature)
-
-            # support arbitrary keyword
-            # arguments for trace decorator
-            for k, v in okwargs.items():
-                trace(k, to_dict(v))
-
-            inputs = _inputs(func, args, kwargs)  # type: ignore
-            trace("inputs", inputs)
-
-            try:
-                result = func(*args, **kwargs)  # type: ignore
-                trace("result", _results(result))
-            except Exception as e:
-                trace(
-                    "result",
-                    {
-                        "exception": {
-                            "type": type(e),
-                            "traceback": (traceback.format_tb(tb=e.__traceback__) if e.__traceback__ else None),
-                            "message": str(e),
-                            "args": to_dict(e.args),
-                        }
-                    },
-                )
-                raise e
-
-            return result
-
-    return wrapper
-
-
-def _trace_async(func: Union[Callable, None] = None, **okwargs: Any) -> Callable:
-
-    @wraps(func)  # type: ignore
-    async def wrapper(*args, **kwargs):
-        name, signature = _name(func, args)  # type: ignore
-        with Tracer.start(name) as trace:
-            trace("signature", signature)
-
-            # support arbitrary keyword
-            # arguments for trace decorator
-            for k, v in okwargs.items():
-                trace(k, to_dict(v))
-
-            inputs = _inputs(func, args, kwargs)  # type: ignore
-            trace("inputs", inputs)
-            try:
-                result = await func(*args, **kwargs)  # type: ignore
-                trace("result", _results(result))
-            except Exception as e:
-                trace(
-                    "result",
-                    {
-                        "exception": {
-                            "type": type(e),
-                            "traceback": (traceback.format_tb(tb=e.__traceback__) if e.__traceback__ else None),
-                            "message": str(e),
-                            "args": to_dict(e.args),
-                        }
-                    },
-                )
-                raise e
-
-            return result
-
-    return wrapper
-
-
-def trace(func: Union[Callable, None] = None, **kwargs: Any) -> Callable:
-    if func is None:
-        return partial(trace, **kwargs)
-    wrapped_method = _trace_async if inspect.iscoroutinefunction(func) else _trace_sync
-    return wrapped_method(func, **kwargs)
-
-
-class PromptyTracer:
-    def __init__(self, output_dir: Union[str, None] = None) -> None:
-        if output_dir:
-            self.output = Path(output_dir).resolve().absolute()
-        else:
-            self.output = Path(Path(os.getcwd()) / ".runs").resolve().absolute()
-
-        if not self.output.exists():
-            self.output.mkdir(parents=True, exist_ok=True)
-
-        self.stack: List[Dict[str, Any]] = []
-
-    @contextlib.contextmanager
-    def tracer(self, name: str) -> Iterator[Callable[[str, Any], None]]:
-        try:
-            self.stack.append({"name": name})
-            frame = self.stack[-1]
-            frame["__time"] = {
-                "start": datetime.now(),
-            }
-
-            def add(key: str, value: Any) -> None:
-                if key not in frame:
-                    frame[key] = value
-                # multiple values creates list
-                else:
-                    if isinstance(frame[key], list):
-                        frame[key].append(value)
-                    else:
-                        frame[key] = [frame[key], value]
-
-            yield add
-        finally:
-            frame = self.stack.pop()
-            start: datetime = frame["__time"]["start"]
-            end: datetime = datetime.now()
-
-            # add duration to frame
-            frame["__time"] = {
-                "start": start.strftime("%Y-%m-%dT%H:%M:%S.%f"),
-                "end": end.strftime("%Y-%m-%dT%H:%M:%S.%f"),
-                "duration": int((end - start).total_seconds() * 1000),
-            }
-
-            # hoist usage to parent frame
-            if "result" in frame and isinstance(frame["result"], dict):
-                if "usage" in frame["result"]:
-                    frame["__usage"] = self.hoist_item(
-                        frame["result"]["usage"],
-                        frame["__usage"] if "__usage" in frame else {},
-                    )
-
-            # streamed results may have usage as well
-            if "result" in frame and isinstance(frame["result"], list):
-                for result in frame["result"]:
-                    if isinstance(result, dict) and "usage" in result and isinstance(result["usage"], dict):
-                        frame["__usage"] = self.hoist_item(
-                            result["usage"],
-                            frame["__usage"] if "__usage" in frame else {},
-                        )
-
-            # add any usage frames from below
-            if "__frames" in frame:
-                for child in frame["__frames"]:
-                    if "__usage" in child:
-                        frame["__usage"] = self.hoist_item(
-                            child["__usage"],
-                            frame["__usage"] if "__usage" in frame else {},
-                        )
-
-            # if stack is empty, dump the frame
-            if len(self.stack) == 0:
-                self.write_trace(frame)
-            # otherwise, append the frame to the parent
-            else:
-                if "__frames" not in self.stack[-1]:
-                    self.stack[-1]["__frames"] = []
-                self.stack[-1]["__frames"].append(frame)
-
-    def hoist_item(self, src: Dict[str, Any], cur: Dict[str, Any]) -> Dict[str, Any]:
-        for key, value in src.items():
-            if value is None or isinstance(value, list) or isinstance(value, dict):
-                continue
-            try:
-                if key not in cur:
-                    cur[key] = value
-                else:
-                    cur[key] += value
-            except:
-                continue
-
-        return cur
-
-    def write_trace(self, frame: Dict[str, Any]) -> None:
-        trace_file = self.output / f"{frame['name']}.{datetime.now().strftime('%Y%m%d.%H%M%S')}.tracy"
-
-        v = importlib.metadata.version("prompty")  # type: ignore
-        enriched_frame = {
-            "runtime": "python",
-            "version": v,
-            "trace": frame,
-        }
-
-        with open(trace_file, "w") as f:
-            json.dump(enriched_frame, f, indent=4)
-
-
-@contextlib.contextmanager
-def console_tracer(name: str) -> Iterator[Callable[[str, Any], None]]:
-    try:
-        print(f"Starting {name}")
-        yield lambda key, value: print(f"{key}:\n{json.dumps(to_dict(value), indent=4)}")
-    finally:
-        print(f"Ending {name}")
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py b/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py
deleted file mode 100644
index 22f284180ee1..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/prompts/_utils.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-# mypy: disable-error-code="import-untyped,return-value"
-# pylint: disable=line-too-long,R,wrong-import-order,global-variable-not-assigned)
-import json
-import os
-import re
-import sys
-from typing import Any, Dict
-from pathlib import Path
-
-
-_yaml_regex = re.compile(
-    r"^\s*" + r"(?:---|\+\+\+)" + r"(.*?)" + r"(?:---|\+\+\+)" + r"\s*(.+)$",
-    re.S | re.M,
-)
-
-
-def load_text(file_path, encoding="utf-8"):
-    with open(file_path, "r", encoding=encoding) as file:
-        return file.read()
-
-
-def load_json(file_path, encoding="utf-8"):
-    return json.loads(load_text(file_path, encoding=encoding))
-
-
-def load_global_config(prompty_path: Path = Path.cwd(), configuration: str = "default") -> Dict[str, Any]:
-    prompty_config_path = prompty_path.joinpath("prompty.json")
-    if os.path.exists(prompty_config_path):
-        c = load_json(prompty_config_path)
-        if configuration in c:
-            return c[configuration]
-        else:
-            raise ValueError(f'Item "{configuration}" not found in "{prompty_config_path}"')
-    else:
-        return {}
-
-
-def load_prompty(file_path, encoding="utf-8") -> Dict[str, Any]:
-    contents = load_text(file_path, encoding=encoding)
-    return parse(contents)
-
-
-def parse(contents):
-    try:
-        import yaml  # type: ignore
-    except ImportError as exc:
-        raise ImportError("Please install pyyaml to use this function. Run `pip install pyyaml`.") from exc
-
-    global _yaml_regex
-
-    fmatter = ""
-    body = ""
-    result = _yaml_regex.search(contents)
-
-    if result:
-        fmatter = result.group(1)
-        body = result.group(2)
-    return {
-        "attributes": yaml.load(fmatter, Loader=yaml.SafeLoader),
-        "body": body,
-        "frontmatter": fmatter,
-    }
-
-
-def remove_leading_empty_space(multiline_str: str) -> str:
-    """
-    Processes a multiline string by:
-    1. Removing empty lines
-    2. Finding the minimum leading spaces
-    3. Indenting all lines to the minimum level
-
-    :param multiline_str: The input multiline string.
-    :type multiline_str: str
-    :return: The processed multiline string.
-    :rtype: str
-    """
-    lines = multiline_str.splitlines()
-    start_index = 0
-    while start_index < len(lines) and lines[start_index].strip() == "":
-        start_index += 1
-
-    # Find the minimum number of leading spaces
-    min_spaces = sys.maxsize
-    for line in lines[start_index:]:
-        if len(line.strip()) == 0:
-            continue
-        spaces = len(line) - len(line.lstrip())
-        spaces += line.lstrip().count("\t") * 2  # Count tabs as 2 spaces
-        min_spaces = min(min_spaces, spaces)
-
-    # Remove leading spaces and indent to the minimum level
-    processed_lines = []
-    for line in lines[start_index:]:
-        processed_lines.append(line[min_spaces:])
-
-    return "\n".join(processed_lines)
diff --git a/sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py b/sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py
deleted file mode 100644
index f7937a99074a..000000000000
--- a/sdk/ai/azure-ai-inference/azure/ai/inference/tracing.py
+++ /dev/null
@@ -1,850 +0,0 @@
-# ------------------------------------
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-# ------------------------------------
-import copy
-from enum import Enum
-import functools
-import json
-import importlib
-import logging
-import os
-from time import time_ns
-from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple, Union
-from urllib.parse import urlparse
-
-# pylint: disable = no-name-in-module
-from azure.core import CaseInsensitiveEnumMeta  # type: ignore
-from azure.core.settings import settings
-from . import models as _models
-
-try:
-    # pylint: disable = no-name-in-module
-    from azure.core.tracing import AbstractSpan, SpanKind  # type: ignore
-    from opentelemetry.trace import StatusCode, Span
-
-    _tracing_library_available = True
-except ModuleNotFoundError:
-
-    _tracing_library_available = False
-
-
-__all__ = [
-    "AIInferenceInstrumentor",
-]
-
-
-_inference_traces_enabled: bool = False
-_trace_inference_content: bool = False
-_INFERENCE_GEN_AI_SYSTEM_NAME = "az.ai.inference"
-
-
-class TraceType(str, Enum, metaclass=CaseInsensitiveEnumMeta):  # pylint: disable=C4747
-    """An enumeration class to represent different types of traces."""
-
-    INFERENCE = "Inference"
-
-
-class AIInferenceInstrumentor:
-    """
-    A class for managing the trace instrumentation of AI Inference.
-
-    This class allows enabling or disabling tracing for AI Inference.
-    and provides functionality to check whether instrumentation is active.
-
-    """
-
-    def __init__(self):
-        if not _tracing_library_available:
-            raise ModuleNotFoundError(
-                "Azure Core Tracing Opentelemetry is not installed. "
-                "Please install it using 'pip install azure-core-tracing-opentelemetry'"
-            )
-        # In the future we could support different versions from the same library
-        # and have a parameter that specifies the version to use.
-        self._impl = _AIInferenceInstrumentorPreview()
-
-    def instrument(self, enable_content_recording: Optional[bool] = None) -> None:
-        """
-        Enable trace instrumentation for AI Inference.
-
-        :param enable_content_recording: Whether content recording is enabled as part
-            of the traces or not. Content in this context refers to chat message content
-            and function call tool related function names, function parameter names and
-            values. True will enable content recording, False will disable it. If no value
-            s provided, then the value read from environment variable
-            AZURE_TRACING_GEN_AI_CONTENT_RECORDING_ENABLED is used. If the environment variable
-            is not found, then the value will default to False. Please note that successive calls
-            to instrument will always apply the content recording value provided with the most
-            recent call to instrument (including applying the environment variable if no value is
-            provided and defaulting to false if the environment variable is not found), even if
-            instrument was already previously called without uninstrument being called in between
-            the instrument calls.
-
-        :type enable_content_recording: bool, optional
-        """
-        self._impl.instrument(enable_content_recording=enable_content_recording)
-
-    def uninstrument(self) -> None:
-        """
-        Disable trace instrumentation for AI Inference.
-
-        Raises:
-            RuntimeError: If instrumentation is not currently enabled.
-
-        This method removes any active instrumentation, stopping the tracing
-        of AI Inference.
-        """
-        self._impl.uninstrument()
-
-    def is_instrumented(self) -> bool:
-        """
-        Check if trace instrumentation for AI Inference is currently enabled.
-
-        :return: True if instrumentation is active, False otherwise.
-        :rtype: bool
-        """
-        return self._impl.is_instrumented()
-
-    def is_content_recording_enabled(self) -> bool:
-        """
-        This function gets the content recording value.
-
-        :return: A bool value indicating whether content recording is enabled.
-        :rtype: bool
-        """
-        return self._impl.is_content_recording_enabled()
-
-
-class _AIInferenceInstrumentorPreview:
-    """
-    A class for managing the trace instrumentation of AI Inference.
-
-    This class allows enabling or disabling tracing for AI Inference.
-    and provides functionality to check whether instrumentation is active.
-    """
-
-    def _str_to_bool(self, s):
-        if s is None:
-            return False
-        return str(s).lower() == "true"
-
-    def instrument(self, enable_content_recording: Optional[bool] = None):
-        """
-        Enable trace instrumentation for AI Inference.
-
-        :param enable_content_recording: Whether content recording is enabled as part
-        of the traces or not. Content in this context refers to chat message content
-        and function call tool related function names, function parameter names and
-        values. True will enable content recording, False will disable it. If no value
-        is provided, then the value read from environment variable
-        AZURE_TRACING_GEN_AI_CONTENT_RECORDING_ENABLED is used. If the environment variable
-        is not found, then the value will default to False.
-
-        :type enable_content_recording: bool, optional
-        """
-        if enable_content_recording is None:
-            var_value = os.environ.get("AZURE_TRACING_GEN_AI_CONTENT_RECORDING_ENABLED")
-            enable_content_recording = self._str_to_bool(var_value)
-        if not self.is_instrumented():
-            self._instrument_inference(enable_content_recording)
-        else:
-            self._set_content_recording_enabled(enable_content_recording=enable_content_recording)
-
-    def uninstrument(self):
-        """
-        Disable trace instrumentation for AI Inference.
-
-        This method removes any active instrumentation, stopping the tracing
-        of AI Inference.
-        """
-        if self.is_instrumented():
-            self._uninstrument_inference()
-
-    def is_instrumented(self):
-        """
-        Check if trace instrumentation for AI Inference is currently enabled.
-
-        :return: True if instrumentation is active, False otherwise.
-        :rtype: bool
-        """
-        return self._is_instrumented()
-
-    def set_content_recording_enabled(self, enable_content_recording: bool = False) -> None:
-        """This function sets the content recording value.
-
-        :param enable_content_recording: Indicates whether tracing of message content should be enabled.
-                                    This also controls whether function call tool function names,
-                                    parameter names and parameter values are traced.
-        :type enable_content_recording: bool
-        """
-        self._set_content_recording_enabled(enable_content_recording=enable_content_recording)
-
-    def is_content_recording_enabled(self) -> bool:
-        """This function gets the content recording value.
-
-        :return: A bool value indicating whether content tracing is enabled.
-        :rtype bool
-        """
-        return self._is_content_recording_enabled()
-
-    def _set_attributes(self, span: "AbstractSpan", *attrs: Tuple[str, Any]) -> None:
-        for attr in attrs:
-            key, value = attr
-            if value is not None:
-                span.add_attribute(key, value)
-
-    def _add_request_chat_message_events(self, span: "AbstractSpan", **kwargs: Any) -> int:
-        timestamp = 0
-        for message in kwargs.get("messages", []):
-            try:
-                message = message.as_dict()
-            except AttributeError:
-                pass
-
-            if message.get("role"):
-                timestamp = self._record_event(
-                    span,
-                    f"gen_ai.{message.get('role')}.message",
-                    {
-                        "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                        "gen_ai.event.content": json.dumps(message),
-                    },
-                    timestamp,
-                )
-
-        return timestamp
-
-    def _parse_url(self, url):
-        parsed = urlparse(url)
-        server_address = parsed.hostname
-        port = parsed.port
-        return server_address, port
-
-    def _add_request_chat_attributes(self, span: "AbstractSpan", *args: Any, **kwargs: Any) -> None:
-        client = args[0]
-        endpoint = client._config.endpoint  # pylint: disable=protected-access
-        server_address, port = self._parse_url(endpoint)
-        model = "chat"
-        if kwargs.get("model") is not None:
-            model_value = kwargs.get("model")
-            if model_value is not None:
-                model = model_value
-
-        self._set_attributes(
-            span,
-            ("gen_ai.operation.name", "chat"),
-            ("gen_ai.system", _INFERENCE_GEN_AI_SYSTEM_NAME),
-            ("gen_ai.request.model", model),
-            ("gen_ai.request.max_tokens", kwargs.get("max_tokens")),
-            ("gen_ai.request.temperature", kwargs.get("temperature")),
-            ("gen_ai.request.top_p", kwargs.get("top_p")),
-            ("server.address", server_address),
-        )
-        if port is not None and port != 443:
-            span.add_attribute("server.port", port)
-
-    def _remove_function_call_names_and_arguments(self, tool_calls: list) -> list:
-        tool_calls_copy = copy.deepcopy(tool_calls)
-        for tool_call in tool_calls_copy:
-            if "function" in tool_call:
-                if "name" in tool_call["function"]:
-                    del tool_call["function"]["name"]
-                if "arguments" in tool_call["function"]:
-                    del tool_call["function"]["arguments"]
-                if not tool_call["function"]:
-                    del tool_call["function"]
-        return tool_calls_copy
-
-    def _get_finish_reasons(self, result) -> Optional[List[str]]:
-        if hasattr(result, "choices") and result.choices:
-            finish_reasons: List[str] = []
-            for choice in result.choices:
-                finish_reason = getattr(choice, "finish_reason", None)
-
-                if finish_reason is None:
-                    # If finish_reason is None, default to "none"
-                    finish_reasons.append("none")
-                elif hasattr(finish_reason, "value"):
-                    # If finish_reason has a 'value' attribute (i.e., it's an enum), use it
-                    finish_reasons.append(finish_reason.value)
-                elif isinstance(finish_reason, str):
-                    # If finish_reason is a string, use it directly
-                    finish_reasons.append(finish_reason)
-                else:
-                    # Default to "none"
-                    finish_reasons.append("none")
-
-            return finish_reasons
-        return None
-
-    def _get_finish_reason_for_choice(self, choice):
-        finish_reason = getattr(choice, "finish_reason", None)
-        if finish_reason is not None:
-            return finish_reason.value
-
-        return "none"
-
-    def _add_response_chat_message_events(
-        self, span: "AbstractSpan", result: _models.ChatCompletions, last_event_timestamp_ns: int
-    ) -> None:
-        for choice in result.choices:
-            attributes = {}
-            if _trace_inference_content:
-                full_response: Dict[str, Any] = {
-                    "message": {"content": choice.message.content},
-                    "finish_reason": self._get_finish_reason_for_choice(choice),
-                    "index": choice.index,
-                }
-                if choice.message.tool_calls:
-                    full_response["message"]["tool_calls"] = [tool.as_dict() for tool in choice.message.tool_calls]
-                attributes = {
-                    "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                    "gen_ai.event.content": json.dumps(full_response),
-                }
-            else:
-                response: Dict[str, Any] = {
-                    "finish_reason": self._get_finish_reason_for_choice(choice),
-                    "index": choice.index,
-                }
-                if choice.message.tool_calls:
-                    response["message"] = {}
-                    tool_calls_function_names_and_arguments_removed = self._remove_function_call_names_and_arguments(
-                        choice.message.tool_calls
-                    )
-                    response["message"]["tool_calls"] = [
-                        tool.as_dict() for tool in tool_calls_function_names_and_arguments_removed
-                    ]
-
-                attributes = {
-                    "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                    "gen_ai.event.content": json.dumps(response),
-                }
-            last_event_timestamp_ns = self._record_event(span, "gen_ai.choice", attributes, last_event_timestamp_ns)
-
-    def _add_response_chat_attributes(
-        self,
-        span: "AbstractSpan",
-        result: Union[_models.ChatCompletions, _models.StreamingChatCompletionsUpdate],
-    ) -> None:
-        self._set_attributes(
-            span,
-            ("gen_ai.response.id", result.id),
-            ("gen_ai.response.model", result.model),
-            (
-                "gen_ai.usage.input_tokens",
-                (result.usage.prompt_tokens if hasattr(result, "usage") and result.usage else None),
-            ),
-            (
-                "gen_ai.usage.output_tokens",
-                (result.usage.completion_tokens if hasattr(result, "usage") and result.usage else None),
-            ),
-        )
-        finish_reasons = self._get_finish_reasons(result)
-        if not finish_reasons is None:
-            span.add_attribute("gen_ai.response.finish_reasons", finish_reasons)  # type: ignore
-
-    def _add_request_details(self, span: "AbstractSpan", args: Any, kwargs: Any) -> int:
-        self._add_request_chat_attributes(span, *args, **kwargs)
-        if _trace_inference_content:
-            return self._add_request_chat_message_events(span, **kwargs)
-        return 0
-
-    def _add_response_details(self, span: "AbstractSpan", result: object, last_event_timestamp_ns: int) -> None:
-        if isinstance(result, _models.ChatCompletions):
-            self._add_response_chat_attributes(span, result)
-            self._add_response_chat_message_events(span, result, last_event_timestamp_ns)
-        # TODO add more models here
-
-    def _accumulate_response(self, item, accumulate: Dict[str, Any]) -> None:
-        if item.finish_reason:
-            accumulate["finish_reason"] = item.finish_reason
-        if item.index:
-            accumulate["index"] = item.index
-        if item.delta.content:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("content", "")
-            accumulate["message"]["content"] += item.delta.content
-        if item.delta.tool_calls:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("tool_calls", [])
-            if item.delta.tool_calls is not None:
-                for tool_call in item.delta.tool_calls:
-                    if tool_call.id:
-                        accumulate["message"]["tool_calls"].append(
-                            {
-                                "id": tool_call.id,
-                                "type": "",
-                                "function": {"name": "", "arguments": ""},
-                            }
-                        )
-                    if tool_call.function:
-                        accumulate["message"]["tool_calls"][-1]["type"] = "function"
-                    if tool_call.function and tool_call.function.name:
-                        accumulate["message"]["tool_calls"][-1]["function"]["name"] = tool_call.function.name
-                    if tool_call.function and tool_call.function.arguments:
-                        accumulate["message"]["tool_calls"][-1]["function"]["arguments"] += tool_call.function.arguments
-
-    def _accumulate_async_streaming_response(self, item, accumulate: Dict[str, Any]) -> None:
-        if not "choices" in item:
-            return
-        if "finish_reason" in item["choices"][0] and item["choices"][0]["finish_reason"]:
-            accumulate["finish_reason"] = item["choices"][0]["finish_reason"]
-        if "index" in item["choices"][0] and item["choices"][0]["index"]:
-            accumulate["index"] = item["choices"][0]["index"]
-        if not "delta" in item["choices"][0]:
-            return
-        if "content" in item["choices"][0]["delta"] and item["choices"][0]["delta"]["content"]:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("content", "")
-            accumulate["message"]["content"] += item["choices"][0]["delta"]["content"]
-        if "tool_calls" in item["choices"][0]["delta"] and item["choices"][0]["delta"]["tool_calls"]:
-            accumulate.setdefault("message", {})
-            accumulate["message"].setdefault("tool_calls", [])
-            if item["choices"][0]["delta"]["tool_calls"] is not None:
-                for tool_call in item["choices"][0]["delta"]["tool_calls"]:
-                    if tool_call.id:
-                        accumulate["message"]["tool_calls"].append(
-                            {
-                                "id": tool_call.id,
-                                "type": "",
-                                "function": {"name": "", "arguments": ""},
-                            }
-                        )
-                    if tool_call.function:
-                        accumulate["message"]["tool_calls"][-1]["type"] = "function"
-                    if tool_call.function and tool_call.function.name:
-                        accumulate["message"]["tool_calls"][-1]["function"]["name"] = tool_call.function.name
-                    if tool_call.function and tool_call.function.arguments:
-                        accumulate["message"]["tool_calls"][-1]["function"]["arguments"] += tool_call.function.arguments
-
-    def _wrapped_stream(
-        self, stream_obj: _models.StreamingChatCompletions, span: "AbstractSpan", previous_event_timestamp: int
-    ) -> _models.StreamingChatCompletions:
-        class StreamWrapper(_models.StreamingChatCompletions):
-            def __init__(self, stream_obj, instrumentor):
-                super().__init__(stream_obj._response)
-                self._instrumentor = instrumentor
-
-            def __iter__(  # pyright: ignore [reportIncompatibleMethodOverride]
-                self,
-            ) -> Iterator[_models.StreamingChatCompletionsUpdate]:
-                accumulate: Dict[str, Any] = {}
-                try:
-                    chunk = None
-                    for chunk in stream_obj:
-                        for item in chunk.choices:
-                            self._instrumentor._accumulate_response(item, accumulate)
-                        yield chunk
-
-                    if chunk is not None:
-                        self._instrumentor._add_response_chat_attributes(span, chunk)
-
-                except Exception as exc:
-                    # Set the span status to error
-                    if isinstance(span.span_instance, Span):  # pyright: ignore [reportPossiblyUnboundVariable]
-                        span.span_instance.set_status(
-                            StatusCode.ERROR,  # pyright: ignore [reportPossiblyUnboundVariable]
-                            description=str(exc),
-                        )
-                    module = exc.__module__ if hasattr(exc, "__module__") and exc.__module__ != "builtins" else ""
-                    error_type = f"{module}.{type(exc).__name__}" if module else type(exc).__name__
-                    self._instrumentor._set_attributes(span, ("error.type", error_type))
-                    raise
-
-                finally:
-                    if stream_obj._done is False:
-                        if accumulate.get("finish_reason") is None:
-                            accumulate["finish_reason"] = "error"
-                    else:
-                        # Only one choice expected with streaming
-                        accumulate["index"] = 0
-                        # Delete message if content tracing is not enabled
-                        if not _trace_inference_content:
-                            if "message" in accumulate:
-                                if "content" in accumulate["message"]:
-                                    del accumulate["message"]["content"]
-                                if not accumulate["message"]:
-                                    del accumulate["message"]
-                            if "message" in accumulate:
-                                if "tool_calls" in accumulate["message"]:
-                                    tool_calls_function_names_and_arguments_removed = (
-                                        self._instrumentor._remove_function_call_names_and_arguments(
-                                            accumulate["message"]["tool_calls"]
-                                        )
-                                    )
-                                    accumulate["message"]["tool_calls"] = list(
-                                        tool_calls_function_names_and_arguments_removed
-                                    )
-                    attributes = {
-                        "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                        "gen_ai.event.content": json.dumps(accumulate),
-                    }
-                    self._instrumentor._record_event(span, "gen_ai.choice", attributes, previous_event_timestamp)
-                    span.finish()
-
-        return StreamWrapper(stream_obj, self)
-
-    def _async_wrapped_stream(
-        self, stream_obj: _models.AsyncStreamingChatCompletions, span: "AbstractSpan", last_event_timestamp_ns: int
-    ) -> _models.AsyncStreamingChatCompletions:
-        class AsyncStreamWrapper(_models.AsyncStreamingChatCompletions):
-            def __init__(self, stream_obj, instrumentor, span, last_event_timestamp_ns):
-                super().__init__(stream_obj._response)
-                self._instrumentor = instrumentor
-                self._accumulate: Dict[str, Any] = {}
-                self._stream_obj = stream_obj
-                self.span = span
-                self._last_result = None
-                self._last_event_timestamp_ns = last_event_timestamp_ns
-
-            async def __anext__(self) -> "_models.StreamingChatCompletionsUpdate":
-                try:
-                    result = await super().__anext__()
-                    self._instrumentor._accumulate_async_streaming_response(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                        result, self._accumulate
-                    )
-                    self._last_result = result
-                except StopAsyncIteration as exc:
-                    self._trace_stream_content()
-                    raise exc
-                return result
-
-            def _trace_stream_content(self) -> None:
-                if self._last_result:
-                    self._instrumentor._add_response_chat_attributes(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                        span, self._last_result
-                    )
-                # Only one choice expected with streaming
-                self._accumulate["index"] = 0
-                # Delete message if content tracing is not enabled
-                if not _trace_inference_content:
-                    if "message" in self._accumulate:
-                        if "content" in self._accumulate["message"]:
-                            del self._accumulate["message"]["content"]
-                            if not self._accumulate["message"]:
-                                del self._accumulate["message"]
-                        if "message" in self._accumulate:
-                            if "tool_calls" in self._accumulate["message"]:
-                                tools_no_recording = self._instrumentor._remove_function_call_names_and_arguments(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                                    self._accumulate["message"]["tool_calls"]
-                                )
-                                self._accumulate["message"]["tool_calls"] = list(tools_no_recording)
-                attributes = {
-                    "gen_ai.system": _INFERENCE_GEN_AI_SYSTEM_NAME,
-                    "gen_ai.event.content": json.dumps(self._accumulate),
-                }
-                self._last_event_timestamp_ns = self._instrumentor._record_event(  # pylint: disable=protected-access, line-too-long # pyright: ignore [reportFunctionMemberAccess]
-                    span, "gen_ai.choice", attributes, self._last_event_timestamp_ns
-                )
-                span.finish()
-
-        async_stream_wrapper = AsyncStreamWrapper(stream_obj, self, span, last_event_timestamp_ns)
-        return async_stream_wrapper
-
-    def _record_event(
-        self, span: "AbstractSpan", name: str, attributes: Dict[str, Any], last_event_timestamp_ns: int
-    ) -> int:
-        timestamp = time_ns()
-
-        # we're recording multiple events, some of them are emitted within (hundreds of) nanoseconds of each other.
-        # time.time_ns resolution is not high enough on windows to guarantee unique timestamps for each message.
-        # Also Azure Monitor truncates resolution to microseconds and some other backends truncate to milliseconds.
-        #
-        # But we need to give users a way to restore event order, so we're incrementing the timestamp
-        # by 1 microsecond for each message.
-        #
-        # This is a workaround, we'll find a generic and better solution - see
-        # https://github.com/open-telemetry/semantic-conventions/issues/1701
-        if last_event_timestamp_ns > 0 and timestamp <= (last_event_timestamp_ns + 1000):
-            timestamp = last_event_timestamp_ns + 1000
-
-        span.span_instance.add_event(name=name, attributes=attributes, timestamp=timestamp)
-
-        return timestamp
-
-    def _trace_sync_function(
-        self,
-        function: Callable,
-        *,
-        _args_to_ignore: Optional[List[str]] = None,
-        _trace_type=TraceType.INFERENCE,
-        _name: Optional[str] = None,
-    ) -> Callable:
-        """
-        Decorator that adds tracing to a synchronous function.
-
-        :param function: The function to be traced.
-        :type function: Callable
-        :param args_to_ignore: A list of argument names to be ignored in the trace.
-                            Defaults to None.
-        :type: args_to_ignore: [List[str]], optional
-        :param trace_type: The type of the trace. Defaults to TraceType.INFERENCE.
-        :type trace_type: TraceType, optional
-        :param name: The name of the trace, will set to func name if not provided.
-        :type name: str, optional
-        :return: The traced function.
-        :rtype: Callable
-        """
-
-        @functools.wraps(function)
-        def inner(*args, **kwargs):
-
-            span_impl_type = settings.tracing_implementation()
-            if span_impl_type is None:
-                return function(*args, **kwargs)
-
-            class_function_name = function.__qualname__
-
-            if class_function_name.startswith("ChatCompletionsClient.complete"):
-                if kwargs.get("model") is None:
-                    span_name = "chat"
-                else:
-                    model = kwargs.get("model")
-                    span_name = f"chat {model}"
-
-                span = span_impl_type(
-                    name=span_name,
-                    kind=SpanKind.CLIENT,  # pyright: ignore [reportPossiblyUnboundVariable]
-                )
-
-                try:
-                    # tracing events not supported in azure-core-tracing-opentelemetry
-                    # so need to access the span instance directly
-                    with span_impl_type.change_context(span.span_instance):
-                        last_event_timestamp_ns = self._add_request_details(span, args, kwargs)
-                        result = function(*args, **kwargs)
-                        if kwargs.get("stream") is True:
-                            return self._wrapped_stream(result, span, last_event_timestamp_ns)
-                        self._add_response_details(span, result, last_event_timestamp_ns)
-                except Exception as exc:
-                    # Set the span status to error
-                    if isinstance(span.span_instance, Span):  # pyright: ignore [reportPossiblyUnboundVariable]
-                        span.span_instance.set_status(
-                            StatusCode.ERROR,  # pyright: ignore [reportPossiblyUnboundVariable]
-                            description=str(exc),
-                        )
-                    module = getattr(exc, "__module__", "")
-                    module = module if module != "builtins" else ""
-                    error_type = f"{module}.{type(exc).__name__}" if module else type(exc).__name__
-                    self._set_attributes(span, ("error.type", error_type))
-                    span.finish()
-                    raise
-
-                span.finish()
-                return result
-
-            # Handle the default case (if the function name does not match)
-            return None  # Ensure all paths return
-
-        return inner
-
-    def _trace_async_function(
-        self,
-        function: Callable,
-        *,
-        _args_to_ignore: Optional[List[str]] = None,
-        _trace_type=TraceType.INFERENCE,
-        _name: Optional[str] = None,
-    ) -> Callable:
-        """
-        Decorator that adds tracing to an asynchronous function.
-
-        :param function: The function to be traced.
-        :type function: Callable
-        :param args_to_ignore: A list of argument names to be ignored in the trace.
-                            Defaults to None.
-        :type: args_to_ignore: [List[str]], optional
-        :param trace_type: The type of the trace. Defaults to TraceType.INFERENCE.
-        :type trace_type: TraceType, optional
-        :param name: The name of the trace, will set to func name if not provided.
-        :type name: str, optional
-        :return: The traced function.
-        :rtype: Callable
-        """
-
-        @functools.wraps(function)
-        async def inner(*args, **kwargs):
-            span_impl_type = settings.tracing_implementation()
-            if span_impl_type is None:
-                return await function(*args, **kwargs)
-
-            class_function_name = function.__qualname__
-
-            if class_function_name.startswith("ChatCompletionsClient.complete"):
-                if kwargs.get("model") is None:
-                    span_name = "chat"
-                else:
-                    model = kwargs.get("model")
-                    span_name = f"chat {model}"
-
-                span = span_impl_type(
-                    name=span_name,
-                    kind=SpanKind.CLIENT,  # pyright: ignore [reportPossiblyUnboundVariable]
-                )
-                try:
-                    # tracing events not supported in azure-core-tracing-opentelemetry
-                    # so need to access the span instance directly
-                    with span_impl_type.change_context(span.span_instance):
-                        last_event_timestamp_ns = self._add_request_details(span, args, kwargs)
-                        result = await function(*args, **kwargs)
-                        if kwargs.get("stream") is True:
-                            return self._async_wrapped_stream(result, span, last_event_timestamp_ns)
-                        self._add_response_details(span, result, last_event_timestamp_ns)
-
-                except Exception as exc:
-                    # Set the span status to error
-                    if isinstance(span.span_instance, Span):  # pyright: ignore [reportPossiblyUnboundVariable]
-                        span.span_instance.set_status(
-                            StatusCode.ERROR,  # pyright: ignore [reportPossiblyUnboundVariable]
-                            description=str(exc),
-                        )
-                    module = getattr(exc, "__module__", "")
-                    module = module if module != "builtins" else ""
-                    error_type = f"{module}.{type(exc).__name__}" if module else type(exc).__name__
-                    self._set_attributes(span, ("error.type", error_type))
-                    span.finish()
-                    raise
-
-                span.finish()
-                return result
-
-            # Handle the default case (if the function name does not match)
-            return None  # Ensure all paths return
-
-        return inner
-
-    def _inject_async(self, f, _trace_type, _name):
-        wrapper_fun = self._trace_async_function(f)
-        wrapper_fun._original = f  # pylint: disable=protected-access # pyright: ignore [reportFunctionMemberAccess]
-        return wrapper_fun
-
-    def _inject_sync(self, f, _trace_type, _name):
-        wrapper_fun = self._trace_sync_function(f)
-        wrapper_fun._original = f  # pylint: disable=protected-access # pyright: ignore [reportFunctionMemberAccess]
-        return wrapper_fun
-
-    def _inference_apis(self):
-        sync_apis = (
-            (
-                "azure.ai.inference",
-                "ChatCompletionsClient",
-                "complete",
-                TraceType.INFERENCE,
-                "inference_chat_completions_complete",
-            ),
-        )
-        async_apis = (
-            (
-                "azure.ai.inference.aio",
-                "ChatCompletionsClient",
-                "complete",
-                TraceType.INFERENCE,
-                "inference_chat_completions_complete",
-            ),
-        )
-        return sync_apis, async_apis
-
-    def _inference_api_list(self):
-        sync_apis, async_apis = self._inference_apis()
-        yield sync_apis, self._inject_sync
-        yield async_apis, self._inject_async
-
-    def _generate_api_and_injector(self, apis):
-        for api, injector in apis:
-            for module_name, class_name, method_name, trace_type, name in api:
-                try:
-                    module = importlib.import_module(module_name)
-                    api = getattr(module, class_name)
-                    if hasattr(api, method_name):
-                        yield api, method_name, trace_type, injector, name
-                except AttributeError as e:
-                    # Log the attribute exception with the missing class information
-                    logging.warning(
-                        "AttributeError: The module '%s' does not have the class '%s'. %s",
-                        module_name,
-                        class_name,
-                        str(e),
-                    )
-                except Exception as e:  # pylint: disable=broad-except
-                    # Log other exceptions as a warning, as we're not sure what they might be
-                    logging.warning("An unexpected error occurred: '%s'", str(e))
-
-    def _available_inference_apis_and_injectors(self):
-        """
-        Generates a sequence of tuples containing Inference API classes, method names, and
-        corresponding injector functions.
-
-        :return: A generator yielding tuples.
-        :rtype: tuple
-        """
-        yield from self._generate_api_and_injector(self._inference_api_list())
-
-    def _instrument_inference(self, enable_content_tracing: bool = False):
-        """This function modifies the methods of the Inference API classes to
-        inject logic before calling the original methods.
-        The original methods are stored as _original attributes of the methods.
-
-        :param enable_content_tracing: Indicates whether tracing of message content should be enabled.
-                                    This also controls whether function call tool function names,
-                                    parameter names and parameter values are traced.
-        :type enable_content_tracing: bool
-        """
-        # pylint: disable=W0603
-        global _inference_traces_enabled
-        global _trace_inference_content
-        if _inference_traces_enabled:
-            raise RuntimeError("Traces already started for azure.ai.inference")
-        _inference_traces_enabled = True
-        _trace_inference_content = enable_content_tracing
-        for (
-            api,
-            method,
-            trace_type,
-            injector,
-            name,
-        ) in self._available_inference_apis_and_injectors():
-            # Check if the method of the api class has already been modified
-            if not hasattr(getattr(api, method), "_original"):
-                setattr(api, method, injector(getattr(api, method), trace_type, name))
-
-    def _uninstrument_inference(self):
-        """This function restores the original methods of the Inference API classes
-        by assigning them back from the _original attributes of the modified methods.
-        """
-        # pylint: disable=W0603
-        global _inference_traces_enabled
-        global _trace_inference_content
-        _trace_inference_content = False
-        for api, method, _, _, _ in self._available_inference_apis_and_injectors():
-            if hasattr(getattr(api, method), "_original"):
-                setattr(api, method, getattr(getattr(api, method), "_original"))
-        _inference_traces_enabled = False
-
-    def _is_instrumented(self):
-        """This function returns True if Inference libary has already been instrumented
-        for tracing and False if it has not been instrumented.
-
-        :return: A value indicating whether the Inference library is currently instrumented or not.
-        :rtype: bool
-        """
-        return _inference_traces_enabled
-
-    def _set_content_recording_enabled(self, enable_content_recording: bool = False) -> None:
-        """This function sets the content recording value.
-
-        :param enable_content_recording: Indicates whether tracing of message content should be enabled.
-                                    This also controls whether function call tool function names,
-                                    parameter names and parameter values are traced.
-        :type enable_content_recording: bool
-        """
-        global _trace_inference_content  # pylint: disable=W0603
-        _trace_inference_content = enable_content_recording
-
-    def _is_content_recording_enabled(self) -> bool:
-        """This function gets the content recording value.
-
-        :return: A bool value indicating whether content tracing is enabled.
-        :rtype bool
-        """
-        return _trace_inference_content
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_dict_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_dict_async.py
index b908cea2c5ae..53ccd05053e1 100644
--- a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_dict_async.py
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_from_input_dict_async.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py
index 5035bc652d8d..974f921e982c 100644
--- a/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_chat_completions_streaming_azure_openai_async.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py
index cfc9a4372222..b8f846541cb1 100644
--- a/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_embeddings_async.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py b/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py
index 311837814607..b975822f1ed6 100644
--- a/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py
+++ b/sdk/ai/azure-ai-inference/samples/async_samples/sample_load_client_async.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py
index e39c2adba790..34689ac27b3f 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_azure_openai.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict.py
index 0bbdea862ab3..6e2c5c4ca8d1 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict_with_image_url.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict_with_image_url.py
index f1c44431c523..ad78561bcc3e 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict_with_image_url.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_dict_with_image_url.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_prompt_string.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_prompt_string.py
index e1ee22f32a9c..78a2315a3384 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_prompt_string.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_from_input_prompt_string.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py
index a6042089946c..3c7b0c7f8279 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_streaming_with_tools.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py
index 6083dd8b9ba2..d229672876ce 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_history.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_image_url.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_image_url.py
index 18c3925d0326..c148d8d72f30 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_image_url.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_image_url.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output.py
index 50b07d63a841..06859cff8beb 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output_pydantic.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output_pydantic.py
index e2cba755e8aa..fa58b961e307 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output_pydantic.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_structured_output_pydantic.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
index dfe5fd048b51..dfa1ab3eb739 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_chat_completions_with_tools.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py b/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py
index 923e6410565c..41a606719bb3 100644
--- a/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py
+++ b/sdk/ai/azure-ai-inference/samples/sample_embeddings_azure_openai.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/sdk_packaging.toml b/sdk/ai/azure-ai-inference/sdk_packaging.toml
new file mode 100644
index 000000000000..e7687fdae93b
--- /dev/null
+++ b/sdk/ai/azure-ai-inference/sdk_packaging.toml
@@ -0,0 +1,2 @@
+[packaging]
+auto_update = false
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/setup.py b/sdk/ai/azure-ai-inference/setup.py
index 3de5f549efe0..c7b5395a3f9f 100644
--- a/sdk/ai/azure-ai-inference/setup.py
+++ b/sdk/ai/azure-ai-inference/setup.py
@@ -13,7 +13,7 @@
 
 
 PACKAGE_NAME = "azure-ai-inference"
-PACKAGE_PPRINT_NAME = "Azure AI Inference"
+PACKAGE_PPRINT_NAME = "Azure Ai Inference"
 
 # a-b-c => a/b/c
 package_folder_path = PACKAGE_NAME.replace("-", "/")
@@ -35,7 +35,7 @@
     license="MIT License",
     author="Microsoft Corporation",
     author_email="azpysdkhelp@microsoft.com",
-    url="https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/ai/azure-ai-inference",
+    url="https://github.com/Azure/azure-sdk-for-python/tree/main/sdk",
     keywords="azure, azure sdk",
     classifiers=[
         "Development Status :: 4 - Beta",
@@ -68,8 +68,4 @@
         "typing-extensions>=4.6.0",
     ],
     python_requires=">=3.8",
-    extras_require={
-        "opentelemetry": ["azure-core-tracing-opentelemetry"],
-        "prompts": ["pyyaml"],
-    },
 )
diff --git a/sdk/ai/azure-ai-inference/tests/gen_ai_trace_verifier.py b/sdk/ai/azure-ai-inference/tests/gen_ai_trace_verifier.py
index c3d3b34a4406..62dd824a0039 100644
--- a/sdk/ai/azure-ai-inference/tests/gen_ai_trace_verifier.py
+++ b/sdk/ai/azure-ai-inference/tests/gen_ai_trace_verifier.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tests/model_inference_test_base.py b/sdk/ai/azure-ai-inference/tests/model_inference_test_base.py
index eab1fb7418de..9619a2b15cc5 100644
--- a/sdk/ai/azure-ai-inference/tests/model_inference_test_base.py
+++ b/sdk/ai/azure-ai-inference/tests/model_inference_test_base.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tests/test_chat_completions_client.py b/sdk/ai/azure-ai-inference/tests/test_chat_completions_client.py
index b347f1d285b1..051a5372cf28 100644
--- a/sdk/ai/azure-ai-inference/tests/test_chat_completions_client.py
+++ b/sdk/ai/azure-ai-inference/tests/test_chat_completions_client.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tests/test_chat_completions_client_async.py b/sdk/ai/azure-ai-inference/tests/test_chat_completions_client_async.py
index dc13fc1eba4a..89f06d6da0e5 100644
--- a/sdk/ai/azure-ai-inference/tests/test_chat_completions_client_async.py
+++ b/sdk/ai/azure-ai-inference/tests/test_chat_completions_client_async.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tests/test_client_tracing.py b/sdk/ai/azure-ai-inference/tests/test_client_tracing.py
index 997dd117cae9..6205a3d37ccc 100644
--- a/sdk/ai/azure-ai-inference/tests/test_client_tracing.py
+++ b/sdk/ai/azure-ai-inference/tests/test_client_tracing.py
@@ -1,4 +1,4 @@
-# pylint: disable=too-many-lines
+# pylint: disable=too-many-lines,line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
@@ -1120,4 +1120,4 @@ def get_weather(city: str) -> str:
         events_match = GenAiTraceVerifier().check_span_events(spans[1], expected_events)
         assert events_match == True
 
-        AIInferenceInstrumentor().uninstrument()
+        AIInferenceInstrumentor().uninstrument()
\ No newline at end of file
diff --git a/sdk/ai/azure-ai-inference/tests/test_embeddings_client_async.py b/sdk/ai/azure-ai-inference/tests/test_embeddings_client_async.py
index 3f1c5ade0057..aff721431109 100644
--- a/sdk/ai/azure-ai-inference/tests/test_embeddings_client_async.py
+++ b/sdk/ai/azure-ai-inference/tests/test_embeddings_client_async.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tests/test_image_embeddings_client_async.py b/sdk/ai/azure-ai-inference/tests/test_image_embeddings_client_async.py
index 58b48c143f59..3553e2863b36 100644
--- a/sdk/ai/azure-ai-inference/tests/test_image_embeddings_client_async.py
+++ b/sdk/ai/azure-ai-inference/tests/test_image_embeddings_client_async.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tests/test_prompts.py b/sdk/ai/azure-ai-inference/tests/test_prompts.py
index 0168fbeb8c01..8f3d76ce4aab 100644
--- a/sdk/ai/azure-ai-inference/tests/test_prompts.py
+++ b/sdk/ai/azure-ai-inference/tests/test_prompts.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tests/test_unit_tests.py b/sdk/ai/azure-ai-inference/tests/test_unit_tests.py
index 14f1f74dcfbe..d572c32deb54 100644
--- a/sdk/ai/azure-ai-inference/tests/test_unit_tests.py
+++ b/sdk/ai/azure-ai-inference/tests/test_unit_tests.py
@@ -1,3 +1,4 @@
+# pylint: disable=line-too-long,useless-suppression
 # ------------------------------------
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT License.
diff --git a/sdk/ai/azure-ai-inference/tsp-location.yaml b/sdk/ai/azure-ai-inference/tsp-location.yaml
index b107d6f6ece8..cd1ed73985d1 100644
--- a/sdk/ai/azure-ai-inference/tsp-location.yaml
+++ b/sdk/ai/azure-ai-inference/tsp-location.yaml
@@ -1,4 +1,4 @@
 directory: specification/ai/ModelClient
-commit: a7a977a1666ad293769bc17fb80309be390b2ba9
+commit: 4f9bafd8e839c8995dfac592f7e0034a6e231587
 repo: Azure/azure-rest-api-specs
-additionalDirectories:
+additionalDirectories: