Skip to content

Commit f31bcc1

Browse files
authored
feat: add Azure OpenAI inference provider support (#3396)
# What does this PR do? Llama-stack now supports a new OpenAI compatible endpoint with Azure OpenAI. The starter distro has been updated to add the new remote inference provider. A few tests have been modified and improved. ## Test Plan Deploy a model in the Aure portal then: ``` $ AZURE_API_KEY=... AZURE_API_BASE=... uv run llama stack build --image-type venv --providers inference=remote::azure --run ... $ LLAMA_STACK_CONFIG=http://localhost:8321 uv run --group test pytest -v -ra --text-model azure/gpt-4.1 tests/integration/inference/test_openai_completion.py ... Results: ``` ============================================= test session starts ============================================== platform darwin -- Python 3.12.8, pytest-8.4.1, pluggy-1.6.0 -- /Users/leseb/Documents/AI/llama-stack/.venv/bin/python3 cachedir: .pytest_cache metadata: {'Python': '3.12.8', 'Platform': 'macOS-15.6.1-arm64-arm-64bit', 'Packages': {'pytest': '8.4.1', 'pluggy': '1.6.0'}, 'Plugins': {'anyio': '4.9.0', 'html': '4.1.1', 'socket': '0.7.0', 'asyncio': '1.1.0', 'json-report': '1.5.0', 'timeout': '2.4.0', 'metadata': '3.1.1', 'cov': '6.2.1', 'nbval': '0.11.0', 'hydra-core': '1.3.2'}} rootdir: /Users/leseb/Documents/AI/llama-stack configfile: pyproject.toml plugins: anyio-4.9.0, html-4.1.1, socket-0.7.0, asyncio-1.1.0, json-report-1.5.0, timeout-2.4.0, metadata-3.1.1, cov-6.2.1, nbval-0.11.0, hydra-core-1.3.2 asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function collected 27 items tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming[txt=azure/gpt-5-mini-inference:completion:sanity] SKIPPED [ 3%] tests/integration/inference/test_openai_completion.py::test_openai_completion_non_streaming_suffix[txt=azure/gpt-5-mini-inference:completion:suffix] SKIPPED [ 7%] tests/integration/inference/test_openai_completion.py::test_openai_completion_streaming[txt=azure/gpt-5-mini-inference:completion:sanity] SKIPPED [ 11%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-1] SKIPPED [ 14%] tests/integration/inference/test_openai_completion.py::test_openai_completion_guided_choice[txt=azure/gpt-5-mini] SKIPPED [ 18%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01] PASSED [ 22%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 25%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 29%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-True] PASSED [ 33%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-True] PASSED [ 37%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming_with_file[txt=azure/gpt-5-mini] SKIPPEDed files.) [ 40%] tests/integration/inference/test_openai_completion.py::test_openai_completion_prompt_logprobs[txt=azure/gpt-5-mini-0] SKIPPED [ 44%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02] PASSED [ 48%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 51%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[openai_client-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 55%] tests/integration/inference/test_openai_completion.py::test_inference_store[openai_client-txt=azure/gpt-5-mini-False] PASSED [ 59%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[openai_client-txt=azure/gpt-5-mini-False] PASSED [ 62%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_01] PASSED [ 66%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 70%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_01] PASSED [ 74%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-True] PASSED [ 77%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-True] PASSED [ 81%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_non_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:non_streaming_02] PASSED [ 85%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 88%] tests/integration/inference/test_openai_completion.py::test_openai_chat_completion_streaming_with_n[client_with_models-txt=azure/gpt-5-mini-inference:chat_completion:streaming_02] PASSED [ 92%] tests/integration/inference/test_openai_completion.py::test_inference_store[client_with_models-txt=azure/gpt-5-mini-False] PASSED [ 96%] tests/integration/inference/test_openai_completion.py::test_inference_store_tool_calls[client_with_models-txt=azure/gpt-5-mini-False] PASSED [100%] =========================================== short test summary info ============================================ SKIPPED [3] tests/integration/inference/test_openai_completion.py:63: Model azure/gpt-5-mini hosted by remote::azure doesn't support OpenAI completions. SKIPPED [3] tests/integration/inference/test_openai_completion.py:118: Model azure/gpt-5-mini hosted by remote::azure doesn't support vllm extra_body parameters. SKIPPED [1] tests/integration/inference/test_openai_completion.py:124: Model azure/gpt-5-mini hosted by remote::azure doesn't support chat completion calls with base64 encoded files. ================================== 20 passed, 7 skipped, 2 warnings in 51.77s ================================== ``` Signed-off-by: Sébastien Han <[email protected]>
1 parent c2d281e commit f31bcc1

File tree

26 files changed

+6403
-13
lines changed

26 files changed

+6403
-13
lines changed

docs/source/providers/inference/index.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ This section contains documentation for all available providers for the **infere
1818
inline_meta-reference
1919
inline_sentence-transformers
2020
remote_anthropic
21+
remote_azure
2122
remote_bedrock
2223
remote_cerebras
2324
remote_databricks
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# remote::azure
2+
3+
## Description
4+
5+
6+
Azure OpenAI inference provider for accessing GPT models and other Azure services.
7+
Provider documentation
8+
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
9+
10+
11+
## Configuration
12+
13+
| Field | Type | Required | Default | Description |
14+
|-------|------|----------|---------|-------------|
15+
| `api_key` | `<class 'pydantic.types.SecretStr'>` | No | | Azure API key for Azure |
16+
| `api_base` | `<class 'pydantic.networks.HttpUrl'>` | No | | Azure API base for Azure (e.g., https://your-resource-name.openai.azure.com) |
17+
| `api_version` | `str \| None` | No | | Azure API version for Azure (e.g., 2024-12-01-preview) |
18+
| `api_type` | `str \| None` | No | azure | Azure API type for Azure (e.g., azure) |
19+
20+
## Sample Configuration
21+
22+
```yaml
23+
api_key: ${env.AZURE_API_KEY:=}
24+
api_base: ${env.AZURE_API_BASE:=}
25+
api_version: ${env.AZURE_API_VERSION:=}
26+
api_type: ${env.AZURE_API_TYPE:=}
27+
28+
```
29+

llama_stack/distributions/ci-tests/build.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ distribution_spec:
1717
- provider_type: remote::vertexai
1818
- provider_type: remote::groq
1919
- provider_type: remote::sambanova
20+
- provider_type: remote::azure
2021
- provider_type: inline::sentence-transformers
2122
vector_io:
2223
- provider_type: inline::faiss

llama_stack/distributions/ci-tests/run.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ providers:
8181
config:
8282
url: https://api.sambanova.ai/v1
8383
api_key: ${env.SAMBANOVA_API_KEY:=}
84+
- provider_id: ${env.AZURE_API_KEY:+azure}
85+
provider_type: remote::azure
86+
config:
87+
api_key: ${env.AZURE_API_KEY:=}
88+
api_base: ${env.AZURE_API_BASE:=}
89+
api_version: ${env.AZURE_API_VERSION:=}
90+
api_type: ${env.AZURE_API_TYPE:=}
8491
- provider_id: sentence-transformers
8592
provider_type: inline::sentence-transformers
8693
vector_io:

llama_stack/distributions/starter-gpu/build.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ distribution_spec:
1818
- provider_type: remote::vertexai
1919
- provider_type: remote::groq
2020
- provider_type: remote::sambanova
21+
- provider_type: remote::azure
2122
- provider_type: inline::sentence-transformers
2223
vector_io:
2324
- provider_type: inline::faiss

llama_stack/distributions/starter-gpu/run.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ providers:
8181
config:
8282
url: https://api.sambanova.ai/v1
8383
api_key: ${env.SAMBANOVA_API_KEY:=}
84+
- provider_id: ${env.AZURE_API_KEY:+azure}
85+
provider_type: remote::azure
86+
config:
87+
api_key: ${env.AZURE_API_KEY:=}
88+
api_base: ${env.AZURE_API_BASE:=}
89+
api_version: ${env.AZURE_API_VERSION:=}
90+
api_type: ${env.AZURE_API_TYPE:=}
8491
- provider_id: sentence-transformers
8592
provider_type: inline::sentence-transformers
8693
vector_io:

llama_stack/distributions/starter/build.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ distribution_spec:
1818
- provider_type: remote::vertexai
1919
- provider_type: remote::groq
2020
- provider_type: remote::sambanova
21+
- provider_type: remote::azure
2122
- provider_type: inline::sentence-transformers
2223
vector_io:
2324
- provider_type: inline::faiss

llama_stack/distributions/starter/run.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,13 @@ providers:
8181
config:
8282
url: https://api.sambanova.ai/v1
8383
api_key: ${env.SAMBANOVA_API_KEY:=}
84+
- provider_id: ${env.AZURE_API_KEY:+azure}
85+
provider_type: remote::azure
86+
config:
87+
api_key: ${env.AZURE_API_KEY:=}
88+
api_base: ${env.AZURE_API_BASE:=}
89+
api_version: ${env.AZURE_API_VERSION:=}
90+
api_type: ${env.AZURE_API_TYPE:=}
8491
- provider_id: sentence-transformers
8592
provider_type: inline::sentence-transformers
8693
vector_io:

llama_stack/distributions/starter/starter.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@ def _get_config_for_provider(provider_spec: ProviderSpec) -> dict[str, Any]:
5959
"cerebras",
6060
"nvidia",
6161
"bedrock",
62+
"azure",
6263
]
6364

6465
INFERENCE_PROVIDER_IDS = {
@@ -68,6 +69,7 @@ def _get_config_for_provider(provider_spec: ProviderSpec) -> dict[str, Any]:
6869
"cerebras": "${env.CEREBRAS_API_KEY:+cerebras}",
6970
"nvidia": "${env.NVIDIA_API_KEY:+nvidia}",
7071
"vertexai": "${env.VERTEX_AI_PROJECT:+vertexai}",
72+
"azure": "${env.AZURE_API_KEY:+azure}",
7173
}
7274

7375

@@ -277,5 +279,21 @@ def get_distribution_template(name: str = "starter") -> DistributionTemplate:
277279
"http://localhost:11434",
278280
"Ollama URL",
279281
),
282+
"AZURE_API_KEY": (
283+
"",
284+
"Azure API Key",
285+
),
286+
"AZURE_API_BASE": (
287+
"",
288+
"Azure API Base",
289+
),
290+
"AZURE_API_VERSION": (
291+
"",
292+
"Azure API Version",
293+
),
294+
"AZURE_API_TYPE": (
295+
"azure",
296+
"Azure API Type",
297+
),
280298
},
281299
)

llama_stack/providers/registry/inference.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,4 +295,19 @@ def available_providers() -> list[ProviderSpec]:
295295
description="IBM WatsonX inference provider for accessing AI models on IBM's WatsonX platform.",
296296
),
297297
),
298+
remote_provider_spec(
299+
api=Api.inference,
300+
adapter=AdapterSpec(
301+
adapter_type="azure",
302+
pip_packages=["litellm"],
303+
module="llama_stack.providers.remote.inference.azure",
304+
config_class="llama_stack.providers.remote.inference.azure.AzureConfig",
305+
provider_data_validator="llama_stack.providers.remote.inference.azure.config.AzureProviderDataValidator",
306+
description="""
307+
Azure OpenAI inference provider for accessing GPT models and other Azure services.
308+
Provider documentation
309+
https://learn.microsoft.com/en-us/azure/ai-foundry/openai/overview
310+
""",
311+
),
312+
),
298313
]

0 commit comments

Comments
 (0)