diff --git a/.fernignore b/.fernignore index c8666c2..d85561f 100644 --- a/.fernignore +++ b/.fernignore @@ -9,3 +9,8 @@ src/agora_agent/agentkit/ # Documentation - managed manually, not generated by Fern docs/ + +# Dependency manifests/lockfiles are managed manually +pyproject.toml +poetry.lock +requirements.txt diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 16caff3..f46ffcf 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,8 +15,6 @@ jobs: curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 - name: Install dependencies run: poetry install - - name: Validate docs - run: poetry run python scripts/validate_docs.py - name: Compile run: poetry run mypy . test: @@ -33,8 +31,30 @@ jobs: curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 - name: Install dependencies run: poetry install - - name: Validate docs - run: poetry run python scripts/validate_docs.py - name: Test run: poetry run pytest -rP . + + publish: + needs: [compile, test] + if: github.event_name == 'push' && contains(github.ref, 'refs/tags/') + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Set up python + uses: actions/setup-python@v4 + with: + python-version: 3.8 + - name: Bootstrap poetry + run: | + curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 + - name: Install dependencies + run: poetry install + - name: Publish to pypi + run: | + poetry config repositories.remote https://upload.pypi.org/legacy/ + poetry --no-interaction -v publish --build --repository remote --username "$PYPI_USERNAME" --password "$PYPI_PASSWORD" + env: + PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} + PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml deleted file mode 100644 index 58ef110..0000000 --- a/.github/workflows/release.yml +++ /dev/null @@ -1,76 +0,0 @@ -name: release - -on: - push: - tags: - - "v*" - workflow_dispatch: - inputs: - tag_name: - description: "Tag to publish" - required: false - type: string - -jobs: - release: - runs-on: ubuntu-latest - permissions: - contents: write - id-token: write - steps: - - name: Checkout repo - uses: actions/checkout@v4 - - - name: Set up python - uses: actions/setup-python@v4 - with: - python-version: 3.8 - - - name: Bootstrap poetry - run: | - curl -sSL https://install.python-poetry.org | python - -y --version 1.5.1 - - - name: Install dependencies - run: poetry install - - - name: Compile - run: poetry run mypy . - - - name: Test - run: poetry run pytest -rP . - - - name: Build - run: poetry build - - - name: Extract changelog notes - id: changelog - env: - INPUT_TAG_NAME: ${{ github.event.inputs.tag_name }} - run: | - VERSION="${INPUT_TAG_NAME:-${GITHUB_REF_NAME}}" - NOTES=$(awk -v ver="## [${VERSION}]" ' - index($0, ver) == 1 { found=1; next } - found && /^## / { exit } - found { print } - ' changelog.md) - echo "notes<> "$GITHUB_OUTPUT" - echo "$NOTES" >> "$GITHUB_OUTPUT" - echo "EOF" >> "$GITHUB_OUTPUT" - - - name: Create GitHub Release - env: - GH_TOKEN: ${{ github.token }} - NOTES: ${{ steps.changelog.outputs.notes }} - INPUT_TAG_NAME: ${{ github.event.inputs.tag_name }} - run: | - VERSION="${INPUT_TAG_NAME:-${GITHUB_REF_NAME}}" - echo "$NOTES" > release_notes.md - gh release create "$VERSION" \ - --title "$VERSION" \ - --notes-file release_notes.md \ - dist/* - - - name: Publish to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 - with: - password: ${{ secrets.PYPI_API_TOKEN }} diff --git a/README.md b/README.md index 4c24c8b..f0ecfe5 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,33 @@ -# Agora Agent Server SDK for Python +# Agoraio Python Library [![fern shield](https://img.shields.io/badge/%F0%9F%8C%BF-Built%20with%20Fern-brightgreen)](https://buildwithfern.com?utm_source=github&utm_medium=github&utm_campaign=readme&utm_source=https%3A%2F%2Fgithub.com%2FAgoraIO-Conversational-AI%2Fagent-server-sdk-python) [![pypi](https://img.shields.io/pypi/v/agora-agent-server-sdk)](https://pypi.python.org/pypi/agora-agent-server-sdk) -The Agora Agent Server SDK for Python lets you build real-time voice agents on Agora Conversational AI with a high-level `Agent` / `AgentSession` API and a generated low-level REST client. +The Agora Conversational AI SDK provides convenient access to the Agora Conversational AI APIs, +enabling you to build voice-powered AI agents with support for both cascading flows (ASR -> LLM -> TTS) +and multimodal flows (MLLM) for real-time audio processing. + + +## Table of Contents + +- [Requirements](#requirements) +- [Installation](#installation) +- [Quick Start](#quick-start) +- [Byok](#byok) +- [Mllm Realtime Multimodal](#mllm-realtime-multimodal) +- [Documentation](#documentation) +- [Reference](#reference) +- [Mllm Flow Multimodal](#mllm-flow-multimodal) +- [Usage](#usage) +- [Async Client](#async-client) +- [Exception Handling](#exception-handling) +- [Pagination](#pagination) +- [Advanced](#advanced) + - [Access Raw Response Data](#access-raw-response-data) + - [Retries](#retries) + - [Timeouts](#timeouts) + - [Custom Client](#custom-client) +- [Contributing](#contributing) ## Requirements @@ -183,14 +207,323 @@ See the [MLLM Flow guide](./docs/guides/mllm-flow.md) for full examples with Gem ## Documentation -- [Overview](./docs/index.md) -- [Authentication](./docs/getting-started/authentication.md) -- [Quick Start](./docs/getting-started/quick-start.md) -- [BYOK Guide](./docs/guides/byok.md) -- [MLLM Flow](./docs/guides/mllm-flow.md) -- [Low-Level API](./docs/guides/low-level-api.md) +API reference documentation is available [here](https://docs.agora.io/en/conversational-ai/overview). ## Reference -- [SDK Reference](./reference.md) -- [Agora Conversational AI Docs](https://docs.agora.io/en/conversational-ai/overview) +A full reference for this library is available [here](https://github.com/AgoraIO-Conversational-AI/agent-server-sdk-python/blob/HEAD/./reference.md). + +## MLLM Flow (Multimodal) + +For real-time audio processing using OpenAI's Realtime API or Google Gemini Live, use the MLLM (Multimodal Large Language Model) flow instead of the cascading ASR -> LLM -> TTS flow. See the [MLLM Overview](https://docs.agora.io/en/conversational-ai/models/mllm/overview) for more details. + +```python +from agora-agent-server-sdk import Agora +from agora-agent-server-sdk.agents import ( + StartAgentsRequestProperties, + StartAgentsRequestPropertiesAdvancedFeatures, + StartAgentsRequestPropertiesMllm, + StartAgentsRequestPropertiesMllmVendor, + StartAgentsRequestPropertiesTts, + StartAgentsRequestPropertiesTtsVendor, + StartAgentsRequestPropertiesLlm, + StartAgentsRequestPropertiesTurnDetection, + StartAgentsRequestPropertiesTurnDetectionType, +) + +client = Agora( + customer_id="YOUR_CUSTOMER_ID", + customer_secret="YOUR_CUSTOMER_SECRET", +) + +client.agents.start( + appid="your_app_id", + name="mllm_agent", + properties=StartAgentsRequestProperties( + channel="channel_name", + token="your_token", + agent_rtc_uid="1001", + remote_rtc_uids=["1002"], + idle_timeout=120, + advanced_features=StartAgentsRequestPropertiesAdvancedFeatures( + enable_mllm=True, + ), + mllm=StartAgentsRequestPropertiesMllm( + url="wss://api.openai.com/v1/realtime", + api_key="", + vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, + params={ + "model": "gpt-4o-realtime-preview", + "voice": "alloy", + }, + input_modalities=["audio"], + output_modalities=["text", "audio"], + greeting_message="Hello! I'm ready to chat in real-time.", + ), + turn_detection=StartAgentsRequestPropertiesTurnDetection( + type=StartAgentsRequestPropertiesTurnDetectionType.SERVER_VAD, + threshold=0.5, + silence_duration_ms=500, + ), + # TTS and LLM are still required but not used when MLLM is enabled + tts=StartAgentsRequestPropertiesTts( + vendor=StartAgentsRequestPropertiesTtsVendor.MICROSOFT, + params={}, + ), + llm=StartAgentsRequestPropertiesLlm( + url="https://api.openai.com/v1/chat/completions", + ), + ), +) +``` + + +## Usage + +Instantiate and use the client with the following: + +```python +from agora_agent import Agora, MicrosoftTtsParams, Tts_Microsoft +from agora_agent.agents import ( + StartAgentsRequestProperties, + StartAgentsRequestPropertiesAsr, + StartAgentsRequestPropertiesLlm, +) + +client = Agora( + authorization="YOUR_AUTHORIZATION", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", +) +client.agents.start( + appid="appid", + name="unique_name", + properties=StartAgentsRequestProperties( + channel="channel_name", + token="token", + agent_rtc_uid="1001", + remote_rtc_uids=["1002"], + idle_timeout=120, + asr=StartAgentsRequestPropertiesAsr( + language="en-US", + ), + tts=Tts_Microsoft( + params=MicrosoftTtsParams( + key="key", + region="region", + voice_name="voice_name", + ), + ), + llm=StartAgentsRequestPropertiesLlm( + url="https://api.openai.com/v1/chat/completions", + api_key="", + system_messages=[ + {"role": "system", "content": "You are a helpful chatbot."} + ], + params={"model": "gpt-4o-mini"}, + max_history=32, + greeting_message="Hello, how can I assist you today?", + failure_message="Please hold on a second.", + ), + ), +) +``` + +## Async Client + +The SDK also exports an `async` client so that you can make non-blocking calls to our API. Note that if you are constructing an Async httpx client class to pass into this client, use `httpx.AsyncClient()` instead of `httpx.Client()` (e.g. for the `httpx_client` parameter of this client). + +```python +import asyncio + +from agora_agent import AsyncAgora, MicrosoftTtsParams, Tts_Microsoft +from agora_agent.agents import ( + StartAgentsRequestProperties, + StartAgentsRequestPropertiesAsr, + StartAgentsRequestPropertiesLlm, +) + +client = AsyncAgora( + authorization="YOUR_AUTHORIZATION", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", +) + + +async def main() -> None: + await client.agents.start( + appid="appid", + name="unique_name", + properties=StartAgentsRequestProperties( + channel="channel_name", + token="token", + agent_rtc_uid="1001", + remote_rtc_uids=["1002"], + idle_timeout=120, + asr=StartAgentsRequestPropertiesAsr( + language="en-US", + ), + tts=Tts_Microsoft( + params=MicrosoftTtsParams( + key="key", + region="region", + voice_name="voice_name", + ), + ), + llm=StartAgentsRequestPropertiesLlm( + url="https://api.openai.com/v1/chat/completions", + api_key="", + system_messages=[ + {"role": "system", "content": "You are a helpful chatbot."} + ], + params={"model": "gpt-4o-mini"}, + max_history=32, + greeting_message="Hello, how can I assist you today?", + failure_message="Please hold on a second.", + ), + ), + ) + + +asyncio.run(main()) +``` + +## Exception Handling + +When the API returns a non-success status code (4xx or 5xx response), a subclass of the following error +will be thrown. + +```python +from agora_agent.core.api_error import ApiError + +try: + client.agents.start(...) +except ApiError as e: + print(e.status_code) + print(e.body) +``` + +## Pagination + +Paginated requests will return a `SyncPager` or `AsyncPager`, which can be used as generators for the underlying object. + +```python +from agora_agent import Agora + +client = Agora( + authorization="YOUR_AUTHORIZATION", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", +) +response = client.agents.list( + appid="appid", +) +for item in response: + yield item +# alternatively, you can paginate page-by-page +for page in response.iter_pages(): + yield page +``` + +```python +# You can also iterate through pages and access the typed response per page +pager = client.agents.list(...) +for page in pager.iter_pages(): + print(page.response) # access the typed response for each page + for item in page: + print(item) +``` + +## Advanced + +### Access Raw Response Data + +The SDK provides access to raw response data, including headers, through the `.with_raw_response` property. +The `.with_raw_response` property returns a "raw" client that can be used to access the `.headers` and `.data` attributes. + +```python +from agora_agent import Agora + +client = Agora( + ..., +) +response = client.agents.with_raw_response.start(...) +print(response.headers) # access the response headers +print(response.data) # access the underlying object +pager = client.agents.list(...) +print(pager.response) # access the typed response for the first page +for item in pager: + print(item) # access the underlying object(s) +for page in pager.iter_pages(): + print(page.response) # access the typed response for each page + for item in page: + print(item) # access the underlying object(s) +``` + +### Retries + +The SDK is instrumented with automatic retries with exponential backoff. A request will be retried as long +as the request is deemed retryable and the number of retry attempts has not grown larger than the configured +retry limit (default: 2). + +A request is deemed retryable when any of the following HTTP status codes is returned: + +- [408](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/408) (Timeout) +- [429](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/429) (Too Many Requests) +- [5XX](https://developer.mozilla.org/en-US/docs/Web/HTTP/Status/500) (Internal Server Errors) + +Use the `max_retries` request option to configure this behavior. + +```python +client.agents.start(..., request_options={ + "max_retries": 1 +}) +``` + +### Timeouts + +The SDK defaults to a 60 second timeout. You can configure this with a timeout option at the client or request level. + +```python + +from agora_agent import Agora + +client = Agora( + ..., + timeout=20.0, +) + + +# Override timeout for a specific method +client.agents.start(..., request_options={ + "timeout_in_seconds": 1 +}) +``` + +### Custom Client + +You can override the `httpx` client to customize it for your use-case. Some common use-cases include support for proxies +and transports. + +```python +import httpx +from agora_agent import Agora + +client = Agora( + ..., + httpx_client=httpx.Client( + proxy="http://my.test.proxy.example.com", + transport=httpx.HTTPTransport(local_address="0.0.0.0"), + ), +) +``` + +## Contributing + +While we value open-source contributions to this SDK, this library is generated programmatically. +Additions made directly to this library would have to be moved over to our generation code, +otherwise they would be overwritten upon the next generated release. Feel free to open a PR as +a proof of concept, but know that we will not be able to merge it as-is. We suggest opening +an issue first to discuss with us! + +On the other hand, contributions to the README are always very welcome! diff --git a/changelog.md b/changelog.md index 1ecf085..dac6d62 100644 --- a/changelog.md +++ b/changelog.md @@ -4,6 +4,27 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/). +## [v1.4.0] — 2026-05-13 + +### Added + +- **`DeepgramTTS`** — New TTS vendor wrapper for Deepgram (Beta). Accepts `api_key`, `model`, `base_url`, `sample_rate`, `params`, and `skip_patterns`. +- **`Agent.with_tools(enabled=True)`** — Dedicated builder method to enable MCP tool invocation (`advanced_features.enable_tools`). Replaces the raw `with_advanced_features(AdvancedFeatures(enable_tools=True))` call. +- **LLM vendors: `headers` field** — All four LLM vendors (`OpenAI`, `AzureOpenAI`, `Anthropic`, `Gemini`) now accept an optional `headers: Dict[str, str]` parameter. Use this to pass custom HTTP headers to the LLM provider (e.g., tenant identifiers, routing headers). +- **`AgentSession.think()` / `AsyncAgentSession.think()`** — Send a custom instruction to a running agent through the `agent_management` API. +- **`Agent.with_interruption()`** — Configure the new top-level `interruption` object for unified interruption control. +- **MLLM turn detection** — `OpenAIRealtime`, `GeminiLive`, and `VertexAI` now accept `turn_detection`, which maps to `mllm.turn_detection` and overrides top-level turn detection for MLLM sessions. +- **`audio_scenario` AgentKit support** — `SessionParams` and AgentKit request construction now expose the top-level `parameters.audio_scenario` field. +- **MLLM vendor parity** — `GeminiLive` is documented and exposed as the direct Google Gemini Live API wrapper. + +### Fixed + +- **MiniMax TTS preset stripping** — When a MiniMax reseller preset is inferred (`minimax_speech_2_6_turbo` or `minimax_speech_2_8_turbo`), the `group_id` and `url` fields are now correctly stripped from `tts.params` alongside `key` and `model`. Previously they were forwarded to the API, causing request failures. +- **MLLM enable flag** — `Agent.with_mllm()` now sets `mllm.enable = True` and removes the deprecated `advanced_features.enable_mllm` flag from generated requests. +- **MLLM wrapper shape** — MLLM vendors no longer emit removed fields such as `style`; docs and tests now reflect the v2.6 MLLM contract. +- **Preset-backed OpenAI TTS** — `OpenAITTS` no longer requires `api_key` when a reseller preset supplies credentials server-side. +- **AgentKit parity coverage** — Added regression coverage for interruption, MLLM turn detection, Deepgram TTS, LLM headers, and deprecated MLLM flag cleanup. + ## [v1.3.0] — 2026-04-02 ### Added diff --git a/docs/concepts/agent.md b/docs/concepts/agent.md index a8b80ad..0a8b159 100644 --- a/docs/concepts/agent.md +++ b/docs/concepts/agent.md @@ -32,7 +32,7 @@ agent = Agent( | `max_history` | `int` | No | Maximum conversation history length | | `turn_detection` | `TurnDetectionConfig` | No | Turn detection settings | | `sal` | `SalConfig` | No | SAL (Speech Activity Level) configuration | -| `advanced_features` | `Dict[str, Any]` | No | Advanced features (e.g., `{'enable_mllm': True}`) | +| `advanced_features` | `Dict[str, Any]` | No | Advanced features (e.g., `{'enable_rtm': True}`) | | `parameters` | `SessionParams` | No | Additional session parameters | | `geofence` | `GeofenceConfig` | No | Regional access restriction | | `labels` | `Dict[str, str]` | No | Custom key-value labels (returned in callbacks) | @@ -60,7 +60,7 @@ Each `with_*` method returns a **new** `Agent` instance — the original is unch | `with_instructions(text)` | `str` | Override the system prompt | | `with_greeting(text)` | `str` | Override the greeting message | | `with_name(name)` | `str` | Override the agent name | -| `with_turn_detection(config)` | `TurnDetectionConfig` | Override turn detection (use `config.start_of_speech` / `config.end_of_speech` for SOS/EOS) | +| `with_turn_detection(config)` | `TurnDetectionConfig` | Override cascading-flow SOS/EOS detection; use `with_interruption()` for interruption behavior | | `with_sal(config)` | `SalConfig` | Set SAL configuration | | `with_advanced_features(features)` | `Dict[str, Any]` | Set advanced features | | `with_parameters(parameters)` | `SessionParams` | Set session parameters | diff --git a/docs/concepts/vendors.md b/docs/concepts/vendors.md index 9fb02c1..7f22d8e 100644 --- a/docs/concepts/vendors.md +++ b/docs/concepts/vendors.md @@ -12,7 +12,7 @@ All vendor classes are available from `agora_agent.agentkit.vendors`: ```python -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT +from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT ``` ## LLM Vendors @@ -50,6 +50,7 @@ Used with `agent.with_tts()`. Each TTS vendor produces audio at a specific sampl | `FishAudioTTS` | Fish Audio | `key`, `reference_id` | — | | `GroqTTS` | Groq | `key` | — | | `MiniMaxTTS` | MiniMax | `key` | — | +| `DeepgramTTS` | Deepgram | `api_key`, `model` | Configurable | | `SarvamTTS` | Sarvam | `api_key` | — | @@ -78,7 +79,6 @@ Used with `agent.with_stt()`. | `AmazonSTT` | Amazon Transcribe | `access_key`, `secret_key`, `region` | | `AssemblyAISTT` | AssemblyAI | `api_key` | | `AresSTT` | Ares | — (all optional) | -| `SonioxSTT` | Soniox | `api_key`, `language` | | `SarvamSTT` | Sarvam | `api_key`, `language` | @@ -94,8 +94,9 @@ Used with `agent.with_mllm()` for the [MLLM flow](../guides/mllm-flow.md). These | Class | Provider | Required Parameters | |---|---|---| -| `OpenAIRealtime` | OpenAI Realtime | `api_key` | -| `VertexAI` | Vertex AI (Gemini Live) | `model`, `project_id`, `location`, `adc_credentials_string` | +| `OpenAIRealtime` | OpenAI Realtime | `api_key`; optional `turn_detection` | +| `GeminiLive` | Google Gemini Live API | `api_key`, `model`; optional `turn_detection` | +| `VertexAI` | Vertex AI (Gemini Live) | `model`, `project_id`, `location`, `adc_credentials_string`; optional `turn_detection` | ```python diff --git a/docs/guides/agent-builder-features.md b/docs/guides/agent-builder-features.md index 07e3f24..16efad6 100644 --- a/docs/guides/agent-builder-features.md +++ b/docs/guides/agent-builder-features.md @@ -16,6 +16,7 @@ For string values with a finite set of options (e.g. `data_channel`, `sal_mode`, |---|---|---| | `sal` | `with_sal(config)` | Selective Attention Locking — speaker recognition and noise suppression | | `advanced_features` | `with_advanced_features(features)` | Enable MLLM, RTM, SAL, tools | +| `tools` | `with_tools(enabled=True)` | Enable MCP tool invocation | | `parameters` | `with_parameters(params)` | Silence config, farewell config, data channel | | `failure_message` | `with_failure_message(msg)` | Message spoken when LLM fails | | `max_history` | `with_max_history(n)` | Max conversation turns in LLM context | @@ -60,13 +61,13 @@ from agora_agent.agentkit import Agent, AdvancedFeatures from agora_agent.agentkit.vendors import OpenAIRealtime # MLLM mode (see mllm-flow guide) -agent = Agent(advanced_features=AdvancedFeatures(enable_mllm=True)).with_mllm(OpenAIRealtime(api_key='...')) +agent = Agent().with_mllm(OpenAIRealtime(api_key='...')) # RTM signaling for custom data delivery agent = Agent(advanced_features=AdvancedFeatures(enable_rtm=True)) # Enable tool invocation via MCP -agent = Agent(advanced_features=AdvancedFeatures(enable_tools=True)) +agent = Agent().with_tools() ``` ## Session Parameters @@ -340,5 +341,5 @@ agent_id = session.start() - [Agent Reference](../reference/agent.md) — full API signatures - [Cascading Flow](./cascading-flow.md) — ASR → LLM → TTS setup -- [MLLM Flow](./mllm-flow.md) — multimodal flow with `enable_mllm` +- [MLLM Flow](./mllm-flow.md) — multimodal flow with `mllm.enable` - [Regional Routing](./regional-routing.md) — client area and geofence diff --git a/docs/guides/low-level-api.md b/docs/guides/low-level-api.md index 1ac5e05..6677b45 100644 --- a/docs/guides/low-level-api.md +++ b/docs/guides/low-level-api.md @@ -8,6 +8,13 @@ description: Direct client.agents.start() usage without the builder pattern. For full control over request payloads you can call the generated clients directly and pass raw types such as `StartAgentsRequestProperties`, `Tts_Elevenlabs`, and `StartAgentsRequestPropertiesAsr`. Use this when you need vendor or options not exposed by the agentkit, or when integrating with generated types from the API spec. +## Raw telephony and phone-number APIs + +AgentKit focuses on realtime agent session helpers. Telephony call status, call hangup, and phone-number management are exposed through the generated low-level clients: + +- `client.telephony` for call status and hangup operations +- `client.phone_numbers` for phone-number list, create, retrieve, update, and delete operations + ## Cascading flow (ASR → LLM → TTS) ```python @@ -129,11 +136,6 @@ For real-time audio with OpenAI Realtime or Google Gemini Live, use the MLLM flo ```python from agora_agent import Agora, Area -from agora_agent.agentkit import ( - AdvancedFeatures, - TurnDetectionConfig, - TurnDetectionTypeValues, -) from agora_agent.agents import ( StartAgentsRequestProperties, StartAgentsRequestPropertiesMllm, @@ -159,8 +161,8 @@ client.agents.start( agent_rtc_uid="1001", remote_rtc_uids=["1002"], idle_timeout=120, - advanced_features=AdvancedFeatures(enable_mllm=True), mllm=StartAgentsRequestPropertiesMllm( + enable=True, url="wss://api.openai.com/v1/realtime", api_key="", vendor=StartAgentsRequestPropertiesMllmVendor.OPENAI, @@ -171,18 +173,12 @@ client.agents.start( input_modalities=["audio"], output_modalities=["text", "audio"], greeting_message="Hello! I'm ready to chat in real-time.", - ), - turn_detection=TurnDetectionConfig( - type=TurnDetectionTypeValues.SERVER_VAD, # deprecated; use config.end_of_speech instead - threshold=0.5, - silence_duration_ms=500, - ), - tts=StartAgentsRequestPropertiesTts( - vendor=StartAgentsRequestPropertiesTtsVendor.ELEVENLABS, - params={}, - ), - llm=StartAgentsRequestPropertiesLlm( - url="https://api.openai.com/v1/chat/completions", + turn_detection={ + "mode": "server_vad", + "server_vad_config": { + "idle_timeout_ms": 5000, + }, + }, ), ), ) diff --git a/docs/guides/mllm-flow.md b/docs/guides/mllm-flow.md index 31fa34d..15d7c3e 100644 --- a/docs/guides/mllm-flow.md +++ b/docs/guides/mllm-flow.md @@ -13,29 +13,26 @@ Two MLLM vendors are supported: - **OpenAI Realtime** — `gpt-4o-realtime-preview` and related models - **Gemini Live** — direct Google AI API access for audio-native Gemini models -## Required: Enable MLLM Mode +## Enable MLLM Mode -MLLM mode must be explicitly enabled via `advanced_features`: +Call `agent.with_mllm(vendor)` to enable MLLM mode. The builder sets `mllm.enable = True` automatically. ```python -from agora_agent.agentkit import Agent, AdvancedFeatures +from agora_agent.agentkit import Agent agent = Agent( name='realtime-agent', instructions='You are a voice assistant.', - advanced_features=AdvancedFeatures(enable_mllm=True), ) ``` -Without `AdvancedFeatures(enable_mllm=True)`, the SDK treats the session as a cascading flow and requires LLM + TTS vendors. - ## OpenAI Realtime ### Sync ```python from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AdvancedFeatures +from agora_agent.agentkit import Agent from agora_agent.agentkit.vendors import OpenAIRealtime client = Agora( @@ -48,7 +45,6 @@ agent = ( Agent( name='realtime-agent', instructions='You are a helpful voice assistant.', - advanced_features=AdvancedFeatures(enable_mllm=True), ) .with_mllm(OpenAIRealtime( api_key='your-openai-key', @@ -67,7 +63,7 @@ session.stop() ```python import asyncio from agora_agent import AsyncAgora, Area -from agora_agent.agentkit import Agent, AdvancedFeatures +from agora_agent.agentkit import Agent from agora_agent.agentkit.vendors import OpenAIRealtime async def main(): @@ -81,7 +77,6 @@ async def main(): Agent( name='realtime-agent', instructions='You are a helpful voice assistant.', - advanced_features=AdvancedFeatures(enable_mllm=True), ) .with_mllm(OpenAIRealtime( api_key='your-openai-key', @@ -102,7 +97,7 @@ Gemini Live uses a Google AI API key: ```python from agora_agent import Agora, Area -from agora_agent.agentkit import Agent, AdvancedFeatures +from agora_agent.agentkit import Agent from agora_agent.agentkit.vendors import GeminiLive client = Agora( @@ -115,7 +110,6 @@ agent = ( Agent( name='gemini-agent', instructions='You are a helpful multilingual assistant.', - advanced_features=AdvancedFeatures(enable_mllm=True), ) .with_mllm(GeminiLive( api_key='your-google-ai-api-key', diff --git a/docs/reference/agent.md b/docs/reference/agent.md index 87a3b16..0e7f5e8 100644 --- a/docs/reference/agent.md +++ b/docs/reference/agent.md @@ -16,6 +16,7 @@ Agent( name: Optional[str] = None, instructions: Optional[str] = None, turn_detection: Optional[TurnDetectionConfig] = None, + interruption: Optional[InterruptionConfig] = None, sal: Optional[SalConfig] = None, advanced_features: Optional[Dict[str, Any]] = None, parameters: Optional[SessionParams] = None, @@ -34,8 +35,9 @@ Agent( | `name` | `Optional[str]` | `None` | Agent name, used as default session name | | `instructions` | `Optional[str]` | `None` | System prompt for the LLM | | `turn_detection` | `Optional[TurnDetectionConfig]` | `None` | Turn detection configuration | +| `interruption` | `Optional[InterruptionConfig]` | `None` | Unified interruption control configuration | | `sal` | `Optional[SalConfig]` | `None` | Speech Activity Level configuration | -| `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_mllm': True}`) | +| `advanced_features` | `Optional[Dict[str, Any]]` | `None` | Advanced features dict (e.g., `{'enable_rtm': True}`) | | `parameters` | `Optional[SessionParams]` | `None` | Additional session parameters | | `greeting` | `Optional[str]` | `None` | Auto-spoken greeting when agent joins | | `failure_message` | `Optional[str]` | `None` | Spoken on error | @@ -81,13 +83,12 @@ agent = Agent().with_stt(DeepgramSTT(api_key='your-key', language='en-US')) ### `with_mllm(vendor: BaseMLLM) -> Agent` -Set the MLLM vendor for multimodal flow. Requires `AdvancedFeatures(enable_mllm=True)`. +Set the MLLM vendor for multimodal flow. Calling `with_mllm()` automatically sets `mllm.enable = True`. ```python -from agora_agent.agentkit import AdvancedFeatures from agora_agent.agentkit.vendors import OpenAIRealtime -agent = Agent(advanced_features=AdvancedFeatures(enable_mllm=True)).with_mllm(OpenAIRealtime(api_key='your-key')) +agent = Agent().with_mllm(OpenAIRealtime(api_key='your-key')) ``` ### `with_avatar(vendor: BaseAvatar) -> Agent` @@ -104,7 +105,11 @@ agent = agent.with_avatar(HeyGenAvatar(api_key='your-key', quality='medium', ago ### `with_turn_detection(config: TurnDetectionConfig) -> Agent` -Override turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for the preferred SOS/EOS model. +Override cascading-flow turn detection settings. Use `config.start_of_speech` and `config.end_of_speech` for SOS/EOS detection. Use `with_interruption()` for interruption behavior and MLLM vendor `turn_detection` for MLLM turn detection. + +### `with_interruption(config: InterruptionConfig) -> Agent` + +Configure unified interruption behavior using the top-level `interruption` object. Use this for `start_of_speech` and `keywords` interruption modes. ### `with_instructions(instructions: str) -> Agent` @@ -124,7 +129,11 @@ Set SAL (Selective Attention Locking) configuration. ### `with_advanced_features(features: AdvancedFeatures) -> Agent` -Set advanced features (e.g. `{'enable_mllm': True}`, `{'enable_rtm': True}`). +Set advanced features (e.g. `{'enable_rtm': True}`). + +### `with_tools(enabled: bool = True) -> Agent` + +Enable or disable MCP tool invocation by setting `advanced_features.enable_tools`. ### `with_parameters(parameters: SessionParams) -> Agent` diff --git a/docs/reference/vendors.md b/docs/reference/vendors.md index af596eb..798a7f3 100644 --- a/docs/reference/vendors.md +++ b/docs/reference/vendors.md @@ -10,7 +10,7 @@ All vendor classes are available from `agora_agent.agentkit.vendors`: ```python -from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT, OpenAIRealtime, HeyGenAvatar +from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramTTS, DeepgramSTT, OpenAIRealtime, GeminiLive, HeyGenAvatar ``` --- @@ -31,7 +31,11 @@ from agora_agent.agentkit.vendors import OpenAI, ElevenLabsTTS, DeepgramSTT, Ope | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Failure message | | `input_modalities` | `List[str]` | No | `None` | Input modalities | +| `output_modalities` | `List[str]` | No | `None` | Output modalities | | `params` | `Dict[str, Any]` | No | `None` | Additional model parameters | +| `headers` | `Dict[str, str]` | No | `None` | Custom HTTP headers forwarded to the LLM provider | +| `greeting_configs` | `Dict[str, Any]` | No | `None` | Greeting playback configuration | +| `template_variables` | `Dict[str, str]` | No | `None` | Template variables for messages | ```python @@ -55,6 +59,11 @@ llm = OpenAI(api_key='your-key', model='gpt-4o-mini', temperature=0.7) | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Failure message | | `input_modalities` | `List[str]` | No | `None` | Input modalities | +| `output_modalities` | `List[str]` | No | `None` | Output modalities | +| `params` | `Dict[str, Any]` | No | `None` | Additional model parameters | +| `headers` | `Dict[str, str]` | No | `None` | Custom HTTP headers forwarded to the LLM provider | +| `greeting_configs` | `Dict[str, Any]` | No | `None` | Greeting playback configuration | +| `template_variables` | `Dict[str, str]` | No | `None` | Template variables for messages | ```python @@ -80,6 +89,11 @@ llm = AzureOpenAI( | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Failure message | | `input_modalities` | `List[str]` | No | `None` | Input modalities | +| `output_modalities` | `List[str]` | No | `None` | Output modalities | +| `params` | `Dict[str, Any]` | No | `None` | Additional model parameters | +| `headers` | `Dict[str, str]` | No | `None` | Custom HTTP headers forwarded to the LLM provider | +| `greeting_configs` | `Dict[str, Any]` | No | `None` | Greeting playback configuration | +| `template_variables` | `Dict[str, str]` | No | `None` | Template variables for messages | ```python @@ -102,6 +116,11 @@ llm = Anthropic(api_key='your-anthropic-key', model='claude-3-5-sonnet-20241022' | `greeting_message` | `str` | No | `None` | Greeting message | | `failure_message` | `str` | No | `None` | Failure message | | `input_modalities` | `List[str]` | No | `None` | Input modalities | +| `output_modalities` | `List[str]` | No | `None` | Output modalities | +| `params` | `Dict[str, Any]` | No | `None` | Additional model parameters | +| `headers` | `Dict[str, str]` | No | `None` | Custom HTTP headers forwarded to the LLM provider | +| `greeting_configs` | `Dict[str, Any]` | No | `None` | Greeting playback configuration | +| `template_variables` | `Dict[str, str]` | No | `None` | Template variables for messages | ```python @@ -182,6 +201,17 @@ Fixed sample rate: 24000 Hz. | `voice_id` | `str` | Yes | — | Amazon Polly voice ID | | `skip_patterns` | `List[int]` | No | `None` | Skip patterns | +### `DeepgramTTS` + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | Deepgram API key | +| `model` | `str` | Yes | — | Deepgram TTS model (e.g., `aura-2-thalia-en`) | +| `base_url` | `str` | No | `None` | WebSocket endpoint; defaults server-side to `wss://api.deepgram.com/v1/speak` | +| `sample_rate` | `int` | No | `None` | Sample rate in Hz (for example, `24000`) | +| `params` | `Dict[str, Any]` | No | `None` | Additional Deepgram TTS parameters | +| `skip_patterns` | `List[int]` | No | `None` | Skip patterns | + ### `HumeAITTS` | Parameter | Type | Required | Default | Description | @@ -313,14 +343,6 @@ Fixed sample rate: 24000 Hz. | `language` | `str` | No | `None` | Language code | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | -### `SonioxSTT` - -| Parameter | Type | Required | Default | Description | -|---|---|---|---|---| -| `api_key` | `str` | Yes | — | Soniox API key | -| `language` | `str` | Yes | — | Language code (e.g., `en`) | -| `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | - ### `SarvamSTT` | Parameter | Type | Required | Default | Description | @@ -348,6 +370,26 @@ Fixed sample rate: 24000 Hz. | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | | `params` | `Dict[str, Any]` | No | `None` | Additional parameters | +| `turn_detection` | `MllmTurnDetectionConfig` | No | `None` | MLLM turn detection configuration; overrides top-level `turn_detection` | + +### `GeminiLive` + +| Parameter | Type | Required | Default | Description | +|---|---|---|---|---| +| `api_key` | `str` | Yes | — | Google Gemini API key | +| `model` | `str` | Yes | — | Gemini Live model name | +| `url` | `str` | No | `None` | Custom WebSocket URL | +| `instructions` | `str` | No | `None` | System instructions | +| `voice` | `str` | No | `None` | Voice name | +| `greeting_message` | `str` | No | `None` | Greeting message | +| `failure_message` | `str` | No | `None` | Message played when the model call fails | +| `max_history` | `int` | No | `None` | Maximum conversation history length | +| `predefined_tools` | `List[str]` | No | `None` | Predefined tools (e.g., `["_publish_message"]`) | +| `input_modalities` | `List[str]` | No | `None` | Input modalities | +| `output_modalities` | `List[str]` | No | `None` | Output modalities | +| `messages` | `List[Dict]` | No | `None` | Conversation messages | +| `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | +| `turn_detection` | `MllmTurnDetectionConfig` | No | `None` | MLLM turn detection configuration; overrides top-level `turn_detection` | ### `VertexAI` @@ -367,6 +409,7 @@ Fixed sample rate: 24000 Hz. | `output_modalities` | `List[str]` | No | `None` | Output modalities | | `messages` | `List[Dict]` | No | `None` | Conversation messages | | `additional_params` | `Dict[str, Any]` | No | `None` | Additional parameters | +| `turn_detection` | `MllmTurnDetectionConfig` | No | `None` | MLLM turn detection configuration; overrides top-level `turn_detection` | --- diff --git a/poetry.lock b/poetry.lock index 5092d3a..46f7b7b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -38,13 +38,13 @@ trio = ["trio (>=0.26.1)"] [[package]] name = "certifi" -version = "2026.2.25" +version = "2026.4.22" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.7" files = [ - {file = "certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa"}, - {file = "certifi-2026.2.25.tar.gz", hash = "sha256:e887ab5cee78ea814d3472169153c2d12cd43b14bd03329a39a9c6e2e80bfba7"}, + {file = "certifi-2026.4.22-py3-none-any.whl", hash = "sha256:3cb2210c8f88ba2318d29b0388d1023c8492ff72ecdde4ebdaddbb13a31b1c4a"}, + {file = "certifi-2026.4.22.tar.gz", hash = "sha256:8d455352a37b71bf76a79caa83a3d6c25afee4a385d632127b6afb3963f1c580"}, ] [[package]] @@ -133,17 +133,17 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "idna" -version = "3.11" +version = "3.14" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.8" files = [ - {file = "idna-3.11-py3-none-any.whl", hash = "sha256:771a87f49d9defaf64091e6e6fe9c18d4833f140bd19464795bc32d966ca37ea"}, - {file = "idna-3.11.tar.gz", hash = "sha256:795dafcc9c04ed0c1fb032c2aa73654d8e8c5023a7df64a53f39190ada629902"}, + {file = "idna-3.14-py3-none-any.whl", hash = "sha256:e677eaf072e290f7b725f9acf0b3a2bd55f9fd6f7c70abe5f0e34823d0accf69"}, + {file = "idna-3.14.tar.gz", hash = "sha256:466d810d7a2cc1022bea9b037c39728d51ae7dad40d480fc9b7d7ecf98ba8ee3"}, ] [package.extras] -all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] +all = ["mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] [[package]] name = "iniconfig" @@ -222,13 +222,13 @@ files = [ [[package]] name = "packaging" -version = "26.0" +version = "26.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.8" files = [ - {file = "packaging-26.0-py3-none-any.whl", hash = "sha256:b36f1fef9334a5588b4166f8bcd26a14e521f2b55e6b9de3aaa80d3ff7a37529"}, - {file = "packaging-26.0.tar.gz", hash = "sha256:00243ae351a257117b6a241061796684b084ed1c516a08c48a3f7e147a9d80b4"}, + {file = "packaging-26.2-py3-none-any.whl", hash = "sha256:5fc45236b9446107ff2415ce77c807cee2862cb6fac22b8a73826d0693b0980e"}, + {file = "packaging-26.2.tar.gz", hash = "sha256:ff452ff5a3e828ce110190feff1178bb1f2ea2281fa2075aadb987c2fb221661"}, ] [[package]] diff --git a/pyproject.toml b/pyproject.toml index b2149d7..6104af4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ name = "agora-agent-server-sdk" [tool.poetry] name = "agora-agent-server-sdk" -version = "1.3.0" +version = "v1.4.0" description = "" readme = "README.md" authors = [] diff --git a/reference.md b/reference.md index 43c3768..356b016 100644 --- a/reference.md +++ b/reference.md @@ -916,6 +916,165 @@ client.agents.interrupt( + + + + +## Agent Management +
client.agent_management.agent_think(...) +
+
+ +#### 📝 Description + +
+
+ +
+
+ +Send a custom text instruction to the specified conversational AI agent instance. + +The instruction is injected into the current conversation pipeline as user input, and the agent processes and responds to it following the standard user input logic. + +Use this endpoint for the following scenarios: +- **Implicit instruction injection**: Inject hidden context or directives into the conversation. +- **Client-side event triggering**: Notify the agent of client-side events, such as a user clicking a button. +- **Voice and text collaboration**: Combine text instructions with voice input for richer interaction. +
+
+
+
+ +#### 🔌 Usage + +
+
+ +
+
+ +```python +from agora_agent import Agora + +client = Agora( + authorization="YOUR_AUTHORIZATION", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", +) +client.agent_management.agent_think( + appid="appid", + agent_id="agentId", + text="The user just clicked the purchase button.", + on_listening_action="inject", + on_thinking_action="interrupt", + on_speaking_action="ignore", + interruptable=True, + metadata={"publisher": "user123", "model": "deepseek-r1"}, +) + +``` +
+
+
+
+ +#### ⚙️ Parameters + +
+
+ +
+
+ +**appid:** `str` — The App ID of the project. + +
+
+ +
+
+ +**agent_id:** `str` — The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + +
+
+ +
+
+ +**text:** `str` — The custom instruction text to inject into the current conversation pipeline. The system processes this as user input. + +
+
+ +
+
+ +**on_listening_action:** `typing.Optional[AgentThinkAgentManagementRequestOnListeningAction]` + +The action to take when the agent is in a listening state: +- `inject`: Inject the custom text instruction into the current turn without interrupting it. +- `ignore`: Ignore the request. + +
+
+ +
+
+ +**on_thinking_action:** `typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction]` + +The action to take when the agent is in a thinking state: +- `interrupt`: Interrupt the current state and start a new conversation turn. +- `ignore`: Ignore the request. + +
+
+ +
+
+ +**on_speaking_action:** `typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction]` + +The action to take when the agent is in a speaking state: +- `interrupt`: Interrupt the current state and start a new conversation turn. +- `ignore`: Ignore the request. + +
+
+ +
+
+ +**interruptable:** `typing.Optional[bool]` + +Whether user speech can interrupt the injected instruction: +- `true`: User speech can interrupt the instruction. +- `false`: User speech cannot interrupt the instruction. + +
+
+ +
+
+ +**metadata:** `typing.Optional[typing.Dict[str, str]]` — Custom metadata in key-value pair format. Use this field to pass additional business information such as identifiers or model references. + +
+
+ +
+
+ +**request_options:** `typing.Optional[RequestOptions]` — Request-specific configuration. + +
+
+
+
+ +
diff --git a/scripts/validate_docs.py b/scripts/validate_docs.py deleted file mode 100644 index 4d4e3d3..0000000 --- a/scripts/validate_docs.py +++ /dev/null @@ -1,79 +0,0 @@ -import ast -import pathlib -import re -import sys -from typing import List, Optional - -ROOT = pathlib.Path.cwd() -BANNED_PATTERNS = [ - re.compile(r'\{\{\s*owner\s*\}\}'), - re.compile(r'\{\{\s*repo\s*\}\}'), - re.compile(r'from agora-agent-server-sdk'), -] -# `concepts` and `reference` snippets must declare whether they are runnable examples or API fragments. -CODE_BLOCK_RE = re.compile( - r'(?:()[ \t]*\n)?```python\n([\s\S]*?)```' -) - - -def collect_markdown_files() -> List[pathlib.Path]: - return [ROOT / 'README.md', *sorted((ROOT / 'docs').rglob('*.md'))] - - -def is_annotated_section(file: pathlib.Path) -> bool: - relative = file.relative_to(ROOT).as_posix() - return '/docs/concepts/' in f'/{relative}' or '/docs/reference/' in f'/{relative}' - - -def snippet_mode(code: str, annotation: Optional[str]) -> str: - if annotation == 'fragment': - return 'fragment' - if annotation == 'executable': - return 'executable' - return 'executable' - - -MARKDOWN_FILES = collect_markdown_files() - -failures: List[str] = [] -snippet_count = 0 -fragment_count = 0 - -for file in MARKDOWN_FILES: - content = file.read_text(encoding='utf-8') - - for pattern in BANNED_PATTERNS: - if pattern.search(content): - failures.append(f"{file.relative_to(ROOT)} contains banned pattern: {pattern.pattern}") - - for match in CODE_BLOCK_RE.finditer(content): - annotation = match.group(2) - code = match.group(3) - if is_annotated_section(file) and not annotation: - failures.append(f"{file.relative_to(ROOT)} contains an unannotated python snippet") - continue - - mode = snippet_mode(code, annotation) - if mode == 'fragment': - fragment_count += 1 - continue - - snippet_count += 1 - try: - ast.parse(code, filename=str(file)) - except SyntaxError as exc: - failures.append(f"{file.relative_to(ROOT)}:{exc.lineno}: {exc.msg}") - -if snippet_count == 0: - failures.append('No Python code blocks found in README/docs markdown.') - -if failures: - print('Documentation validation failed:', file=sys.stderr) - for failure in failures: - print(f'- {failure}', file=sys.stderr) - raise SystemExit(1) - -print( - f'Validated {snippet_count} executable and {fragment_count} fragment Python snippets across ' - f'{len(MARKDOWN_FILES)} markdown files.' -) diff --git a/src/agora_agent/agent_management/__init__.py b/src/agora_agent/agent_management/__init__.py new file mode 100644 index 0000000..5cde020 --- /dev/null +++ b/src/agora_agent/agent_management/__init__.py @@ -0,0 +1,4 @@ +# This file was auto-generated by Fern from our API Definition. + +# isort: skip_file + diff --git a/src/agora_agent/agent_management/client.py b/src/agora_agent/agent_management/client.py new file mode 100644 index 0000000..71b3c62 --- /dev/null +++ b/src/agora_agent/agent_management/client.py @@ -0,0 +1,256 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper +from ..core.request_options import RequestOptions +from .raw_client import AsyncRawAgentManagementClient, RawAgentManagementClient +from .types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, +) +from .types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, +) +from .types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, +) +from .types.agent_think_agent_management_response import AgentThinkAgentManagementResponse + +# this is used as the default value for optional parameters +OMIT = typing.cast(typing.Any, ...) + + +class AgentManagementClient: + def __init__(self, *, client_wrapper: SyncClientWrapper): + self._raw_client = RawAgentManagementClient(client_wrapper=client_wrapper) + + @property + def with_raw_response(self) -> RawAgentManagementClient: + """ + Retrieves a raw implementation of this client that returns raw responses. + + Returns + ------- + RawAgentManagementClient + """ + return self._raw_client + + def agent_think( + self, + appid: str, + agent_id: str, + *, + text: str, + on_listening_action: typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] = OMIT, + on_thinking_action: typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] = OMIT, + on_speaking_action: typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] = OMIT, + interruptable: typing.Optional[bool] = OMIT, + metadata: typing.Optional[typing.Dict[str, str]] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> AgentThinkAgentManagementResponse: + """ + Send a custom text instruction to the specified conversational AI agent instance. + + The instruction is injected into the current conversation pipeline as user input, and the agent processes and responds to it following the standard user input logic. + + Use this endpoint for the following scenarios: + - **Implicit instruction injection**: Inject hidden context or directives into the conversation. + - **Client-side event triggering**: Notify the agent of client-side events, such as a user clicking a button. + - **Voice and text collaboration**: Combine text instructions with voice input for richer interaction. + + Parameters + ---------- + appid : str + The App ID of the project. + + agent_id : str + The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + + text : str + The custom instruction text to inject into the current conversation pipeline. The system processes this as user input. + + on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] + The action to take when the agent is in a listening state: + - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `ignore`: Ignore the request. + + on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] + The action to take when the agent is in a thinking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + on_speaking_action : typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] + The action to take when the agent is in a speaking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + interruptable : typing.Optional[bool] + Whether user speech can interrupt the injected instruction: + - `true`: User speech can interrupt the instruction. + - `false`: User speech cannot interrupt the instruction. + + metadata : typing.Optional[typing.Dict[str, str]] + Custom metadata in key-value pair format. Use this field to pass additional business information such as identifiers or model references. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + AgentThinkAgentManagementResponse + Request was successful. The response body contains the result of the request. + + Examples + -------- + from agora_agent import Agora + + client = Agora( + authorization="YOUR_AUTHORIZATION", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", + ) + client.agent_management.agent_think( + appid="appid", + agent_id="agentId", + text="The user just clicked the purchase button.", + on_listening_action="inject", + on_thinking_action="interrupt", + on_speaking_action="ignore", + interruptable=True, + metadata={"publisher": "user123", "model": "deepseek-r1"}, + ) + """ + _response = self._raw_client.agent_think( + appid, + agent_id, + text=text, + on_listening_action=on_listening_action, + on_thinking_action=on_thinking_action, + on_speaking_action=on_speaking_action, + interruptable=interruptable, + metadata=metadata, + request_options=request_options, + ) + return _response.data + + +class AsyncAgentManagementClient: + def __init__(self, *, client_wrapper: AsyncClientWrapper): + self._raw_client = AsyncRawAgentManagementClient(client_wrapper=client_wrapper) + + @property + def with_raw_response(self) -> AsyncRawAgentManagementClient: + """ + Retrieves a raw implementation of this client that returns raw responses. + + Returns + ------- + AsyncRawAgentManagementClient + """ + return self._raw_client + + async def agent_think( + self, + appid: str, + agent_id: str, + *, + text: str, + on_listening_action: typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] = OMIT, + on_thinking_action: typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] = OMIT, + on_speaking_action: typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] = OMIT, + interruptable: typing.Optional[bool] = OMIT, + metadata: typing.Optional[typing.Dict[str, str]] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> AgentThinkAgentManagementResponse: + """ + Send a custom text instruction to the specified conversational AI agent instance. + + The instruction is injected into the current conversation pipeline as user input, and the agent processes and responds to it following the standard user input logic. + + Use this endpoint for the following scenarios: + - **Implicit instruction injection**: Inject hidden context or directives into the conversation. + - **Client-side event triggering**: Notify the agent of client-side events, such as a user clicking a button. + - **Voice and text collaboration**: Combine text instructions with voice input for richer interaction. + + Parameters + ---------- + appid : str + The App ID of the project. + + agent_id : str + The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + + text : str + The custom instruction text to inject into the current conversation pipeline. The system processes this as user input. + + on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] + The action to take when the agent is in a listening state: + - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `ignore`: Ignore the request. + + on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] + The action to take when the agent is in a thinking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + on_speaking_action : typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] + The action to take when the agent is in a speaking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + interruptable : typing.Optional[bool] + Whether user speech can interrupt the injected instruction: + - `true`: User speech can interrupt the instruction. + - `false`: User speech cannot interrupt the instruction. + + metadata : typing.Optional[typing.Dict[str, str]] + Custom metadata in key-value pair format. Use this field to pass additional business information such as identifiers or model references. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + AgentThinkAgentManagementResponse + Request was successful. The response body contains the result of the request. + + Examples + -------- + import asyncio + + from agora_agent import AsyncAgora + + client = AsyncAgora( + authorization="YOUR_AUTHORIZATION", + username="YOUR_USERNAME", + password="YOUR_PASSWORD", + ) + + + async def main() -> None: + await client.agent_management.agent_think( + appid="appid", + agent_id="agentId", + text="The user just clicked the purchase button.", + on_listening_action="inject", + on_thinking_action="interrupt", + on_speaking_action="ignore", + interruptable=True, + metadata={"publisher": "user123", "model": "deepseek-r1"}, + ) + + + asyncio.run(main()) + """ + _response = await self._raw_client.agent_think( + appid, + agent_id, + text=text, + on_listening_action=on_listening_action, + on_thinking_action=on_thinking_action, + on_speaking_action=on_speaking_action, + interruptable=interruptable, + metadata=metadata, + request_options=request_options, + ) + return _response.data diff --git a/src/agora_agent/agent_management/raw_client.py b/src/agora_agent/agent_management/raw_client.py new file mode 100644 index 0000000..03a0838 --- /dev/null +++ b/src/agora_agent/agent_management/raw_client.py @@ -0,0 +1,228 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing +from json.decoder import JSONDecodeError + +from ..core.api_error import ApiError +from ..core.client_wrapper import AsyncClientWrapper, SyncClientWrapper +from ..core.http_response import AsyncHttpResponse, HttpResponse +from ..core.jsonable_encoder import jsonable_encoder +from ..core.request_options import RequestOptions +from ..core.unchecked_base_model import construct_type +from .types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction, +) +from .types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction, +) +from .types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction, +) +from .types.agent_think_agent_management_response import AgentThinkAgentManagementResponse + +# this is used as the default value for optional parameters +OMIT = typing.cast(typing.Any, ...) + + +class RawAgentManagementClient: + def __init__(self, *, client_wrapper: SyncClientWrapper): + self._client_wrapper = client_wrapper + + def agent_think( + self, + appid: str, + agent_id: str, + *, + text: str, + on_listening_action: typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] = OMIT, + on_thinking_action: typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] = OMIT, + on_speaking_action: typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] = OMIT, + interruptable: typing.Optional[bool] = OMIT, + metadata: typing.Optional[typing.Dict[str, str]] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> HttpResponse[AgentThinkAgentManagementResponse]: + """ + Send a custom text instruction to the specified conversational AI agent instance. + + The instruction is injected into the current conversation pipeline as user input, and the agent processes and responds to it following the standard user input logic. + + Use this endpoint for the following scenarios: + - **Implicit instruction injection**: Inject hidden context or directives into the conversation. + - **Client-side event triggering**: Notify the agent of client-side events, such as a user clicking a button. + - **Voice and text collaboration**: Combine text instructions with voice input for richer interaction. + + Parameters + ---------- + appid : str + The App ID of the project. + + agent_id : str + The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + + text : str + The custom instruction text to inject into the current conversation pipeline. The system processes this as user input. + + on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] + The action to take when the agent is in a listening state: + - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `ignore`: Ignore the request. + + on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] + The action to take when the agent is in a thinking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + on_speaking_action : typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] + The action to take when the agent is in a speaking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + interruptable : typing.Optional[bool] + Whether user speech can interrupt the injected instruction: + - `true`: User speech can interrupt the instruction. + - `false`: User speech cannot interrupt the instruction. + + metadata : typing.Optional[typing.Dict[str, str]] + Custom metadata in key-value pair format. Use this field to pass additional business information such as identifiers or model references. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + HttpResponse[AgentThinkAgentManagementResponse] + Request was successful. The response body contains the result of the request. + """ + _response = self._client_wrapper.httpx_client.request( + f"v2/projects/{jsonable_encoder(appid)}/agents/{jsonable_encoder(agent_id)}/think", + method="POST", + json={ + "text": text, + "on_listening_action": on_listening_action, + "on_thinking_action": on_thinking_action, + "on_speaking_action": on_speaking_action, + "interruptable": interruptable, + "metadata": metadata, + }, + headers={ + "content-type": "application/json", + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + AgentThinkAgentManagementResponse, + construct_type( + type_=AgentThinkAgentManagementResponse, # type: ignore + object_=_response.json(), + ), + ) + return HttpResponse(response=_response, data=_data) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) + + +class AsyncRawAgentManagementClient: + def __init__(self, *, client_wrapper: AsyncClientWrapper): + self._client_wrapper = client_wrapper + + async def agent_think( + self, + appid: str, + agent_id: str, + *, + text: str, + on_listening_action: typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] = OMIT, + on_thinking_action: typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] = OMIT, + on_speaking_action: typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] = OMIT, + interruptable: typing.Optional[bool] = OMIT, + metadata: typing.Optional[typing.Dict[str, str]] = OMIT, + request_options: typing.Optional[RequestOptions] = None, + ) -> AsyncHttpResponse[AgentThinkAgentManagementResponse]: + """ + Send a custom text instruction to the specified conversational AI agent instance. + + The instruction is injected into the current conversation pipeline as user input, and the agent processes and responds to it following the standard user input logic. + + Use this endpoint for the following scenarios: + - **Implicit instruction injection**: Inject hidden context or directives into the conversation. + - **Client-side event triggering**: Notify the agent of client-side events, such as a user clicking a button. + - **Voice and text collaboration**: Combine text instructions with voice input for richer interaction. + + Parameters + ---------- + appid : str + The App ID of the project. + + agent_id : str + The agent instance ID you obtained after successfully calling `join` to start a conversational AI agent. + + text : str + The custom instruction text to inject into the current conversation pipeline. The system processes this as user input. + + on_listening_action : typing.Optional[AgentThinkAgentManagementRequestOnListeningAction] + The action to take when the agent is in a listening state: + - `inject`: Inject the custom text instruction into the current turn without interrupting it. + - `ignore`: Ignore the request. + + on_thinking_action : typing.Optional[AgentThinkAgentManagementRequestOnThinkingAction] + The action to take when the agent is in a thinking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + on_speaking_action : typing.Optional[AgentThinkAgentManagementRequestOnSpeakingAction] + The action to take when the agent is in a speaking state: + - `interrupt`: Interrupt the current state and start a new conversation turn. + - `ignore`: Ignore the request. + + interruptable : typing.Optional[bool] + Whether user speech can interrupt the injected instruction: + - `true`: User speech can interrupt the instruction. + - `false`: User speech cannot interrupt the instruction. + + metadata : typing.Optional[typing.Dict[str, str]] + Custom metadata in key-value pair format. Use this field to pass additional business information such as identifiers or model references. + + request_options : typing.Optional[RequestOptions] + Request-specific configuration. + + Returns + ------- + AsyncHttpResponse[AgentThinkAgentManagementResponse] + Request was successful. The response body contains the result of the request. + """ + _response = await self._client_wrapper.httpx_client.request( + f"v2/projects/{jsonable_encoder(appid)}/agents/{jsonable_encoder(agent_id)}/think", + method="POST", + json={ + "text": text, + "on_listening_action": on_listening_action, + "on_thinking_action": on_thinking_action, + "on_speaking_action": on_speaking_action, + "interruptable": interruptable, + "metadata": metadata, + }, + headers={ + "content-type": "application/json", + }, + request_options=request_options, + omit=OMIT, + ) + try: + if 200 <= _response.status_code < 300: + _data = typing.cast( + AgentThinkAgentManagementResponse, + construct_type( + type_=AgentThinkAgentManagementResponse, # type: ignore + object_=_response.json(), + ), + ) + return AsyncHttpResponse(response=_response, data=_data) + _response_json = _response.json() + except JSONDecodeError: + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response.text) + raise ApiError(status_code=_response.status_code, headers=dict(_response.headers), body=_response_json) diff --git a/src/agora_agent/agent_management/types/__init__.py b/src/agora_agent/agent_management/types/__init__.py new file mode 100644 index 0000000..5cde020 --- /dev/null +++ b/src/agora_agent/agent_management/types/__init__.py @@ -0,0 +1,4 @@ +# This file was auto-generated by Fern from our API Definition. + +# isort: skip_file + diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py new file mode 100644 index 0000000..117d8cc --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_listening_action.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentThinkAgentManagementRequestOnListeningAction = typing.Union[typing.Literal["inject", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_speaking_action.py b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_speaking_action.py new file mode 100644 index 0000000..3cfe7a4 --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_speaking_action.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentThinkAgentManagementRequestOnSpeakingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_thinking_action.py b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_thinking_action.py new file mode 100644 index 0000000..996272f --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_request_on_thinking_action.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentThinkAgentManagementRequestOnThinkingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_agent_management_response.py b/src/agora_agent/agent_management/types/agent_think_agent_management_response.py new file mode 100644 index 0000000..7e512eb --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_agent_management_response.py @@ -0,0 +1,33 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel + + +class AgentThinkAgentManagementResponse(UncheckedBaseModel): + agent_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Unique identifier of the agent instance. + """ + + channel: typing.Optional[str] = pydantic.Field(default=None) + """ + The name of the RTC channel where the agent is located. + """ + + start_ts: typing.Optional[int] = pydantic.Field(default=None) + """ + Timestamp indicating when the agent was created. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py new file mode 100644 index 0000000..54cca4c --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_request_on_listening_action.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentThinkRequestOnListeningAction = typing.Union[typing.Literal["inject", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py new file mode 100644 index 0000000..8329197 --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_request_on_speaking_action.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentThinkRequestOnSpeakingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py b/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py new file mode 100644 index 0000000..ee50877 --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_request_on_thinking_action.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +AgentThinkRequestOnThinkingAction = typing.Union[typing.Literal["interrupt", "ignore"], typing.Any] diff --git a/src/agora_agent/agent_management/types/agent_think_response.py b/src/agora_agent/agent_management/types/agent_think_response.py new file mode 100644 index 0000000..3a3c646 --- /dev/null +++ b/src/agora_agent/agent_management/types/agent_think_response.py @@ -0,0 +1,33 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel + + +class AgentThinkResponse(UncheckedBaseModel): + agent_id: typing.Optional[str] = pydantic.Field(default=None) + """ + Unique identifier of the agent instance. + """ + + channel: typing.Optional[str] = pydantic.Field(default=None) + """ + The name of the RTC channel where the agent is located. + """ + + start_ts: typing.Optional[int] = pydantic.Field(default=None) + """ + Timestamp indicating when the agent was created. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agentkit/__init__.py b/src/agora_agent/agentkit/__init__.py index ed95b7c..1942bce 100644 --- a/src/agora_agent/agentkit/__init__.py +++ b/src/agora_agent/agentkit/__init__.py @@ -27,15 +27,33 @@ SalMode, AdvancedFeatures, SessionParams, + SessionParamsInput, SilenceConfig, SilenceAction, FarewellConfig, ParametersDataChannel, + ParametersAudioScenario, + InterruptionConfig, + InterruptionMode, + MllmTurnDetectionConfig, + MllmTurnDetectionMode, LlmGreetingConfigs, LlmGreetingConfigsMode, McpServersItem, ) from .agent_session import AgentSession, AgentSessionOptions, AsyncAgentSession +from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, +) +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, +) from .avatar_types import ( is_akool_avatar, is_anam_avatar, @@ -46,6 +64,7 @@ ) from .constants import ( DataChannel, + AudioScenario, SilenceActionValues, SalModeValues, GeofenceArea, @@ -87,6 +106,7 @@ CartesiaSampleRate, CartesiaTTS, DeepgramSTT, + DeepgramTTS, ElevenLabsSampleRate, ElevenLabsTTS, FishAudioTTS, @@ -151,12 +171,19 @@ "AdvancedFeatures", # Session parameters types "SessionParams", + "SessionParamsInput", "SilenceConfig", "SilenceAction", "FarewellConfig", "ParametersDataChannel", + "ParametersAudioScenario", + "InterruptionConfig", + "InterruptionMode", + "MllmTurnDetectionConfig", + "MllmTurnDetectionMode", # Type-safe constants "DataChannel", + "AudioScenario", "SilenceActionValues", "SalModeValues", "GeofenceArea", @@ -170,6 +197,10 @@ "AgentSession", "AsyncAgentSession", "AgentSessionOptions", + "AgentThinkResponse", + "AgentThinkRequestOnListeningAction", + "AgentThinkRequestOnThinkingAction", + "AgentThinkRequestOnSpeakingAction", "AgentPresets", "DeepgramPresetModels", "OpenAIPresetModels", @@ -201,6 +232,7 @@ "MicrosoftTTS", "OpenAITTS", "CartesiaTTS", + "DeepgramTTS", "GoogleTTS", "AmazonTTS", "HumeAITTS", diff --git a/src/agora_agent/agentkit/agent.py b/src/agora_agent/agentkit/agent.py index 1e256fa..70a1bdd 100644 --- a/src/agora_agent/agentkit/agent.py +++ b/src/agora_agent/agentkit/agent.py @@ -2,6 +2,7 @@ import time import typing +import typing_extensions if typing.TYPE_CHECKING: from .agent_session import AgentSession, AsyncAgentSession @@ -29,6 +30,11 @@ from ..agents.types.start_agents_request_properties_parameters_silence_config_action import StartAgentsRequestPropertiesParametersSilenceConfigAction from ..agents.types.start_agents_request_properties_parameters_farewell_config import StartAgentsRequestPropertiesParametersFarewellConfig from ..agents.types.start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel +from ..agents.types.start_agents_request_properties_parameters_audio_scenario import StartAgentsRequestPropertiesParametersAudioScenario +from ..agents.types.start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption +from ..agents.types.start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode +from ..agents.types.start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection +from ..agents.types.start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode from ..agents.types.start_agents_request_properties_llm_greeting_configs import StartAgentsRequestPropertiesLlmGreetingConfigs from ..agents.types.start_agents_request_properties_llm_greeting_configs_mode import StartAgentsRequestPropertiesLlmGreetingConfigsMode from ..agents.types.start_agents_request_properties_llm_mcp_servers_item import StartAgentsRequestPropertiesLlmMcpServersItem @@ -82,6 +88,20 @@ SilenceAction = StartAgentsRequestPropertiesParametersSilenceConfigAction FarewellConfig = StartAgentsRequestPropertiesParametersFarewellConfig ParametersDataChannel = StartAgentsRequestPropertiesParametersDataChannel +ParametersAudioScenario = StartAgentsRequestPropertiesParametersAudioScenario +InterruptionConfig = StartAgentsRequestPropertiesInterruption +InterruptionMode = StartAgentsRequestPropertiesInterruptionMode +MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection +MllmTurnDetectionMode = StartAgentsRequestPropertiesMllmTurnDetectionMode + + +class SessionParamsInput(typing_extensions.TypedDict, total=False): + silence_config: StartAgentsRequestPropertiesParametersSilenceConfig + farewell_config: StartAgentsRequestPropertiesParametersFarewellConfig + data_channel: StartAgentsRequestPropertiesParametersDataChannel + enable_metrics: bool + enable_error_message: bool + audio_scenario: ParametersAudioScenario # LLM sub-type aliases LlmGreetingConfigs = StartAgentsRequestPropertiesLlmGreetingConfigs @@ -123,9 +143,10 @@ def __init__( name: typing.Optional[str] = None, instructions: typing.Optional[str] = None, turn_detection: typing.Optional[TurnDetectionConfig] = None, + interruption: typing.Optional[InterruptionConfig] = None, sal: typing.Optional[SalConfig] = None, advanced_features: typing.Optional[AdvancedFeatures] = None, - parameters: typing.Optional[SessionParams] = None, + parameters: typing.Optional[typing.Union[SessionParams, SessionParamsInput]] = None, greeting: typing.Optional[str] = None, failure_message: typing.Optional[str] = None, max_history: typing.Optional[int] = None, @@ -147,6 +168,7 @@ def __init__( self._avatar: typing.Optional[typing.Dict[str, typing.Any]] = None self._avatar_required_sample_rate: typing.Optional[int] = None self._turn_detection = turn_detection + self._interruption = interruption self._sal = sal self._advanced_features = advanced_features self._parameters = parameters @@ -174,6 +196,21 @@ def with_stt(self, vendor: BaseSTT) -> "Agent": def with_mllm(self, vendor: BaseMLLM) -> "Agent": new_agent = self._clone() new_agent._mllm = vendor.to_config() + if isinstance(new_agent._mllm, dict): + new_agent._mllm["enable"] = True + if isinstance(new_agent._advanced_features, dict): + advanced_features = {key: value for key, value in new_agent._advanced_features.items() if key != "enable_mllm"} + new_agent._advanced_features = typing.cast(AdvancedFeatures, advanced_features) if advanced_features else None + elif isinstance(new_agent._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures): + advanced_features_model = new_agent._advanced_features.model_copy(update={"enable_mllm": None}) + if ( + advanced_features_model.enable_rtm is None + and advanced_features_model.enable_sal is None + and advanced_features_model.enable_tools is None + ): + new_agent._advanced_features = None + else: + new_agent._advanced_features = advanced_features_model return new_agent def with_avatar(self, vendor: BaseAvatar) -> "Agent": @@ -198,6 +235,12 @@ def with_turn_detection(self, config: TurnDetectionConfig) -> "Agent": new_agent._turn_detection = config return new_agent + def with_interruption(self, config: InterruptionConfig) -> "Agent": + """Returns a new Agent with unified interruption control configured.""" + new_agent = self._clone() + new_agent._interruption = config + return new_agent + def with_instructions(self, instructions: str) -> "Agent": new_agent = self._clone() new_agent._instructions = instructions @@ -222,13 +265,27 @@ def with_sal(self, config: SalConfig) -> "Agent": def with_advanced_features(self, features: AdvancedFeatures) -> "Agent": """Returns a new Agent with the specified advanced features configuration. - Use this to enable MLLM mode (``{"enable_mllm": True}``), RTM, and other features. + Use this to enable RTM and other advanced features. """ new_agent = self._clone() new_agent._advanced_features = features return new_agent - def with_parameters(self, parameters: SessionParams) -> "Agent": + def with_tools(self, enabled: bool = True) -> "Agent": + """Returns a new Agent with MCP tool invocation enabled or disabled.""" + new_agent = self._clone() + if new_agent._advanced_features is None: + new_agent._advanced_features = StartAgentsRequestPropertiesAdvancedFeatures(enable_tools=enabled) + elif isinstance(new_agent._advanced_features, dict): + new_agent._advanced_features = typing.cast( + AdvancedFeatures, + {**new_agent._advanced_features, "enable_tools": enabled}, + ) + else: + new_agent._advanced_features = new_agent._advanced_features.model_copy(update={"enable_tools": enabled}) + return new_agent + + def with_parameters(self, parameters: typing.Union[SessionParams, SessionParamsInput]) -> "Agent": """Returns a new Agent with the specified session parameters. Use this to configure silence behaviour, graceful hang-up, data channel, and more. @@ -309,6 +366,10 @@ def mllm(self) -> typing.Optional[typing.Dict[str, typing.Any]]: def turn_detection(self) -> typing.Optional[TurnDetectionConfig]: return self._turn_detection + @property + def interruption(self) -> typing.Optional[InterruptionConfig]: + return self._interruption + @property def instructions(self) -> typing.Optional[str]: return self._instructions @@ -338,7 +399,7 @@ def advanced_features(self) -> typing.Optional[AdvancedFeatures]: return self._advanced_features @property - def parameters(self) -> typing.Optional[SessionParams]: + def parameters(self) -> typing.Optional[typing.Union[SessionParams, SessionParamsInput]]: return self._parameters @property @@ -370,6 +431,7 @@ def config(self) -> typing.Dict[str, typing.Any]: "stt": self._stt, "mllm": self._mllm, "turn_detection": self._turn_detection, + "interruption": self._interruption, "sal": self._sal, "avatar": self._avatar, "advanced_features": self._advanced_features, @@ -491,13 +553,8 @@ def to_properties( **token_kwargs, ) - is_mllm_mode = ( - self._advanced_features is not None - and ( - (isinstance(self._advanced_features, dict) and self._advanced_features.get("enable_mllm") is True) - or (isinstance(self._advanced_features, StartAgentsRequestPropertiesAdvancedFeatures) and self._advanced_features.enable_mllm is True) - ) - ) + mllm_flag = isinstance(self._mllm, dict) and self._mllm.get("enable") is True + is_mllm_mode = bool(mllm_flag or self._mllm is not None) base_kwargs: typing.Dict[str, typing.Any] = { "channel": channel, @@ -514,6 +571,8 @@ def to_properties( base_kwargs["mllm"] = self._mllm if self._turn_detection is not None: base_kwargs["turn_detection"] = self._turn_detection + if self._interruption is not None: + base_kwargs["interruption"] = self._interruption if self._sal is not None: base_kwargs["sal"] = self._sal if self._avatar is not None: @@ -521,7 +580,10 @@ def to_properties( if self._advanced_features is not None: base_kwargs["advanced_features"] = self._advanced_features if self._parameters is not None: - base_kwargs["parameters"] = self._parameters + if isinstance(self._parameters, dict): + base_kwargs["parameters"] = StartAgentsRequestPropertiesParameters(**self._parameters) + else: + base_kwargs["parameters"] = self._parameters if self._geofence is not None: base_kwargs["geofence"] = self._geofence if self._labels is not None: @@ -582,6 +644,7 @@ def _clone(self) -> "Agent": new_agent._avatar = self._avatar new_agent._avatar_required_sample_rate = self._avatar_required_sample_rate new_agent._turn_detection = self._turn_detection + new_agent._interruption = self._interruption new_agent._sal = self._sal new_agent._advanced_features = self._advanced_features new_agent._parameters = self._parameters diff --git a/src/agora_agent/agentkit/agent_session.py b/src/agora_agent/agentkit/agent_session.py index 7af4cf2..2408659 100644 --- a/src/agora_agent/agentkit/agent_session.py +++ b/src/agora_agent/agentkit/agent_session.py @@ -2,6 +2,18 @@ import warnings from ..core.api_error import ApiError +from ..agent_management.types.agent_think_agent_management_request_on_listening_action import ( + AgentThinkAgentManagementRequestOnListeningAction as AgentThinkRequestOnListeningAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_speaking_action import ( + AgentThinkAgentManagementRequestOnSpeakingAction as AgentThinkRequestOnSpeakingAction, +) +from ..agent_management.types.agent_think_agent_management_request_on_thinking_action import ( + AgentThinkAgentManagementRequestOnThinkingAction as AgentThinkRequestOnThinkingAction, +) +from ..agent_management.types.agent_think_agent_management_response import ( + AgentThinkAgentManagementResponse as AgentThinkResponse, +) from ..agents.types.start_agents_request_properties import StartAgentsRequestProperties from .agent import Agent from .avatar_types import ( @@ -126,6 +138,11 @@ def raw(self) -> typing.Any: """ return self._client.agents + @property + def raw_agent_management(self) -> typing.Any: + """Direct access to the underlying Fern-generated AgentManagement client.""" + return self._client.agent_management + # ------------------------------------------------------------------ # Internal helpers # ------------------------------------------------------------------ @@ -205,10 +222,10 @@ def _dump_model(value: typing.Any) -> typing.Any: return value def _is_mllm_mode(self) -> bool: - advanced_features = self._agent.advanced_features - if isinstance(advanced_features, dict): - return advanced_features.get("enable_mllm") is True - return bool(getattr(advanced_features, "enable_mllm", False)) + mllm = self._agent.mllm + if isinstance(mllm, dict) and mllm.get("enable") is True: + return True + return mllm is not None def _build_start_properties(self, token_opts: typing.Dict[str, typing.Any]) -> typing.Dict[str, typing.Any]: base_properties = self._agent.to_properties( @@ -457,6 +474,41 @@ def interrupt(self) -> None: self._app_id, self._agent_id, request_options=self._request_options() ) + def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline.""" + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + def update(self, properties: typing.Any) -> None: """Update the agent configuration at runtime. @@ -672,6 +724,41 @@ async def interrupt(self) -> None: self._app_id, self._agent_id, request_options=self._request_options() ) + async def think( + self, + text: str, + *, + on_listening_action: typing.Optional[AgentThinkRequestOnListeningAction] = None, + on_thinking_action: typing.Optional[AgentThinkRequestOnThinkingAction] = None, + on_speaking_action: typing.Optional[AgentThinkRequestOnSpeakingAction] = None, + interruptable: typing.Optional[bool] = None, + metadata: typing.Optional[typing.Dict[str, str]] = None, + ) -> AgentThinkResponse: + """Inject a custom text instruction into the current session pipeline.""" + if self._status != "running": + raise RuntimeError(f"Cannot think in {self._status} state") + if not self._agent_id: + raise RuntimeError("No agent ID available") + + kwargs: typing.Dict[str, typing.Any] = {"text": text} + if on_listening_action is not None: + kwargs["on_listening_action"] = on_listening_action + if on_thinking_action is not None: + kwargs["on_thinking_action"] = on_thinking_action + if on_speaking_action is not None: + kwargs["on_speaking_action"] = on_speaking_action + if interruptable is not None: + kwargs["interruptable"] = interruptable + if metadata is not None: + kwargs["metadata"] = metadata + + return await self._client.agent_management.agent_think( + self._app_id, + self._agent_id, + request_options=self._request_options(), + **kwargs, + ) + async def update(self, properties: typing.Any) -> None: """Update the agent configuration at runtime. diff --git a/src/agora_agent/agentkit/constants.py b/src/agora_agent/agentkit/constants.py index eb63ee0..f86e4d3 100644 --- a/src/agora_agent/agentkit/constants.py +++ b/src/agora_agent/agentkit/constants.py @@ -8,6 +8,11 @@ class DataChannel: RTM = "rtm" DATASTREAM = "datastream" +class AudioScenario: + DEFAULT = "default" + CHORUS = "chorus" + AISERVER = "aiserver" + # Silence action when timeout elapses: "speak" | "think" # (Use for parameters.silence_config.action — avoids shadowing SilenceAction type) diff --git a/src/agora_agent/agentkit/presets.py b/src/agora_agent/agentkit/presets.py index d73f15c..dcd9680 100644 --- a/src/agora_agent/agentkit/presets.py +++ b/src/agora_agent/agentkit/presets.py @@ -169,6 +169,8 @@ def strip_inferred_preset_fields(properties: typing.Dict[str, typing.Any], infer if inferred_preset == _MINIMAX_MODEL_TO_PRESET.get(_normalize_model_name(params.get("model")) or ""): params["model"] = None params["key"] = None + params["group_id"] = None + params["url"] = None tts = {k: v for k, v in {**tts, "params": _omit_none(params)}.items() if v is not None} return {**properties, "asr": asr, "llm": llm, "tts": tts} diff --git a/src/agora_agent/agentkit/vendors/__init__.py b/src/agora_agent/agentkit/vendors/__init__.py index 589c979..0320843 100644 --- a/src/agora_agent/agentkit/vendors/__init__.py +++ b/src/agora_agent/agentkit/vendors/__init__.py @@ -28,6 +28,7 @@ from .tts import ( AmazonTTS, CartesiaTTS, + DeepgramTTS, ElevenLabsTTS, FishAudioTTS, GoogleTTS, @@ -60,6 +61,7 @@ "MicrosoftTTS", "OpenAITTS", "CartesiaTTS", + "DeepgramTTS", "GoogleTTS", "AmazonTTS", "HumeAITTS", diff --git a/src/agora_agent/agentkit/vendors/avatar.py b/src/agora_agent/agentkit/vendors/avatar.py index 74f85ad..b83a356 100644 --- a/src/agora_agent/agentkit/vendors/avatar.py +++ b/src/agora_agent/agentkit/vendors/avatar.py @@ -1,3 +1,4 @@ +import warnings from typing import Any, Dict, Optional from pydantic import BaseModel, ConfigDict, Field, field_validator @@ -31,7 +32,14 @@ def validate_quality(cls, v: str) -> str: return v class HeyGenAvatar(BaseAvatar): + """Deprecated: HeyGen has been renamed to LiveAvatar. Use LiveAvatarAvatar instead.""" + def __init__(self, **kwargs: Any): + warnings.warn( + "HeyGenAvatar is deprecated; use LiveAvatarAvatar instead.", + DeprecationWarning, + stacklevel=2, + ) self.options = HeyGenAvatarOptions(**kwargs) @property diff --git a/src/agora_agent/agentkit/vendors/base.py b/src/agora_agent/agentkit/vendors/base.py index 7b8a5ca..f4c4ce0 100644 --- a/src/agora_agent/agentkit/vendors/base.py +++ b/src/agora_agent/agentkit/vendors/base.py @@ -65,8 +65,8 @@ class BaseMLLM(ABC): """Abstract base class for all MLLM (multimodal LLM) vendor implementations. When an MLLM is configured via :meth:`~agora_agent.agentkit.Agent.with_mllm`, - the ``enable_mllm`` flag is set on the request and the ``llm``/``tts`` fields - are omitted. Subclasses must implement :meth:`to_config` to return a dict + the ``mllm.enable`` flag is set on the request and the ``llm``/``tts`` fields + are omitted. Subclasses must implement :meth:`to_config` to return a dict that maps to the ``mllm`` field of the payload. """ diff --git a/src/agora_agent/agentkit/vendors/llm.py b/src/agora_agent/agentkit/vendors/llm.py index 34c4ce2..7465c9f 100644 --- a/src/agora_agent/agentkit/vendors/llm.py +++ b/src/agora_agent/agentkit/vendors/llm.py @@ -29,6 +29,7 @@ class OpenAIOptions(BaseModel): failure_message: Optional[str] = Field(default=None) input_modalities: Optional[List[str]] = Field(default=None) params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) greeting_configs: Optional[Dict[str, Any]] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) @@ -61,6 +62,8 @@ def to_config(self) -> Dict[str, Any]: } if self.options.api_key is not None: config["api_key"] = self.options.api_key + if self.options.headers is not None: + config["headers"] = self.options.headers if self.options.system_messages is not None: config["system_messages"] = self.options.system_messages @@ -99,6 +102,7 @@ class AzureOpenAIOptions(BaseModel): failure_message: Optional[str] = Field(default=None) input_modalities: Optional[List[str]] = Field(default=None) params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) greeting_configs: Optional[Dict[str, Any]] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) @@ -134,6 +138,8 @@ def to_config(self) -> Dict[str, Any]: params["max_tokens"] = self.options.max_tokens if params: config["params"] = params + if self.options.headers is not None: + config["headers"] = self.options.headers if self.options.system_messages is not None: config["system_messages"] = self.options.system_messages @@ -169,6 +175,7 @@ class AnthropicOptions(BaseModel): failure_message: Optional[str] = Field(default=None) input_modalities: Optional[List[str]] = Field(default=None) params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) greeting_configs: Optional[Dict[str, Any]] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) @@ -200,6 +207,8 @@ def to_config(self) -> Dict[str, Any]: if self.options.system_messages is not None: config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers if self.options.greeting_message is not None: config["greeting_message"] = self.options.greeting_message if self.options.failure_message is not None: @@ -235,6 +244,7 @@ class GeminiOptions(BaseModel): failure_message: Optional[str] = Field(default=None) input_modalities: Optional[List[str]] = Field(default=None) params: Optional[Dict[str, Any]] = Field(default=None) + headers: Optional[Dict[str, str]] = Field(default=None) output_modalities: Optional[List[str]] = Field(default=None) greeting_configs: Optional[Dict[str, Any]] = Field(default=None) template_variables: Optional[Dict[str, str]] = Field(default=None) @@ -268,6 +278,8 @@ def to_config(self) -> Dict[str, Any]: if self.options.system_messages is not None: config["system_messages"] = self.options.system_messages + if self.options.headers is not None: + config["headers"] = self.options.headers if self.options.greeting_message is not None: config["greeting_message"] = self.options.greeting_message if self.options.failure_message is not None: diff --git a/src/agora_agent/agentkit/vendors/mllm.py b/src/agora_agent/agentkit/vendors/mllm.py index 8deb5df..5f6f940 100644 --- a/src/agora_agent/agentkit/vendors/mllm.py +++ b/src/agora_agent/agentkit/vendors/mllm.py @@ -2,8 +2,13 @@ from pydantic import BaseModel, ConfigDict, Field +from ...agents.types.start_agents_request_properties_mllm_turn_detection import ( + StartAgentsRequestPropertiesMllmTurnDetection, +) from .base import BaseMLLM +MllmTurnDetectionConfig = StartAgentsRequestPropertiesMllmTurnDetection + class OpenAIRealtimeOptions(BaseModel): model_config = ConfigDict(extra="forbid") @@ -16,6 +21,7 @@ class OpenAIRealtimeOptions(BaseModel): output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") @@ -27,7 +33,6 @@ def __init__(self, **kwargs: Any): def to_config(self) -> Dict[str, Any]: config: Dict[str, Any] = { "vendor": "openai", - "style": "openai", "api_key": self.options.api_key, } @@ -54,6 +59,8 @@ def to_config(self) -> Dict[str, Any]: config["failure_message"] = self.options.failure_message if self.options.max_history is not None: config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection return config @@ -73,6 +80,7 @@ class VertexAIOptions(BaseModel): output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") @@ -98,7 +106,6 @@ def to_config(self) -> Dict[str, Any]: config: Dict[str, Any] = { "vendor": "vertexai", - "style": "openai", "params": params, } @@ -118,6 +125,8 @@ def to_config(self) -> Dict[str, Any]: config["failure_message"] = self.options.failure_message if self.options.max_history is not None: config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection return config @@ -135,6 +144,7 @@ class GeminiLiveOptions(BaseModel): output_modalities: Optional[List[str]] = Field(default=None, description="Output modalities") messages: Optional[List[Dict[str, Any]]] = Field(default=None, description="Conversation messages") additional_params: Optional[Dict[str, Any]] = Field(default=None, description="Additional parameters") + turn_detection: Optional[MllmTurnDetectionConfig] = Field(default=None, description="MLLM turn detection configuration") predefined_tools: Optional[List[str]] = Field(default=None, description="Predefined tools") failure_message: Optional[str] = Field(default=None, description="Message played on failure") max_history: Optional[int] = Field(default=None, description="Maximum conversation history length") @@ -155,7 +165,6 @@ def to_config(self) -> Dict[str, Any]: config: Dict[str, Any] = { "vendor": "gemini", - "style": "openai", "api_key": self.options.api_key, "params": params, } @@ -176,5 +185,7 @@ def to_config(self) -> Dict[str, Any]: config["failure_message"] = self.options.failure_message if self.options.max_history is not None: config["max_history"] = self.options.max_history + if self.options.turn_detection is not None: + config["turn_detection"] = self.options.turn_detection return config diff --git a/src/agora_agent/agentkit/vendors/tts.py b/src/agora_agent/agentkit/vendors/tts.py index 935479e..557ea56 100644 --- a/src/agora_agent/agentkit/vendors/tts.py +++ b/src/agora_agent/agentkit/vendors/tts.py @@ -226,6 +226,42 @@ def to_config(self) -> Dict[str, Any]: return result +class DeepgramTTSOptions(BaseModel): + model_config = ConfigDict(extra="forbid") + + api_key: str = Field(..., description="Deepgram API key") + model: str = Field(..., description="Deepgram TTS model (e.g., 'aura-2-thalia-en')") + base_url: Optional[str] = Field(default=None, description="WebSocket endpoint") + sample_rate: Optional[int] = Field(default=None, description="Sample rate in Hz") + params: Optional[Dict[str, Any]] = Field(default=None, description="Additional Deepgram TTS parameters") + skip_patterns: Optional[List[int]] = Field(default=None) + +class DeepgramTTS(BaseTTS): + def __init__(self, **kwargs: Any): + self.options = DeepgramTTSOptions(**kwargs) + + @property + def sample_rate(self) -> Optional[int]: + return self.options.sample_rate + + def to_config(self) -> Dict[str, Any]: + params: Dict[str, Any] = { + "api_key": self.options.api_key, + "model": self.options.model, + **(self.options.params or {}), + } + + if self.options.base_url is not None: + params["base_url"] = self.options.base_url + if self.options.sample_rate is not None: + params["sample_rate"] = self.options.sample_rate + + result: Dict[str, Any] = {"vendor": "deepgram", "params": params} + if self.options.skip_patterns is not None: + result["skip_patterns"] = self.options.skip_patterns + return result + + class HumeAITTSOptions(BaseModel): model_config = ConfigDict(extra="forbid") diff --git a/src/agora_agent/agents/types/start_agents_request_properties.py b/src/agora_agent/agents/types/start_agents_request_properties.py index e6bb1d1..06c3482 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties.py +++ b/src/agora_agent/agents/types/start_agents_request_properties.py @@ -11,6 +11,7 @@ from .start_agents_request_properties_avatar import StartAgentsRequestPropertiesAvatar from .start_agents_request_properties_filler_words import StartAgentsRequestPropertiesFillerWords from .start_agents_request_properties_geofence import StartAgentsRequestPropertiesGeofence +from .start_agents_request_properties_interruption import StartAgentsRequestPropertiesInterruption from .start_agents_request_properties_llm import StartAgentsRequestPropertiesLlm from .start_agents_request_properties_mllm import StartAgentsRequestPropertiesMllm from .start_agents_request_properties_parameters import StartAgentsRequestPropertiesParameters @@ -36,7 +37,7 @@ class StartAgentsRequestProperties(UncheckedBaseModel): agent_rtc_uid: str = pydantic.Field() """ - The user ID of the agent in the channel. A value of `0` means that a random UID is generated and assigned. Set the `token` accordingly. + The user ID of the agent in the channel. All UIDs within an RTC channel must be unique. Ensure no other user or service bot is using this UID. A value of `0` means that a unique random UID is generated and assigned. Set the `token` accordingly. """ remote_rtc_uids: typing.List[str] = pydantic.Field() @@ -93,7 +94,12 @@ class StartAgentsRequestProperties(UncheckedBaseModel): turn_detection: typing.Optional[StartAgentsRequestPropertiesTurnDetection] = pydantic.Field(default=None) """ - Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. + Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. + """ + + interruption: typing.Optional[StartAgentsRequestPropertiesInterruption] = pydantic.Field(default=None) + """ + Interruption control configuration. Provides unified management of the agent's behavior when interrupted by the user. """ sal: typing.Optional[StartAgentsRequestPropertiesSal] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py b/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py index 55480a4..78250d7 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_advanced_features.py @@ -14,7 +14,7 @@ class StartAgentsRequestPropertiesAdvancedFeatures(UncheckedBaseModel): enable_mllm: typing.Optional[bool] = pydantic.Field(default=None) """ - Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. See `turn_detection.type` for turn detection options available with MLLM. + Use `mllm.enable` instead. Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. See `turn_detection.mode` for turn detection options available with MLLM. """ enable_rtm: typing.Optional[bool] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_interruption.py b/src/agora_agent/agents/types/start_agents_request_properties_interruption.py new file mode 100644 index 0000000..4807e56 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_interruption.py @@ -0,0 +1,57 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel +from .start_agents_request_properties_interruption_disabled_config import ( + StartAgentsRequestPropertiesInterruptionDisabledConfig, +) +from .start_agents_request_properties_interruption_keywords_config import ( + StartAgentsRequestPropertiesInterruptionKeywordsConfig, +) +from .start_agents_request_properties_interruption_mode import StartAgentsRequestPropertiesInterruptionMode + + +class StartAgentsRequestPropertiesInterruption(UncheckedBaseModel): + """ + Interruption control configuration. Provides unified management of the agent's behavior when interrupted by the user. + """ + + enable: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to enable agent interruption: + - `true`: Enable interruption. + - `false`: Disable interruption. When disabled, the agent cannot be interrupted mid-response. + """ + + mode: typing.Optional[StartAgentsRequestPropertiesInterruptionMode] = pydantic.Field(default=None) + """ + The interruption trigger mode: + - `start_of_speech`: Trigger interruption when the user starts speaking. + - `keywords`: Trigger interruption when the user speaks a specified keyword. Configure the trigger keywords in `keywords_config`. + """ + + keywords_config: typing.Optional[StartAgentsRequestPropertiesInterruptionKeywordsConfig] = pydantic.Field( + default=None + ) + """ + Configuration for keyword-based interruption triggering. Applicable only when `mode` is `keywords`. + """ + + disabled_config: typing.Optional[StartAgentsRequestPropertiesInterruptionDisabledConfig] = pydantic.Field( + default=None + ) + """ + Configuration for agent behavior when interruption is disabled. Applicable only when `interruption.enable` is `false`. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_interruption_disabled_config.py b/src/agora_agent/agents/types/start_agents_request_properties_interruption_disabled_config.py new file mode 100644 index 0000000..fb476dd --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_interruption_disabled_config.py @@ -0,0 +1,34 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel +from .start_agents_request_properties_interruption_disabled_config_strategy import ( + StartAgentsRequestPropertiesInterruptionDisabledConfigStrategy, +) + + +class StartAgentsRequestPropertiesInterruptionDisabledConfig(UncheckedBaseModel): + """ + Configuration for agent behavior when interruption is disabled. Applicable only when `interruption.enable` is `false`. + """ + + strategy: typing.Optional[StartAgentsRequestPropertiesInterruptionDisabledConfigStrategy] = pydantic.Field( + default=None + ) + """ + The processing strategy when interruption is disabled: + - `append`: User speech does not interrupt the agent. The agent processes the user's input after the current interaction ends. + - `ignore`: The agent ignores user speech. If the agent receives user speech while speaking or thinking, it discards the input without storing it in context. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_interruption_disabled_config_strategy.py b/src/agora_agent/agents/types/start_agents_request_properties_interruption_disabled_config_strategy.py new file mode 100644 index 0000000..dd4195c --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_interruption_disabled_config_strategy.py @@ -0,0 +1,7 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +StartAgentsRequestPropertiesInterruptionDisabledConfigStrategy = typing.Union[ + typing.Literal["append", "ignore"], typing.Any +] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_interruption_keywords_config.py b/src/agora_agent/agents/types/start_agents_request_properties_interruption_keywords_config.py new file mode 100644 index 0000000..b4c89fc --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_interruption_keywords_config.py @@ -0,0 +1,27 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel + + +class StartAgentsRequestPropertiesInterruptionKeywordsConfig(UncheckedBaseModel): + """ + Configuration for keyword-based interruption triggering. Applicable only when `mode` is `keywords`. + """ + + trigger_keywords: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + The list of keywords that trigger an interruption. A maximum of 128 keywords is supported. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_interruption_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_interruption_mode.py new file mode 100644 index 0000000..7f19451 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_interruption_mode.py @@ -0,0 +1,5 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +StartAgentsRequestPropertiesInterruptionMode = typing.Union[typing.Literal["start_of_speech", "keywords"], typing.Any] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm.py b/src/agora_agent/agents/types/start_agents_request_properties_llm.py index 41ced33..20c391e 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_llm.py @@ -100,6 +100,11 @@ class StartAgentsRequestPropertiesLlm(UncheckedBaseModel): MCP (Model Context Protocol) server configuration. By configuring MCP servers, agents can call tools provided by external services to implement advanced functionality. """ + headers: typing.Optional[typing.Dict[str, str]] = pydantic.Field(default=None) + """ + Custom headers to include in requests to the LLM. Use this field to pass business-specific information such as custom fields or tenant identifiers. These headers are merged with the headers generated by the Conversational AI Engine. If a key conflict occurs, the engine-generated header takes precedence. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py index 46355b7..a8594ee 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_llm_greeting_configs.py @@ -22,6 +22,11 @@ class StartAgentsRequestPropertiesLlmGreetingConfigs(UncheckedBaseModel): - `single_first`: Broadcasts a greeting only once to the first user who joins the channel. """ + delay_ms: typing.Optional[int] = pydantic.Field(default=None) + """ + The delay in milliseconds before the agent plays the greeting message after a user joins the channel. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py index d0693f6..e84422c 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_mllm.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm.py @@ -5,6 +5,7 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from .start_agents_request_properties_mllm_turn_detection import StartAgentsRequestPropertiesMllmTurnDetection from .start_agents_request_properties_mllm_vendor import StartAgentsRequestPropertiesMllmVendor @@ -13,6 +14,11 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): Multimodal Large Language Model (MLLM) configuration for real-time audio and text processing. `mllm` is an exclusive alternative to the standard `asr` + `llm` + `tts` pipeline. """ + enable: typing.Optional[bool] = pydantic.Field(default=None) + """ + Enable Multimodal Large Language Model for voice-to-voice processing. Enabling MLLM automatically disables ASR, LLM, and TTS since the MLLM handles end-to-end voice processing directly. Replaces the deprecated `advanced_features.enable_mllm`. + """ + url: typing.Optional[str] = pydantic.Field(default=None) """ The MLLM WebSocket URL for real-time communication. @@ -30,7 +36,7 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) """ - Additional MLLM configuration parameters. The `modalities` setting is overridden by `input_modalities` and `output_modalities`. The `turn_detection` setting is overridden by the `turn_detection` section outside of `mllm`. + Additional MLLM configuration parameters. The `modalities` setting is overridden by `input_modalities` and `output_modalities`. The `turn_detection` setting is overridden by `mllm.turn_detection`. """ input_modalities: typing.Optional[typing.List[str]] = pydantic.Field(default=None) @@ -51,6 +57,21 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): Agent greeting message. If provided, the first user in the channel is automatically greeted with this message upon joining. """ + failure_message: typing.Optional[str] = pydantic.Field(default=None) + """ + Message played when the MLLM call fails. + """ + + max_history: typing.Optional[int] = pydantic.Field(default=None) + """ + Maximum number of conversation history messages cached for the MLLM session. + """ + + predefined_tools: typing.Optional[typing.List[str]] = pydantic.Field(default=None) + """ + Predefined tools available to the MLLM provider. + """ + vendor: typing.Optional[StartAgentsRequestPropertiesMllmVendor] = pydantic.Field(default=None) """ MLLM provider. Currently supports: @@ -59,10 +80,9 @@ class StartAgentsRequestPropertiesMllm(UncheckedBaseModel): - `vertexai`: Google Gemini Live (Vertex AI) """ - style: typing.Optional[typing.Literal["openai"]] = pydantic.Field(default=None) + turn_detection: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetection] = pydantic.Field(default=None) """ - The request style for MLLM completion: - - `openai`: For OpenAI Realtime API format + Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. """ if IS_PYDANTIC_V2: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py new file mode 100644 index 0000000..9298a0c --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection.py @@ -0,0 +1,61 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel +from .start_agents_request_properties_mllm_turn_detection_agora_vad_config import ( + StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig, +) +from .start_agents_request_properties_mllm_turn_detection_mode import StartAgentsRequestPropertiesMllmTurnDetectionMode +from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config import ( + StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig, +) +from .start_agents_request_properties_mllm_turn_detection_server_vad_config import ( + StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig, +) + + +class StartAgentsRequestPropertiesMllmTurnDetection(UncheckedBaseModel): + """ + Turn detection configuration for the MLLM module. When defined, the top-level `turn_detection` object has no effect. + """ + + mode: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionMode] = pydantic.Field(default=None) + """ + Turn detection mode for MLLM: + - `agora_vad`: Agora VAD-based detection. + - `server_vad`: Vendor-side VAD-based detection. Supported by OpenAI Realtime API and Gemini Live. + - `semantic_vad`: Semantic-based detection. Supported by OpenAI Realtime API only. + """ + + agora_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig] = pydantic.Field( + default=None + ) + """ + Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. + """ + + server_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig] = pydantic.Field( + default=None + ) + """ + Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. + """ + + semantic_vad_config: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig] = ( + pydantic.Field(default=None) + ) + """ + Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py new file mode 100644 index 0000000..ec30215 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_agora_vad_config.py @@ -0,0 +1,42 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel + + +class StartAgentsRequestPropertiesMllmTurnDetectionAgoraVadConfig(UncheckedBaseModel): + """ + Configuration for Agora VAD-based turn detection. Applicable when `mode` is `agora_vad`. + """ + + interrupt_duration_ms: typing.Optional[int] = pydantic.Field(default=None) + """ + Minimum duration of speech in milliseconds required to trigger an interruption. + """ + + prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) + """ + Duration of audio in milliseconds to include before the detected speech start. + """ + + silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) + """ + Duration of silence in milliseconds required to determine end of speech. + """ + + threshold: typing.Optional[float] = pydantic.Field(default=None) + """ + VAD sensitivity threshold. A higher value reduces false positives. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py new file mode 100644 index 0000000..0d004e8 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_mode.py @@ -0,0 +1,7 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +StartAgentsRequestPropertiesMllmTurnDetectionMode = typing.Union[ + typing.Literal["agora_vad", "server_vad", "semantic_vad"], typing.Any +] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py new file mode 100644 index 0000000..1e310f0 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel +from .start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness import ( + StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness, +) + + +class StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfig(UncheckedBaseModel): + """ + Configuration for semantic-based turn detection. Applicable when `mode` is `semantic_vad`. Supported by OpenAI Realtime API only. + """ + + eagerness: typing.Optional[StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness] = ( + pydantic.Field(default=None) + ) + """ + Controls how eagerly the model ends its turn. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py new file mode 100644 index 0000000..8b67b1d --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_semantic_vad_config_eagerness.py @@ -0,0 +1,7 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +StartAgentsRequestPropertiesMllmTurnDetectionSemanticVadConfigEagerness = typing.Union[ + typing.Literal["auto", "low", "medium", "high"], typing.Any +] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py new file mode 100644 index 0000000..d27b76e --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config.py @@ -0,0 +1,62 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ...core.pydantic_utilities import IS_PYDANTIC_V2 +from ...core.unchecked_base_model import UncheckedBaseModel +from .start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity import ( + StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity, +) +from .start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity import ( + StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity, +) + + +class StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfig(UncheckedBaseModel): + """ + Configuration for vendor-side VAD-based turn detection. Applicable when `mode` is `server_vad`. Parameters are passed through to the vendor. + """ + + prefix_padding_ms: typing.Optional[int] = pydantic.Field(default=None) + """ + Duration of audio in milliseconds to include before the detected speech start. + """ + + silence_duration_ms: typing.Optional[int] = pydantic.Field(default=None) + """ + Duration of silence in milliseconds required to determine end of speech. + """ + + threshold: typing.Optional[float] = pydantic.Field(default=None) + """ + VAD sensitivity threshold. Applicable to OpenAI Realtime API only. + """ + + idle_timeout_ms: typing.Optional[int] = pydantic.Field(default=None) + """ + Idle timeout in milliseconds. Applicable to OpenAI Realtime API only. + """ + + start_of_speech_sensitivity: typing.Optional[ + StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity + ] = pydantic.Field(default=None) + """ + Sensitivity for start of speech detection. Applicable to Gemini Live only. + """ + + end_of_speech_sensitivity: typing.Optional[ + StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity + ] = pydantic.Field(default=None) + """ + Sensitivity for end of speech detection. Applicable to Gemini Live only. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py new file mode 100644 index 0000000..e92d3f1 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_end_of_speech_sensitivity.py @@ -0,0 +1,7 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigEndOfSpeechSensitivity = typing.Union[ + typing.Literal["END_SENSITIVITY_HIGH", "END_SENSITIVITY_LOW"], typing.Any +] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py new file mode 100644 index 0000000..25860c1 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_mllm_turn_detection_server_vad_config_start_of_speech_sensitivity.py @@ -0,0 +1,7 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +StartAgentsRequestPropertiesMllmTurnDetectionServerVadConfigStartOfSpeechSensitivity = typing.Union[ + typing.Literal["START_SENSITIVITY_HIGH", "START_SENSITIVITY_LOW"], typing.Any +] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_parameters.py b/src/agora_agent/agents/types/start_agents_request_properties_parameters.py index aee1492..9106f2c 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_parameters.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_parameters.py @@ -5,6 +5,9 @@ import pydantic from ...core.pydantic_utilities import IS_PYDANTIC_V2 from ...core.unchecked_base_model import UncheckedBaseModel +from .start_agents_request_properties_parameters_audio_scenario import ( + StartAgentsRequestPropertiesParametersAudioScenario, +) from .start_agents_request_properties_parameters_data_channel import StartAgentsRequestPropertiesParametersDataChannel from .start_agents_request_properties_parameters_farewell_config import ( StartAgentsRequestPropertiesParametersFarewellConfig, @@ -48,6 +51,14 @@ class StartAgentsRequestPropertiesParameters(UncheckedBaseModel): Whether to receive agent error events. This setting only takes effect when `advanced_features.enable_rtm` is `true`. """ + audio_scenario: typing.Optional[StartAgentsRequestPropertiesParametersAudioScenario] = pydantic.Field(default=None) + """ + The audio scenario for the RTC channel. + - `default`: Maps to `aiserver`. + - `chorus`: Real-time chorus scenario, where users have good network conditions and require ultra-low latency. + - `aiserver`: Optimized for interactions between the user and the conversational AI agent in terms of latency and network resilience. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_parameters_audio_scenario.py b/src/agora_agent/agents/types/start_agents_request_properties_parameters_audio_scenario.py new file mode 100644 index 0000000..8e14e90 --- /dev/null +++ b/src/agora_agent/agents/types/start_agents_request_properties_parameters_audio_scenario.py @@ -0,0 +1,7 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +StartAgentsRequestPropertiesParametersAudioScenario = typing.Union[ + typing.Literal["default", "chorus", "aiserver"], typing.Any +] diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py index a768af1..40dbb02 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection.py @@ -15,7 +15,7 @@ class StartAgentsRequestPropertiesTurnDetection(UncheckedBaseModel): """ - Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. + Conversation turn detection settings. Controls the logic for voice activity detection and conversation turn determination. This object has no effect when `mllm.enable` is true; use `mllm.turn_detection` instead. """ mode: typing.Optional[typing.Literal["default"]] = pydantic.Field(default=None) diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config.py index 3c45e22..71e7722 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_end_of_speech_semantic_config.py @@ -18,6 +18,13 @@ class StartAgentsRequestPropertiesTurnDetectionConfigEndOfSpeechSemanticConfig(U Maximum wait time in milliseconds. Use `-1` for no timeout. The maximum time to wait for semantic determination. After timeout, the conversation end is determined based on the current state. """ + pause_state_enabled: typing.Optional[bool] = pydantic.Field(default=None) + """ + Whether to detect user intent to pause the conversation: + - `true`: The agent uses semantic understanding to determine if the user intends to pause the conversation. For example, when the user's input ends with phrases such as "hold on" or "just a moment", the agent waits for further input rather than treating the utterance as complete and sending it to the LLM. + - `false`: The agent does not detect intent to pause the conversation. + """ + if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 else: diff --git a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_start_of_speech.py b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_start_of_speech.py index 6d72018..cfd7a8a 100644 --- a/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_start_of_speech.py +++ b/src/agora_agent/agents/types/start_agents_request_properties_turn_detection_config_start_of_speech.py @@ -28,8 +28,8 @@ class StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeech(UncheckedBase """ Start of speech detection mode: - `vad`: Based on VAD (Voice Activity Detection). Uses audio signal detection. - - `keywords`: (Beta) Based on keyword trigger. Conversation begins when the agent detects a specified keyword. - - `disabled`: Disables start of speech detection. Does not actively trigger new conversation turns. + - `keywords`: Deprecated. Use `interruption.mode = "keywords"` instead. + - `disabled`: Deprecated. Use `interruption.enable = false` with `interruption.disabled_config.strategy` to configure the handling strategy. """ vad_config: typing.Optional[StartAgentsRequestPropertiesTurnDetectionConfigStartOfSpeechVadConfig] = pydantic.Field( diff --git a/src/agora_agent/client.py b/src/agora_agent/client.py index 3f2290c..8a981af 100644 --- a/src/agora_agent/client.py +++ b/src/agora_agent/client.py @@ -9,6 +9,7 @@ from .environment import AgoraEnvironment if typing.TYPE_CHECKING: + from .agent_management.client import AgentManagementClient, AsyncAgentManagementClient from .agents.client import AgentsClient, AsyncAgentsClient from .phone_numbers.client import AsyncPhoneNumbersClient, PhoneNumbersClient from .telephony.client import AsyncTelephonyClient, TelephonyClient @@ -88,6 +89,7 @@ def __init__( timeout=_defaulted_timeout, ) self._agents: typing.Optional[AgentsClient] = None + self._agent_management: typing.Optional[AgentManagementClient] = None self._telephony: typing.Optional[TelephonyClient] = None self._phone_numbers: typing.Optional[PhoneNumbersClient] = None @@ -99,6 +101,14 @@ def agents(self): self._agents = AgentsClient(client_wrapper=self._client_wrapper) return self._agents + @property + def agent_management(self): + if self._agent_management is None: + from .agent_management.client import AgentManagementClient # noqa: E402 + + self._agent_management = AgentManagementClient(client_wrapper=self._client_wrapper) + return self._agent_management + @property def telephony(self): if self._telephony is None: @@ -190,6 +200,7 @@ def __init__( timeout=_defaulted_timeout, ) self._agents: typing.Optional[AsyncAgentsClient] = None + self._agent_management: typing.Optional[AsyncAgentManagementClient] = None self._telephony: typing.Optional[AsyncTelephonyClient] = None self._phone_numbers: typing.Optional[AsyncPhoneNumbersClient] = None @@ -201,6 +212,14 @@ def agents(self): self._agents = AsyncAgentsClient(client_wrapper=self._client_wrapper) return self._agents + @property + def agent_management(self): + if self._agent_management is None: + from .agent_management.client import AsyncAgentManagementClient # noqa: E402 + + self._agent_management = AsyncAgentManagementClient(client_wrapper=self._client_wrapper) + return self._agent_management + @property def telephony(self): if self._telephony is None: diff --git a/src/agora_agent/core/client_wrapper.py b/src/agora_agent/core/client_wrapper.py index e632923..c5a0e03 100644 --- a/src/agora_agent/core/client_wrapper.py +++ b/src/agora_agent/core/client_wrapper.py @@ -26,10 +26,10 @@ def __init__( def get_headers(self) -> typing.Dict[str, str]: headers: typing.Dict[str, str] = { - "User-Agent": "agora-agent-server-sdk/1.3.0", + "User-Agent": "agora-agent-server-sdk/v1.4.0", "X-Fern-Language": "Python", "X-Fern-SDK-Name": "agora-agent-server-sdk", - "X-Fern-SDK-Version": "1.3.0", + "X-Fern-SDK-Version": "v1.4.0", **(self.get_custom_headers() or {}), } headers["Authorization"] = httpx.BasicAuth(self._get_username(), self._get_password())._auth_header diff --git a/src/agora_agent/core/pydantic_utilities.py b/src/agora_agent/core/pydantic_utilities.py index e586456..185e5c4 100644 --- a/src/agora_agent/core/pydantic_utilities.py +++ b/src/agora_agent/core/pydantic_utilities.py @@ -2,43 +2,22 @@ # nopycln: file import datetime as dt -import types -import typing from collections import defaultdict from typing import Any, Callable, ClassVar, Dict, List, Mapping, Optional, Set, Tuple, Type, TypeVar, Union, cast import pydantic -import typing_extensions -from .datetime_utils import serialize_datetime -from .serialization import convert_and_respect_annotation_metadata -from typing_extensions import TypeAlias IS_PYDANTIC_V2 = pydantic.VERSION.startswith("2.") if IS_PYDANTIC_V2: - UnionType: Any = getattr(types, "UnionType", None) - ModelField = Any - encoders_by_type = { - dt.date: str, - dt.datetime: serialize_datetime, - } - get_args = typing_extensions.get_args - get_origin = typing_extensions.get_origin - - _DATE_ADAPTER = pydantic.TypeAdapter(dt.date) # type: ignore[attr-defined] - _DATETIME_ADAPTER = pydantic.TypeAdapter(dt.datetime) # type: ignore[attr-defined] - - def parse_date(value: Any) -> dt.date: - return _DATE_ADAPTER.validate_python(value) - - def parse_datetime(value: Any) -> dt.datetime: - return _DATETIME_ADAPTER.validate_python(value) - - def is_literal_type(type_: Any) -> bool: - return get_origin(type_) in (typing.Literal, typing_extensions.Literal) - - def is_union(type_: Any) -> bool: - return get_origin(type_) in (Union, UnionType) + from pydantic.v1.datetime_parse import parse_date as parse_date + from pydantic.v1.datetime_parse import parse_datetime as parse_datetime + from pydantic.v1.fields import ModelField as ModelField + from pydantic.v1.json import ENCODERS_BY_TYPE as encoders_by_type # type: ignore[attr-defined] + from pydantic.v1.typing import get_args as get_args + from pydantic.v1.typing import get_origin as get_origin + from pydantic.v1.typing import is_literal_type as is_literal_type + from pydantic.v1.typing import is_union as is_union else: from pydantic.datetime_parse import parse_date as parse_date # type: ignore[no-redef] from pydantic.datetime_parse import parse_datetime as parse_datetime # type: ignore[no-redef] @@ -49,6 +28,10 @@ def is_union(type_: Any) -> bool: from pydantic.typing import is_literal_type as is_literal_type # type: ignore[no-redef] from pydantic.typing import is_union as is_union # type: ignore[no-redef] +from .datetime_utils import serialize_datetime +from .serialization import convert_and_respect_annotation_metadata +from typing_extensions import TypeAlias + T = TypeVar("T") Model = TypeVar("Model", bound=pydantic.BaseModel) @@ -210,15 +193,12 @@ class V2RootModel(UniversalBaseModel, pydantic.RootModel): # type: ignore[misc, def encode_by_type(o: Any) -> Any: - encoders_by_class_tuples: Dict[Callable[[Any], Any], Tuple[Type[Any], ...]] = {} + encoders_by_class_tuples: Dict[Callable[[Any], Any], Tuple[Any, ...]] = defaultdict(tuple) for type_, encoder in encoders_by_type.items(): - typed_encoder = cast(Callable[[Any], Any], encoder) - typed_type = cast(Type[Any], type_) - encoders_by_class_tuples[typed_encoder] = encoders_by_class_tuples.get(typed_encoder, ()) + (typed_type,) + encoders_by_class_tuples[encoder] += (type_,) if type(o) in encoders_by_type: - encoder = cast(Callable[[Any], Any], encoders_by_type[type(o)]) - return encoder(o) + return encoders_by_type[type(o)](o) for encoder, classes_tuple in encoders_by_class_tuples.items(): if isinstance(o, classes_tuple): return encoder(o) diff --git a/src/agora_agent/types/cartesia_tts_params.py b/src/agora_agent/types/cartesia_tts_params.py index ed3aa8f..2aaf069 100644 --- a/src/agora_agent/types/cartesia_tts_params.py +++ b/src/agora_agent/types/cartesia_tts_params.py @@ -5,6 +5,7 @@ import pydantic from ..core.pydantic_utilities import IS_PYDANTIC_V2 from ..core.unchecked_base_model import UncheckedBaseModel +from .cartesia_tts_voice import CartesiaTtsVoice class CartesiaTtsParams(UncheckedBaseModel): @@ -12,16 +13,12 @@ class CartesiaTtsParams(UncheckedBaseModel): Cartesia TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: str = pydantic.Field() """ Cartesia API key """ - voice_id: str = pydantic.Field() - """ - Cartesia voice ID - """ - + voice: CartesiaTtsVoice model_id: typing.Optional[str] = pydantic.Field(default=None) """ Model ID (optional) diff --git a/src/agora_agent/types/cartesia_tts_voice.py b/src/agora_agent/types/cartesia_tts_voice.py new file mode 100644 index 0000000..f49ee45 --- /dev/null +++ b/src/agora_agent/types/cartesia_tts_voice.py @@ -0,0 +1,32 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class CartesiaTtsVoice(UncheckedBaseModel): + """ + Cartesia voice selection. + """ + + mode: typing.Literal["id"] = pydantic.Field(default="id") + """ + Cartesia voice selection mode. + """ + + id: str = pydantic.Field() + """ + Cartesia voice ID + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_tts.py b/src/agora_agent/types/deepgram_tts.py new file mode 100644 index 0000000..ce38d9e --- /dev/null +++ b/src/agora_agent/types/deepgram_tts.py @@ -0,0 +1,29 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel +from .deepgram_tts_params import DeepgramTtsParams + + +class DeepgramTts(UncheckedBaseModel): + """ + Deepgram Text-to-Speech configuration (Beta). + """ + + params: DeepgramTtsParams + skip_patterns: typing.Optional[typing.List[int]] = pydantic.Field(default=None) + """ + Controls whether the TTS module skips bracketed content when reading LLM response text. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/deepgram_tts_params.py b/src/agora_agent/types/deepgram_tts_params.py new file mode 100644 index 0000000..e858291 --- /dev/null +++ b/src/agora_agent/types/deepgram_tts_params.py @@ -0,0 +1,52 @@ +# This file was auto-generated by Fern from our API Definition. + +import typing + +import pydantic +from ..core.pydantic_utilities import IS_PYDANTIC_V2 +from ..core.unchecked_base_model import UncheckedBaseModel + + +class DeepgramTtsParams(UncheckedBaseModel): + """ + Deepgram TTS configuration parameters. + """ + + api_key: str = pydantic.Field() + """ + Deepgram API key + """ + + model: str = pydantic.Field() + """ + Deepgram TTS model (for example, "aura-2-thalia-en") + """ + + base_url: typing.Optional[str] = pydantic.Field(default=None) + """ + Deepgram WebSocket endpoint override + """ + + sample_rate: typing.Optional[int] = pydantic.Field(default=None) + """ + Audio sampling rate in Hz + """ + + params: typing.Optional[typing.Dict[str, typing.Any]] = pydantic.Field(default=None) + """ + Additional Deepgram TTS parameters + """ + + skip_patterns: typing.Optional[typing.List[int]] = pydantic.Field(default=None) + """ + Controls whether the TTS module skips bracketed content when reading LLM response text. + """ + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow diff --git a/src/agora_agent/types/open_ai_tts_params.py b/src/agora_agent/types/open_ai_tts_params.py index 458292d..67a6e1d 100644 --- a/src/agora_agent/types/open_ai_tts_params.py +++ b/src/agora_agent/types/open_ai_tts_params.py @@ -12,7 +12,7 @@ class OpenAiTtsParams(UncheckedBaseModel): OpenAI TTS configuration parameters. """ - key: str = pydantic.Field() + api_key: typing.Optional[str] = pydantic.Field(default=None) """ OpenAI API key """ diff --git a/src/agora_agent/types/tts.py b/src/agora_agent/types/tts.py index 56a36fd..85761fd 100644 --- a/src/agora_agent/types/tts.py +++ b/src/agora_agent/types/tts.py @@ -10,6 +10,7 @@ from ..core.unchecked_base_model import UncheckedBaseModel, UnionMetadata from .amazon_tts_params import AmazonTtsParams from .cartesia_tts_params import CartesiaTtsParams +from .deepgram_tts_params import DeepgramTtsParams from .eleven_labs_tts_params import ElevenLabsTtsParams from .fish_audio_tts_params import FishAudioTtsParams from .google_tts_params import GoogleTtsParams @@ -202,6 +203,21 @@ class Config: extra = pydantic.Extra.allow +class Tts_Deepgram(UncheckedBaseModel): + vendor: typing.Literal["deepgram"] = "deepgram" + params: DeepgramTtsParams + skip_patterns: typing.Optional[typing.List[int]] = None + + if IS_PYDANTIC_V2: + model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 + else: + + class Config: + frozen = True + smart_union = True + extra = pydantic.Extra.allow + + Tts = typing_extensions.Annotated[ typing.Union[ Tts_Microsoft, @@ -216,6 +232,7 @@ class Config: Tts_Google, Tts_Amazon, Tts_Sarvam, + Tts_Deepgram, ], UnionMetadata(discriminant="vendor"), ] diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index f452943..0000000 --- a/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# This file makes the test suite an explicit package for mypy module resolution. diff --git a/tests/agentkit/__init__.py b/tests/agentkit/__init__.py deleted file mode 100644 index 394ea77..0000000 --- a/tests/agentkit/__init__.py +++ /dev/null @@ -1 +0,0 @@ -# This file makes the AgentKit test suite an explicit package for mypy module resolution. diff --git a/tests/agentkit/helpers.py b/tests/agentkit/helpers.py deleted file mode 100644 index 3936836..0000000 --- a/tests/agentkit/helpers.py +++ /dev/null @@ -1,112 +0,0 @@ -from __future__ import annotations - -from types import SimpleNamespace -from typing import Any, Dict, List, Optional - - -def dump_model(value: Any) -> Any: - if hasattr(value, "model_dump"): - return value.model_dump(exclude_none=True) - if isinstance(value, dict): - return {k: dump_model(v) for k, v in value.items()} - if isinstance(value, list): - return [dump_model(v) for v in value] - return value - - -class DummyAgents: - def __init__(self) -> None: - self.start_calls: List[Any] = [] - self.stop_calls: List[Any] = [] - self.speak_calls: List[Any] = [] - self.interrupt_calls: List[Any] = [] - self.update_calls: List[Any] = [] - self.history_calls: List[Any] = [] - self.turn_calls: List[Any] = [] - self.get_calls: List[Any] = [] - - self.start_result: Any = SimpleNamespace(agent_id="agent-1") - self.start_error: Optional[Exception] = None - self.stop_error: Optional[Exception] = None - - def start(self, app_id, **kwargs): - self.start_calls.append((app_id, kwargs)) - if self.start_error is not None: - raise self.start_error - return self.start_result - - def stop(self, app_id, agent_id, request_options=None): - self.stop_calls.append((app_id, agent_id, request_options)) - if self.stop_error is not None: - raise self.stop_error - return None - - def speak(self, app_id, agent_id, request_options=None, **kwargs): - self.speak_calls.append((app_id, agent_id, request_options, kwargs)) - return None - - def interrupt(self, app_id, agent_id, request_options=None): - self.interrupt_calls.append((app_id, agent_id, request_options)) - return None - - def update(self, app_id, agent_id, properties=None, request_options=None): - self.update_calls.append((app_id, agent_id, properties, request_options)) - return None - - def get_history(self, app_id, agent_id, request_options=None): - self.history_calls.append((app_id, agent_id, request_options)) - return {"contents": []} - - def get_turns(self, app_id, agent_id, request_options=None): - self.turn_calls.append((app_id, agent_id, request_options)) - return {"turns": [{"agent_id": agent_id}]} - - def get(self, app_id, agent_id, request_options=None): - self.get_calls.append((app_id, agent_id, request_options)) - return {"agent_id": agent_id} - - -class DummyAsyncAgents(DummyAgents): - async def start(self, app_id, **kwargs): - return super().start(app_id, **kwargs) - - async def stop(self, app_id, agent_id, request_options=None): - return super().stop(app_id, agent_id, request_options) - - async def speak(self, app_id, agent_id, request_options=None, **kwargs): - return super().speak(app_id, agent_id, request_options, **kwargs) - - async def interrupt(self, app_id, agent_id, request_options=None): - return super().interrupt(app_id, agent_id, request_options) - - async def update(self, app_id, agent_id, properties=None, request_options=None): - return super().update(app_id, agent_id, properties, request_options) - - async def get_history(self, app_id, agent_id, request_options=None): - return super().get_history(app_id, agent_id, request_options) - - async def get_turns(self, app_id, agent_id, request_options=None): - return super().get_turns(app_id, agent_id, request_options) - - async def get(self, app_id, agent_id, request_options=None): - return super().get(app_id, agent_id, request_options) - - -class DummyClient: - def __init__( - self, - *, - auth_mode: str = "basic", - app_id: str = "app-id", - app_certificate: Optional[str] = "app-cert", - ) -> None: - self.app_id = app_id - self.app_certificate = app_certificate - self.auth_mode = auth_mode - self.agents = DummyAgents() - - -class DummyAsyncClient(DummyClient): - def __init__(self, **kwargs: Any) -> None: - super().__init__(**kwargs) - self.agents = DummyAsyncAgents() diff --git a/tests/agentkit/test_agent.py b/tests/agentkit/test_agent.py deleted file mode 100644 index 4bd0030..0000000 --- a/tests/agentkit/test_agent.py +++ /dev/null @@ -1,130 +0,0 @@ -from unittest import mock - -import pytest - -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import ( - DeepgramSTT, - ElevenLabsTTS, - OpenAI, - OpenAIRealtime, -) -from tests.agentkit.helpers import DummyClient, dump_model - - -def test_builder_methods_are_immutable_and_reflected_in_config_and_getters(): - agent = Agent(name="base", instructions="helpful") - llm = OpenAI(api_key="key", model="gpt-4o-mini") - tts = ElevenLabsTTS(key="tts", model_id="model", voice_id="voice", sample_rate=24000) - stt = DeepgramSTT(api_key="dg", model="nova-3") - - configured = agent.with_llm(llm).with_tts(tts).with_stt(stt).with_greeting("hi").with_max_history(10) - - assert agent.llm is None - assert configured.llm == llm.to_config() - assert configured.tts == tts.to_config() - assert configured.stt == stt.to_config() - assert configured.greeting == "hi" - assert configured.max_history == 10 - assert configured.config["name"] == "base" - - -def test_create_session_resolves_name_from_option_agent_or_timestamp(): - client = DummyClient() - named = Agent(name="from-agent") - explicit = named.create_session(client, channel="c", agent_uid="1", remote_uids=["2"], name="explicit") - assert explicit.agent.name == "from-agent" - assert explicit._name == "explicit" - - defaulted = named.create_session(client, channel="c", agent_uid="1", remote_uids=["2"]) - assert defaulted._name == "from-agent" - - with mock.patch("agora_agent.agentkit.agent.time.time", return_value=123456): - generated = Agent().create_session(client, channel="c", agent_uid="1", remote_uids=["2"]) - assert generated._name == "agent-123456" - - -def test_to_properties_throws_when_llm_or_tts_missing_outside_preset_or_pipeline_flow(): - with pytest.raises(ValueError, match="TTS configuration is required"): - Agent().with_llm(OpenAI(api_key="key", model="gpt-4o-mini")).to_properties( - channel="room", - agent_uid="1", - remote_uids=["2"], - token="token", - ) - - with pytest.raises(ValueError, match="LLM configuration is required"): - Agent().with_tts(ElevenLabsTTS(key="tts", model_id="model", voice_id="voice", sample_rate=24000)).to_properties( - channel="room", - agent_uid="1", - remote_uids=["2"], - token="token", - ) - - -def test_to_properties_applies_defaults_and_overrides_for_standard_pipeline(): - agent = ( - Agent(instructions="top-level instructions", greeting="hello", failure_message="retry", max_history=7) - .with_llm( - OpenAI( - api_key="key", - model="gpt-4o-mini", - greeting_message="vendor greeting", - failure_message="vendor failure", - ) - ) - .with_tts(ElevenLabsTTS(key="tts", model_id="model", voice_id="voice", sample_rate=24000)) - .with_stt(DeepgramSTT(api_key="dg", model="nova-3")) - ) - - props = dump_model( - agent.to_properties(channel="room", agent_uid="1", remote_uids=["2"], token="token") - ) - - assert props["llm"]["system_messages"] == [{"role": "system", "content": "top-level instructions"}] - assert props["llm"]["greeting_message"] == "vendor greeting" - assert props["llm"]["failure_message"] == "vendor failure" - assert props["llm"]["max_history"] == 7 - assert props["tts"]["vendor"] == "elevenlabs" - assert props["asr"]["vendor"] == "deepgram" - - -def test_to_properties_supports_preset_or_pipeline_backed_sessions_without_llm_or_tts(): - props = dump_model( - Agent(instructions="preset-backed").to_properties( - channel="room", - agent_uid="1", - remote_uids=["2"], - token="token", - skip_vendor_validation=True, - ) - ) - assert props["channel"] == "room" - assert "llm" not in props - assert "tts" not in props - - -def test_to_properties_generates_token_and_respects_mllm_vendor_precedence(): - agent = Agent(greeting="top hello", failure_message="top fail", max_history=9).with_mllm( - OpenAIRealtime( - api_key="key", - url="wss://openai.example.com/realtime", - greeting_message="vendor hello", - ) - ).with_advanced_features({"enable_mllm": True}) - - props = dump_model( - agent.to_properties( - channel="room", - agent_uid="1", - remote_uids=["2"], - app_id="app-id", - app_certificate="app-cert", - ) - ) - - assert props["mllm"]["greeting_message"] == "vendor hello" - assert props["mllm"]["failure_message"] == "top fail" - assert props["mllm"]["max_history"] == 9 - assert props["mllm"]["url"] == "wss://openai.example.com/realtime" - assert isinstance(props["token"], str) and props["token"] diff --git a/tests/agentkit/test_agent_session.py b/tests/agentkit/test_agent_session.py deleted file mode 100644 index f91722a..0000000 --- a/tests/agentkit/test_agent_session.py +++ /dev/null @@ -1,171 +0,0 @@ -import asyncio -from unittest import mock - -import pytest - -from agora_agent.agentkit import Agent -from agora_agent.agentkit.vendors import AkoolAvatar, DeepgramSTT, ElevenLabsTTS, OpenAI -from agora_agent.core.api_error import ApiError -from tests.agentkit.helpers import DummyAsyncClient, DummyClient, dump_model - - -def build_standard_agent(): - return ( - Agent(name="assistant", instructions="be helpful") - .with_stt(DeepgramSTT(api_key="dg", model="nova-3")) - .with_llm(OpenAI(api_key="key", model="gpt-4o-mini")) - .with_tts(ElevenLabsTTS(key="tts", model_id="model", voice_id="voice", sample_rate=24000)) - ) - - -def test_start_accepts_preset_arrays_and_normalizes_them(): - client = DummyClient() - session = Agent(name="assistant").create_session( - client, - channel="room", - agent_uid="1", - remote_uids=["2"], - preset=["deepgram_nova_3", "openai_gpt_5_mini", "openai_tts_1"], - ) - session.start() - _, kwargs = client.agents.start_calls[0] - assert kwargs["preset"] == "deepgram_nova_3,openai_gpt_5_mini,openai_tts_1" - - -def test_session_methods_enforce_state_and_id_guards(): - session = build_standard_agent().create_session(DummyClient(), channel="room", agent_uid="1", remote_uids=["2"]) - - for method_name in ["stop", "interrupt", "update", "say"]: - with pytest.raises(RuntimeError): - if method_name == "say": - getattr(session, method_name)("hello") - elif method_name == "update": - getattr(session, method_name)({}) - else: - getattr(session, method_name)() - - with pytest.raises(RuntimeError): - session.get_history() - with pytest.raises(RuntimeError): - session.get_info() - with pytest.raises(RuntimeError): - session.get_turns() - - -def test_app_credentials_mode_adds_auth_headers_and_exposes_getters_and_raw_client(): - client = DummyClient(auth_mode="app-credentials") - session = build_standard_agent().create_session(client, channel="room", agent_uid="1", remote_uids=["2"]) - - headers = session._convo_ai_headers() - assert headers is not None - assert headers["Authorization"].startswith("agora token=") - assert session.app_id == "app-id" - assert session.agent.name == "assistant" - assert session.raw is client.agents - - -def test_event_handlers_can_be_added_removed_and_warning_path_exercised(): - client = DummyClient() - session = build_standard_agent().create_session(client, channel="room", agent_uid="1", remote_uids=["2"]) - received = [] - - def started(payload): - received.append(payload) - - def failing(_payload): - raise RuntimeError("boom") - - session.on("started", started) - session.on("started", failing) - - with pytest.warns(UserWarning): - session.start() - - assert received == [{"agent_id": "agent-1"}] - session.off("started", started) - session._emit("started", {"agent_id": "agent-2"}) - assert received == [{"agent_id": "agent-1"}] - - -def test_running_session_methods_call_underlying_client_helpers(): - client = DummyClient() - session = build_standard_agent().create_session(client, channel="room", agent_uid="1", remote_uids=["2"]) - session.start() - session.say("hello", priority="APPEND", interruptable=True) - session.interrupt() - session.update({"greeting_message": "updated"}) - assert session.get_history() == {"contents": []} - assert session.get_info() == {"agent_id": "agent-1"} - assert session.get_turns() == {"turns": [{"agent_id": "agent-1"}]} - session.stop() - - assert client.agents.speak_calls - assert client.agents.interrupt_calls - assert client.agents.update_calls - assert client.agents.history_calls - assert client.agents.get_calls - assert client.agents.turn_calls - assert client.agents.stop_calls - - -def test_start_sets_status_to_error_and_emits_error_event_on_failure(): - client = DummyClient() - client.agents.start_error = RuntimeError("start failed") - session = build_standard_agent().create_session(client, channel="room", agent_uid="1", remote_uids=["2"]) - errors = [] - session.on("error", errors.append) - - with pytest.raises(RuntimeError, match="start failed"): - session.start() - - assert session.status == "error" - assert len(errors) == 1 - - -def test_stop_swallows_404_and_non_404_moves_to_error(): - client = DummyClient() - session = build_standard_agent().create_session(client, channel="room", agent_uid="1", remote_uids=["2"]) - session.start() - client.agents.stop_error = ApiError(status_code=404) - session.stop() - assert session.status == "stopped" - - client2 = DummyClient() - session2 = build_standard_agent().create_session(client2, channel="room", agent_uid="1", remote_uids=["2"]) - session2.start() - client2.agents.stop_error = ApiError(status_code=500) - with pytest.raises(ApiError): - session2.stop() - assert session2.status == "error" - - -def test_avatar_validation_warning_branch_and_async_session_methods(): - agent = ( - Agent(name="avatar") - .with_llm(OpenAI(api_key="key", model="gpt-4o-mini")) - .with_tts(ElevenLabsTTS(key="tts", model_id="model", voice_id="voice")) - .with_avatar(AkoolAvatar(api_key="akool", avatar_id="avatar-1")) - ) - client = DummyClient() - session = agent.create_session(client, channel="room", agent_uid="1", remote_uids=["2"]) - with pytest.warns(UserWarning): - session._validate_avatar_config() - - async def run_async_case(): - async_client = DummyAsyncClient() - async_session = build_standard_agent().create_async_session( - async_client, - channel="room", - agent_uid="1", - remote_uids=["2"], - ) - await async_session.start() - await async_session.say("hello") - await async_session.interrupt() - await async_session.update({"greeting_message": "updated"}) - assert await async_session.get_history() == {"contents": []} - assert await async_session.get_info() == {"agent_id": "agent-1"} - assert await async_session.get_turns() == {"turns": [{"agent_id": "agent-1"}]} - await async_session.stop() - - asyncio.run(run_async_case()) diff --git a/tests/agentkit/test_agentkit_parity.py b/tests/agentkit/test_agentkit_parity.py deleted file mode 100644 index a486449..0000000 --- a/tests/agentkit/test_agentkit_parity.py +++ /dev/null @@ -1,187 +0,0 @@ -from typing import Any, Dict, List, Tuple -from types import SimpleNamespace -import unittest - -from agora_agent.agentkit import ( - Agent, - AnamAvatar, - GeminiLive, - LiveAvatarAvatar, - OpenAI, - OpenAITTS, - validate_avatar_config, - validate_tts_sample_rate, -) -from agora_agent.agentkit.vendors import DeepgramSTT - - -class DummyAgents: - def __init__(self) -> None: - self.start_calls: List[Tuple[Any, Dict[str, Any]]] = [] - self.turn_calls: List[Tuple[Any, Any, Any]] = [] - - def start(self, app_id, **kwargs): - self.start_calls.append((app_id, kwargs)) - return SimpleNamespace(agent_id="agent-1") - - def get_turns(self, app_id, agent_id, request_options=None): - self.turn_calls.append((app_id, agent_id, request_options)) - return {"turns": [{"agent_id": agent_id}]} - - def stop(self, *args, **kwargs): - return None - - def speak(self, *args, **kwargs): - return None - - def interrupt(self, *args, **kwargs): - return None - - def update(self, *args, **kwargs): - return None - - def get_history(self, *args, **kwargs): - return {"contents": []} - - def get(self, *args, **kwargs): - return {"agent_id": "agent-1"} - - -class DummyClient: - def __init__(self) -> None: - self.app_id = "app-id" - self.app_certificate = "app-cert" - self.auth_mode = "basic" - self.agents = DummyAgents() - - -def dump_properties(properties): - if hasattr(properties, "model_dump"): - return properties.model_dump(exclude_none=True) - return properties - - -class AgentKitParityTests(unittest.TestCase): - def test_start_supports_preset_and_pipeline_without_explicit_llm_or_tts(self): - client = DummyClient() - agent = Agent(name="preset-agent", instructions="Use preset defaults.") - - session = agent.create_session( - client, - channel="room-1", - agent_uid="1", - remote_uids=["2"], - preset="deepgram_nova_3,openai_gpt_4o_mini,openai_tts_1", - pipeline_id="pipeline_123", - ) - - agent_id = session.start() - - self.assertEqual(agent_id, "agent-1") - _, kwargs = client.agents.start_calls[0] - self.assertEqual(kwargs["preset"], "deepgram_nova_3,openai_gpt_4o_mini,openai_tts_1") - self.assertEqual(kwargs["pipeline_id"], "pipeline_123") - dumped = dump_properties(kwargs["properties"]) - self.assertEqual(dumped["channel"], "room-1") - self.assertEqual(dumped["agent_rtc_uid"], "1") - self.assertNotIn("llm", dumped) - self.assertNotIn("tts", dumped) - - def test_start_infers_reseller_presets_and_strips_credential_fields(self): - client = DummyClient() - agent = ( - Agent(name="auto-preset", instructions="Use reseller defaults.") - .with_stt(DeepgramSTT(model="nova-3")) - .with_llm(OpenAI(model="gpt-5-mini")) - .with_tts(OpenAITTS(voice="alloy")) - ) - - session = agent.create_session( - client, - channel="room-2", - agent_uid="1", - remote_uids=["2"], - ) - - session.start() - - _, kwargs = client.agents.start_calls[0] - self.assertEqual(kwargs["preset"], "deepgram_nova_3,openai_gpt_5_mini,openai_tts_1") - dumped = dump_properties(kwargs["properties"]) - self.assertFalse(dumped["asr"].get("params")) - self.assertEqual( - dumped["llm"]["system_messages"], - [{"role": "system", "content": "Use reseller defaults."}], - ) - self.assertEqual(dumped["llm"]["input_modalities"], ["text"]) - self.assertFalse(dumped["llm"].get("api_key")) - self.assertEqual(dumped["tts"].get("params"), {"voice": "alloy"}) - - def test_session_get_turns_proxies_to_agents_client(self): - client = DummyClient() - session = Agent(name="assistant").create_session( - client, - channel="room-3", - agent_uid="1", - remote_uids=["2"], - preset="deepgram_nova_3,openai_gpt_4o_mini,openai_tts_1", - ) - session.start() - - turns = session.get_turns() - - self.assertEqual(turns, {"turns": [{"agent_id": "agent-1"}]}) - self.assertEqual(client.agents.turn_calls, [("app-id", "agent-1", None)]) - - def test_gemini_live_matches_low_level_shape(self): - config = GeminiLive( - api_key="google-key", - model="gemini-live-2.5-flash", - url="wss://generativelanguage.googleapis.com/ws", - instructions="You are concise.", - voice="Aoede", - greeting_message="Hello", - predefined_tools=["_publish_message"], - failure_message="Please try again.", - max_history=8, - additional_params={"temperature": 0.2}, - messages=[{"role": "user", "content": "Hi"}], - ).to_config() - - self.assertEqual( - config, - { - "vendor": "gemini", - "style": "openai", - "api_key": "google-key", - "url": "wss://generativelanguage.googleapis.com/ws", - "params": { - "temperature": 0.2, - "model": "gemini-live-2.5-flash", - "instructions": "You are concise.", - "voice": "Aoede", - }, - "messages": [{"role": "user", "content": "Hi"}], - "greeting_message": "Hello", - "predefined_tools": ["_publish_message"], - "failure_message": "Please try again.", - "max_history": 8, - }, - ) - - def test_liveavatar_and_anam_avatar_support_matches_typescript(self): - liveavatar = LiveAvatarAvatar(api_key="live-key", quality="high", agora_uid="42").to_config() - validate_avatar_config(liveavatar) - with self.assertRaisesRegex(ValueError, "LiveAvatar"): - validate_tts_sample_rate(liveavatar, 16000) - - anam = AnamAvatar(api_key="anam-key", persona_id="persona-1").to_config() - validate_avatar_config(anam) - agent = Agent().with_tts(OpenAITTS(api_key="openai-key", voice="alloy")).with_avatar( - AnamAvatar(api_key="anam-key", persona_id="persona-1") - ) - self.assertEqual(agent.avatar, anam) - - -if __name__ == "__main__": - unittest.main() diff --git a/tests/agentkit/test_constants.py b/tests/agentkit/test_constants.py deleted file mode 100644 index 10a12b6..0000000 --- a/tests/agentkit/test_constants.py +++ /dev/null @@ -1,26 +0,0 @@ -from agora_agent.agentkit import ( - DataChannel, - FillerWordsSelectionRule, - GeofenceArea, - GeofenceExcludeArea, - SalModeValues, - SilenceActionValues, - TurnDetectionTypeValues, -) - - -def test_constants_match_expected_values(): - assert DataChannel.RTM == "rtm" - assert DataChannel.DATASTREAM == "datastream" - assert SilenceActionValues.SPEAK == "speak" - assert SilenceActionValues.THINK == "think" - assert SalModeValues.LOCKING == "locking" - assert SalModeValues.RECOGNITION == "recognition" - assert GeofenceArea.GLOBAL == "GLOBAL" - assert GeofenceArea.NORTH_AMERICA == "NORTH_AMERICA" - assert GeofenceExcludeArea.JAPAN == "JAPAN" - assert FillerWordsSelectionRule.SHUFFLE == "shuffle" - assert FillerWordsSelectionRule.ROUND_ROBIN == "round_robin" - assert TurnDetectionTypeValues.AGORA_VAD == "agora_vad" - assert TurnDetectionTypeValues.SERVER_VAD == "server_vad" - assert TurnDetectionTypeValues.SEMANTIC_VAD == "semantic_vad" diff --git a/tests/agentkit/test_presets.py b/tests/agentkit/test_presets.py deleted file mode 100644 index 51f38eb..0000000 --- a/tests/agentkit/test_presets.py +++ /dev/null @@ -1,86 +0,0 @@ -from agora_agent.agentkit import AgentPresets, normalize_preset_input -from agora_agent.agentkit.presets import resolve_session_presets - - -def test_preset_values_match_expected_strings(): - assert AgentPresets.asr.deepgram_nova_2 == "deepgram_nova_2" - assert AgentPresets.asr.deepgram_nova_3 == "deepgram_nova_3" - assert AgentPresets.llm.openai_gpt_4o_mini == "openai_gpt_4o_mini" - assert AgentPresets.llm.openai_gpt_4_1_mini == "openai_gpt_4_1_mini" - assert AgentPresets.llm.openai_gpt_5_nano == "openai_gpt_5_nano" - assert AgentPresets.llm.openai_gpt_5_mini == "openai_gpt_5_mini" - assert AgentPresets.tts.minimax_speech_2_6_turbo == "minimax_speech_2_6_turbo" - assert AgentPresets.tts.minimax_speech_2_8_turbo == "minimax_speech_2_8_turbo" - assert AgentPresets.tts.openai_tts_1 == "openai_tts_1" - - -def test_normalize_preset_input_variants(): - assert normalize_preset_input(None) is None - assert normalize_preset_input("deepgram_nova_3") == "deepgram_nova_3" - assert ( - normalize_preset_input(["deepgram_nova_3", "openai_gpt_5_mini"]) - == "deepgram_nova_3,openai_gpt_5_mini" - ) - assert ( - normalize_preset_input(" deepgram_nova_3, , openai_gpt_5_mini ") - == "deepgram_nova_3,openai_gpt_5_mini" - ) - - -def test_resolve_session_presets_returns_none_when_nothing_inferrable(): - preset, properties = resolve_session_presets(None, {"llm": {"vendor": "custom"}}) - assert preset is None - assert properties["llm"] == {"vendor": "custom"} - assert properties["asr"] is None - assert properties["tts"] is None - - -def test_resolve_session_presets_inferrs_and_strips_fields(): - preset, properties = resolve_session_presets( - None, - { - "asr": {"vendor": "deepgram", "params": {"model": "nova-3"}}, - "llm": { - "vendor": "openai", - "url": "https://api.openai.com/v1/chat/completions", - "params": {"model": "gpt-5-mini"}, - }, - "tts": {"vendor": "openai", "params": {"model": "tts-1", "voice": "alloy"}}, - }, - ) - assert preset == "deepgram_nova_3,openai_gpt_5_mini,openai_tts_1" - assert properties["asr"] == {"vendor": "deepgram"} - assert properties["llm"] == {"vendor": "openai"} - assert properties["tts"] == {"vendor": "openai", "params": {"voice": "alloy"}} - - -def test_resolve_session_presets_minimax_and_explicit_precedence(): - preset, properties = resolve_session_presets( - "deepgram_nova_2", - { - "asr": {"vendor": "deepgram", "params": {"model": "nova-3"}}, - "tts": {"vendor": "minimax", "params": {"model": "speech-2.8-turbo"}}, - }, - ) - assert preset == "deepgram_nova_2,minimax_speech_2_8_turbo" - assert properties["asr"]["params"] == {"model": "nova-3"} - assert properties["tts"] == {"vendor": "minimax"} - - -def test_resolve_session_presets_skips_inference_when_credentials_or_nonstandard_values_present(): - assert resolve_session_presets( - None, {"asr": {"vendor": "deepgram", "params": {"model": "nova-3", "api_key": "key"}}} - )[0] is None - assert resolve_session_presets( - None, - { - "llm": { - "vendor": "openai", - "url": "https://example.com/chat/completions", - "params": {"model": "gpt-5-mini"}, - } - }, - )[0] is None - assert resolve_session_presets( - None, {"tts": {"vendor": "minimax", "params": {"model": "speech-2.8-turbo", "key": "secret"}}} - )[0] is None diff --git a/tests/agentkit/test_token.py b/tests/agentkit/test_token.py deleted file mode 100644 index cb4b37a..0000000 --- a/tests/agentkit/test_token.py +++ /dev/null @@ -1,69 +0,0 @@ -from unittest import mock - -import pytest - -from agora_agent.agentkit.token import ( - MAX_EXPIRY_SECONDS, - expires_in_hours, - expires_in_minutes, - generate_convo_ai_token, - generate_rtc_token, -) - - -def test_expires_in_helpers_validate_and_cap_values(): - with pytest.raises(ValueError): - expires_in_hours(0) - with pytest.raises(ValueError): - expires_in_minutes(-1) - - with pytest.warns(UserWarning): - assert expires_in_hours(30) == MAX_EXPIRY_SECONDS - with pytest.warns(UserWarning): - assert expires_in_minutes(60 * 30) == MAX_EXPIRY_SECONDS - - assert expires_in_hours(1.5) == 5400 - assert expires_in_minutes(2.5) == 150 - - -def test_token_generators_return_non_empty_strings(): - rtc = generate_rtc_token( - app_id="a" * 32, - app_certificate="b" * 32, - channel="demo", - uid=1, - ) - convo = generate_convo_ai_token( - app_id="a" * 32, - app_certificate="b" * 32, - channel_name="demo", - account="1", - ) - assert isinstance(rtc, str) and rtc - assert isinstance(convo, str) and convo - - -def test_generate_convo_ai_token_uses_builder_when_available_and_defaults_privilege_expire(): - fake_builder = mock.Mock() - fake_builder.buildTokenWithRtm.return_value = "token-123" - - with mock.patch.dict("sys.modules", {"agora_token_builder": mock.Mock(RtcTokenBuilder=fake_builder)}): - token = generate_convo_ai_token( - app_id="app-id", - app_certificate="app-cert", - channel_name="demo", - account="1", - token_expire=120, - privilege_expire=0, - ) - - assert token == "token-123" - fake_builder.buildTokenWithRtm.assert_called_once_with( - "app-id", - "app-cert", - "demo", - "1", - 1, - 120, - 0, - ) diff --git a/tests/agentkit/test_vendors.py b/tests/agentkit/test_vendors.py deleted file mode 100644 index 70a923e..0000000 --- a/tests/agentkit/test_vendors.py +++ /dev/null @@ -1,160 +0,0 @@ -import pytest - -from agora_agent.agentkit import ( - AnamAvatar, - GeminiLive, - LiveAvatarAvatar, - validate_avatar_config, - validate_tts_sample_rate, -) -from agora_agent.agentkit.vendors import ( - AmazonSTT, - AmazonTTS, - Anthropic, - AresSTT, - AssemblyAISTT, - AzureOpenAI, - CartesiaTTS, - DeepgramSTT, - ElevenLabsTTS, - FishAudioTTS, - Gemini, - GoogleSTT, - GoogleTTS, - HeyGenAvatar, - HumeAITTS, - MicrosoftSTT, - MicrosoftTTS, - MiniMaxTTS, - MurfTTS, - OpenAI, - OpenAIRealtime, - OpenAISTT, - OpenAITTS, - RimeTTS, - SarvamSTT, - SarvamTTS, - SpeechmaticsSTT, - VertexAI, -) - - -def test_llm_vendor_mappings_cover_core_shapes_and_defaults(): - assert OpenAI(api_key="key", model="gpt-4o-mini").to_config()["url"] == "https://api.openai.com/v1/chat/completions" - assert "api_key" not in OpenAI(model="gpt-5-mini").to_config() - assert "params" not in AzureOpenAI( - api_key="key", - endpoint="https://azure.example.com", - deployment_name="deploy", - ).to_config() - anthropic = Anthropic(api_key="key", model="claude", temperature=0.3, top_p=0.7).to_config() - assert anthropic["params"]["temperature"] == 0.3 - assert anthropic["params"]["top_p"] == 0.7 - gemini = Gemini(api_key="key", model="gemini", temperature=0.2, top_p=0.8, top_k=10).to_config() - assert gemini["style"] == "gemini" - assert gemini["params"]["top_k"] == 10 - - -def test_mllm_vendor_mappings_cover_optional_branches(): - realtime = OpenAIRealtime( - api_key="key", - url="wss://openai.example.com/realtime", - predefined_tools=["_publish_message"], - failure_message="Retry", - max_history=3, - ).to_config() - assert realtime == { - "vendor": "openai", - "style": "openai", - "api_key": "key", - "url": "wss://openai.example.com/realtime", - "predefined_tools": ["_publish_message"], - "failure_message": "Retry", - "max_history": 3, - } - - vertex = VertexAI( - model="gemini-live", - url="wss://vertex.example.com/realtime", - project_id="project", - location="us-central1", - adc_credentials_string="creds", - additional_params={"temperature": 0.2}, - predefined_tools=["_publish_message"], - failure_message="Try again", - max_history=5, - ).to_config() - assert vertex["vendor"] == "vertexai" - assert vertex["url"] == "wss://vertex.example.com/realtime" - assert vertex["params"]["temperature"] == 0.2 - assert vertex["predefined_tools"] == ["_publish_message"] - assert vertex["failure_message"] == "Try again" - assert vertex["max_history"] == 5 - - gemini_live = GeminiLive( - api_key="key", - model="gemini-live", - url="wss://gemini.example.com/realtime", - voice="Aoede", - predefined_tools=["_publish_message"], - failure_message="Please try again.", - max_history=8, - ).to_config() - assert gemini_live["vendor"] == "gemini" - assert gemini_live["url"] == "wss://gemini.example.com/realtime" - assert gemini_live["params"]["voice"] == "Aoede" - assert gemini_live["predefined_tools"] == ["_publish_message"] - assert gemini_live["failure_message"] == "Please try again." - assert gemini_live["max_history"] == 8 - - -def test_stt_vendor_mappings_cover_all_wrappers(): - assert SpeechmaticsSTT(api_key="key", language="en").to_config()["vendor"] == "speechmatics" - assert DeepgramSTT(api_key="key", model="nova-3", smart_format=True, punctuation=True).to_config()["params"][ - "smart_format" - ] - assert MicrosoftSTT(key="key", region="eastus").to_config()["vendor"] == "microsoft" - assert OpenAISTT(api_key="key", model="whisper-1").to_config()["vendor"] == "openai" - assert GoogleSTT(api_key="key", language="en-US").to_config()["vendor"] == "google" - assert AmazonSTT(access_key="a", secret_key="b", region="us-east-1").to_config()["vendor"] == "amazon" - assert AssemblyAISTT(api_key="key").to_config()["vendor"] == "assemblyai" - assert AresSTT(language="en").to_config()["vendor"] == "ares" - assert SarvamSTT(api_key="key", language="en").to_config()["vendor"] == "sarvam" - - -def test_tts_vendor_mappings_cover_all_wrappers_and_skip_patterns(): - assert ElevenLabsTTS(key="key", model_id="model", voice_id="voice", skip_patterns=[1]).to_config()["skip_patterns"] == [1] - assert MicrosoftTTS(key="key", region="eastus", voice_name="voice").to_config()["vendor"] == "microsoft" - assert OpenAITTS(voice="alloy").to_config()["params"] == {"voice": "alloy"} - assert CartesiaTTS(api_key="key", voice_id="voice").to_config()["params"]["voice"]["id"] == "voice" - assert GoogleTTS(key="key", voice_name="voice").to_config()["vendor"] == "google" - assert AmazonTTS(access_key="a", secret_key="b", region="us-east-1", voice_id="voice").to_config()["vendor"] == "amazon" - assert HumeAITTS(key="key").to_config()["vendor"] == "humeai" - assert RimeTTS(key="key", speaker="speaker").to_config()["vendor"] == "rime" - assert FishAudioTTS(key="key", reference_id="ref").to_config()["vendor"] == "fishaudio" - assert MiniMaxTTS(model="speech-2.8-turbo").to_config()["params"] == {"model": "speech-2.8-turbo"} - assert SarvamTTS(key="key", speaker="speaker", target_language_code="en-IN").to_config()["vendor"] == "sarvam" - assert MurfTTS(key="key", voice_id="voice").to_config()["vendor"] == "murf" - - -def test_avatar_vendor_mappings_and_validators_cover_failure_branches(): - with pytest.raises(ValueError, match="quality"): - HeyGenAvatar(api_key="key", quality="bad", agora_uid="1") - - liveavatar = LiveAvatarAvatar( - api_key="key", - quality="high", - agora_uid="1", - avatar_id="avatar", - disable_idle_timeout=True, - activity_idle_timeout=30, - ).to_config() - assert liveavatar["vendor"] == "liveavatar" - validate_avatar_config(liveavatar) - - anam = AnamAvatar(api_key="key", persona_id="persona").to_config() - assert anam["vendor"] == "anam" - validate_avatar_config(anam) - - with pytest.raises(ValueError, match="HeyGen"): - validate_tts_sample_rate(HeyGenAvatar(api_key="key", quality="high", agora_uid="1").to_config(), 16000) diff --git a/tests/custom/test_client.py b/tests/custom/test_client.py index ab04ce6..77fae36 100644 --- a/tests/custom/test_client.py +++ b/tests/custom/test_client.py @@ -1,7 +1,265 @@ -import pytest +from agora_agent.agentkit.agent import Agent, AdvancedFeatures, InterruptionConfig, MllmTurnDetectionConfig, TurnDetectionConfig +from agora_agent.agentkit.constants import TurnDetectionTypeValues +import asyncio +import warnings +from agora_agent.agentkit.agent_session import AgentSession, AsyncAgentSession +from agora_agent.agentkit.vendors import DeepgramTTS, HeyGenAvatar, MicrosoftTTS, OpenAI, OpenAIRealtime +from agora_agent.agentkit import AgentThinkResponse +from typing import Any, Dict, List, Tuple -# Get started with writing tests with pytest at https://docs.pytest.org -@pytest.mark.skip(reason="Unimplemented") -def test_client() -> None: - assert True +class _AgentManagementStub: + def __init__(self) -> None: + self.calls: List[Tuple[str, str, Dict[str, Any]]] = [] + + def agent_think(self, appid, agent_id, **kwargs): # noqa: ANN001 + self.calls.append((appid, agent_id, kwargs)) + return AgentThinkResponse(agent_id=agent_id, channel="room", start_ts=1) + + +class _ClientStub: + auth_mode = "basic" + + def __init__(self) -> None: + self.agents = object() + self.agent_management = _AgentManagementStub() + + +class _AsyncAgentManagementStub: + def __init__(self) -> None: + self.calls: List[Tuple[str, str, Dict[str, Any]]] = [] + + async def agent_think(self, appid, agent_id, **kwargs): # noqa: ANN001 + self.calls.append((appid, agent_id, kwargs)) + return AgentThinkResponse(agent_id=agent_id, channel="room", start_ts=1) + + +class _AsyncClientStub: + auth_mode = "basic" + + def __init__(self) -> None: + self.agents = object() + self.agent_management = _AsyncAgentManagementStub() + + +def test_agentkit_think_routes_to_agent_management() -> None: + client = _ClientStub() + session = AgentSession( + client=client, + agent=Agent(), + app_id="appid", + name="agent", + channel="room", + token="token", + agent_uid="1", + remote_uids=["2"], + ) + session._status = "running" + session._agent_id = "agent-1" + + response = session.think("Injected instruction", on_thinking_action="interrupt") + assert response.agent_id == "agent-1" + assert len(client.agent_management.calls) == 1 + appid, agent_id, kwargs = client.agent_management.calls[0] + assert appid == "appid" + assert agent_id == "agent-1" + assert kwargs["text"] == "Injected instruction" + assert kwargs["on_thinking_action"] == "interrupt" + + +def test_async_agentkit_think_routes_to_agent_management() -> None: + async def _run() -> None: + client = _AsyncClientStub() + session = AsyncAgentSession( + client=client, + agent=Agent(), + app_id="appid", + name="agent", + channel="room", + token="token", + agent_uid="1", + remote_uids=["2"], + ) + session._status = "running" + session._agent_id = "agent-1" + + response = await session.think("Injected instruction", on_thinking_action="interrupt") + assert response.agent_id == "agent-1" + assert len(client.agent_management.calls) == 1 + appid, agent_id, kwargs = client.agent_management.calls[0] + assert appid == "appid" + assert agent_id == "agent-1" + assert kwargs["text"] == "Injected instruction" + assert kwargs["on_thinking_action"] == "interrupt" + + asyncio.run(_run()) + + +def test_llm_vendor_headers_are_forwarded_to_properties() -> None: + agent = Agent().with_llm( + OpenAI( + api_key="openai-key", + model="gpt-4o-mini", + headers={"X-Trace-Id": "trace-123"}, + output_modalities=["text", "audio"], + greeting_configs={"mode": "single_first"}, + template_variables={"caller_name": "Ada"}, + ) + ).with_tts(MicrosoftTTS(key="tts-key", region="eastus", voice_name="en-US-JennyNeural")) + + props = agent.to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + ) + + assert props.llm is not None + assert props.llm.headers == {"X-Trace-Id": "trace-123"} + assert props.llm.output_modalities == ["text", "audio"] + assert props.llm.greeting_configs is not None + assert props.llm.greeting_configs.mode == "single_first" + assert props.llm.template_variables == {"caller_name": "Ada"} + + +def test_with_turn_detection_forwards_config() -> None: + turn_detection = TurnDetectionConfig( + type=TurnDetectionTypeValues.AGORA_VAD, + threshold=0.5, + ) + + props = Agent().with_turn_detection(turn_detection).to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + skip_vendor_validation=True, + ) + + assert props.turn_detection == turn_detection + + +def test_with_interruption_forwards_config() -> None: + interruption = InterruptionConfig( + enable=False, + disabled_config={"strategy": "ignore"}, + ) + + props = Agent().with_interruption(interruption).to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + skip_vendor_validation=True, + ) + + assert props.interruption == interruption + + +def test_mllm_turn_detection_is_forwarded_without_legacy_style() -> None: + mllm_turn_detection = MllmTurnDetectionConfig( + mode="server_vad", + server_vad_config={"idle_timeout_ms": 5000}, + ) + props = Agent().with_mllm( + OpenAIRealtime(api_key="openai-key", turn_detection=mllm_turn_detection) + ).to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + ) + + assert props.mllm is not None + assert props.mllm.vendor == "openai" + assert "style" not in props.mllm.dict() + assert props.mllm.turn_detection == mllm_turn_detection + + +def test_with_mllm_sets_mllm_enable_without_legacy_flag() -> None: + agent = Agent().with_mllm(OpenAIRealtime(api_key="openai-key")) + + props = agent.to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + ) + + assert props.mllm is not None + assert props.mllm.enable is True + assert props.advanced_features is None + + +def test_with_mllm_removes_deprecated_enable_mllm_from_existing_advanced_features() -> None: + agent = Agent( + advanced_features=AdvancedFeatures(enable_mllm=True, enable_rtm=True) + ).with_mllm(OpenAIRealtime(api_key="openai-key")) + + props = agent.to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + ) + + assert props.mllm is not None + assert props.mllm.enable is True + assert props.advanced_features is not None + assert props.advanced_features.enable_mllm is None + assert props.advanced_features.enable_rtm is True + + +def test_with_mllm_drops_advanced_features_when_only_deprecated_enable_mllm_was_set() -> None: + props = Agent( + advanced_features=AdvancedFeatures(enable_mllm=True) + ).with_mllm(OpenAIRealtime(api_key="openai-key")).to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + ) + + assert props.mllm is not None + assert props.mllm.enable is True + assert props.advanced_features is None + + +def test_with_tools_sets_enable_tools() -> None: + props = Agent().with_tools().to_properties( + channel="room", + token="rtc-token", + agent_uid="1", + remote_uids=["2"], + skip_vendor_validation=True, + ) + + assert props.advanced_features is not None + assert props.advanced_features.enable_tools is True + + +def test_heygen_avatar_emits_deprecation_warning() -> None: + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + HeyGenAvatar(api_key="heygen-key", quality="high", agora_uid="42") + + assert any("HeyGenAvatar is deprecated" in str(warning.message) for warning in caught) + + +def test_deepgram_tts_vendor_config() -> None: + tts = DeepgramTTS( + api_key="deepgram-key", + model="aura-2-thalia-en", + base_url="wss://api.deepgram.com/v1/speak", + sample_rate=24000, + params={"encoding": "linear16"}, + ).to_config() + + assert tts["vendor"] == "deepgram" + assert tts["params"] == { + "api_key": "deepgram-key", + "model": "aura-2-thalia-en", + "base_url": "wss://api.deepgram.com/v1/speak", + "sample_rate": 24000, + "encoding": "linear16", + } diff --git a/tests/custom/test_presets.py b/tests/custom/test_presets.py new file mode 100644 index 0000000..c05c477 --- /dev/null +++ b/tests/custom/test_presets.py @@ -0,0 +1,135 @@ +from agora_agent.agentkit.presets import resolve_session_presets + + +def test_minimax_preset_strips_group_id_and_url_when_no_key() -> None: + """When no key is provided, preset is inferred and credential fields are stripped.""" + properties = { + "tts": { + "vendor": "minimax", + "params": { + "group_id": "my-group", + "model": "speech-2.6-turbo", + "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", + "voice_setting": {"voice_id": "English_captivating_female1"}, + }, + } + } + preset, resolved = resolve_session_presets(None, properties) + assert preset == "minimax_speech_2_6_turbo" + params = resolved["tts"]["params"] + assert "group_id" not in params + assert "url" not in params + assert "model" not in params + assert params["voice_setting"]["voice_id"] == "English_captivating_female1" + + +def test_minimax_preset_strips_group_id_and_url_for_28_turbo() -> None: + properties = { + "tts": { + "vendor": "minimax", + "params": { + "group_id": "org-123", + "model": "speech-2.8-turbo", + "url": "wss://api.minimax.io/ws/v1/t2a_v2", + "voice_setting": {"voice_id": "some-voice"}, + }, + } + } + preset, resolved = resolve_session_presets(None, properties) + assert preset == "minimax_speech_2_8_turbo" + params = resolved["tts"]["params"] + assert "group_id" not in params + assert "url" not in params + assert "model" not in params + + +def test_minimax_preset_strips_group_id_and_url_with_underscore_model_name() -> None: + properties = { + "tts": { + "vendor": "minimax", + "params": { + "group_id": "my-group", + "model": "speech_2_6_turbo", + "url": "wss://api-uw.minimax.io/ws/v1/t2a_v2", + }, + } + } + preset, resolved = resolve_session_presets(None, properties) + assert preset == "minimax_speech_2_6_turbo" + params = resolved["tts"].get("params") or {} + assert "group_id" not in params + assert "url" not in params + assert "model" not in params + + +def test_minimax_preset_not_inferred_when_key_present() -> None: + """When user provides their own key, preset is NOT inferred and nothing is stripped.""" + properties = { + "tts": { + "vendor": "minimax", + "params": { + "key": "user-secret", + "group_id": "my-group", + "model": "speech-2.6-turbo", + }, + } + } + preset, resolved = resolve_session_presets(None, properties) + assert preset is None + params = resolved["tts"]["params"] + assert params.get("key") == "user-secret" + assert params.get("group_id") == "my-group" + + +def test_minimax_preset_not_inferred_when_explicit_preset_given() -> None: + """When an explicit tts preset is provided, tts inference is skipped.""" + properties = { + "tts": { + "vendor": "minimax", + "params": { + "group_id": "my-group", + "model": "speech-2.6-turbo", + }, + } + } + preset, resolved = resolve_session_presets("minimax_speech_2_6_turbo", properties) + assert preset == "minimax_speech_2_6_turbo" + # Explicit preset: tts inference is skipped, params are NOT stripped + params = resolved["tts"]["params"] + assert params.get("group_id") == "my-group" + + +def test_deepgram_preset_strips_model_and_api_key() -> None: + properties = { + "asr": { + "vendor": "deepgram", + "params": { + "model": "nova-3", + "language": "en-US", + }, + } + } + preset, resolved = resolve_session_presets(None, properties) + assert preset == "deepgram_nova_3" + params = resolved["asr"]["params"] + assert "model" not in params + assert "api_key" not in params + assert params.get("language") == "en-US" + + +def test_openai_llm_preset_strips_model_api_key_and_default_url() -> None: + properties = { + "llm": { + "vendor": "openai", + "url": "https://api.openai.com/v1/chat/completions", + "params": { + "model": "gpt-4o-mini", + }, + } + } + preset, resolved = resolve_session_presets(None, properties) + assert preset == "openai_gpt_4o_mini" + llm = resolved["llm"] + assert "api_key" not in llm + assert "url" not in llm + assert "model" not in (llm.get("params") or {})