trypromptly
diff --git a/‎llmstack/apps/apis.py‎
Lines changed: 4 additions & 9 deletions b/‎llmstack/apps/apis.py‎
Lines changed: 4 additions & 9 deletions
diff --git a/‎llmstack/apps/app_types.py‎
Lines changed: 6 additions & 4 deletions b/‎llmstack/apps/app_types.py‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎llmstack/apps/runner/agent_actor.py‎
Lines changed: 10 additions & 22 deletions b/‎llmstack/apps/runner/agent_actor.py‎
Lines changed: 10 additions & 22 deletions
diff --git a/‎llmstack/apps/runner/agent_controller.py‎
Lines changed: 75 additions & 32 deletions b/‎llmstack/apps/runner/agent_controller.py‎
Lines changed: 75 additions & 32 deletions
@@ -606,15 +606,9 @@ def post(self, request):
         app_owner_profile = get_object_or_404(Profile, user=owner)
         app_type_slug = request.data["type_slug"] if "type_slug" in request.data else None
         app_type = (
-            get_object_or_404(
-                AppType,
-                id=request.data["app_type"],
-            )
+            AppType.objects.filter(id=request.data["app_type"]).first()
             if "app_type" in request.data
-            else get_object_or_404(
-                AppType,
-                slug=app_type_slug,
-            )
+            else AppType.objects.filter(slug=app_type_slug).first()
         )
         app_name = request.data["name"]
         app_description = request.data["description"] if "description" in request.data else ""
@@ -670,6 +664,7 @@ def post(self, request):
             owner=owner,
             description=app_description,
             type=app_type,
+            type_slug=app_type.slug if app_type else app_type_slug,
             template_slug=template_slug,
             web_integration_config=web_integration_config,
             slack_integration_config=slack_integration_config,
@@ -679,7 +674,7 @@ def post(self, request):
         app_data = {
             "name": app_name,
             "description": app_description,
-            "type_slug": app_type.slug,
+            "type_slug": app_type.slug if app_type else app_type_slug,
             "description": app_description,
             "config": app_config,
             "input_fields": app_input_fields,
 
@@ -1,14 +1,16 @@
 import uuid
 
-from llmstack.apps.models import App, AppType
+from llmstack.apps.models import App
 
 from .types.agent import Agent  # noqa: F401
 from .types.app_type_interface import AppTypeInterface
 from .types.chat import ChatApp  # noqa: F401
 from .types.discord import DiscordApp  # noqa: F401
 from .types.slack import SlackApp  # noqa: F401
 from .types.twilio import TwilioApp  # noqa: F401
+from .types.voice_agent import VoiceAgent  # noqa: F401
 from .types.web import WebApp  # noqa: F401
+from .types.workflow import Workflow  # noqa: F401
 
 
 class AppTypeFactory:
@@ -18,7 +20,7 @@ class AppTypeFactory:
 
     @staticmethod
     def get_app_type_handler(
-        app_type: AppType,
+        app_type_slug: str,
         platform: str = None,
     ) -> AppTypeInterface:
         subclasses = AppTypeInterface.__subclasses__()
@@ -31,7 +33,7 @@ def get_app_type_handler(
 
         # Match with slug
         for subclass in subclasses:
-            if subclass.slug() == app_type.slug.lower():
+            if subclass.slug() == app_type_slug.lower():
                 return subclass
 
         return None
@@ -40,7 +42,7 @@ def get_app_type_handler(
     def get_app_type_signature_verifier(app_id: str, platform: str = "web"):
         app = App.objects.get(uuid=uuid.UUID(app_id))
         app_type_handler = AppTypeFactory.get_app_type_handler(
-            app.type,
+            app.type.slug if app.type else app.type_slug,
             platform,
         )
 
 
@@ -16,7 +16,6 @@
 )
 from llmstack.apps.runner.output_actor import OutputActor
 from llmstack.common.utils.liquid import render_template
-from llmstack.common.utils.provider_config import get_matched_provider_config
 from llmstack.play.actor import BookKeepingData
 from llmstack.play.messages import ContentData, Error, Message, MessageType
 from llmstack.play.output_stream import stitch_model_objects
@@ -34,6 +33,7 @@ def __init__(
         dependencies: list = [],
         templates: Dict[str, str] = {},
         agent_config: Dict[str, Any] = {},
+        is_voice_agent: bool = False,
         metadata: Dict[str, Any] = {},
         provider_configs: Dict[str, Any] = {},
         tools: List[Dict] = [],
@@ -42,25 +42,13 @@ def __init__(
         self._process_output_task = None
         self._config = agent_config
         self._provider_configs = provider_configs
-        self._provider_slug = self._config.get("provider_slug", "openai")
-        self._model_slug = self._config.get("model", "gpt-4o-mini")
-        self._provider_config = get_matched_provider_config(
-            provider_configs=self._provider_configs,
-            provider_slug=self._provider_slug,
-            model_slug=self._model_slug,
-        )
-        self._realtime = self._config.get("realtime", False)
+        self._is_voice_agent = is_voice_agent
 
         self._controller_config = AgentControllerConfig(
             provider_configs=self._provider_configs,
-            provider_config=self._provider_config,
-            provider_slug=self._provider_slug,
-            model_slug=self._model_slug,
-            system_message=self._config.get("system_message", "You are a helpful assistant."),
+            agent_config=self._config,
+            is_voice_agent=self._is_voice_agent,
             tools=tools,
-            stream=True if self._config.get("stream") is None else self._config.get("stream"),
-            realtime=self._realtime,
-            max_steps=min(self._config.get("max_steps", 30), 100),
             metadata=metadata,
         )
 
@@ -245,18 +233,18 @@ async def _process_output(self):
                         "usage_metrics": [
                             ("promptly/*/*/*", MetricType.INVOCATION, (ProviderConfigSource.PLATFORM_DEFAULT, 1)),
                             (
-                                f"{self._provider_slug}/*/{self._model_slug}/*",
+                                controller_output.data.provider,
                                 MetricType.INPUT_TOKENS,
                                 (
-                                    self._provider_config.provider_config_source,
+                                    controller_output.data.source,
                                     controller_output.data.prompt_tokens,
                                 ),
                             ),
                             (
-                                f"{self._provider_slug}/*/{self._model_slug}/*",
+                                controller_output.data.provider,
                                 MetricType.OUTPUT_TOKENS,
                                 (
-                                    self._provider_config.provider_config_source,
+                                    controller_output.data.source,
                                     controller_output.data.completion_tokens,
                                 ),
                             ),
@@ -278,8 +266,8 @@ async def _process_output(self):
     def on_receive(self, message: Message) -> None:
         if message.type == MessageType.CONTENT:
             if message.sender == "_inputs0":
-                if self._realtime:
-                    # For realtime, we send both text and audio streams if available
+                if self._is_voice_agent:
+                    # For voice agents, we send both text and audio streams if available
                     content = []
                     if message.data.content.get("text", None):
                         content.append(
 
@@ -5,37 +5,51 @@
 import queue
 import ssl
 import threading
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Literal, Optional, Union
 
 import websockets
 from asgiref.sync import sync_to_async
 from pydantic import BaseModel, ConfigDict
 
+from llmstack.apps.types.agent import AgentConfigSchema
+from llmstack.apps.types.voice_agent import VoiceAgentConfigSchema
 from llmstack.common.blocks.base.schema import StrEnum
 from llmstack.common.utils.liquid import render_template
 from llmstack.common.utils.provider_config import get_matched_provider_config
 from llmstack.common.utils.sslr.types.chat.chat_completion import ChatCompletion
 from llmstack.common.utils.sslr.types.chat.chat_completion_chunk import (
     ChatCompletionChunk,
 )
-from llmstack.processors.providers.config import ProviderConfig
 from llmstack.processors.providers.promptly import get_llm_client_from_provider_config
 
 logger = logging.getLogger(__name__)
 
 
 class AgentControllerConfig(BaseModel):
     provider_configs: Dict[str, Any]
-    provider_config: ProviderConfig
-    provider_slug: str
-    model_slug: str
-    system_message: str
+    agent_config: Union[AgentConfigSchema, VoiceAgentConfigSchema]
+    is_voice_agent: bool = False
     tools: List[Dict]
-    stream: bool = False
-    realtime: bool = False
-    max_steps: int = 30
     metadata: Dict[str, Any]
-    model_config = ConfigDict(protected_namespaces=())
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    def __init__(self, **data):
+        # Convert agent_config to correct type if needed
+        if "agent_config" in data:
+            config = data["agent_config"]
+            if isinstance(config, dict):
+                if data.get("is_voice_agent", False):
+                    data["agent_config"] = VoiceAgentConfigSchema(**config)
+                else:
+                    data["agent_config"] = AgentConfigSchema(**config)
+
+        super().__init__(**data)
+
+        if self.is_voice_agent and not isinstance(self.agent_config, VoiceAgentConfigSchema):
+            raise ValueError("agent_config must be VoiceAgentConfigSchema when is_voice_agent is True")
+        elif not self.is_voice_agent and not isinstance(self.agent_config, AgentConfigSchema):
+            raise ValueError("agent_config must be AgentConfigSchema when is_voice_agent is False")
 
 
 class AgentControllerDataType(StrEnum):
@@ -54,6 +68,8 @@ class AgentUsageData(BaseModel):
     prompt_tokens: int = 0
     completion_tokens: int = 0
     total_tokens: int = 0
+    provider: str = ""
+    source: str = ""
 
 
 class AgentMessageRole(StrEnum):
@@ -116,19 +132,10 @@ class AgentController:
     def __init__(self, output_queue: asyncio.Queue, config: AgentControllerConfig):
         self._output_queue = output_queue
         self._config = config
-        self._messages: List[AgentMessage] = [
-            AgentSystemMessage(
-                role=AgentMessageRole.SYSTEM,
-                content=[
-                    AgentMessageContent(
-                        type=AgentMessageContentType.TEXT,
-                        data=render_template(self._config.system_message, {}),
-                    )
-                ],
-            )
-        ]
+        self._messages: List[AgentMessage] = []
         self._llm_client = None
         self._websocket = None
+        self._provider_config = None
 
         self._input_text_stream = None
         self._input_audio_stream = None
@@ -154,11 +161,16 @@ async def _handle_websocket_messages(self):
             if event["type"] == "session.created":
                 logger.info(f"Session created: {event['session']['id']}")
                 session = {}
-                session["instructions"] = self._config.system_message
+                session["instructions"] = self._config.agent_config.system_message
                 session["tools"] = [
                     {"type": "function", **t["function"]} for t in self._config.tools if t["type"] == "function"
                 ]
 
+                if self._config.agent_config.input_audio_format:
+                    session["input_audio_format"] = self._config.agent_config.input_audio_format
+                if self._config.agent_config.output_audio_format:
+                    session["output_audio_format"] = self._config.agent_config.output_audio_format
+
                 updated_session = {
                     "type": "session.update",
                     "session": session,
@@ -173,6 +185,12 @@ async def _init_websocket_connection(self):
         from llmstack.apps.models import AppSessionFiles
         from llmstack.assets.stream import AssetStream
 
+        self._provider_config = get_matched_provider_config(
+            provider_configs=self._config.provider_configs,
+            provider_slug=self._config.agent_config.backend.provider,
+            model_slug=self._config.agent_config.backend.model,
+        )
+
         # Create the output streams
         self._output_audio_stream = AssetStream(
             await sync_to_async(AppSessionFiles.create_streaming_asset)(
@@ -191,9 +209,9 @@ async def _init_websocket_connection(self):
         ssl_context.check_hostname = False
         ssl_context.verify_mode = ssl.CERT_NONE
 
-        websocket_url = f"wss://api.openai.com/v1/realtime?model={self._config.model_slug}"
+        websocket_url = f"wss://api.openai.com/v1/realtime?model={self._config.agent_config.backend.model}"
         headers = {
-            "Authorization": f"Bearer {self._config.provider_config.api_key}",
+            "Authorization": f"Bearer {self._provider_config.api_key}",
             "OpenAI-Beta": "realtime=v1",
         }
 
@@ -208,16 +226,34 @@ async def _init_websocket_connection(self):
         self._loop.create_task(self._handle_websocket_messages())
 
     def _init_llm_client(self):
+        self._provider_config = get_matched_provider_config(
+            provider_configs=self._config.provider_configs,
+            provider_slug=self._config.agent_config.provider,
+            model_slug=self._config.agent_config.model,
+        )
+
         self._llm_client = get_llm_client_from_provider_config(
-            self._config.provider_slug,
-            self._config.model_slug,
+            self._config.agent_config.provider,
+            self._config.agent_config.model,
             lambda provider_slug, model_slug: get_matched_provider_config(
                 provider_configs=self._config.provider_configs,
                 provider_slug=provider_slug,
                 model_slug=model_slug,
             ),
         )
 
+        self._messages.append(
+            AgentSystemMessage(
+                role=AgentMessageRole.SYSTEM,
+                content=[
+                    AgentMessageContent(
+                        type=AgentMessageContentType.TEXT,
+                        data=render_template(self._config.agent_config.system_message, {}),
+                    )
+                ],
+            )
+        )
+
     async def _process_input_audio_stream(self):
         if self._input_audio_stream:
             async for chunk in self._input_audio_stream.read_async():
@@ -317,8 +353,8 @@ def process(self, data: AgentControllerData):
         self._messages.append(data.data)
 
         try:
-            if len(self._messages) > self._config.max_steps:
-                raise Exception(f"Max steps ({self._config.max_steps}) exceeded: {len(self._messages)}")
+            if len(self._messages) > self._config.agent_config.max_steps:
+                raise Exception(f"Max steps ({self._config.agent_config.max_steps}) exceeded: {len(self._messages)}")
 
             if data.type != AgentControllerDataType.AGENT_OUTPUT:
                 self._input_messages_queue.put(data)
@@ -334,7 +370,7 @@ def process(self, data: AgentControllerData):
             )
 
     async def process_messages(self, data: AgentControllerData):
-        if self._config.realtime:
+        if self._config.is_voice_agent and self._config.agent_config.backend.backend_type == Literal["multi_modal"]:
             if not self._websocket:
                 await self._init_websocket_connection()
 
@@ -391,14 +427,15 @@ async def process_messages(self, data: AgentControllerData):
                 self._init_llm_client()
 
             client_messages = self._convert_messages_to_llm_client_format()
+            stream = True if self._config.agent_config.stream is None else self._config.agent_config.stream
             response = self._llm_client.chat.completions.create(
-                model=self._config.model_slug,
+                model=self._config.agent_config.model,
                 messages=client_messages,
-                stream=self._config.stream,
+                stream=stream,
                 tools=self._config.tools,
             )
 
-            if self._config.stream:
+            if stream:
                 for chunk in response:
                     self.add_llm_client_response_to_output_queue(chunk)
             else:
@@ -419,6 +456,8 @@ def add_llm_client_response_to_output_queue(self, response: Any):
                         prompt_tokens=response.usage.input_tokens,
                         completion_tokens=response.usage.output_tokens,
                         total_tokens=response.usage.total_tokens,
+                        source=self._provider_config.provider_config_source,
+                        provider=str(self._provider_config),
                     ),
                 )
             )
@@ -621,6 +660,10 @@ async def add_ws_event_to_output_queue(self, event: Any):
                     type=AgentControllerDataType.INPUT_STREAM,
                 )
             )
+        elif event_type == "input_audio_buffer.speech_stopped":
+            pass
+        elif event_type == "conversation.item.input_audio_transcription.completed":
+            pass
         elif event_type == "error":
             logger.error(f"WebSocket error: {event}")