Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ CARTESIA_API_KEY=your_cartesia_api_key_here

# Anthropic API credentials
ANTHROPIC_API_KEY=your_anthropic_api_key_here

# Baseten API credentials
BASETEN_API_KEY=your_baseten_api_key_here
BASETEN_BASE_URL=your_baseten_base_url_here
2 changes: 2 additions & 0 deletions agents-core/vision_agents/core/agents/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ async def join(self, call: Call) -> "AgentSessionContextManager":

# wait for conversation creation coro at the very end of the join flow
self.conversation = await create_conversation_coro
# Provide conversation to the LLM so it can access the chat history.
self.llm.set_conversation(self.conversation)
return AgentSessionContextManager(self, self._connection)

async def finish(self):
Expand Down
19 changes: 16 additions & 3 deletions agents-core/vision_agents/core/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant
from getstream.video.rtc import AudioStreamTrack, PcmData
from vision_agents.core.processors import Processor
from vision_agents.core.utils.utils import parse_instructions
from vision_agents.core.utils.utils import Instructions, parse_instructions
from vision_agents.core.events.manager import EventManager
from .function_registry import FunctionRegistry
from .llm_types import ToolSchema, NormalizedToolCallItem
Expand All @@ -50,7 +50,6 @@ class LLM(abc.ABC):
before_response_listener: BeforeCb
after_response_listener: AfterCb
agent: Optional["Agent"]
_conversation: Optional["Conversation"]
function_registry: FunctionRegistry

def __init__(self):
Expand All @@ -59,6 +58,9 @@ def __init__(self):
self.events = EventManager()
self.events.register_events_from_module(events)
self.function_registry = FunctionRegistry()
self.instructions: Optional[str] = None
self.parsed_instructions: Optional[Instructions] = None
self._conversation: Optional[Conversation] = None

async def warmup(self) -> None:
"""
Expand Down Expand Up @@ -187,9 +189,20 @@ def _attach_agent(self, agent: Agent):
Attach agent to the llm
"""
self.agent = agent
self._conversation = agent.conversation
self._set_instructions(agent.instructions)

def set_conversation(self, conversation: Conversation):
"""
Provide the Conversation object to the LLM to access the chat history.
To be called by the Agent after it joins the call.

Args:
conversation: a Conversation object

Returns:
"""
self._conversation = conversation

def _set_instructions(self, instructions: str):
self.instructions = instructions

Expand Down
5 changes: 2 additions & 3 deletions plugins/anthropic/tests/test_anthropic_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class TestClaudeLLM:
async def llm(self) -> ClaudeLLM:
"""Test ClaudeLLM initialization with a provided client."""
llm = ClaudeLLM(model="claude-sonnet-4-20250514")
llm._conversation = InMemoryConversation("be friendly", [])
llm.set_conversation(InMemoryConversation("be friendly", []))
return llm

@pytest.mark.asyncio
Expand Down Expand Up @@ -58,7 +58,7 @@ async def test_native_api(self, llm: ClaudeLLM):
@pytest.mark.integration
async def test_stream(self, llm: ClaudeLLM):
streamingWorks = False

@llm.events.subscribe
async def passed(event: LLMResponseChunkEvent):
nonlocal streamingWorks
Expand All @@ -70,7 +70,6 @@ async def passed(event: LLMResponseChunkEvent):

assert streamingWorks


@pytest.mark.integration
async def test_memory(self, llm: ClaudeLLM):
await llm.simple_response(
Expand Down
2 changes: 1 addition & 1 deletion plugins/aws/tests/test_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def assert_response_successful(self, response):
async def llm(self) -> BedrockLLM:
"""Test BedrockLLM initialization with a provided client."""
llm = BedrockLLM(model="qwen.qwen3-32b-v1:0", region_name="us-east-1")
llm._conversation = InMemoryConversation("be friendly", [])
llm.set_conversation(InMemoryConversation("be friendly", []))
return llm

@pytest.mark.asyncio
Expand Down
117 changes: 117 additions & 0 deletions plugins/baseten/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Qwen3-VL hosted on Baseten
Qwen3-VL is the latest open-source Video Language Model (VLM) from Alibaba. This plugin allows developers to easily run the model hosted on [Baseten](https://www.baseten.co/) with Vision Agents. The model accepts text and video and responds with text vocalised with the TTS service of your choice.

## Features

- **Video understanding**: Automatically buffers and forwards video frames to Baseten-hosted VLM models
- **Streaming responses**: Supports streaming text responses with real-time chunk events
- **Frame buffering**: Configurable frame rate and buffer duration for optimal performance
- **Event-driven**: Emits LLM events (chunks, completion, errors) for integration with other components

## Installation

```bash
uv add vision-agents[baseten]
```

## Quick Start

```python
from vision_agents.core import Agent, User
from vision_agents.plugins import baseten, getstream, deepgram, elevenlabs, vogent

async def create_agent(**kwargs) -> Agent:
# Initialize the Baseten VLM
llm = baseten.VLM(model="qwen3vl")

# Create an agent with video understanding capabilities
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Video Assistant", id="agent"),
instructions="You're a helpful video AI assistant. Analyze the video frames and respond to user questions about what you see.",
llm=llm,
stt=deepgram.STT(),
tts=elevenlabs.TTS(),
turn_detection=vogent.TurnDetection(),
processors=[],
)
return agent

async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
await agent.create_user()
call = await agent.create_call(call_type, call_id)

with await agent.join(call):
# The agent will automatically process video frames and respond to user input
await agent.finish()
```

## Configuration

### Environment Variables

- **`BASETEN_API_KEY`**: Your Baseten API key (required)
- **`BASETEN_BASE_URL`**: The base URL for your Baseten API endpoint (required)

### Initialization Parameters

```python
baseten.VLM(
model: str, # Baseten model name (e.g., "qwen3vl")
api_key: Optional[str] = None, # API key (defaults to BASETEN_API_KEY env var)
base_url: Optional[str] = None, # Base URL (defaults to BASETEN_BASE_URL env var)
fps: int = 1, # Frames per second to process (default: 1)
frame_buffer_seconds: int = 10, # Seconds of video to buffer (default: 10)
client: Optional[AsyncOpenAI] = None, # Custom OpenAI client (optional)
)
```

### Parameters

- **`model`**: The name of the Baseten-hosted model to use. Must be a vision-capable model.
- **`api_key`**: Your Baseten API key. If not provided, reads from `BASETEN_API_KEY` environment variable.
- **`base_url`**: The base URL for Baseten API. If not provided, reads from `BASETEN_BASE_URL` environment variable.
- **`fps`**: Number of video frames per second to capture and send to the model. Lower values reduce API costs but may miss fast-moving content. Default is 1 fps.
- **`frame_buffer_seconds`**: How many seconds of video to buffer. Total buffer size = `fps * frame_buffer_seconds`. Default is 10 seconds.
- **`client`**: Optional pre-configured `AsyncOpenAI` client. If provided, `api_key` and `base_url` are ignored.

## How It Works

1. **Video Frame Buffering**: The plugin automatically subscribes to video tracks when the agent joins a call. It buffers frames at the specified FPS for the configured duration.

2. **Frame Processing**: When responding to user input, the plugin:
- Converts buffered video frames to JPEG format
- Resizes frames to 800x600 (maintaining aspect ratio)
- Encodes frames as base64 data URLs

3. **API Request**: Sends the conversation history (including system instructions) along with all buffered frames to the Baseten model.

4. **Streaming Response**: Processes the streaming response and emits events for each chunk and completion.

## Events

The plugin emits the following events:

- **`LLMResponseChunkEvent`**: Emitted for each text chunk in the streaming response
- **`LLMResponseCompletedEvent`**: Emitted when the response stream completes
- **`LLMErrorEvent`**: Emitted if an API request fails

## Requirements

- Python 3.10+
- `openai>=2.5.0`
- `vision-agents` (core framework)
- Baseten API key and base URL

## Notes

- **Frame Rate**: The default FPS of 1 is optimized for VLM use cases. Higher FPS values will increase API costs and latency.
- **Frame Size**: Frames are automatically resized to 800x600 pixels while maintaining aspect ratio to optimize API payload size.
- **Buffer Duration**: The 10-second default buffer provides context for the model while keeping memory usage reasonable.
- **Tool Calling**: Tool/function calling support is not yet implemented (see TODOs in code).

## Troubleshooting

- **No video processing**: Ensure the agent has joined a call with video tracks available. The plugin automatically subscribes to video when tracks are added.
- **API errors**: Verify your `BASETEN_API_KEY` and `BASETEN_BASE_URL` are set correctly and the model name is valid.
- **High latency**: Consider reducing `fps` or `frame_buffer_seconds` to decrease the number of frames sent per request.
1 change: 1 addition & 0 deletions plugins/baseten/example/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Please see root plugin readme.
Empty file.
21 changes: 21 additions & 0 deletions plugins/baseten/example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
[project]
name = "qwen3-vl-example"
version = "0.1.0"
description = "Example using Qwen3 VL hosted on Baseten with Vision Agents"
requires-python = ">=3.10"
dependencies = [
"vision-agents",
"vision-agents-plugins-baseten",
"vision-agents-plugins-getstream",
"vision-agents-plugins-deepgram",
"vision-agents-plugins-elevenlabs",
"python-dotenv",
]

[tool.uv.sources]
vision-agents = { workspace = true }
vision-agents-plugins-baseten = { workspace = true }
vision-agents-plugins-elevenlabs = { workspace = true }
vision-agents-plugins-getstream = { workspace = true }
vision-agents-plugins-deepgram = { workspace = true }

46 changes: 46 additions & 0 deletions plugins/baseten/example/qwen_vl_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import asyncio

from dotenv import load_dotenv

from vision_agents.core import Agent, User, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import baseten, getstream, deepgram, elevenlabs
from vision_agents.core.events import CallSessionParticipantJoinedEvent


load_dotenv()


async def create_agent(**kwargs) -> Agent:
# Initialize the Baseten VLM
llm = baseten.VLM(model="qwen3vl")

# Create an agent with video understanding capabilities
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Video Assistant", id="agent"),
instructions="You're a helpful video AI assistant. Analyze the video frames and respond to user questions about what you see.",
llm=llm,
stt=deepgram.STT(),
tts=elevenlabs.TTS(),
processors=[],
)
return agent

async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
await agent.create_user()
call = await agent.create_call(call_type, call_id)

@agent.events.subscribe
async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
if event.participant.user.id != "agent":
await asyncio.sleep(2)
await agent.simple_response("Describe what you currently see")

with await agent.join(call):
await agent.edge.open_demo(call)
# The agent will automatically process video frames and respond to user input
await agent.finish()

if __name__ == "__main__":
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
Empty file added plugins/baseten/py.typed
Empty file.
36 changes: 36 additions & 0 deletions plugins/baseten/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "vision-agents-plugins-baseten"
dynamic = ["version"]
description = "Baseten plugin for vision agents"
readme = "README.md"
requires-python = ">=3.10"
license = "MIT"
dependencies = [
"vision-agents",
"openai>=2.5.0",
]

[project.urls]
Documentation = "https://visionagents.ai/"
Website = "https://visionagents.ai/"
Source = "https://github.com/GetStream/Vision-Agents"

[tool.hatch.version]
source = "vcs"
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }

[tool.hatch.build.targets.wheel]
packages = ["."]

[tool.uv.sources]
vision-agents = { workspace = true }

[dependency-groups]
dev = [
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
]
4 changes: 4 additions & 0 deletions plugins/baseten/vision_agents/plugins/baseten/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .baseten_vlm import BasetenVLM as VLM


__all__ = ["VLM"]
Loading
Loading