Skip to content
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,7 @@ CARTESIA_API_KEY=your_cartesia_api_key_here

# Anthropic API credentials
ANTHROPIC_API_KEY=your_anthropic_api_key_here

# Baseten API credentials
BASETEN_API_KEY=your_baseten_api_key_here
BASETEN_BASE_URL=your_baseten_base_url_here
2 changes: 2 additions & 0 deletions agents-core/vision_agents/core/agents/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -551,6 +551,8 @@ async def join(self, call: Call) -> "AgentSessionContextManager":

# wait for conversation creation coro at the very end of the join flow
self.conversation = await create_conversation_coro
# Provide conversation to the LLM so it can access the chat history.
self.llm.set_conversation(self.conversation)
return AgentSessionContextManager(self, self._connection)

async def finish(self):
Expand Down
19 changes: 16 additions & 3 deletions agents-core/vision_agents/core/llm/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant
from getstream.video.rtc import AudioStreamTrack, PcmData
from vision_agents.core.processors import Processor
from vision_agents.core.utils.utils import parse_instructions
from vision_agents.core.utils.utils import Instructions, parse_instructions
from vision_agents.core.events.manager import EventManager
from .function_registry import FunctionRegistry
from .llm_types import ToolSchema, NormalizedToolCallItem
Expand All @@ -50,7 +50,6 @@ class LLM(abc.ABC):
before_response_listener: BeforeCb
after_response_listener: AfterCb
agent: Optional["Agent"]
_conversation: Optional["Conversation"]
function_registry: FunctionRegistry

def __init__(self):
Expand All @@ -59,6 +58,9 @@ def __init__(self):
self.events = EventManager()
self.events.register_events_from_module(events)
self.function_registry = FunctionRegistry()
self.instructions: Optional[str] = None
self.parsed_instructions: Optional[Instructions] = None
self._conversation: Optional[Conversation] = None

async def warmup(self) -> None:
"""
Expand Down Expand Up @@ -187,9 +189,20 @@ def _attach_agent(self, agent: Agent):
Attach agent to the llm
"""
self.agent = agent
self._conversation = agent.conversation
self._set_instructions(agent.instructions)

def set_conversation(self, conversation: Conversation):
"""
Provide the Conversation object to the LLM to access the chat history.
To be called by the Agent after it joins the call.

Args:
conversation: a Conversation object

Returns:
"""
self._conversation = conversation

def _set_instructions(self, instructions: str):
self.instructions = instructions

Expand Down
5 changes: 2 additions & 3 deletions plugins/anthropic/tests/test_anthropic_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ class TestClaudeLLM:
async def llm(self) -> ClaudeLLM:
"""Test ClaudeLLM initialization with a provided client."""
llm = ClaudeLLM(model="claude-sonnet-4-20250514")
llm._conversation = InMemoryConversation("be friendly", [])
llm.set_conversation(InMemoryConversation("be friendly", []))
return llm

@pytest.mark.asyncio
Expand Down Expand Up @@ -58,7 +58,7 @@ async def test_native_api(self, llm: ClaudeLLM):
@pytest.mark.integration
async def test_stream(self, llm: ClaudeLLM):
streamingWorks = False

@llm.events.subscribe
async def passed(event: LLMResponseChunkEvent):
nonlocal streamingWorks
Expand All @@ -70,7 +70,6 @@ async def passed(event: LLMResponseChunkEvent):

assert streamingWorks


@pytest.mark.integration
async def test_memory(self, llm: ClaudeLLM):
await llm.simple_response(
Expand Down
2 changes: 1 addition & 1 deletion plugins/aws/tests/test_aws.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def assert_response_successful(self, response):
async def llm(self) -> BedrockLLM:
"""Test BedrockLLM initialization with a provided client."""
llm = BedrockLLM(model="qwen.qwen3-32b-v1:0", region_name="us-east-1")
llm._conversation = InMemoryConversation("be friendly", [])
llm.set_conversation(InMemoryConversation("be friendly", []))
return llm

@pytest.mark.asyncio
Expand Down
117 changes: 117 additions & 0 deletions plugins/baseten/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# Qwen3-VL hosted on Baseten
Qwen3-VL is the latest open-source Video Language Model (VLM) from Alibaba. This plugin allows developers to easily run the model hosted on [Baseten](https://www.baseten.co/) with Vision Agents. The model accepts text and video and responds with text vocalised with the TTS service of your choice.

## Features

- **Video understanding**: Automatically buffers and forwards video frames to Baseten-hosted VLM models
- **Streaming responses**: Supports streaming text responses with real-time chunk events
- **Frame buffering**: Configurable frame rate and buffer duration for optimal performance
- **Event-driven**: Emits LLM events (chunks, completion, errors) for integration with other components

## Installation

```bash
uv add vision-agents[baseten]
```

## Quick Start

```python
from vision_agents.core import Agent, User
from vision_agents.plugins import baseten, getstream, deepgram, elevenlabs, vogent

async def create_agent(**kwargs) -> Agent:
# Initialize the Baseten VLM
llm = baseten.VLM(model="qwen3vl")

# Create an agent with video understanding capabilities
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Video Assistant", id="agent"),
instructions="You're a helpful video AI assistant. Analyze the video frames and respond to user questions about what you see.",
llm=llm,
stt=deepgram.STT(),
tts=elevenlabs.TTS(),
turn_detection=vogent.TurnDetection(),
processors=[],
)
return agent

async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
await agent.create_user()
call = await agent.create_call(call_type, call_id)

with await agent.join(call):
# The agent will automatically process video frames and respond to user input
await agent.finish()
```

## Configuration

### Environment Variables

- **`BASETEN_API_KEY`**: Your Baseten API key (required)
- **`BASETEN_BASE_URL`**: The base URL for your Baseten API endpoint (required)

### Initialization Parameters

```python
baseten.VLM(
model: str, # Baseten model name (e.g., "qwen3vl")
api_key: Optional[str] = None, # API key (defaults to BASETEN_API_KEY env var)
base_url: Optional[str] = None, # Base URL (defaults to BASETEN_BASE_URL env var)
fps: int = 1, # Frames per second to process (default: 1)
frame_buffer_seconds: int = 10, # Seconds of video to buffer (default: 10)
client: Optional[AsyncOpenAI] = None, # Custom OpenAI client (optional)
)
```

### Parameters

- **`model`**: The name of the Baseten-hosted model to use. Must be a vision-capable model.
- **`api_key`**: Your Baseten API key. If not provided, reads from `BASETEN_API_KEY` environment variable.
- **`base_url`**: The base URL for Baseten API. If not provided, reads from `BASETEN_BASE_URL` environment variable.
- **`fps`**: Number of video frames per second to capture and send to the model. Lower values reduce API costs but may miss fast-moving content. Default is 1 fps.
- **`frame_buffer_seconds`**: How many seconds of video to buffer. Total buffer size = `fps * frame_buffer_seconds`. Default is 10 seconds.
- **`client`**: Optional pre-configured `AsyncOpenAI` client. If provided, `api_key` and `base_url` are ignored.

## How It Works

1. **Video Frame Buffering**: The plugin automatically subscribes to video tracks when the agent joins a call. It buffers frames at the specified FPS for the configured duration.

2. **Frame Processing**: When responding to user input, the plugin:
- Converts buffered video frames to JPEG format
- Resizes frames to 800x600 (maintaining aspect ratio)
- Encodes frames as base64 data URLs

3. **API Request**: Sends the conversation history (including system instructions) along with all buffered frames to the Baseten model.

4. **Streaming Response**: Processes the streaming response and emits events for each chunk and completion.

## Events

The plugin emits the following events:

- **`LLMResponseChunkEvent`**: Emitted for each text chunk in the streaming response
- **`LLMResponseCompletedEvent`**: Emitted when the response stream completes
- **`LLMErrorEvent`**: Emitted if an API request fails

## Requirements

- Python 3.10+
- `openai>=2.5.0`
- `vision-agents` (core framework)
- Baseten API key and base URL

## Notes

- **Frame Rate**: The default FPS of 1 is optimized for VLM use cases. Higher FPS values will increase API costs and latency.
- **Frame Size**: Frames are automatically resized to 800x600 pixels while maintaining aspect ratio to optimize API payload size.
- **Buffer Duration**: The 10-second default buffer provides context for the model while keeping memory usage reasonable.
- **Tool Calling**: Tool/function calling support is not yet implemented (see TODOs in code).

## Troubleshooting

- **No video processing**: Ensure the agent has joined a call with video tracks available. The plugin automatically subscribes to video when tracks are added.
- **API errors**: Verify your `BASETEN_API_KEY` and `BASETEN_BASE_URL` are set correctly and the model name is valid.
- **High latency**: Consider reducing `fps` or `frame_buffer_seconds` to decrease the number of frames sent per request.
Empty file added plugins/baseten/py.typed
Empty file.
36 changes: 36 additions & 0 deletions plugins/baseten/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
[build-system]
requires = ["hatchling", "hatch-vcs"]
build-backend = "hatchling.build"

[project]
name = "vision-agents-plugins-baseten"
dynamic = ["version"]
description = "Baseten plugin for vision agents"
readme = "README.md"
requires-python = ">=3.10"
license = "MIT"
dependencies = [
"vision-agents",
"openai>=2.5.0",
]

[project.urls]
Documentation = "https://visionagents.ai/"
Website = "https://visionagents.ai/"
Source = "https://github.com/GetStream/Vision-Agents"

[tool.hatch.version]
source = "vcs"
raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }

[tool.hatch.build.targets.wheel]
packages = ["."]

[tool.uv.sources]
vision-agents = { workspace = true }

[dependency-groups]
dev = [
"pytest>=8.4.1",
"pytest-asyncio>=1.0.0",
]
4 changes: 4 additions & 0 deletions plugins/baseten/vision_agents/plugins/baseten/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .baseten_vlm import BasetenVLM as VLM


__all__ = ["VLM"]
Loading