diff --git a/.env.example b/.env.example index 21bd47eb..c27a7b57 100644 --- a/.env.example +++ b/.env.example @@ -25,3 +25,7 @@ CARTESIA_API_KEY=your_cartesia_api_key_here # Anthropic API credentials ANTHROPIC_API_KEY=your_anthropic_api_key_here + +# Baseten API credentials +BASETEN_API_KEY=your_baseten_api_key_here +BASETEN_BASE_URL=your_baseten_base_url_here diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py index 545840bd..6d8ae358 100644 --- a/agents-core/vision_agents/core/agents/agents.py +++ b/agents-core/vision_agents/core/agents/agents.py @@ -551,6 +551,8 @@ async def join(self, call: Call) -> "AgentSessionContextManager": # wait for conversation creation coro at the very end of the join flow self.conversation = await create_conversation_coro + # Provide conversation to the LLM so it can access the chat history. + self.llm.set_conversation(self.conversation) return AgentSessionContextManager(self, self._connection) async def finish(self): diff --git a/agents-core/vision_agents/core/llm/llm.py b/agents-core/vision_agents/core/llm/llm.py index 1871bcd7..3c25ffc4 100644 --- a/agents-core/vision_agents/core/llm/llm.py +++ b/agents-core/vision_agents/core/llm/llm.py @@ -26,7 +26,7 @@ from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant from getstream.video.rtc import AudioStreamTrack, PcmData from vision_agents.core.processors import Processor -from vision_agents.core.utils.utils import parse_instructions +from vision_agents.core.utils.utils import Instructions, parse_instructions from vision_agents.core.events.manager import EventManager from .function_registry import FunctionRegistry from .llm_types import ToolSchema, NormalizedToolCallItem @@ -50,7 +50,6 @@ class LLM(abc.ABC): before_response_listener: BeforeCb after_response_listener: AfterCb agent: Optional["Agent"] - _conversation: Optional["Conversation"] function_registry: FunctionRegistry def __init__(self): @@ -59,6 +58,9 @@ def __init__(self): self.events = EventManager() self.events.register_events_from_module(events) self.function_registry = FunctionRegistry() + self.instructions: Optional[str] = None + self.parsed_instructions: Optional[Instructions] = None + self._conversation: Optional[Conversation] = None async def warmup(self) -> None: """ @@ -187,9 +189,20 @@ def _attach_agent(self, agent: Agent): Attach agent to the llm """ self.agent = agent - self._conversation = agent.conversation self._set_instructions(agent.instructions) + def set_conversation(self, conversation: Conversation): + """ + Provide the Conversation object to the LLM to access the chat history. + To be called by the Agent after it joins the call. + + Args: + conversation: a Conversation object + + Returns: + """ + self._conversation = conversation + def _set_instructions(self, instructions: str): self.instructions = instructions diff --git a/plugins/anthropic/tests/test_anthropic_llm.py b/plugins/anthropic/tests/test_anthropic_llm.py index 7bb9918d..7791b474 100644 --- a/plugins/anthropic/tests/test_anthropic_llm.py +++ b/plugins/anthropic/tests/test_anthropic_llm.py @@ -18,7 +18,7 @@ class TestClaudeLLM: async def llm(self) -> ClaudeLLM: """Test ClaudeLLM initialization with a provided client.""" llm = ClaudeLLM(model="claude-sonnet-4-20250514") - llm._conversation = InMemoryConversation("be friendly", []) + llm.set_conversation(InMemoryConversation("be friendly", [])) return llm @pytest.mark.asyncio @@ -58,7 +58,7 @@ async def test_native_api(self, llm: ClaudeLLM): @pytest.mark.integration async def test_stream(self, llm: ClaudeLLM): streamingWorks = False - + @llm.events.subscribe async def passed(event: LLMResponseChunkEvent): nonlocal streamingWorks @@ -70,7 +70,6 @@ async def passed(event: LLMResponseChunkEvent): assert streamingWorks - @pytest.mark.integration async def test_memory(self, llm: ClaudeLLM): await llm.simple_response( diff --git a/plugins/aws/tests/test_aws.py b/plugins/aws/tests/test_aws.py index 47d9df9c..1c31ed25 100644 --- a/plugins/aws/tests/test_aws.py +++ b/plugins/aws/tests/test_aws.py @@ -35,7 +35,7 @@ def assert_response_successful(self, response): async def llm(self) -> BedrockLLM: """Test BedrockLLM initialization with a provided client.""" llm = BedrockLLM(model="qwen.qwen3-32b-v1:0", region_name="us-east-1") - llm._conversation = InMemoryConversation("be friendly", []) + llm.set_conversation(InMemoryConversation("be friendly", [])) return llm @pytest.mark.asyncio diff --git a/plugins/baseten/README.md b/plugins/baseten/README.md new file mode 100644 index 00000000..92409564 --- /dev/null +++ b/plugins/baseten/README.md @@ -0,0 +1,117 @@ +# Qwen3-VL hosted on Baseten +Qwen3-VL is the latest open-source Video Language Model (VLM) from Alibaba. This plugin allows developers to easily run the model hosted on [Baseten](https://www.baseten.co/) with Vision Agents. The model accepts text and video and responds with text vocalised with the TTS service of your choice. + +## Features + +- **Video understanding**: Automatically buffers and forwards video frames to Baseten-hosted VLM models +- **Streaming responses**: Supports streaming text responses with real-time chunk events +- **Frame buffering**: Configurable frame rate and buffer duration for optimal performance +- **Event-driven**: Emits LLM events (chunks, completion, errors) for integration with other components + +## Installation + +```bash +uv add vision-agents[baseten] +``` + +## Quick Start + +```python +from vision_agents.core import Agent, User +from vision_agents.plugins import baseten, getstream, deepgram, elevenlabs, vogent + +async def create_agent(**kwargs) -> Agent: + # Initialize the Baseten VLM + llm = baseten.VLM(model="qwen3vl") + + # Create an agent with video understanding capabilities + agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Video Assistant", id="agent"), + instructions="You're a helpful video AI assistant. Analyze the video frames and respond to user questions about what you see.", + llm=llm, + stt=deepgram.STT(), + tts=elevenlabs.TTS(), + turn_detection=vogent.TurnDetection(), + processors=[], + ) + return agent + +async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None: + await agent.create_user() + call = await agent.create_call(call_type, call_id) + + with await agent.join(call): + # The agent will automatically process video frames and respond to user input + await agent.finish() +``` + +## Configuration + +### Environment Variables + +- **`BASETEN_API_KEY`**: Your Baseten API key (required) +- **`BASETEN_BASE_URL`**: The base URL for your Baseten API endpoint (required) + +### Initialization Parameters + +```python +baseten.VLM( + model: str, # Baseten model name (e.g., "qwen3vl") + api_key: Optional[str] = None, # API key (defaults to BASETEN_API_KEY env var) + base_url: Optional[str] = None, # Base URL (defaults to BASETEN_BASE_URL env var) + fps: int = 1, # Frames per second to process (default: 1) + frame_buffer_seconds: int = 10, # Seconds of video to buffer (default: 10) + client: Optional[AsyncOpenAI] = None, # Custom OpenAI client (optional) +) +``` + +### Parameters + +- **`model`**: The name of the Baseten-hosted model to use. Must be a vision-capable model. +- **`api_key`**: Your Baseten API key. If not provided, reads from `BASETEN_API_KEY` environment variable. +- **`base_url`**: The base URL for Baseten API. If not provided, reads from `BASETEN_BASE_URL` environment variable. +- **`fps`**: Number of video frames per second to capture and send to the model. Lower values reduce API costs but may miss fast-moving content. Default is 1 fps. +- **`frame_buffer_seconds`**: How many seconds of video to buffer. Total buffer size = `fps * frame_buffer_seconds`. Default is 10 seconds. +- **`client`**: Optional pre-configured `AsyncOpenAI` client. If provided, `api_key` and `base_url` are ignored. + +## How It Works + +1. **Video Frame Buffering**: The plugin automatically subscribes to video tracks when the agent joins a call. It buffers frames at the specified FPS for the configured duration. + +2. **Frame Processing**: When responding to user input, the plugin: + - Converts buffered video frames to JPEG format + - Resizes frames to 800x600 (maintaining aspect ratio) + - Encodes frames as base64 data URLs + +3. **API Request**: Sends the conversation history (including system instructions) along with all buffered frames to the Baseten model. + +4. **Streaming Response**: Processes the streaming response and emits events for each chunk and completion. + +## Events + +The plugin emits the following events: + +- **`LLMResponseChunkEvent`**: Emitted for each text chunk in the streaming response +- **`LLMResponseCompletedEvent`**: Emitted when the response stream completes +- **`LLMErrorEvent`**: Emitted if an API request fails + +## Requirements + +- Python 3.10+ +- `openai>=2.5.0` +- `vision-agents` (core framework) +- Baseten API key and base URL + +## Notes + +- **Frame Rate**: The default FPS of 1 is optimized for VLM use cases. Higher FPS values will increase API costs and latency. +- **Frame Size**: Frames are automatically resized to 800x600 pixels while maintaining aspect ratio to optimize API payload size. +- **Buffer Duration**: The 10-second default buffer provides context for the model while keeping memory usage reasonable. +- **Tool Calling**: Tool/function calling support is not yet implemented (see TODOs in code). + +## Troubleshooting + +- **No video processing**: Ensure the agent has joined a call with video tracks available. The plugin automatically subscribes to video when tracks are added. +- **API errors**: Verify your `BASETEN_API_KEY` and `BASETEN_BASE_URL` are set correctly and the model name is valid. +- **High latency**: Consider reducing `fps` or `frame_buffer_seconds` to decrease the number of frames sent per request. diff --git a/plugins/baseten/example/README.md b/plugins/baseten/example/README.md new file mode 100644 index 00000000..69c2bf0a --- /dev/null +++ b/plugins/baseten/example/README.md @@ -0,0 +1 @@ +Please see root plugin readme. \ No newline at end of file diff --git a/plugins/baseten/example/__init__.py b/plugins/baseten/example/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plugins/baseten/example/pyproject.toml b/plugins/baseten/example/pyproject.toml new file mode 100644 index 00000000..d34229b4 --- /dev/null +++ b/plugins/baseten/example/pyproject.toml @@ -0,0 +1,21 @@ +[project] +name = "qwen3-vl-example" +version = "0.1.0" +description = "Example using Qwen3 VL hosted on Baseten with Vision Agents" +requires-python = ">=3.10" +dependencies = [ + "vision-agents", + "vision-agents-plugins-baseten", + "vision-agents-plugins-getstream", + "vision-agents-plugins-deepgram", + "vision-agents-plugins-elevenlabs", + "python-dotenv", +] + +[tool.uv.sources] +vision-agents = { workspace = true } +vision-agents-plugins-baseten = { workspace = true } +vision-agents-plugins-elevenlabs = { workspace = true } +vision-agents-plugins-getstream = { workspace = true } +vision-agents-plugins-deepgram = { workspace = true } + diff --git a/plugins/baseten/example/qwen_vl_example.py b/plugins/baseten/example/qwen_vl_example.py new file mode 100644 index 00000000..7db371f2 --- /dev/null +++ b/plugins/baseten/example/qwen_vl_example.py @@ -0,0 +1,46 @@ +import asyncio + +from dotenv import load_dotenv + +from vision_agents.core import Agent, User, cli +from vision_agents.core.agents import AgentLauncher +from vision_agents.plugins import baseten, getstream, deepgram, elevenlabs +from vision_agents.core.events import CallSessionParticipantJoinedEvent + + +load_dotenv() + + +async def create_agent(**kwargs) -> Agent: + # Initialize the Baseten VLM + llm = baseten.VLM(model="qwen3vl") + + # Create an agent with video understanding capabilities + agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Video Assistant", id="agent"), + instructions="You're a helpful video AI assistant. Analyze the video frames and respond to user questions about what you see.", + llm=llm, + stt=deepgram.STT(), + tts=elevenlabs.TTS(), + processors=[], + ) + return agent + +async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None: + await agent.create_user() + call = await agent.create_call(call_type, call_id) + + @agent.events.subscribe + async def on_participant_joined(event: CallSessionParticipantJoinedEvent): + if event.participant.user.id != "agent": + await asyncio.sleep(2) + await agent.simple_response("Describe what you currently see") + + with await agent.join(call): + await agent.edge.open_demo(call) + # The agent will automatically process video frames and respond to user input + await agent.finish() + +if __name__ == "__main__": + cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) \ No newline at end of file diff --git a/plugins/baseten/py.typed b/plugins/baseten/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/plugins/baseten/pyproject.toml b/plugins/baseten/pyproject.toml new file mode 100644 index 00000000..35af0d8c --- /dev/null +++ b/plugins/baseten/pyproject.toml @@ -0,0 +1,36 @@ +[build-system] +requires = ["hatchling", "hatch-vcs"] +build-backend = "hatchling.build" + +[project] +name = "vision-agents-plugins-baseten" +dynamic = ["version"] +description = "Baseten plugin for vision agents" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +dependencies = [ + "vision-agents", + "openai>=2.5.0", +] + +[project.urls] +Documentation = "https://visionagents.ai/" +Website = "https://visionagents.ai/" +Source = "https://github.com/GetStream/Vision-Agents" + +[tool.hatch.version] +source = "vcs" +raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" } + +[tool.hatch.build.targets.wheel] +packages = ["."] + +[tool.uv.sources] +vision-agents = { workspace = true } + +[dependency-groups] +dev = [ + "pytest>=8.4.1", + "pytest-asyncio>=1.0.0", +] diff --git a/plugins/baseten/vision_agents/plugins/baseten/__init__.py b/plugins/baseten/vision_agents/plugins/baseten/__init__.py new file mode 100644 index 00000000..19a94314 --- /dev/null +++ b/plugins/baseten/vision_agents/plugins/baseten/__init__.py @@ -0,0 +1,4 @@ +from .baseten_vlm import BasetenVLM as VLM + + +__all__ = ["VLM"] diff --git a/plugins/baseten/vision_agents/plugins/baseten/baseten_vlm.py b/plugins/baseten/vision_agents/plugins/baseten/baseten_vlm.py new file mode 100644 index 00000000..7e84acc1 --- /dev/null +++ b/plugins/baseten/vision_agents/plugins/baseten/baseten_vlm.py @@ -0,0 +1,307 @@ +import base64 +import io +import logging +import os +from collections import deque +from typing import Iterator, Optional, cast + +import av +from aiortc.mediastreams import MediaStreamTrack, VideoStreamTrack +from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import Participant +from openai import AsyncOpenAI, AsyncStream +from openai.types.chat import ChatCompletionChunk +from PIL.Image import Resampling +from vision_agents.core.llm.events import ( + LLMResponseChunkEvent, + LLMResponseCompletedEvent, +) +from vision_agents.core.llm.llm import LLMResponseEvent, VideoLLM +from vision_agents.core.processors import Processor +from vision_agents.core.utils.video_forwarder import VideoForwarder + +from . import events + +logger = logging.getLogger(__name__) + + +PLUGIN_NAME = "baseten_vlm" + + +class BasetenVLM(VideoLLM): + """ + TODO: Docs + + Examples: + + from vision_agents.plugins import baseten + llm = baseten.VLM(model="qwen3vl") + + """ + + def __init__( + self, + model: str, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + fps: int = 1, + frame_buffer_seconds: int = 10, + client: Optional[AsyncOpenAI] = None, + ): + """ + Initialize the BasetenVLM class. + + Args: + model (str): The Baseten-hosted model to use. + api_key: optional API key. By default, loads from BASETEN_API_KEY environment variable. + base_url: optional base url. By default, loads from BASETEN_BASE_URL environment variable. + fps: the number of video frames per second to handle. + frame_buffer_seconds: the number of seconds to buffer for the model's input. + Total buffer size = fps * frame_buffer_seconds. + client: optional `AsyncOpenAI` client. By default, creates a new client object. + """ + super().__init__() + self.model = model + self.events.register_events_from_module(events) + + api_key = api_key or os.getenv("BASETEN_API_KEY") + base_url = base_url or os.getenv("BASETEN_BASE_URL") + if client is not None: + self._client = client + elif not api_key: + raise ValueError("api_key must be provided") + elif not base_url: + raise ValueError("base_url must be provided") + else: + self._client = AsyncOpenAI(api_key=api_key, base_url=base_url) + + self._fps = fps + self._video_forwarder: Optional[VideoForwarder] = None + + # Buffer latest 10s of the video track to forward it to the model + # together with the user transcripts + self._frame_buffer: deque[av.VideoFrame] = deque( + maxlen=fps * frame_buffer_seconds + ) + self._frame_width = 800 + self._frame_height = 600 + + async def simple_response( + self, + text: str, + processors: Optional[list[Processor]] = None, + participant: Optional[Participant] = None, + ) -> LLMResponseEvent: + """ + simple_response is a standardized way to create an LLM response. + + This method is also called every time the new STT transcript is received. + + Args: + text: The text to respond to. + processors: list of processors (which contain state) about the video/voice AI. + participant: the Participant object, optional. + + Examples: + + llm.simple_response("say hi to the user, be nice") + """ + + # TODO: Clean up the `_build_enhanced_instructions` and use that. The should be compiled at the agent probably. + + if self._conversation is None: + # The agent hasn't joined the call yet. + logger.warning( + "Cannot create an LLM response - the conversation has not been initialized yet." + ) + return LLMResponseEvent(original=None, text="") + + messages: list[dict] = [] + # Add Agent's instructions as system prompt. + if self.instructions: + messages.append( + { + "role": "system", + "content": self.instructions, + } + ) + + # TODO: Do we need to limit how many messages we send? + # Add all messages from the conversation to the prompt + for message in self._conversation.messages: + messages.append( + { + "role": message.role, + "content": message.content, + } + ) + + # Attach the latest bufferred frames to the request + frames_data = [] + for frame_bytes in self._get_frames_bytes(): + frame_b64 = base64.b64encode(frame_bytes).decode("utf-8") + frame_msg = { + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{frame_b64}"}, + } + frames_data.append(frame_msg) + + logger.debug( + f'Forwarding {len(frames_data)} to the Baseten model "{self.model}"' + ) + + messages.append( + { + "role": "user", + "content": frames_data, + } + ) + + # TODO: Maybe move it to a method, too much code + try: + response = await self._client.chat.completions.create( # type: ignore[arg-type] + messages=messages, # type: ignore[arg-type] + model=self.model, + stream=True, + ) + except Exception as e: + # Send an error event if the request failed + logger.exception( + f'Failed to get a response from the Baseten model "{self.model}"' + ) + self.events.send( + events.LLMErrorEvent( + plugin_name=PLUGIN_NAME, + error_message=str(e), + event_data=e, + ) + ) + return LLMResponseEvent(original=None, text="") + + i = 0 + llm_response_event: LLMResponseEvent[Optional[ChatCompletionChunk]] = ( + LLMResponseEvent(original=None, text="") + ) + text_chunks: list[str] = [] + total_text = "" + async for chunk in cast(AsyncStream[ChatCompletionChunk], response): + if not chunk.choices: + continue + + choice = chunk.choices[0] + content = choice.delta.content + finish_reason = choice.finish_reason + + if content: + text_chunks.append(content) + # Emit delta events for each response chunk. + self.events.send( + LLMResponseChunkEvent( + plugin_name=PLUGIN_NAME, + content_index=None, + item_id=chunk.id, + output_index=0, + sequence_number=i, + delta=content, + ) + ) + + elif finish_reason: + # Emit the completion event when the response stream is finished. + total_text = "".join(text_chunks) + self.events.send( + LLMResponseCompletedEvent( + plugin_name=PLUGIN_NAME, + original=chunk, + text=total_text, + item_id=chunk.id, + ) + ) + + llm_response_event = LLMResponseEvent(original=chunk, text=total_text) + i += 1 + + return llm_response_event + + async def watch_video_track( + self, + track: MediaStreamTrack, + shared_forwarder: Optional[VideoForwarder] = None, + ) -> None: + """ + Setup video forwarding and start bufferring video frames. + This method is called by the `Agent`. + + Args: + track: instance of VideoStreamTrack. + shared_forwarder: a shared VideoForwarder instance if present. Defaults to None. + + Returns: None + """ + + if self._video_forwarder is not None and shared_forwarder is None: + logger.warning("Video forwarder already running, stopping the previous one") + await self._video_forwarder.stop() + self._video_forwarder = None + logger.info("Stopped video forwarding") + + logger.info("🎥 BasetenVLM subscribing to VideoForwarder") + if not shared_forwarder: + self._video_forwarder = shared_forwarder or VideoForwarder( + cast(VideoStreamTrack, track), + max_buffer=10, + fps=1.0, # Low FPS for VLM + name="baseten_vlm_forwarder", + ) + await self._video_forwarder.start() + else: + self._video_forwarder = shared_forwarder + + # Start buffering video frames + await self._video_forwarder.start_event_consumer(self._frame_buffer.append) + + def _get_frames_bytes(self) -> Iterator[bytes]: + """ + Iterate over all bufferred video frames. + """ + for frame in self._frame_buffer: + yield _frame_to_jpeg_bytes( + frame=frame, + target_width=self._frame_width, + target_height=self._frame_height, + quality=85, + ) + + +# TODO: Move it to some core utils +def _frame_to_jpeg_bytes( + frame: av.VideoFrame, target_width: int, target_height: int, quality: int = 85 +) -> bytes: + """ + Convert a frame to JPEG bytes with resizing. + + Args: + frame: an instance of `av.VideoFrame` + target_width: target width in pixels + target_height: target height in pixels + quality: JPEG quality. Default is 85. + + Returns: frame as JPEG bytes. + + """ + # Convert frame to a PIL image + img = frame.to_image() + + # Calculate scaling to maintain aspect ratio + src_width, src_height = img.size + # Calculate scale factor (fit within target dimensions) + scale = min(target_width / src_width, target_height / src_height) + new_width = int(src_width * scale) + new_height = int(src_height * scale) + + # Resize with aspect ratio maintained + resized = img.resize((new_width, new_height), Resampling.LANCZOS) + + # Save as JPEG with quality control + buf = io.BytesIO() + resized.save(buf, "JPEG", quality=quality, optimize=True) + return buf.getvalue() diff --git a/plugins/baseten/vision_agents/plugins/baseten/events.py b/plugins/baseten/vision_agents/plugins/baseten/events.py new file mode 100644 index 00000000..a521661e --- /dev/null +++ b/plugins/baseten/vision_agents/plugins/baseten/events.py @@ -0,0 +1,12 @@ +from dataclasses import dataclass, field +from vision_agents.core.events import PluginBaseEvent +from typing import Optional, Any + + +@dataclass +class LLMErrorEvent(PluginBaseEvent): + """Event emitted when an LLM encounters an error.""" + + type: str = field(default="plugin.llm.error", init=False) + error_message: Optional[str] = None + event_data: Optional[Any] = None diff --git a/plugins/gemini/tests/test_gemini_llm.py b/plugins/gemini/tests/test_gemini_llm.py index 7210fff2..f683a56e 100644 --- a/plugins/gemini/tests/test_gemini_llm.py +++ b/plugins/gemini/tests/test_gemini_llm.py @@ -14,9 +14,7 @@ load_dotenv() - class TestGeminiLLM: - def test_message(self): messages = GeminiLLM._normalize_message("say hi") assert isinstance(messages[0], Message) @@ -32,7 +30,7 @@ def test_advanced_message(self): @pytest.fixture async def llm(self) -> GeminiLLM: llm = GeminiLLM(model="gemini-2.0-flash-exp") - llm._conversation = InMemoryConversation("be friendly", []) + llm.set_conversation(InMemoryConversation("be friendly", [])) return llm @pytest.mark.integration @@ -51,14 +49,14 @@ async def test_native_api(self, llm: GeminiLLM): @pytest.mark.integration async def test_stream(self, llm: GeminiLLM): streamingWorks = False - + @llm.events.subscribe async def passed(event: LLMResponseChunkEvent): nonlocal streamingWorks streamingWorks = True - + await llm.simple_response("Explain magma to a 5 year old") - + # Wait for all events in queue to be processed await llm.events.wait() @@ -67,7 +65,9 @@ async def passed(event: LLMResponseChunkEvent): @pytest.mark.integration async def test_memory(self, llm: GeminiLLM): await llm.simple_response(text="There are 2 dogs in the room") - response = await llm.simple_response(text="How many paws are there in the room?") + response = await llm.simple_response( + text="How many paws are there in the room?" + ) assert "8" in response.text or "eight" in response.text @@ -82,7 +82,7 @@ async def test_native_memory(self, llm: GeminiLLM): @pytest.mark.integration async def test_instruction_following(self): llm = GeminiLLM(model="gemini-2.0-flash-exp") - llm._conversation = InMemoryConversation("be friendly", []) + llm.set_conversation(InMemoryConversation("be friendly", [])) llm._set_instructions("only reply in 2 letter country shortcuts") @@ -165,11 +165,11 @@ async def handle_error_event(event: events.GeminiErrorEvent): ) chunk_item_ids.add(chunk_event.item_id) total_delta_text += chunk_event.delta - + # Validate content_index: should be sequential (0, 1, 2, ...) or None if chunk_event.content_index is not None: content_indices.append(chunk_event.content_index) - + # Verify content_index sequencing if any are provided if content_indices: # Should be sequential starting from 0 diff --git a/plugins/openrouter/tests/test_openrouter_llm.py b/plugins/openrouter/tests/test_openrouter_llm.py index 77c586cc..2962d36c 100644 --- a/plugins/openrouter/tests/test_openrouter_llm.py +++ b/plugins/openrouter/tests/test_openrouter_llm.py @@ -62,9 +62,9 @@ async def llm(self) -> LLM: """Fixture for OpenRouter LLM with z-ai/glm-4.6 model.""" if not os.environ.get("OPENROUTER_API_KEY"): pytest.skip("OPENROUTER_API_KEY environment variable not set") - + llm = LLM(model="anthropic/claude-haiku-4.5") - llm._conversation = InMemoryConversation("be friendly", []) + llm.set_conversation(InMemoryConversation("be friendly", [])) return llm @pytest.mark.integration @@ -114,7 +114,7 @@ async def test_memory(self, llm: LLM): response = await llm.simple_response( text="How many paws are there in the room?", ) - + self.assert_response_successful(response) assert "8" in response.text or "eight" in response.text.lower(), ( f"Expected '8' or 'eight' in response, got: {response.text}" @@ -129,7 +129,7 @@ async def test_native_memory(self, llm: LLM): response = await llm.create_response( input="How many paws are there in the room?", ) - + self.assert_response_successful(response) assert "8" in response.text or "eight" in response.text.lower(), ( f"Expected '8' or 'eight' in response, got: {response.text}" @@ -153,4 +153,3 @@ async def test_instruction_following(self): assert "nl" in response.text.lower(), ( f"Expected 'NL' in response, got: {response.text}" ) - diff --git a/pyproject.toml b/pyproject.toml index 6eef8063..9a8a73a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,6 +6,7 @@ build-backend = "hatchling.build" vision-agents = { workspace = true } vision-agents-plugins-anthropic = { workspace = true } vision-agents-plugins-aws = { workspace = true } +vision-agents-plugins-baseten = { workspace = true } vision-agents-plugins-cartesia = { workspace = true } vision-agents-plugins-deepgram = { workspace = true } vision-agents-plugins-elevenlabs = { workspace = true } @@ -37,6 +38,7 @@ members = [ "agents-core", "plugins/anthropic", "plugins/aws", + "plugins/baseten", "plugins/cartesia", "plugins/deepgram", "plugins/elevenlabs", diff --git a/uv.lock b/uv.lock index 9a8bfe28..0d0c374a 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.12" resolution-markers = [ "python_full_version >= '3.13' and sys_platform == 'win32'", @@ -13,6 +13,7 @@ members = [ "vision-agents", "vision-agents-plugins-anthropic", "vision-agents-plugins-aws", + "vision-agents-plugins-baseten", "vision-agents-plugins-cartesia", "vision-agents-plugins-deepgram", "vision-agents-plugins-elevenlabs", @@ -5532,6 +5533,32 @@ dev = [ { name = "pytest-asyncio", specifier = ">=1.0.0" }, ] +[[package]] +name = "vision-agents-plugins-baseten" +source = { editable = "plugins/baseten" } +dependencies = [ + { name = "openai" }, + { name = "vision-agents" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, + { name = "pytest-asyncio" }, +] + +[package.metadata] +requires-dist = [ + { name = "openai", specifier = ">=2.5.0" }, + { name = "vision-agents", editable = "agents-core" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=8.4.1" }, + { name = "pytest-asyncio", specifier = ">=1.0.0" }, +] + [[package]] name = "vision-agents-plugins-cartesia" source = { editable = "plugins/cartesia" }