Skip to content
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions agents-core/vision_agents/core/agents/agent_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,14 @@ async def warmup(self, **kwargs) -> None:
if agent.turn_detection and hasattr(agent.turn_detection, 'warmup'):
logger.debug("Warming up turn detection: %s", agent.turn_detection.__class__.__name__)
warmup_tasks.append(agent.turn_detection.warmup())

# Warmup processors
if agent.processors and hasattr(agent.processors, 'warmup'):
logger.debug("Warming up processors")
for processor in agent.processors:
if hasattr(processor, 'warmup'):
logger.debug("Warming up processor: %s", processor.__class__.__name__)
warmup_tasks.append(processor.warmup())

# Run all warmups in parallel
if warmup_tasks:
Expand Down
5 changes: 2 additions & 3 deletions agents-core/vision_agents/core/agents/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import time
import uuid
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard, Coroutine
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
from uuid import uuid4

import getstream.models
from aiortc import VideoStreamTrack
from getstream.video.async_call import Call
from getstream.video.rtc import Call

from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
Expand Down Expand Up @@ -697,7 +696,7 @@ async def create_user(self) -> None:
async def create_call(self, call_type: str, call_id: str) -> Call:
"""Shortcut for creating a call/room etc."""
call = self.edge.client.video.call(call_type, call_id)
response = await call.get_or_create(data={"created_by_id": self.agent_user.id})
await call.get_or_create(data={"created_by_id": self.agent_user.id})

return call

Expand Down
Empty file.
Empty file.
55 changes: 55 additions & 0 deletions plugins/moondream/example/moondream_vlm_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import asyncio
import logging
from dotenv import load_dotenv

from vision_agents.core import User, Agent, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import deepgram, getstream, vogent, elevenlabs, moondream, gemini
from vision_agents.core.events import CallSessionParticipantJoinedEvent

logger = logging.getLogger(__name__)

load_dotenv()

async def create_agent(**kwargs) -> Agent:
llm = moondream.LocalDetectionProcessor(
# api_key=os.getenv("MOONDREAM_API_KEY"),

)
# create an agent to run with Stream's edge, openAI llm
agent = Agent(
edge=getstream.Edge(), # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
agent_user=User(
name="My happy AI friend", id="agent"
),
llm=gemini.LLM("gemini-2.0-flash"),
tts=elevenlabs.TTS(),
stt=deepgram.STT(),
turn_detection=vogent.TurnDetection(),
processors=[llm]
)
return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
# ensure the agent user is created
await agent.create_user()
# Create a call
call = await agent.create_call(call_type, call_id)

@agent.events.subscribe
async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
if event.participant.user.id != "agent":
await asyncio.sleep(2)
await agent.simple_response("Describe what you currently see")

# Have the agent join the call/room
with await agent.join(call):
# Open the demo UI
await agent.edge.open_demo(call)
# run till the call ends
await agent.finish()


if __name__ == "__main__":
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
22 changes: 22 additions & 0 deletions plugins/moondream/example/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
[project]
name = "moondream-example"
version = "0.1.0"
description = "Example using Moondream Detect and VLM with Vision Agents"
requires-python = ">=3.10"
dependencies = [
"vision-agents",
"vision-agents-plugins-moondream",
"vision-agents-plugins-getstream",
"vision-agents-plugins-deepgram",
"vision-agents-plugins-elevenlabs",
"vision-agents-plugins-vogent",
"python-dotenv",
]

[tool.uv.sources]
vision-agents = { workspace = true }
vision-agents-plugins-moondream = { workspace = true }
vision-agents-plugins-getstream = { workspace = true }
vision-agents-plugins-deepgram = { workspace = true }
vision-agents-plugins-elevenlabs = { workspace = true }
vision-agents-plugins-vogent = { workspace = true }
8 changes: 4 additions & 4 deletions plugins/moondream/tests/test_moondream_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def golf_image(self, assets_dir) -> Iterator[Image.Image]:
@pytest.fixture
def moondream_processor(self) -> Iterator[LocalDetectionProcessor]:
"""Create and manage MoondreamLocalProcessor lifecycle."""
processor = LocalDetectionProcessor(device="cpu")
processor = LocalDetectionProcessor(force_cpu=True)
try:
yield processor
finally:
Expand Down Expand Up @@ -261,7 +261,7 @@ def is_available():
processor.close()

# Also test explicit MPS parameter
processor2 = LocalDetectionProcessor(device="mps")
processor2 = LocalDetectionProcessor(force_cpu=True)
try:
# Verify explicit MPS is also converted to CPU
assert processor2.device == "cpu"
Expand All @@ -270,7 +270,7 @@ def is_available():

def test_device_explicit_cpu(self):
"""Test explicit CPU device selection."""
processor = LocalDetectionProcessor(device="cpu")
processor = LocalDetectionProcessor(force_cpu=True)
try:
assert processor.device == "cpu"
finally:
Expand All @@ -282,7 +282,7 @@ def test_device_explicit_cpu(self):
)
def test_device_explicit_cuda(self):
"""Test explicit CUDA device selection (only if CUDA available)."""
processor = LocalDetectionProcessor(device="cuda")
processor = LocalDetectionProcessor()
try:
assert processor.device == "cuda"
finally:
Expand Down
102 changes: 102 additions & 0 deletions plugins/moondream/tests/test_moondream_local_vlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
"""
Tests for the Moondream LocalVLM plugin.

Integration tests require HF_TOKEN environment variable (for gated model access):

export HF_TOKEN="your-token-here"
uv run pytest plugins/moondream/tests/test_moondream_local_vlm.py -m integration -v
"""
import os
from pathlib import Path
from typing import Iterator

import pytest
import av
from PIL import Image

from vision_agents.plugins.moondream import LocalVLM


@pytest.fixture(scope="session")
def golf_image(assets_dir) -> Iterator[Image.Image]:
"""Load the local golf swing test image from tests/test_assets."""
asset_path = Path(assets_dir) / "golf_swing.png"
with Image.open(asset_path) as img:
yield img.convert("RGB")


@pytest.fixture
def golf_frame(golf_image: Image.Image) -> av.VideoFrame:
"""Create an av.VideoFrame from the golf image."""
return av.VideoFrame.from_image(golf_image)


@pytest.fixture
async def local_vlm_vqa() -> LocalVLM:
"""Create LocalVLM in VQA mode."""
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
pytest.skip("HF_TOKEN not set")

vlm = LocalVLM(mode="vqa")
try:
await vlm.warmup()
yield vlm
finally:
vlm.close()


@pytest.fixture
async def local_vlm_caption() -> LocalVLM:
"""Create LocalVLM in caption mode."""
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
pytest.skip("HF_TOKEN not set")

vlm = LocalVLM(mode="caption")
try:
await vlm.warmup()
yield vlm
finally:
vlm.close()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("HF_TOKEN"), reason="HF_TOKEN not set")
async def test_local_vqa_mode(golf_frame: av.VideoFrame, local_vlm_vqa: LocalVLM):
"""Test LocalVLM VQA mode with a question about the image."""

await local_vlm_vqa.warmup()
assert local_vlm_vqa.model is not None, "Model must be loaded before test"

local_vlm_vqa._latest_frame = golf_frame

question = "What sport is being played in this image?"
response = await local_vlm_vqa.simple_response(question)

assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

assert "golf" in response.text.lower()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("HF_TOKEN"), reason="HF_TOKEN not set")
async def test_local_caption_mode(golf_frame: av.VideoFrame, local_vlm_caption: LocalVLM):
"""Test LocalVLM caption mode to generate a description of the image."""

await local_vlm_caption.warmup()
assert local_vlm_caption.model is not None, "Model must be loaded before test"

local_vlm_caption._latest_frame = golf_frame

response = await local_vlm_caption.simple_response("")

assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

assert len(response.text.strip()) > 0
105 changes: 105 additions & 0 deletions plugins/moondream/tests/test_moondream_vlm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
"""
Tests for the Moondream CloudVLM plugin.

Integration tests require MOONDREAM_API_KEY environment variable:

export MOONDREAM_API_KEY="your-key-here"
uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m integration -v

To run only unit tests (no API key needed):

uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m "not integration" -v
"""
import os
from pathlib import Path
from typing import Iterator

import pytest
import av
from PIL import Image

from vision_agents.plugins.moondream import CloudVLM


@pytest.fixture(scope="session")
def golf_image(assets_dir) -> Iterator[Image.Image]:
"""Load the local golf swing test image from tests/test_assets."""
asset_path = Path(assets_dir) / "golf_swing.png"
with Image.open(asset_path) as img:
yield img.convert("RGB")


@pytest.fixture
def golf_frame(golf_image: Image.Image) -> av.VideoFrame:
"""Create an av.VideoFrame from the golf image."""
return av.VideoFrame.from_image(golf_image)


@pytest.fixture
async def vlm_vqa() -> CloudVLM:
"""Create CloudVLM in VQA mode."""
api_key = os.getenv("MOONDREAM_API_KEY")
if not api_key:
pytest.skip("MOONDREAM_API_KEY not set")

vlm = CloudVLM(api_key=api_key, mode="vqa")
try:
yield vlm
finally:
vlm.close()


@pytest.fixture
async def vlm_caption() -> CloudVLM:
"""Create CloudVLM in caption mode."""
api_key = os.getenv("MOONDREAM_API_KEY")
if not api_key:
pytest.skip("MOONDREAM_API_KEY not set")

vlm = CloudVLM(api_key=api_key, mode="caption")
try:
yield vlm
finally:
vlm.close()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
async def test_vqa_mode(golf_frame: av.VideoFrame, vlm_vqa: CloudVLM):
"""Test VQA mode with a question about the image."""
# Set the latest frame so _process_frame can access it
vlm_vqa._latest_frame = golf_frame

# Ask a question about the image
question = "What sport is being played in this image?"
response = await vlm_vqa.simple_response(question)

# Verify we got a response
assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

# Verify the response mentions golf (should be in the image)
assert "golf" in response.text.lower()


@pytest.mark.integration
@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
async def test_caption_mode(golf_frame: av.VideoFrame, vlm_caption: CloudVLM):
"""Test caption mode to generate a description of the image."""
# Set the latest frame so _process_frame can access it
vlm_caption._latest_frame = golf_frame

# Generate caption (text is not needed for caption mode)
response = await vlm_caption.simple_response("")

# Verify we got a response
assert response is not None
assert response.text is not None
assert len(response.text) > 0
assert response.exception is None

# Verify the caption is descriptive (not empty)
assert len(response.text.strip()) > 0

20 changes: 9 additions & 11 deletions plugins/moondream/vision_agents/plugins/moondream/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,24 +2,22 @@
Moondream plugin for vision-agents.

This plugin provides Moondream 3 vision capabilities including object detection,
visual question answering, counting, and captioning.
visual question answering, and captioning.
"""

from .moondream_cloud_processor import (
CloudDetectionProcessor,
)
from .moondream_local_processor import (
LocalDetectionProcessor,
)
from .moondream_video_track import (
MoondreamVideoTrack,
)
from vision_agents.plugins.moondream.detection.moondream_cloud_processor import CloudDetectionProcessor
from vision_agents.plugins.moondream.detection.moondream_local_processor import LocalDetectionProcessor
from vision_agents.plugins.moondream.detection.moondream_video_track import MoondreamVideoTrack
from vision_agents.plugins.moondream.vlm.moondream_cloud_vlm import CloudVLM
from vision_agents.plugins.moondream.vlm.moondream_local_vlm import LocalVLM


__path__ = __import__("pkgutil").extend_path(__path__, __name__)

__all__ = [
"CloudDetectionProcessor",
"CloudVLM",
"LocalVLM",
"LocalDetectionProcessor",
"MoondreamVideoTrack",
]

Loading