GetStream · Nash0x7E2 · Nov 7, 2025 · Nov 5, 2025 · Nov 5, 2025 · Nov 5, 2025
diff --git a/agents-core/vision_agents/core/agents/agent_launcher.py b/agents-core/vision_agents/core/agents/agent_launcher.py
@@ -92,6 +92,14 @@ async def warmup(self, **kwargs) -> None:
             if agent.turn_detection and hasattr(agent.turn_detection, 'warmup'):
                 logger.debug("Warming up turn detection: %s", agent.turn_detection.__class__.__name__)
                 warmup_tasks.append(agent.turn_detection.warmup())
+
+            # Warmup processors
+            if agent.processors and hasattr(agent.processors, 'warmup'):
+                logger.debug("Warming up processors")
+                for processor in agent.processors:
+                    if hasattr(processor, 'warmup'):
+                        logger.debug("Warming up processor: %s", processor.__class__.__name__)
+                        warmup_tasks.append(processor.warmup())
 
             # Run all warmups in parallel
             if warmup_tasks:

diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
@@ -5,12 +5,11 @@
 import time
 import uuid
 from dataclasses import asdict
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard, Coroutine
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
 from uuid import uuid4
 
 import getstream.models
 from aiortc import VideoStreamTrack
-from getstream.video.async_call import Call
 from getstream.video.rtc import Call
 
 from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
@@ -697,7 +696,7 @@ async def create_user(self) -> None:
     async def create_call(self, call_type: str, call_id: str) -> Call:
         """Shortcut for creating a call/room etc."""
         call = self.edge.client.video.call(call_type, call_id)
-        response = await call.get_or_create(data={"created_by_id": self.agent_user.id})
+        await call.get_or_create(data={"created_by_id": self.agent_user.id})
 
         return call
 

diff --git a/plugins/moondream/example/README.md b/plugins/moondream/example/README.md
diff --git a/plugins/moondream/example/__init__.py b/plugins/moondream/example/__init__.py
diff --git a/plugins/moondream/example/moondream_vlm_example.py b/plugins/moondream/example/moondream_vlm_example.py
@@ -0,0 +1,55 @@
+import asyncio
+import logging
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent, cli
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import deepgram, getstream, vogent, elevenlabs, moondream, gemini
+from vision_agents.core.events import CallSessionParticipantJoinedEvent
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+async def create_agent(**kwargs) -> Agent:
+    llm = moondream.LocalDetectionProcessor(
+        # api_key=os.getenv("MOONDREAM_API_KEY"),
+
+    )
+    # create an agent to run with Stream's edge, openAI llm
+    agent = Agent(
+        edge=getstream.Edge(),  # low latency edge. clients for React, iOS, Android, RN, Flutter etc.
+        agent_user=User(
+            name="My happy AI friend", id="agent"
+        ),
+        llm=gemini.LLM("gemini-2.0-flash"),
+        tts=elevenlabs.TTS(),
+        stt=deepgram.STT(),
+        turn_detection=vogent.TurnDetection(),
+        processors=[llm]
+    )
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    # ensure the agent user is created
+    await agent.create_user()
+    # Create a call
+    call = await agent.create_call(call_type, call_id)
+
+    @agent.events.subscribe
+    async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
+        if event.participant.user.id != "agent":
+            await asyncio.sleep(2)
+            await agent.simple_response("Describe what you currently see")
+
+    # Have the agent join the call/room
+    with await agent.join(call):
+        # Open the demo UI
+        await agent.edge.open_demo(call)
+        # run till the call ends
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
diff --git a/plugins/moondream/example/pyproject.toml b/plugins/moondream/example/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "moondream-example"
+version = "0.1.0"
+description = "Example using Moondream Detect and VLM with Vision Agents"
+requires-python = ">=3.10"
+dependencies = [
+    "vision-agents",
+    "vision-agents-plugins-moondream",
+    "vision-agents-plugins-getstream",
+    "vision-agents-plugins-deepgram",
+    "vision-agents-plugins-elevenlabs",
+    "vision-agents-plugins-vogent",
+    "python-dotenv",
+]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+vision-agents-plugins-moondream = { workspace = true }
+vision-agents-plugins-getstream = { workspace = true }
+vision-agents-plugins-deepgram = { workspace = true }
+vision-agents-plugins-elevenlabs = { workspace = true }
+vision-agents-plugins-vogent = { workspace = true }
diff --git a/plugins/moondream/tests/test_moondream_local.py b/plugins/moondream/tests/test_moondream_local.py
@@ -41,7 +41,7 @@ def golf_image(self, assets_dir) -> Iterator[Image.Image]:
     @pytest.fixture
     def moondream_processor(self) -> Iterator[LocalDetectionProcessor]:
         """Create and manage MoondreamLocalProcessor lifecycle."""
-        processor = LocalDetectionProcessor(device="cpu")
+        processor = LocalDetectionProcessor(force_cpu=True)
         try:
             yield processor
         finally:
@@ -261,7 +261,7 @@ def is_available():
             processor.close()
 
         # Also test explicit MPS parameter
-        processor2 = LocalDetectionProcessor(device="mps")
+        processor2 = LocalDetectionProcessor(force_cpu=True)
         try:
             # Verify explicit MPS is also converted to CPU
             assert processor2.device == "cpu"
@@ -270,7 +270,7 @@ def is_available():
 
     def test_device_explicit_cpu(self):
         """Test explicit CPU device selection."""
-        processor = LocalDetectionProcessor(device="cpu")
+        processor = LocalDetectionProcessor(force_cpu=True)
         try:
             assert processor.device == "cpu"
         finally:
@@ -282,7 +282,7 @@ def test_device_explicit_cpu(self):
     )
     def test_device_explicit_cuda(self):
         """Test explicit CUDA device selection (only if CUDA available)."""
-        processor = LocalDetectionProcessor(device="cuda")
+        processor = LocalDetectionProcessor()
         try:
             assert processor.device == "cuda"
         finally:

diff --git a/plugins/moondream/tests/test_moondream_local_vlm.py b/plugins/moondream/tests/test_moondream_local_vlm.py
@@ -0,0 +1,102 @@
+"""
+Tests for the Moondream LocalVLM plugin.
+
+Integration tests require HF_TOKEN environment variable (for gated model access):
+
+    export HF_TOKEN="your-token-here"
+    uv run pytest plugins/moondream/tests/test_moondream_local_vlm.py -m integration -v
+"""
+import os
+from pathlib import Path
+from typing import Iterator
+
+import pytest
+import av
+from PIL import Image
+
+from vision_agents.plugins.moondream import LocalVLM
+
+
+@pytest.fixture(scope="session")
+def golf_image(assets_dir) -> Iterator[Image.Image]:
+    """Load the local golf swing test image from tests/test_assets."""
+    asset_path = Path(assets_dir) / "golf_swing.png"
+    with Image.open(asset_path) as img:
+        yield img.convert("RGB")
+
+
+@pytest.fixture
+def golf_frame(golf_image: Image.Image) -> av.VideoFrame:
+    """Create an av.VideoFrame from the golf image."""
+    return av.VideoFrame.from_image(golf_image)
+
+
+@pytest.fixture
+async def local_vlm_vqa() -> LocalVLM:
+    """Create LocalVLM in VQA mode."""
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        pytest.skip("HF_TOKEN not set")
+
+    vlm = LocalVLM(mode="vqa")
+    try:
+        await vlm.warmup()
+        yield vlm
+    finally:
+        vlm.close()
+
+
+@pytest.fixture
+async def local_vlm_caption() -> LocalVLM:
+    """Create LocalVLM in caption mode."""
+    hf_token = os.getenv("HF_TOKEN")
+    if not hf_token:
+        pytest.skip("HF_TOKEN not set")
+
+    vlm = LocalVLM(mode="caption")
+    try:
+        await vlm.warmup()
+        yield vlm
+    finally:
+        vlm.close()
+
+
+@pytest.mark.integration
+@pytest.mark.skipif(not os.getenv("HF_TOKEN"), reason="HF_TOKEN not set")
+async def test_local_vqa_mode(golf_frame: av.VideoFrame, local_vlm_vqa: LocalVLM):
+    """Test LocalVLM VQA mode with a question about the image."""
+
+    await local_vlm_vqa.warmup()
+    assert local_vlm_vqa.model is not None, "Model must be loaded before test"
+
+    local_vlm_vqa._latest_frame = golf_frame
+
+    question = "What sport is being played in this image?"
+    response = await local_vlm_vqa.simple_response(question)
+
+    assert response is not None
+    assert response.text is not None
+    assert len(response.text) > 0
+    assert response.exception is None
+
+    assert "golf" in response.text.lower()
+
+
+@pytest.mark.integration
+@pytest.mark.skipif(not os.getenv("HF_TOKEN"), reason="HF_TOKEN not set")
+async def test_local_caption_mode(golf_frame: av.VideoFrame, local_vlm_caption: LocalVLM):
+    """Test LocalVLM caption mode to generate a description of the image."""
+
+    await local_vlm_caption.warmup()
+    assert local_vlm_caption.model is not None, "Model must be loaded before test"
+
+    local_vlm_caption._latest_frame = golf_frame
+
+    response = await local_vlm_caption.simple_response("")
+
+    assert response is not None
+    assert response.text is not None
+    assert len(response.text) > 0
+    assert response.exception is None
+
+    assert len(response.text.strip()) > 0
diff --git a/plugins/moondream/tests/test_moondream_vlm.py b/plugins/moondream/tests/test_moondream_vlm.py
@@ -0,0 +1,105 @@
+"""
+Tests for the Moondream CloudVLM plugin.
+
+Integration tests require MOONDREAM_API_KEY environment variable:
+
+    export MOONDREAM_API_KEY="your-key-here"
+    uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m integration -v
+
+To run only unit tests (no API key needed):
+
+    uv run pytest plugins/moondream/tests/test_moondream_vlm.py -m "not integration" -v
+"""
+import os
+from pathlib import Path
+from typing import Iterator
+
+import pytest
+import av
+from PIL import Image
+
+from vision_agents.plugins.moondream import CloudVLM
+
+
+@pytest.fixture(scope="session")
+def golf_image(assets_dir) -> Iterator[Image.Image]:
+    """Load the local golf swing test image from tests/test_assets."""
+    asset_path = Path(assets_dir) / "golf_swing.png"
+    with Image.open(asset_path) as img:
+        yield img.convert("RGB")
+
+
+@pytest.fixture
+def golf_frame(golf_image: Image.Image) -> av.VideoFrame:
+    """Create an av.VideoFrame from the golf image."""
+    return av.VideoFrame.from_image(golf_image)
+
+
+@pytest.fixture
+async def vlm_vqa() -> CloudVLM:
+    """Create CloudVLM in VQA mode."""
+    api_key = os.getenv("MOONDREAM_API_KEY")
+    if not api_key:
+        pytest.skip("MOONDREAM_API_KEY not set")
+
+    vlm = CloudVLM(api_key=api_key, mode="vqa")
+    try:
+        yield vlm
+    finally:
+        vlm.close()
+
+
+@pytest.fixture
+async def vlm_caption() -> CloudVLM:
+    """Create CloudVLM in caption mode."""
+    api_key = os.getenv("MOONDREAM_API_KEY")
+    if not api_key:
+        pytest.skip("MOONDREAM_API_KEY not set")
+
+    vlm = CloudVLM(api_key=api_key, mode="caption")
+    try:
+        yield vlm
+    finally:
+        vlm.close()
+
+
+@pytest.mark.integration
+@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
+async def test_vqa_mode(golf_frame: av.VideoFrame, vlm_vqa: CloudVLM):
+    """Test VQA mode with a question about the image."""
+    # Set the latest frame so _process_frame can access it
+    vlm_vqa._latest_frame = golf_frame
+
+    # Ask a question about the image
+    question = "What sport is being played in this image?"
+    response = await vlm_vqa.simple_response(question)
+
+    # Verify we got a response
+    assert response is not None
+    assert response.text is not None
+    assert len(response.text) > 0
+    assert response.exception is None
+
+    # Verify the response mentions golf (should be in the image)
+    assert "golf" in response.text.lower()
+
+
+@pytest.mark.integration
+@pytest.mark.skipif(not os.getenv("MOONDREAM_API_KEY"), reason="MOONDREAM_API_KEY not set")
+async def test_caption_mode(golf_frame: av.VideoFrame, vlm_caption: CloudVLM):
+    """Test caption mode to generate a description of the image."""
+    # Set the latest frame so _process_frame can access it
+    vlm_caption._latest_frame = golf_frame
+
+    # Generate caption (text is not needed for caption mode)
+    response = await vlm_caption.simple_response("")
+
+    # Verify we got a response
+    assert response is not None
+    assert response.text is not None
+    assert len(response.text) > 0
+    assert response.exception is None
+
+    # Verify the caption is descriptive (not empty)
+    assert len(response.text.strip()) > 0
+
diff --git a/plugins/moondream/vision_agents/plugins/moondream/__init__.py b/plugins/moondream/vision_agents/plugins/moondream/__init__.py
@@ -2,24 +2,22 @@
 Moondream plugin for vision-agents.
 
 This plugin provides Moondream 3 vision capabilities including object detection,
-visual question answering, counting, and captioning.
+visual question answering, and captioning.
 """
 
-from .moondream_cloud_processor import (
-    CloudDetectionProcessor,
-)
-from .moondream_local_processor import (
-    LocalDetectionProcessor,
-)
-from .moondream_video_track import (
-    MoondreamVideoTrack,
-)
+from vision_agents.plugins.moondream.detection.moondream_cloud_processor import CloudDetectionProcessor
+from vision_agents.plugins.moondream.detection.moondream_local_processor import LocalDetectionProcessor
+from vision_agents.plugins.moondream.detection.moondream_video_track import MoondreamVideoTrack
+from vision_agents.plugins.moondream.vlm.moondream_cloud_vlm import CloudVLM
+from vision_agents.plugins.moondream.vlm.moondream_local_vlm import LocalVLM
+
 
 __path__ = __import__("pkgutil").extend_path(__path__, __name__)
 
 __all__ = [
     "CloudDetectionProcessor",
+    "CloudVLM",
+    "LocalVLM",
     "LocalDetectionProcessor",
     "MoondreamVideoTrack",
 ]
-