From 9675a1449a7a31c289398e5d2a74498e47503661 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Mon, 27 Oct 2025 15:40:28 +0100
Subject: [PATCH 01/20] implemented heygen avatars

---
 agents-core/pyproject.toml                    |   2 +
 plugins/heygen/README.md                      | 181 ++++++++++++++
 plugins/heygen/example/README.md              | 126 ++++++++++
 plugins/heygen/example/__init__.py            |   0
 plugins/heygen/example/avatar_example.py      |  74 ++++++
 plugins/heygen/example/pyproject.toml         |  25 ++
 plugins/heygen/py.typed                       |   0
 plugins/heygen/pyproject.toml                 |  41 +++
 plugins/heygen/tests/__init__.py              |   0
 plugins/heygen/tests/test_heygen_plugin.py    | 120 +++++++++
 .../vision_agents/plugins/heygen/__init__.py  |  12 +
 .../plugins/heygen/heygen_avatar_publisher.py | 171 +++++++++++++
 .../plugins/heygen/heygen_rtc_manager.py      | 235 ++++++++++++++++++
 .../plugins/heygen/heygen_session.py          | 181 ++++++++++++++
 .../plugins/heygen/heygen_video_track.py      | 161 ++++++++++++
 pyproject.toml                                |   4 +-
 uv.lock                                       |  56 ++++-
 17 files changed, 1387 insertions(+), 2 deletions(-)
 create mode 100644 plugins/heygen/README.md
 create mode 100644 plugins/heygen/example/README.md
 create mode 100644 plugins/heygen/example/__init__.py
 create mode 100644 plugins/heygen/example/avatar_example.py
 create mode 100644 plugins/heygen/example/pyproject.toml
 create mode 100644 plugins/heygen/py.typed
 create mode 100644 plugins/heygen/pyproject.toml
 create mode 100644 plugins/heygen/tests/__init__.py
 create mode 100644 plugins/heygen/tests/test_heygen_plugin.py
 create mode 100644 plugins/heygen/vision_agents/plugins/heygen/__init__.py
 create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
 create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
 create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
 create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py

diff --git a/agents-core/pyproject.toml b/agents-core/pyproject.toml
index b817ceb2..f66da592 100644
--- a/agents-core/pyproject.toml
+++ b/agents-core/pyproject.toml
@@ -41,6 +41,7 @@ deepgram = ["vision-agents-plugins-deepgram"]
 elevenlabs = ["vision-agents-plugins-elevenlabs"]
 gemini = ["vision-agents-plugins-gemini"]
 getstream = ["vision-agents-plugins-getstream"]
+heygen = ["vision-agents-plugins-heygen"]
 kokoro = ["vision-agents-plugins-kokoro"]
 krisp = ["vision-agents-plugins-krisp"]
 moonshine = ["vision-agents-plugins-moonshine"]
@@ -57,6 +58,7 @@ all-plugins = [
   "vision-agents-plugins-elevenlabs",
   "vision-agents-plugins-gemini",
   "vision-agents-plugins-getstream",
+  "vision-agents-plugins-heygen",
   "vision-agents-plugins-kokoro",
   "vision-agents-plugins-krisp",
   "vision-agents-plugins-moonshine",
diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md
new file mode 100644
index 00000000..2dc19e34
--- /dev/null
+++ b/plugins/heygen/README.md
@@ -0,0 +1,181 @@
+# HeyGen Avatar Plugin for Vision Agents
+
+Add realistic avatar video to your AI agents using HeyGen's streaming avatar API.
+
+## Features
+
+- 🎭 **Realistic Avatars**: Use HeyGen's high-quality avatars with natural movements
+- 🎤 **Automatic Lip-Sync**: Avatar automatically syncs with audio from any TTS provider
+- 🚀 **WebRTC Streaming**: Low-latency real-time video streaming via WebRTC
+- 🔌 **Easy Integration**: Works seamlessly with Vision Agents framework
+- 🎨 **Customizable**: Configure avatar, quality, resolution, and more
+
+## Installation
+
+```bash
+pip install vision-agents-plugins-heygen
+```
+
+Or with uv:
+
+```bash
+uv pip install vision-agents-plugins-heygen
+```
+
+## Quick Start
+
+```python
+import asyncio
+from uuid import uuid4
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent
+from vision_agents.plugins import cartesia, deepgram, getstream, gemini, heygen
+
+load_dotenv()
+
+async def start_avatar_agent():
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="AI Assistant with Avatar", id="agent"),
+        instructions="You're a friendly AI assistant.",
+        
+        llm=gemini.LLM("gemini-2.0-flash"),
+        tts=cartesia.TTS(),
+        stt=deepgram.STT(),
+        
+        # Add HeyGen avatar
+        processors=[
+            heygen.AvatarPublisher(
+                avatar_id="default",
+                quality="high"
+            )
+        ]
+    )
+    
+    call = agent.edge.client.video.call("default", str(uuid4()))
+    
+    with await agent.join(call):
+        await agent.edge.open_demo(call)
+        await agent.simple_response("Hello! I'm your AI assistant with an avatar.")
+        await agent.finish()
+
+if __name__ == "__main__":
+    asyncio.run(start_avatar_agent())
+```
+
+## Configuration
+
+### Environment Variables
+
+Set your HeyGen API key:
+
+```bash
+HEYGEN_API_KEY=your_heygen_api_key_here
+```
+
+### AvatarPublisher Options
+
+```python
+heygen.AvatarPublisher(
+    avatar_id="default",           # HeyGen avatar ID
+    quality="high",                # Video quality: "low", "medium", "high"
+    resolution=(1920, 1080),       # Output resolution (width, height)
+    api_key=None,                  # Optional: override env var
+)
+```
+
+## Usage Examples
+
+### With Realtime LLM
+
+```python
+from vision_agents.plugins import gemini, heygen, getstream
+
+agent = Agent(
+    edge=getstream.Edge(),
+    agent_user=User(name="Realtime Avatar AI"),
+    instructions="Be conversational and responsive.",
+    
+    llm=gemini.Realtime(fps=2),  # No separate TTS needed
+    
+    processors=[
+        heygen.AvatarPublisher(avatar_id="professional_presenter")
+    ]
+)
+```
+
+### With Multiple Processors
+
+```python
+from vision_agents.plugins import ultralytics, heygen
+
+agent = Agent(
+    edge=getstream.Edge(),
+    agent_user=User(name="Fitness Coach"),
+    instructions="Analyze user poses and provide feedback.",
+    
+    llm=gemini.Realtime(fps=3),
+    
+    processors=[
+        # Process incoming user video
+        ultralytics.YOLOPoseProcessor(model_path="yolo11n-pose.pt"),
+        # Publish avatar video
+        heygen.AvatarPublisher(avatar_id="fitness_trainer")
+    ]
+)
+```
+
+## How It Works
+
+1. **Connection**: Establishes WebRTC connection to HeyGen's streaming API
+2. **Audio Input**: Receives audio from your TTS provider or Realtime LLM
+3. **Avatar Generation**: HeyGen generates avatar video with lip-sync
+4. **Video Streaming**: Streams avatar video to call participants via GetStream Edge
+
+## Requirements
+
+- Python 3.10+
+- HeyGen API key (get one at [heygen.com](https://heygen.com))
+- GetStream account for video calls
+- TTS provider (Cartesia, ElevenLabs, etc.) or Realtime LLM
+
+## Troubleshooting
+
+### Connection Issues
+
+If you experience connection problems:
+
+1. Check your HeyGen API key is valid
+2. Ensure you have network access to HeyGen's servers
+3. Check firewall settings for WebRTC traffic
+
+### Video Quality
+
+To optimize video quality:
+
+- Use `quality="high"` for best results
+- Increase resolution if bandwidth allows
+- Ensure stable internet connection
+
+## API Reference
+
+### AvatarPublisher
+
+Main class for publishing HeyGen avatar video.
+
+**Methods:**
+- `publish_video_track()`: Returns video track for streaming
+- `state()`: Returns current state information
+- `close()`: Clean up resources
+
+## License
+
+MIT
+
+## Links
+
+- [Documentation](https://visionagents.ai/)
+- [GitHub](https://github.com/GetStream/Vision-Agents)
+- [HeyGen API Docs](https://docs.heygen.com/docs/streaming-api)
+
diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md
new file mode 100644
index 00000000..631d3309
--- /dev/null
+++ b/plugins/heygen/example/README.md
@@ -0,0 +1,126 @@
+# HeyGen Avatar Example
+
+This example demonstrates how to use the HeyGen plugin to add realistic avatar video to your AI agent.
+
+## Setup
+
+1. **Install dependencies:**
+
+```bash
+cd plugins/heygen/example
+uv pip install -e .
+```
+
+2. **Configure environment variables:**
+
+Copy `.env.example` to `.env` and fill in your API keys:
+
+```bash
+cp .env.example .env
+```
+
+Required API keys:
+- `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com)
+- `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io)
+- `CARTESIA_API_KEY` - Get from [Cartesia](https://cartesia.ai)
+- `DEEPGRAM_API_KEY` - Get from [Deepgram](https://deepgram.com)
+- `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey)
+
+## Running the Example
+
+```bash
+uv run avatar_example.py
+```
+
+This will:
+1. Start an AI agent with a HeyGen avatar
+2. Open a demo UI in your browser
+3. The avatar will greet you and be ready to chat
+
+## What's Happening
+
+1. **Agent Setup**: The agent is configured with:
+   - Gemini LLM for generating responses
+   - Cartesia TTS for speech synthesis
+   - Deepgram STT for speech recognition
+   - HeyGen AvatarPublisher for avatar video
+
+2. **Avatar Streaming**: When the agent speaks:
+   - Text is generated by Gemini LLM
+   - Audio is synthesized by Cartesia TTS
+   - Audio is sent to HeyGen via WebRTC
+   - HeyGen generates avatar video with lip-sync
+   - Avatar video is streamed to the call
+
+3. **User Interaction**: When you speak:
+   - Audio is captured from your microphone
+   - Transcribed to text by Deepgram
+   - Sent to Gemini LLM for processing
+   - Response is generated and spoken through the avatar
+
+## Customization
+
+### Using a Different Avatar
+
+Get your avatar ID from HeyGen dashboard and update:
+
+```python
+heygen.AvatarPublisher(
+    avatar_id="your_avatar_id_here",
+    quality="high"
+)
+```
+
+### Adjusting Video Quality
+
+Choose quality based on your bandwidth:
+
+```python
+heygen.AvatarPublisher(
+    avatar_id="default",
+    quality="low",      # Options: "low", "medium", "high"
+    resolution=(1280, 720)  # Lower resolution for better performance
+)
+```
+
+### Using a Different LLM
+
+Switch to OpenAI's Realtime API:
+
+```python
+from vision_agents.plugins import openai
+
+agent = Agent(
+    # ... other config ...
+    llm=openai.Realtime(model="gpt-realtime", voice="alloy"),
+    # No need for separate TTS/STT with Realtime LLM
+    processors=[
+        heygen.AvatarPublisher(avatar_id="default")
+    ]
+)
+```
+
+## Troubleshooting
+
+### "HeyGen API key required" Error
+
+Make sure `HEYGEN_API_KEY` is set in your `.env` file.
+
+### Connection Timeout
+
+- Check your internet connection
+- Verify HeyGen API key is valid
+- Ensure firewall allows WebRTC traffic
+
+### No Video Appearing
+
+- Check browser console for errors
+- Verify GetStream credentials are correct
+- Try lowering video quality settings
+
+## Learn More
+
+- [HeyGen API Documentation](https://docs.heygen.com/docs/streaming-api)
+- [Vision Agents Documentation](https://visionagents.ai/)
+- [GetStream Video Documentation](https://getstream.io/video/docs/)
+
diff --git a/plugins/heygen/example/__init__.py b/plugins/heygen/example/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py
new file mode 100644
index 00000000..5d021b41
--- /dev/null
+++ b/plugins/heygen/example/avatar_example.py
@@ -0,0 +1,74 @@
+import asyncio
+from uuid import uuid4
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent
+from vision_agents.plugins import kokoro, deepgram, getstream, smart_turn, gemini, heygen
+
+load_dotenv()
+
+
+async def start_avatar_agent() -> None:
+    """Start an agent with HeyGen avatar.
+    
+    This example demonstrates how to use HeyGen's avatar streaming
+    to add realistic avatar video to your AI agent. The avatar will
+    lip-sync with the audio generated by the TTS.
+    """
+    
+    # Create LLM
+    llm = gemini.LLM("gemini-2.0-flash")
+    
+    # Create agent with HeyGen avatar
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(
+            name="AI Assistant with Avatar",
+            id="agent"
+        ),
+        instructions=(
+            "You're a friendly and helpful AI assistant. "
+            "Keep your responses conversational and engaging. "
+            "Don't use special characters or formatting."
+        ),
+        
+        # LLM and speech components
+        llm=llm,
+        tts=kokoro.TTS(),  # Using Kokoro (free, local TTS)
+        stt=deepgram.STT(),
+        turn_detection=smart_turn.TurnDetection(
+            buffer_duration=2.0,
+            confidence_threshold=0.5
+        ),
+        
+        # Add HeyGen avatar as a video publisher
+        processors=[
+            heygen.AvatarPublisher(
+                avatar_id="default",  # Use your HeyGen avatar ID
+                quality="high",       # Video quality: "low", "medium", "high"
+                resolution=(1920, 1080),  # Output resolution
+            )
+        ]
+    )
+    
+    # Create a call
+    call = agent.edge.client.video.call("default", str(uuid4()))
+    
+    # Join the call
+    with await agent.join(call):
+        # Open demo UI
+        await agent.edge.open_demo(call)
+        
+        # Greet the user through the avatar
+        await agent.simple_response(
+            "Hello! I'm your AI assistant with an avatar. "
+            "How can I help you today?"
+        )
+        
+        # Keep the call running
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    asyncio.run(start_avatar_agent())
+
diff --git a/plugins/heygen/example/pyproject.toml b/plugins/heygen/example/pyproject.toml
new file mode 100644
index 00000000..83fd9bdd
--- /dev/null
+++ b/plugins/heygen/example/pyproject.toml
@@ -0,0 +1,25 @@
+[project]
+name = "heygen-avatar-example"
+version = "0.1.0"
+description = "Example using HeyGen avatar with Vision Agents"
+requires-python = ">=3.10"
+dependencies = [
+    "vision-agents",
+    "vision-agents-plugins-heygen",
+    "vision-agents-plugins-kokoro",
+    "vision-agents-plugins-deepgram",
+    "vision-agents-plugins-gemini",
+    "vision-agents-plugins-getstream",
+    "vision-agents-plugins-smart-turn",
+    "python-dotenv",
+]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+vision-agents-plugins-heygen = { workspace = true }
+vision-agents-plugins-kokoro = { workspace = true }
+vision-agents-plugins-deepgram = { workspace = true }
+vision-agents-plugins-gemini = { workspace = true }
+vision-agents-plugins-getstream = { workspace = true }
+vision-agents-plugins-smart-turn = { workspace = true }
+
diff --git a/plugins/heygen/py.typed b/plugins/heygen/py.typed
new file mode 100644
index 00000000..e69de29b
diff --git a/plugins/heygen/pyproject.toml b/plugins/heygen/pyproject.toml
new file mode 100644
index 00000000..b152460d
--- /dev/null
+++ b/plugins/heygen/pyproject.toml
@@ -0,0 +1,41 @@
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project]
+name = "vision-agents-plugins-heygen"
+version = "0.1.0"
+description = "HeyGen avatar plugin for Vision Agents"
+readme = "README.md"
+requires-python = ">=3.10"
+license = "MIT"
+dependencies = [
+    "vision-agents",
+    "aiortc>=1.9.0",
+    "aiohttp>=3.9.0",
+]
+
+[project.urls]
+Documentation = "https://visionagents.ai/"
+Website = "https://visionagents.ai/"
+Source = "https://github.com/GetStream/Vision-Agents"
+
+[tool.hatch.version]
+source = "vcs"
+raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" }
+
+[tool.hatch.build.targets.wheel]
+packages = ["vision_agents"]
+
+[tool.hatch.build.targets.sdist]
+include = ["/vision_agents"]
+
+[tool.uv.sources]
+vision-agents = { workspace = true }
+
+[dependency-groups]
+dev = [
+    "pytest>=8.4.1",
+    "pytest-asyncio>=1.0.0",
+]
+
diff --git a/plugins/heygen/tests/__init__.py b/plugins/heygen/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py
new file mode 100644
index 00000000..3be160d6
--- /dev/null
+++ b/plugins/heygen/tests/test_heygen_plugin.py
@@ -0,0 +1,120 @@
+import pytest
+from unittest.mock import Mock, AsyncMock, patch
+from vision_agents.plugins.heygen import (
+    AvatarPublisher,
+    HeyGenVideoTrack,
+    HeyGenRTCManager,
+    HeyGenSession,
+)
+
+
+class TestHeyGenSession:
+    """Tests for HeyGenSession."""
+    
+    def test_init_with_api_key(self):
+        """Test initialization with explicit API key."""
+        session = HeyGenSession(
+            avatar_id="test_avatar",
+            quality="high",
+            api_key="test_key",
+        )
+        
+        assert session.avatar_id == "test_avatar"
+        assert session.quality == "high"
+        assert session.api_key == "test_key"
+    
+    def test_init_without_api_key_raises(self):
+        """Test initialization without API key raises error."""
+        with patch.dict("os.environ", {}, clear=True):
+            with pytest.raises(ValueError, match="HeyGen API key required"):
+                HeyGenSession(avatar_id="test_avatar")
+
+
+class TestHeyGenVideoTrack:
+    """Tests for HeyGenVideoTrack."""
+    
+    def test_init(self):
+        """Test video track initialization."""
+        track = HeyGenVideoTrack(width=1920, height=1080)
+        
+        assert track.width == 1920
+        assert track.height == 1080
+        assert not track._stopped
+    
+    def test_stop(self):
+        """Test stopping the video track."""
+        track = HeyGenVideoTrack()
+        track.stop()
+        
+        assert track._stopped
+
+
+class TestHeyGenRTCManager:
+    """Tests for HeyGenRTCManager."""
+    
+    def test_init(self):
+        """Test RTC manager initialization."""
+        with patch.object(HeyGenSession, "__init__", return_value=None):
+            manager = HeyGenRTCManager(
+                avatar_id="test_avatar",
+                quality="medium",
+                api_key="test_key",
+            )
+            
+            assert manager.pc is None
+            assert not manager._connected
+    
+    def test_is_connected_property(self):
+        """Test is_connected property."""
+        with patch.object(HeyGenSession, "__init__", return_value=None):
+            manager = HeyGenRTCManager(api_key="test_key")
+            
+            assert not manager.is_connected
+            
+            manager._connected = True
+            assert manager.is_connected
+
+
+class TestAvatarPublisher:
+    """Tests for AvatarPublisher."""
+    
+    def test_init(self):
+        """Test avatar publisher initialization."""
+        with patch.object(HeyGenRTCManager, "__init__", return_value=None):
+            publisher = AvatarPublisher(
+                avatar_id="test_avatar",
+                quality="high",
+                resolution=(1920, 1080),
+                api_key="test_key",
+            )
+            
+            assert publisher.avatar_id == "test_avatar"
+            assert publisher.quality == "high"
+            assert publisher.resolution == (1920, 1080)
+            assert not publisher._connected
+    
+    def test_publish_video_track(self):
+        """Test publishing video track."""
+        with patch.object(HeyGenRTCManager, "__init__", return_value=None):
+            publisher = AvatarPublisher(api_key="test_key")
+            
+            track = publisher.publish_video_track()
+            
+            assert isinstance(track, HeyGenVideoTrack)
+    
+    def test_state(self):
+        """Test state method."""
+        with patch.object(HeyGenRTCManager, "__init__", return_value=None):
+            publisher = AvatarPublisher(
+                avatar_id="test_avatar",
+                quality="medium",
+                api_key="test_key",
+            )
+            
+            state = publisher.state()
+            
+            assert state["avatar_id"] == "test_avatar"
+            assert state["quality"] == "medium"
+            assert "connected" in state
+            assert "rtc_connected" in state
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/__init__.py b/plugins/heygen/vision_agents/plugins/heygen/__init__.py
new file mode 100644
index 00000000..ef7db7ba
--- /dev/null
+++ b/plugins/heygen/vision_agents/plugins/heygen/__init__.py
@@ -0,0 +1,12 @@
+"""HeyGen avatar plugin for Vision Agents.
+
+This plugin provides HeyGen's interactive avatar streaming capabilities,
+allowing AI agents to have realistic avatar video output with lip-sync.
+"""
+
+from .heygen_avatar_publisher import AvatarPublisher
+
+__all__ = [
+    "AvatarPublisher",
+]
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
new file mode 100644
index 00000000..786b9421
--- /dev/null
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -0,0 +1,171 @@
+import asyncio
+import logging
+from typing import Optional, Any, Tuple
+
+from vision_agents.core.processors.base_processor import (
+    AudioVideoProcessor,
+    VideoPublisherMixin,
+)
+
+from .heygen_rtc_manager import HeyGenRTCManager
+from .heygen_video_track import HeyGenVideoTrack
+
+logger = logging.getLogger(__name__)
+
+
+class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin):
+    """HeyGen avatar video publisher.
+    
+    Publishes video of a HeyGen avatar that lip-syncs to audio input.
+    Can be used as a processor in the Vision Agents framework to add
+    realistic avatar video to AI agents.
+    
+    Example:
+        agent = Agent(
+            edge=getstream.Edge(),
+            agent_user=User(name="Avatar AI"),
+            instructions="Be helpful and friendly",
+            llm=gemini.LLM("gemini-2.0-flash"),
+            tts=cartesia.TTS(),
+            stt=deepgram.STT(),
+            processors=[
+                heygen.AvatarPublisher(
+                    avatar_id="default",
+                    quality="high"
+                )
+            ]
+        )
+    """
+
+    def __init__(
+        self,
+        avatar_id: str = "default",
+        quality: str = "high",
+        resolution: Tuple[int, int] = (1920, 1080),
+        api_key: Optional[str] = None,
+        interval: int = 0,
+        **kwargs,
+    ):
+        """Initialize the HeyGen avatar publisher.
+        
+        Args:
+            avatar_id: HeyGen avatar ID to use for streaming.
+            quality: Video quality ("low", "medium", "high").
+            resolution: Output video resolution (width, height).
+            api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided.
+            interval: Processing interval (not used, kept for compatibility).
+            **kwargs: Additional arguments passed to parent class.
+        """
+        super().__init__(
+            interval=interval,
+            receive_audio=False,
+            receive_video=False,
+            **kwargs
+        )
+        
+        self.avatar_id = avatar_id
+        self.quality = quality
+        self.resolution = resolution
+        self.api_key = api_key
+        
+        # WebRTC manager for HeyGen connection
+        self.rtc_manager = HeyGenRTCManager(
+            avatar_id=avatar_id,
+            quality=quality,
+            api_key=api_key,
+        )
+        
+        # Video track for publishing avatar frames
+        self._video_track = HeyGenVideoTrack(
+            width=resolution[0],
+            height=resolution[1],
+        )
+        
+        # Connection state
+        self._connected = False
+        self._connection_task: Optional[asyncio.Task] = None
+        
+        logger.info(
+            f"🎭 HeyGen AvatarPublisher initialized "
+            f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})"
+        )
+
+    async def _connect_to_heygen(self) -> None:
+        """Establish connection to HeyGen and start receiving video."""
+        try:
+            # Set up video callback before connecting
+            self.rtc_manager.set_video_callback(self._on_video_track)
+            
+            # Connect to HeyGen
+            await self.rtc_manager.connect()
+            
+            self._connected = True
+            logger.info("✅ Connected to HeyGen, avatar streaming active")
+        
+        except Exception as e:
+            logger.error(f"❌ Failed to connect to HeyGen: {e}")
+            self._connected = False
+            raise
+
+    async def _on_video_track(self, track: Any) -> None:
+        """Callback when video track is received from HeyGen.
+        
+        Args:
+            track: Incoming video track from HeyGen's WebRTC connection.
+        """
+        logger.info("📹 Received video track from HeyGen, starting frame forwarding")
+        await self._video_track.start_receiving(track)
+
+    def publish_video_track(self):
+        """Publish the HeyGen avatar video track.
+        
+        This method is called by the Agent to get the video track
+        for publishing to the call.
+        
+        Returns:
+            HeyGenVideoTrack instance for streaming avatar video.
+        """
+        # Start connection if not already connected
+        if not self._connected and not self._connection_task:
+            self._connection_task = asyncio.create_task(self._connect_to_heygen())
+        
+        logger.info("🎥 Publishing HeyGen avatar video track")
+        return self._video_track
+
+    def state(self) -> dict:
+        """Get current state of the avatar publisher.
+        
+        Returns:
+            Dictionary containing current state information.
+        """
+        return {
+            "avatar_id": self.avatar_id,
+            "quality": self.quality,
+            "resolution": self.resolution,
+            "connected": self._connected,
+            "rtc_connected": self.rtc_manager.is_connected,
+        }
+
+    async def close(self) -> None:
+        """Clean up resources and close connections."""
+        logger.info("🔌 Closing HeyGen avatar publisher")
+        
+        # Stop video track
+        if self._video_track:
+            self._video_track.stop()
+        
+        # Close RTC connection
+        if self.rtc_manager:
+            await self.rtc_manager.close()
+        
+        # Cancel connection task if running
+        if self._connection_task:
+            self._connection_task.cancel()
+            try:
+                await self._connection_task
+            except asyncio.CancelledError:
+                pass
+        
+        self._connected = False
+        logger.info("✅ HeyGen avatar publisher closed")
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
new file mode 100644
index 00000000..8e5d340b
--- /dev/null
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -0,0 +1,235 @@
+import asyncio
+import logging
+from typing import Optional, Callable, Any
+
+from aiortc import (
+    RTCPeerConnection,
+    RTCSessionDescription,
+    RTCIceServer,
+    RTCConfiguration,
+    MediaStreamTrack,
+)
+
+from .heygen_session import HeyGenSession
+
+logger = logging.getLogger(__name__)
+
+
+class HeyGenRTCManager:
+    """Manages WebRTC connection to HeyGen's Streaming Avatar API.
+    
+    Handles the low-level WebRTC peer connection, audio/video streaming,
+    and communication with HeyGen's servers.
+    """
+
+    def __init__(
+        self,
+        avatar_id: str = "default",
+        quality: str = "high",
+        api_key: Optional[str] = None,
+    ):
+        """Initialize the RTC manager.
+        
+        Args:
+            avatar_id: HeyGen avatar ID to use.
+            quality: Video quality setting ("low", "medium", "high").
+            api_key: HeyGen API key (uses HEYGEN_API_KEY env var if not provided).
+        """
+        self.session_manager = HeyGenSession(
+            avatar_id=avatar_id,
+            quality=quality,
+            api_key=api_key,
+        )
+        
+        self.pc: Optional[RTCPeerConnection] = None
+        
+        # Video track callback for receiving avatar video
+        self._video_callback: Optional[Callable[[MediaStreamTrack], Any]] = None
+        
+        self._connected = False
+        self._connection_ready = asyncio.Event()
+
+    async def connect(self) -> None:
+        """Establish WebRTC connection to HeyGen's Streaming API.
+        
+        Sets up the peer connection, negotiates tracks, and establishes
+        the connection for real-time avatar streaming.
+        
+        HeyGen flow:
+        1. Create session -> HeyGen provides SDP offer and ICE servers
+        2. Set HeyGen's offer as remote description
+        3. Create answer
+        4. Send answer to HeyGen
+        5. Start session
+        """
+        try:
+            # Create HeyGen session - they provide the SDP offer
+            session_info = await self.session_manager.create_session()
+            
+            # Extract ICE servers and SDP offer from session info
+            ice_servers = self._parse_ice_servers(session_info)
+            
+            # HeyGen's sdp field - check the actual structure
+            sdp_data = session_info.get("sdp")
+            
+            if isinstance(sdp_data, dict):
+                # Standard WebRTC format: {'type': 'offer', 'sdp': 'v=0...'}
+                offer_sdp = sdp_data.get("sdp")
+                sdp_type = sdp_data.get("type")
+                logger.debug(f"Got SDP dict from HeyGen (type: {sdp_type})")
+            elif isinstance(sdp_data, str) and sdp_data.startswith("v=0"):
+                # Raw SDP string (less common)
+                offer_sdp = sdp_data
+                logger.debug("Got raw SDP string from HeyGen")
+            else:
+                offer_sdp = None
+            
+            if not offer_sdp:
+                logger.error(f"❌ Unexpected SDP format. Type: {type(sdp_data)}")
+                if isinstance(sdp_data, dict):
+                    logger.error(f"SDP dict keys: {list(sdp_data.keys())}")
+                logger.error(f"SDP data: {str(sdp_data)[:200] if sdp_data else 'None'}")
+                raise RuntimeError("No valid SDP offer received from HeyGen")
+            
+            # Create RTCPeerConnection with ICE servers
+            config = RTCConfiguration(iceServers=ice_servers)
+            self.pc = RTCPeerConnection(configuration=config)
+            
+            # Set up track handlers
+            @self.pc.on("track")
+            async def on_track(track: MediaStreamTrack):
+                await self._handle_track(track)
+            
+            @self.pc.on("connectionstatechange")
+            async def on_connection_state_change():
+                logger.info(f"🔗 HeyGen connection state: {self.pc.connectionState}")
+                if self.pc.connectionState == "connected":
+                    self._connected = True
+                    self._connection_ready.set()
+                elif self.pc.connectionState in ["failed", "closed"]:
+                    self._connected = False
+                    self._connection_ready.clear()
+            
+            # Set HeyGen's offer as remote description
+            offer = RTCSessionDescription(sdp=offer_sdp, type="offer")
+            await self.pc.setRemoteDescription(offer)
+            
+            # HeyGen's offer already includes tracks, so transceivers are auto-created
+            # We just need to create our answer
+            logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}")
+            
+            # Create our answer
+            answer = await self.pc.createAnswer()
+            await self.pc.setLocalDescription(answer)
+            
+            # Start the session with our SDP answer
+            # HeyGen expects the answer in the start_session call
+            await self.session_manager.start_session(sdp_answer=self.pc.localDescription.sdp)
+            
+            # Wait for connection to be established
+            await asyncio.wait_for(self._connection_ready.wait(), timeout=10.0)
+            
+            logger.info("✅ HeyGen WebRTC connection established")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to connect to HeyGen: {e}")
+            raise
+
+    def _parse_ice_servers(self, session_info: dict) -> list:
+        """Parse ICE servers from HeyGen session info.
+        
+        HeyGen may provide ice_servers, ice_servers2, or rely on LiveKit's embedded servers.
+        
+        Args:
+            session_info: Session information from HeyGen API.
+            
+        Returns:
+            List of RTCIceServer objects.
+        """
+        ice_servers = []
+        
+        # Try ice_servers first, then ice_servers2 as backup
+        ice_server_configs = (
+            session_info.get("ice_servers") or 
+            session_info.get("ice_servers2") or 
+            session_info.get("iceServers", [])
+        )
+        
+        if ice_server_configs and not isinstance(ice_server_configs, list):
+            logger.warning(f"⚠️ Unexpected ice_servers format: {type(ice_server_configs)}")
+            ice_server_configs = []
+        
+        for server_config in ice_server_configs:
+            if not isinstance(server_config, dict):
+                continue
+                
+            urls = server_config.get("urls", [])
+            if isinstance(urls, str):
+                urls = [urls]  # Convert single URL to list
+                
+            username = server_config.get("username")
+            credential = server_config.get("credential")
+            
+            if urls:
+                ice_servers.append(
+                    RTCIceServer(
+                        urls=urls,
+                        username=username,
+                        credential=credential,
+                    )
+                )
+                logger.info(f"🧊 Added ICE server: {urls[0]}")
+        
+        # When using LiveKit, ICE servers may be embedded in SDP
+        # In that case, use public STUN as fallback
+        if not ice_servers:
+            logger.info("ℹ️ Using default STUN servers (LiveKit may provide its own via SDP)")
+            ice_servers.append(
+                RTCIceServer(urls=["stun:stun.l.google.com:19302"])
+            )
+        
+        return ice_servers
+
+    async def _handle_track(self, track: MediaStreamTrack) -> None:
+        """Handle incoming media track from HeyGen.
+        
+        Args:
+            track: Incoming media track (audio or video).
+        """
+        logger.info(f"📡 Received track from HeyGen: {track.kind}")
+        
+        if track.kind == "video":
+            if self._video_callback:
+                await self._video_callback(track)
+            else:
+                logger.warning("Video track received but no callback registered")
+        elif track.kind == "audio":
+            # Audio track from HeyGen (avatar speech) - currently not used
+            logger.debug("Audio track received from HeyGen (ignored)")
+
+    def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None:
+        """Set callback for handling incoming video track.
+        
+        Args:
+            callback: Async function to handle video track.
+        """
+        self._video_callback = callback
+
+    @property
+    def is_connected(self) -> bool:
+        """Check if WebRTC connection is established."""
+        return self._connected
+
+    async def close(self) -> None:
+        """Close the WebRTC connection and clean up resources."""
+        if self.pc:
+            await self.pc.close()
+            self.pc = None
+        
+        await self.session_manager.close()
+        
+        self._connected = False
+        self._connection_ready.clear()
+        
+        logger.info("🔌 HeyGen RTC connection closed")
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
new file mode 100644
index 00000000..917a4a52
--- /dev/null
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
@@ -0,0 +1,181 @@
+import logging
+from typing import Optional, Dict, Any
+from os import getenv
+import aiohttp
+
+logger = logging.getLogger(__name__)
+
+
+class HeyGenSession:
+    """Manages HeyGen API session lifecycle and configuration.
+    
+    Handles authentication, session creation, and API communication
+    with HeyGen's Streaming API.
+    """
+
+    def __init__(
+        self,
+        avatar_id: str = "default",
+        quality: str = "high",
+        api_key: Optional[str] = None,
+    ):
+        """Initialize HeyGen session manager.
+        
+        Args:
+            avatar_id: HeyGen avatar ID to use for streaming.
+            quality: Video quality setting ("low", "medium", "high").
+            api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided.
+        """
+        self.avatar_id = avatar_id
+        self.quality = quality
+        self.api_key = api_key or getenv("HEYGEN_API_KEY")
+        
+        if not self.api_key:
+            raise ValueError(
+                "HeyGen API key required. Set HEYGEN_API_KEY environment variable "
+                "or pass api_key parameter."
+            )
+        
+        self.base_url = "https://api.heygen.com/v1"
+        self.session_id: Optional[str] = None
+        self.session_info: Optional[Dict[str, Any]] = None
+        self._http_session: Optional[aiohttp.ClientSession] = None
+
+    async def create_session(self) -> Dict[str, Any]:
+        """Create a new HeyGen streaming session.
+        
+        Returns:
+            Session information including session_id, ICE servers, and SDP offer.
+        """
+        if not self._http_session:
+            self._http_session = aiohttp.ClientSession()
+        
+        headers = {
+            "X-Api-Key": self.api_key,
+            "Content-Type": "application/json",
+        }
+        
+        payload = {
+            "avatar_id": self.avatar_id,
+            "quality": self.quality,
+        }
+        
+        try:
+            async with self._http_session.post(
+                f"{self.base_url}/streaming.new",
+                json=payload,
+                headers=headers,
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise RuntimeError(
+                        f"Failed to create HeyGen session: {response.status} - {error_text}"
+                    )
+                
+                data = await response.json()
+                self.session_info = data.get("data", {})
+                self.session_id = self.session_info.get("session_id")
+                
+                logger.info(f"✅ HeyGen session created: {self.session_id}")
+                return self.session_info
+                
+        except Exception as e:
+            logger.error(f"❌ Failed to create HeyGen session: {e}")
+            raise
+
+    async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any]:
+        """Start the HeyGen streaming session.
+        
+        Args:
+            sdp_answer: Optional SDP answer to include in the start request.
+            
+        Returns:
+            Start confirmation with session details.
+        """
+        if not self.session_id:
+            raise RuntimeError("Session not created. Call create_session() first.")
+        
+        if not self._http_session:
+            self._http_session = aiohttp.ClientSession()
+        
+        headers = {
+            "X-Api-Key": self.api_key,
+            "Content-Type": "application/json",
+        }
+        
+        payload: Dict[str, Any] = {
+            "session_id": self.session_id,
+        }
+        
+        # Include SDP answer if provided
+        if sdp_answer:
+            payload["sdp"] = {
+                "type": "answer",
+                "sdp": sdp_answer
+            }
+        
+        try:
+            async with self._http_session.post(
+                f"{self.base_url}/streaming.start",
+                json=payload,
+                headers=headers,
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    raise RuntimeError(
+                        f"Failed to start HeyGen session: {response.status} - {error_text}"
+                    )
+                
+                data = await response.json()
+                logger.info(f"✅ HeyGen session started: {self.session_id}")
+                return data
+                
+        except Exception as e:
+            logger.error(f"❌ Failed to start HeyGen session: {e}")
+            raise
+
+    async def stop_session(self) -> None:
+        """Stop the HeyGen streaming session."""
+        if not self.session_id:
+            logger.warning("No active session to stop")
+            return
+        
+        if not self._http_session:
+            return
+        
+        headers = {
+            "X-Api-Key": self.api_key,
+            "Content-Type": "application/json",
+        }
+        
+        payload = {
+            "session_id": self.session_id,
+        }
+        
+        try:
+            async with self._http_session.post(
+                f"{self.base_url}/streaming.stop",
+                json=payload,
+                headers=headers,
+            ) as response:
+                if response.status == 200:
+                    logger.info(f"✅ HeyGen session stopped: {self.session_id}")
+                else:
+                    logger.warning(
+                        f"Failed to stop HeyGen session: {response.status}"
+                    )
+        except Exception as e:
+            logger.error(f"❌ Error stopping HeyGen session: {e}")
+
+    async def close(self) -> None:
+        """Clean up session resources."""
+        await self.stop_session()
+        
+        if self._http_session:
+            await self._http_session.close()
+            self._http_session = None
+        
+        self.session_id = None
+        self.session_info = None
+        logger.info("HeyGen session cleaned up")
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
new file mode 100644
index 00000000..1fcbc39b
--- /dev/null
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
@@ -0,0 +1,161 @@
+import asyncio
+import logging
+from typing import Optional
+
+import av
+from aiortc import MediaStreamTrack, VideoStreamTrack
+from PIL import Image
+
+from vision_agents.core.utils.queue import LatestNQueue
+
+logger = logging.getLogger(__name__)
+
+
+class HeyGenVideoTrack(VideoStreamTrack):
+    """Video track that forwards HeyGen avatar video frames.
+    
+    Receives video frames from HeyGen's WebRTC connection and provides
+    them through the standard VideoStreamTrack interface for publishing
+    to the call.
+    """
+
+    def __init__(self, width: int = 1920, height: int = 1080):
+        """Initialize the HeyGen video track.
+        
+        Args:
+            width: Video frame width.
+            height: Video frame height.
+        """
+        super().__init__()
+        
+        self.width = width
+        self.height = height
+        
+        # Queue for incoming frames from HeyGen
+        self.frame_queue: LatestNQueue[av.VideoFrame] = LatestNQueue(maxlen=30)
+        
+        # Create placeholder frame for when no frames are available
+        placeholder = Image.new("RGB", (self.width, self.height), color=(30, 30, 40))
+        self.placeholder_frame = av.VideoFrame.from_image(placeholder)
+        self.last_frame: av.VideoFrame = self.placeholder_frame
+        
+        self._stopped = False
+        self._receiving_task: Optional[asyncio.Task] = None
+        self._source_track: Optional[MediaStreamTrack] = None
+        
+        logger.info(f"🎬 HeyGenVideoTrack initialized ({width}x{height})")
+
+    async def start_receiving(self, source_track: MediaStreamTrack) -> None:
+        """Start receiving frames from HeyGen's video track.
+        
+        Args:
+            source_track: The incoming video track from HeyGen's WebRTC connection.
+        """
+        if self._receiving_task:
+            logger.warning("Already receiving frames from HeyGen")
+            return
+        
+        self._source_track = source_track
+        self._receiving_task = asyncio.create_task(self._receive_frames())
+        logger.info("📥 Started receiving frames from HeyGen")
+
+    async def _receive_frames(self) -> None:
+        """Continuously receive frames from HeyGen and add to queue."""
+        if not self._source_track:
+            logger.error("No source track set")
+            return
+        
+        try:
+            while not self._stopped:
+                try:
+                    # Receive frame from HeyGen
+                    frame = await self._source_track.recv()
+                    
+                    if frame:
+                        # Resize if needed
+                        if frame.width != self.width or frame.height != self.height:
+                            frame = self._resize_frame(frame)
+                        
+                        # Add to queue (will replace oldest if full)
+                        self.frame_queue.put_latest_nowait(frame)
+                        
+                        logger.debug(
+                            f"📥 Received frame from HeyGen: {frame.width}x{frame.height}"
+                        )
+                
+                except Exception as e:
+                    if not self._stopped:
+                        logger.warning(f"Error receiving frame from HeyGen: {e}")
+                    await asyncio.sleep(0.01)
+        
+        except asyncio.CancelledError:
+            logger.info("Frame receiving task cancelled")
+        except Exception as e:
+            logger.error(f"Fatal error in frame receiving: {e}")
+
+    def _resize_frame(self, frame: av.VideoFrame) -> av.VideoFrame:
+        """Resize a video frame to match the track dimensions.
+        
+        Args:
+            frame: Input video frame.
+            
+        Returns:
+            Resized video frame.
+        """
+        try:
+            img = frame.to_image()
+            resized = img.resize((self.width, self.height), Image.LANCZOS)
+            return av.VideoFrame.from_image(resized)
+        
+        except Exception as e:
+            logger.error(f"Error resizing frame: {e}")
+            return frame
+
+    async def recv(self) -> av.VideoFrame:
+        """Receive the next video frame.
+        
+        This is called by the WebRTC stack to get frames for transmission.
+        
+        Returns:
+            Video frame to transmit.
+        """
+        if self._stopped:
+            raise Exception("Track stopped")
+        
+        try:
+            # Try to get a new frame from queue with short timeout
+            frame = await asyncio.wait_for(
+                self.frame_queue.get(),
+                timeout=0.033  # ~30 FPS
+            )
+            if frame:
+                self.last_frame = frame
+        
+        except asyncio.TimeoutError:
+            # No new frame, use last frame
+            pass
+        
+        except Exception as e:
+            logger.warning(f"Error getting frame from queue: {e}")
+        
+        # Get timestamp for the frame
+        pts, time_base = await self.next_timestamp()
+        
+        # Create a copy of the frame with updated timestamp
+        output_frame = self.last_frame
+        output_frame.pts = pts
+        output_frame.time_base = time_base
+        
+        return output_frame
+
+    def stop(self) -> None:
+        """Stop the video track."""
+        self._stopped = True
+        
+        if self._receiving_task:
+            self._receiving_task.cancel()
+            self._receiving_task = None
+        
+        super().stop()
+        logger.info("🛑 HeyGenVideoTrack stopped")
+
diff --git a/pyproject.toml b/pyproject.toml
index b6e6d93e..26f6c0a5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,6 +23,7 @@ vision-agents-plugins-ultralytics = { workspace = true }
 vision-agents-plugins-krisp = { workspace = true }
 vision-agents-plugins-smart-turn = { workspace = true }
 vision-agents-plugins-wizper = { workspace = true }
+vision-agents-plugins-heygen = { workspace = true }
 
 [tool.uv.workspace]
 members = [
@@ -44,7 +45,8 @@ members = [
     "plugins/ultralytics",
     "plugins/krisp",
     "plugins/smart_turn",
-    "plugins/wizper"
+    "plugins/wizper",
+    "plugins/heygen"
 ]
 exclude = [
 "**/__pycache__",
diff --git a/uv.lock b/uv.lock
index 0ef7c3c5..61c02726 100644
--- a/uv.lock
+++ b/uv.lock
@@ -19,6 +19,7 @@ members = [
     "vision-agents-plugins-fish",
     "vision-agents-plugins-gemini",
     "vision-agents-plugins-getstream",
+    "vision-agents-plugins-heygen",
     "vision-agents-plugins-kokoro",
     "vision-agents-plugins-krisp",
     "vision-agents-plugins-moonshine",
@@ -165,6 +166,24 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/3b/58/af07dda649c22a1ae954ffb7aaaf4d4a57f1bf00ebdf62307affc0b8552f/aioice-0.10.1-py3-none-any.whl", hash = "sha256:f31ae2abc8608b1283ed5f21aebd7b6bd472b152ff9551e9b559b2d8efed79e9", size = 24872, upload-time = "2025-04-13T08:15:24.044Z" },
 ]
 
+[[package]]
+name = "aiortc"
+version = "1.14.0"
+source = { registry = "https://pypi.org/simple" }
+dependencies = [
+    { name = "aioice" },
+    { name = "av" },
+    { name = "cryptography" },
+    { name = "google-crc32c" },
+    { name = "pyee" },
+    { name = "pylibsrtp" },
+    { name = "pyopenssl" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/51/9c/4e027bfe0195de0442da301e2389329496745d40ae44d2d7c4571c4290ce/aiortc-1.14.0.tar.gz", hash = "sha256:adc8a67ace10a085721e588e06a00358ed8eaf5f6b62f0a95358ff45628dd762", size = 1180864, upload-time = "2025-10-13T21:40:37.905Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/57/ab/31646a49209568cde3b97eeade0d28bb78b400e6645c56422c101df68932/aiortc-1.14.0-py3-none-any.whl", hash = "sha256:4b244d7e482f4e1f67e685b3468269628eca1ec91fa5b329ab517738cfca086e", size = 93183, upload-time = "2025-10-13T21:40:36.59Z" },
+]
+
 [[package]]
 name = "aiortc-getstream"
 version = "1.13.0.post1"
@@ -4855,6 +4874,7 @@ all-plugins = [
     { name = "vision-agents-plugins-elevenlabs" },
     { name = "vision-agents-plugins-gemini" },
     { name = "vision-agents-plugins-getstream" },
+    { name = "vision-agents-plugins-heygen" },
     { name = "vision-agents-plugins-kokoro" },
     { name = "vision-agents-plugins-krisp" },
     { name = "vision-agents-plugins-moonshine" },
@@ -4889,6 +4909,9 @@ gemini = [
 getstream = [
     { name = "vision-agents-plugins-getstream" },
 ]
+heygen = [
+    { name = "vision-agents-plugins-heygen" },
+]
 kokoro = [
     { name = "vision-agents-plugins-kokoro" },
 ]
@@ -4940,6 +4963,8 @@ requires-dist = [
     { name = "vision-agents-plugins-gemini", marker = "extra == 'gemini'", editable = "plugins/gemini" },
     { name = "vision-agents-plugins-getstream", marker = "extra == 'all-plugins'", editable = "plugins/getstream" },
     { name = "vision-agents-plugins-getstream", marker = "extra == 'getstream'", editable = "plugins/getstream" },
+    { name = "vision-agents-plugins-heygen", marker = "extra == 'all-plugins'", editable = "plugins/heygen" },
+    { name = "vision-agents-plugins-heygen", marker = "extra == 'heygen'", editable = "plugins/heygen" },
     { name = "vision-agents-plugins-kokoro", marker = "extra == 'all-plugins'", editable = "plugins/kokoro" },
     { name = "vision-agents-plugins-kokoro", marker = "extra == 'kokoro'", editable = "plugins/kokoro" },
     { name = "vision-agents-plugins-krisp", marker = "extra == 'all-plugins'", editable = "plugins/krisp" },
@@ -4959,7 +4984,7 @@ requires-dist = [
     { name = "vision-agents-plugins-xai", marker = "extra == 'all-plugins'", editable = "plugins/xai" },
     { name = "vision-agents-plugins-xai", marker = "extra == 'xai'", editable = "plugins/xai" },
 ]
-provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "kokoro", "krisp", "moonshine", "openai", "silero", "smart-turn", "ultralytics", "wizper", "xai"]
+provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "heygen", "kokoro", "krisp", "moonshine", "openai", "silero", "smart-turn", "ultralytics", "wizper", "xai"]
 
 [[package]]
 name = "vision-agents-plugins-anthropic"
@@ -5183,6 +5208,35 @@ dev = [
     { name = "pytest-asyncio", specifier = ">=1.0.0" },
 ]
 
+[[package]]
+name = "vision-agents-plugins-heygen"
+version = "0.1.0"
+source = { editable = "plugins/heygen" }
+dependencies = [
+    { name = "aiohttp" },
+    { name = "aiortc" },
+    { name = "vision-agents" },
+]
+
+[package.dev-dependencies]
+dev = [
+    { name = "pytest" },
+    { name = "pytest-asyncio" },
+]
+
+[package.metadata]
+requires-dist = [
+    { name = "aiohttp", specifier = ">=3.9.0" },
+    { name = "aiortc", specifier = ">=1.9.0" },
+    { name = "vision-agents", editable = "agents-core" },
+]
+
+[package.metadata.requires-dev]
+dev = [
+    { name = "pytest", specifier = ">=8.4.1" },
+    { name = "pytest-asyncio", specifier = ">=1.0.0" },
+]
+
 [[package]]
 name = "vision-agents-plugins-kokoro"
 source = { editable = "plugins/kokoro" }

From 4f3a6e4d8e6df8008c3ea75c1f49be94d2ea6d60 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Tue, 28 Oct 2025 10:46:20 +0100
Subject: [PATCH 02/20] add lip-sync support by forwarding agent audio to
 heygen

---
 plugins/heygen/README.md                      | 15 +++++++
 plugins/heygen/example/avatar_example.py      |  5 +++
 .../plugins/heygen/heygen_avatar_publisher.py | 40 ++++++++++++++++++-
 .../plugins/heygen/heygen_rtc_manager.py      | 26 ++++++++++++
 4 files changed, 85 insertions(+), 1 deletion(-)

diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md
index 2dc19e34..2d4cdd10 100644
--- a/plugins/heygen/README.md
+++ b/plugins/heygen/README.md
@@ -56,6 +56,11 @@ async def start_avatar_agent():
     call = agent.edge.client.video.call("default", str(uuid4()))
     
     with await agent.join(call):
+        # Enable lip-sync by forwarding agent's audio to HeyGen
+        avatar_publisher = agent.video_publishers[0]
+        if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track:
+            avatar_publisher.set_agent_audio_track(agent._audio_track)
+        
         await agent.edge.open_demo(call)
         await agent.simple_response("Hello! I'm your AI assistant with an avatar.")
         await agent.finish()
@@ -103,6 +108,16 @@ agent = Agent(
         heygen.AvatarPublisher(avatar_id="professional_presenter")
     ]
 )
+
+call = agent.edge.client.video.call("default", str(uuid4()))
+
+with await agent.join(call):
+    # Enable lip-sync
+    avatar_publisher = agent.video_publishers[0]
+    if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track:
+        avatar_publisher.set_agent_audio_track(agent._audio_track)
+    
+    await agent.finish()
 ```
 
 ### With Multiple Processors
diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py
index 5d021b41..2a81c5a7 100644
--- a/plugins/heygen/example/avatar_example.py
+++ b/plugins/heygen/example/avatar_example.py
@@ -56,6 +56,11 @@ async def start_avatar_agent() -> None:
     
     # Join the call
     with await agent.join(call):
+        # Forward agent's audio to HeyGen for lip-sync
+        avatar_publisher = agent.video_publishers[0]
+        if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track:
+            avatar_publisher.set_agent_audio_track(agent._audio_track)
+        
         # Open demo UI
         await agent.edge.open_demo(call)
         
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index 786b9421..d4ef079f 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -58,7 +58,7 @@ def __init__(
         """
         super().__init__(
             interval=interval,
-            receive_audio=False,
+            receive_audio=True,  # Receive audio to forward to HeyGen for lip-sync
             receive_video=False,
             **kwargs
         )
@@ -84,6 +84,7 @@ def __init__(
         # Connection state
         self._connected = False
         self._connection_task: Optional[asyncio.Task] = None
+        self._audio_track_set = False
         
         logger.info(
             f"🎭 HeyGen AvatarPublisher initialized "
@@ -116,6 +117,43 @@ async def _on_video_track(self, track: Any) -> None:
         logger.info("📹 Received video track from HeyGen, starting frame forwarding")
         await self._video_track.start_receiving(track)
 
+    async def _forward_audio_track(self, audio_track: Any) -> None:
+        """Forward agent's audio track to HeyGen for lip-sync.
+        
+        Args:
+            audio_track: The agent's audio output track.
+        """
+        if self._audio_track_set:
+            return  # Already forwarded
+        
+        logger.info("🎤 Forwarding agent's audio output to HeyGen for lip-sync")
+        
+        # Wait for HeyGen connection
+        if not self._connected:
+            if self._connection_task:
+                try:
+                    await asyncio.wait_for(self._connection_task, timeout=10.0)
+                except asyncio.TimeoutError:
+                    logger.error("Timeout waiting for HeyGen connection")
+                    return
+            else:
+                logger.error("HeyGen connection not started")
+                return
+        
+        # Forward the agent's audio track to HeyGen
+        await self.rtc_manager.send_audio_track(audio_track)
+        self._audio_track_set = True
+
+    def set_agent_audio_track(self, audio_track: Any) -> None:
+        """Set the agent's audio track for forwarding to HeyGen.
+        
+        This should be called by the agent after audio track is created.
+        
+        Args:
+            audio_track: The agent's audio output track for TTS/Realtime.
+        """
+        asyncio.create_task(self._forward_audio_track(audio_track))
+
     def publish_video_track(self):
         """Publish the HeyGen avatar video track.
         
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
index 8e5d340b..ee1344d0 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -46,6 +46,9 @@ def __init__(
         # Video track callback for receiving avatar video
         self._video_callback: Optional[Callable[[MediaStreamTrack], Any]] = None
         
+        # Audio track for sending to HeyGen
+        self._audio_sender: Optional[Any] = None
+        
         self._connected = False
         self._connection_ready = asyncio.Event()
 
@@ -118,6 +121,13 @@ async def on_connection_state_change():
             # We just need to create our answer
             logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}")
             
+            # Find and store the audio sender so we can send audio to HeyGen later
+            for sender in self.pc.getSenders():
+                if sender.track and sender.track.kind == "audio":
+                    self._audio_sender = sender
+                    logger.debug("Found audio sender for HeyGen")
+                    break
+            
             # Create our answer
             answer = await self.pc.createAnswer()
             await self.pc.setLocalDescription(answer)
@@ -215,6 +225,22 @@ def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> Non
         """
         self._video_callback = callback
 
+    async def send_audio_track(self, audio_track: MediaStreamTrack) -> None:
+        """Send audio track to HeyGen for lip-sync.
+        
+        Args:
+            audio_track: Audio track containing agent's speech.
+        """
+        if not self._audio_sender:
+            logger.warning("No audio sender available - connection may not be established")
+            return
+        
+        try:
+            await self._audio_sender.replaceTrack(audio_track)
+            logger.info("🎤 Audio track sent to HeyGen for lip-sync")
+        except Exception as e:
+            logger.error(f"Failed to send audio track to HeyGen: {e}")
+
     @property
     def is_connected(self) -> bool:
         """Check if WebRTC connection is established."""

From 680f5d776b6007a1b371593b5870c9763bb1d7b1 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Tue, 28 Oct 2025 10:48:34 +0100
Subject: [PATCH 03/20] switch avatar example to use gemini realtime for better
 lip-sync testing

---
 plugins/heygen/example/avatar_example.py | 23 +++++++----------------
 plugins/heygen/example/pyproject.toml    |  6 ------
 2 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py
index 2a81c5a7..19bbbfdc 100644
--- a/plugins/heygen/example/avatar_example.py
+++ b/plugins/heygen/example/avatar_example.py
@@ -3,23 +3,20 @@
 from dotenv import load_dotenv
 
 from vision_agents.core import User, Agent
-from vision_agents.plugins import kokoro, deepgram, getstream, smart_turn, gemini, heygen
+from vision_agents.plugins import getstream, gemini, heygen
 
 load_dotenv()
 
 
 async def start_avatar_agent() -> None:
-    """Start an agent with HeyGen avatar.
+    """Start an agent with HeyGen avatar using Realtime LLM.
     
     This example demonstrates how to use HeyGen's avatar streaming
-    to add realistic avatar video to your AI agent. The avatar will
-    lip-sync with the audio generated by the TTS.
+    with Gemini Realtime. The avatar will lip-sync with the audio
+    generated by the Realtime LLM.
     """
     
-    # Create LLM
-    llm = gemini.LLM("gemini-2.0-flash")
-    
-    # Create agent with HeyGen avatar
+    # Create agent with HeyGen avatar and Realtime LLM
     agent = Agent(
         edge=getstream.Edge(),
         agent_user=User(
@@ -32,14 +29,8 @@ async def start_avatar_agent() -> None:
             "Don't use special characters or formatting."
         ),
         
-        # LLM and speech components
-        llm=llm,
-        tts=kokoro.TTS(),  # Using Kokoro (free, local TTS)
-        stt=deepgram.STT(),
-        turn_detection=smart_turn.TurnDetection(
-            buffer_duration=2.0,
-            confidence_threshold=0.5
-        ),
+        # Use Gemini Realtime (includes built-in TTS and STT)
+        llm=gemini.Realtime(fps=2),
         
         # Add HeyGen avatar as a video publisher
         processors=[
diff --git a/plugins/heygen/example/pyproject.toml b/plugins/heygen/example/pyproject.toml
index 83fd9bdd..4e1fdf61 100644
--- a/plugins/heygen/example/pyproject.toml
+++ b/plugins/heygen/example/pyproject.toml
@@ -6,20 +6,14 @@ requires-python = ">=3.10"
 dependencies = [
     "vision-agents",
     "vision-agents-plugins-heygen",
-    "vision-agents-plugins-kokoro",
-    "vision-agents-plugins-deepgram",
     "vision-agents-plugins-gemini",
     "vision-agents-plugins-getstream",
-    "vision-agents-plugins-smart-turn",
     "python-dotenv",
 ]
 
 [tool.uv.sources]
 vision-agents = { workspace = true }
 vision-agents-plugins-heygen = { workspace = true }
-vision-agents-plugins-kokoro = { workspace = true }
-vision-agents-plugins-deepgram = { workspace = true }
 vision-agents-plugins-gemini = { workspace = true }
 vision-agents-plugins-getstream = { workspace = true }
-vision-agents-plugins-smart-turn = { workspace = true }
 

From 6eb638fd5590764dd88a4ca5c138d828a3039294 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Wed, 29 Oct 2025 13:33:59 +0100
Subject: [PATCH 04/20] WIP: audio track approach for lip-sync (audio flows but
 no lip movement)

---
 plugins/heygen/example/avatar_example.py      | 14 +--
 .../plugins/heygen/heygen_audio_track.py      | 98 +++++++++++++++++++
 .../plugins/heygen/heygen_avatar_publisher.py | 85 +++++++++++++---
 .../plugins/heygen/heygen_rtc_manager.py      | 45 ++++++---
 4 files changed, 208 insertions(+), 34 deletions(-)
 create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py

diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py
index 19bbbfdc..2d95421f 100644
--- a/plugins/heygen/example/avatar_example.py
+++ b/plugins/heygen/example/avatar_example.py
@@ -47,21 +47,15 @@ async def start_avatar_agent() -> None:
     
     # Join the call
     with await agent.join(call):
-        # Forward agent's audio to HeyGen for lip-sync
+        # Set agent reference on avatar publisher for audio event subscription
         avatar_publisher = agent.video_publishers[0]
-        if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track:
-            avatar_publisher.set_agent_audio_track(agent._audio_track)
+        if hasattr(avatar_publisher, 'set_agent'):
+            avatar_publisher.set_agent(agent)
         
         # Open demo UI
         await agent.edge.open_demo(call)
         
-        # Greet the user through the avatar
-        await agent.simple_response(
-            "Hello! I'm your AI assistant with an avatar. "
-            "How can I help you today?"
-        )
-        
-        # Keep the call running
+        # Keep the call running - Realtime mode handles conversation automatically
         await agent.finish()
 
 
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py
new file mode 100644
index 00000000..f53a5399
--- /dev/null
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py
@@ -0,0 +1,98 @@
+"""Custom audio track for sending audio to HeyGen for lip-sync."""
+
+import asyncio
+import logging
+from typing import Optional
+from fractions import Fraction
+
+import av
+import numpy as np
+from aiortc import AudioStreamTrack
+
+logger = logging.getLogger(__name__)
+
+
+class HeyGenAudioTrack(AudioStreamTrack):
+    """Audio track that accepts PCM data and produces frames for WebRTC.
+    
+    This track receives audio data from the Realtime LLM and produces
+    audio frames that can be sent to HeyGen via WebRTC for lip-sync.
+    """
+    
+    kind = "audio"
+    
+    def __init__(self, sample_rate: int = 24000):
+        """Initialize the audio track.
+        
+        Args:
+            sample_rate: Sample rate for audio frames (default: 24000 for Gemini).
+        """
+        super().__init__()
+        self._sample_rate = sample_rate
+        self._ts = 0
+        self._latest_chunk: Optional[bytes] = None
+        self._silence_cache: dict[int, np.ndarray] = {}
+        logger.info(f"🎤 HeyGenAudioTrack initialized at {sample_rate}Hz")
+    
+    def write_audio(self, pcm_data: bytes) -> None:
+        """Write PCM audio data to be sent to HeyGen.
+        
+        Args:
+            pcm_data: Raw PCM16 audio data from the LLM.
+        """
+        if not pcm_data:
+            return
+        self._latest_chunk = bytes(pcm_data)
+        logger.debug(f"✍️ Audio data written: {len(pcm_data)} bytes")
+    
+    async def recv(self) -> av.AudioFrame:
+        """Receive the next audio frame for WebRTC transmission.
+        
+        Returns:
+            Audio frame to send to HeyGen.
+        """
+        # Pace at 20ms per frame (50 fps)
+        await asyncio.sleep(0.02)
+        
+        sr = self._sample_rate
+        samples_per_frame = int(0.02 * sr)  # 20ms worth of samples
+        
+        chunk = self._latest_chunk
+        if chunk:
+            logger.debug(f"🎙️ recv() producing frame with audio data ({len(chunk)} bytes)")
+        if chunk:
+            # Consume and clear the latest pushed chunk
+            self._latest_chunk = None
+            arr = np.frombuffer(chunk, dtype=np.int16)
+            
+            # Ensure mono channel
+            if arr.ndim == 1:
+                samples = arr.reshape(1, -1)
+            else:
+                samples = arr[:1, :]
+            
+            # Pad or truncate to exactly one 20ms frame
+            needed = samples_per_frame
+            have = samples.shape[1]
+            if have < needed:
+                pad = np.zeros((1, needed - have), dtype=np.int16)
+                samples = np.concatenate([samples, pad], axis=1)
+            elif have > needed:
+                samples = samples[:, :needed]
+        else:
+            # Generate silence when no audio data is available
+            cached = self._silence_cache.get(sr)
+            if cached is None:
+                cached = np.zeros((1, samples_per_frame), dtype=np.int16)
+                self._silence_cache[sr] = cached
+            samples = cached
+        
+        # Create audio frame
+        frame = av.AudioFrame.from_ndarray(samples, format="s16", layout="mono")
+        frame.sample_rate = sr
+        frame.pts = self._ts
+        frame.time_base = Fraction(1, sr)
+        self._ts += samples.shape[1]
+        
+        return frame
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index d4ef079f..16c1a9bc 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -85,11 +85,27 @@ def __init__(
         self._connected = False
         self._connection_task: Optional[asyncio.Task] = None
         self._audio_track_set = False
+        self._agent = None  # Will be set by the agent
+        
+        # Create a custom audio track for HeyGen that we can write to
+        from .heygen_audio_track import HeyGenAudioTrack
+        self._heygen_audio_track = HeyGenAudioTrack(sample_rate=24000)
         
         logger.info(
             f"🎭 HeyGen AvatarPublisher initialized "
             f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})"
         )
+    
+    def set_agent(self, agent: Any) -> None:
+        """Set the agent reference for event subscription.
+        
+        This is called by the agent when the processor is attached.
+        
+        Args:
+            agent: The agent instance.
+        """
+        self._agent = agent
+        logger.info("🔗 Agent reference set for HeyGen avatar publisher")
 
     async def _connect_to_heygen(self) -> None:
         """Establish connection to HeyGen and start receiving video."""
@@ -102,11 +118,38 @@ async def _connect_to_heygen(self) -> None:
             
             self._connected = True
             logger.info("✅ Connected to HeyGen, avatar streaming active")
+            
+            # Subscribe to audio output events from the LLM for lip-sync
+            self._subscribe_to_audio_events()
         
         except Exception as e:
             logger.error(f"❌ Failed to connect to HeyGen: {e}")
             self._connected = False
             raise
+    
+    def _subscribe_to_audio_events(self) -> None:
+        """Subscribe to audio output events from the LLM."""
+        try:
+            # Import the event type
+            from vision_agents.core.llm.events import RealtimeAudioOutputEvent
+            
+            # Get the agent's event manager
+            # Note: This will be set when the processor is attached to an agent
+            if hasattr(self, '_agent') and self._agent:
+                @self._agent.events.subscribe
+                async def on_audio_output(event: RealtimeAudioOutputEvent):
+                    logger.debug(f"📢 Received audio output event: {len(event.audio_data)} bytes at {event.sample_rate}Hz")
+                    await self._on_audio_output(event.audio_data, event.sample_rate)
+                logger.info("🎧 Subscribed to LLM audio output events for lip-sync")
+                
+                # Also log what events are registered
+                logger.info(f"   Event manager has {len(self._agent.events._handlers)} event handlers")
+            else:
+                logger.warning("⚠️ Cannot subscribe to audio events - no agent attached yet")
+        except Exception as e:
+            logger.error(f"Failed to subscribe to audio events: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
 
     async def _on_video_track(self, track: Any) -> None:
         """Callback when video track is received from HeyGen.
@@ -117,16 +160,12 @@ async def _on_video_track(self, track: Any) -> None:
         logger.info("📹 Received video track from HeyGen, starting frame forwarding")
         await self._video_track.start_receiving(track)
 
-    async def _forward_audio_track(self, audio_track: Any) -> None:
-        """Forward agent's audio track to HeyGen for lip-sync.
-        
-        Args:
-            audio_track: The agent's audio output track.
-        """
+    async def _setup_audio_forwarding(self) -> None:
+        """Set up audio forwarding from agent to HeyGen for lip-sync."""
         if self._audio_track_set:
-            return  # Already forwarded
+            return  # Already set up
         
-        logger.info("🎤 Forwarding agent's audio output to HeyGen for lip-sync")
+        logger.info("🎤 Setting up audio forwarding to HeyGen for lip-sync")
         
         # Wait for HeyGen connection
         if not self._connected:
@@ -140,19 +179,39 @@ async def _forward_audio_track(self, audio_track: Any) -> None:
                 logger.error("HeyGen connection not started")
                 return
         
-        # Forward the agent's audio track to HeyGen
-        await self.rtc_manager.send_audio_track(audio_track)
+        # Set our custom audio track on the HeyGen sender
+        await self.rtc_manager.send_audio_track(self._heygen_audio_track)
         self._audio_track_set = True
+        logger.info("✅ Audio track set up for HeyGen lip-sync")
+
+    async def _on_audio_output(self, audio_data: bytes, sample_rate: int) -> None:
+        """Handle audio output from the LLM and forward to HeyGen.
+        
+        Args:
+            audio_data: Raw PCM audio data from the LLM.
+            sample_rate: Sample rate of the audio data.
+        """
+        logger.debug(f"🎵 _on_audio_output called: {len(audio_data)} bytes at {sample_rate}Hz")
+        
+        if not self._audio_track_set:
+            # Set up audio forwarding on first audio output
+            logger.info("🔧 Setting up audio forwarding on first audio output")
+            await self._setup_audio_forwarding()
+        
+        # Write audio data to our custom track for HeyGen
+        logger.info(f"✍️ Writing {len(audio_data)} bytes to HeyGen audio track")
+        self._heygen_audio_track.write_audio(audio_data)
 
     def set_agent_audio_track(self, audio_track: Any) -> None:
         """Set the agent's audio track for forwarding to HeyGen.
         
-        This should be called by the agent after audio track is created.
+        DEPRECATED: This method is no longer needed. Audio is now forwarded
+        via event listening instead of track sharing.
         
         Args:
-            audio_track: The agent's audio output track for TTS/Realtime.
+            audio_track: The agent's audio output track (unused).
         """
-        asyncio.create_task(self._forward_audio_track(audio_track))
+        logger.warning("set_agent_audio_track is deprecated - audio forwarding is automatic via events")
 
     def publish_video_track(self):
         """Publish the HeyGen avatar video track.
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
index ee1344d0..278ea9b5 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -117,16 +117,27 @@ async def on_connection_state_change():
             offer = RTCSessionDescription(sdp=offer_sdp, type="offer")
             await self.pc.setRemoteDescription(offer)
             
-            # HeyGen's offer already includes tracks, so transceivers are auto-created
-            # We just need to create our answer
+            # HeyGen's offer includes tracks for video/audio they send us
+            # Check transceivers to see if we have an audio sender
             logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}")
             
-            # Find and store the audio sender so we can send audio to HeyGen later
-            for sender in self.pc.getSenders():
-                if sender.track and sender.track.kind == "audio":
-                    self._audio_sender = sender
-                    logger.debug("Found audio sender for HeyGen")
-                    break
+            # Find the audio transceiver and modify it to allow sending
+            logger.info(f"🔍 Checking {len(self.pc.getTransceivers())} transceivers for audio")
+            for idx, transceiver in enumerate(self.pc.getTransceivers()):
+                logger.info(f"  Transceiver {idx}: mid={transceiver.mid}, direction={transceiver.direction}")
+                if transceiver.receiver and transceiver.receiver.track:
+                    logger.info(f"    Receiver track: kind={transceiver.receiver.track.kind}")
+                    if transceiver.receiver.track.kind == "audio":
+                        # Found the audio transceiver - modify its direction to allow sending
+                        logger.info(f"    🔧 Modifying audio transceiver direction from {transceiver.direction} to sendrecv")
+                        transceiver.direction = "sendrecv"
+                        self._audio_sender = transceiver.sender
+                        logger.info("✅ Audio transceiver modified for lip-sync")
+                        break
+            
+            # If no audio transceiver found, log warning
+            if not self._audio_sender:
+                logger.warning("⚠️ No audio transceiver found - lip-sync may not work")
             
             # Create our answer
             answer = await self.pc.createAnswer()
@@ -236,10 +247,22 @@ async def send_audio_track(self, audio_track: MediaStreamTrack) -> None:
             return
         
         try:
-            await self._audio_sender.replaceTrack(audio_track)
-            logger.info("🎤 Audio track sent to HeyGen for lip-sync")
+            logger.info(f"🎤 Attempting to send audio track to HeyGen: {audio_track}")
+            logger.info(f"   Audio track kind: {audio_track.kind if hasattr(audio_track, 'kind') else 'unknown'}")
+            logger.info(f"   Current sender track: {self._audio_sender.track}")
+            
+            # replaceTrack is not async in aiortc
+            result = self._audio_sender.replaceTrack(audio_track)
+            # If it returns a coroutine, await it; otherwise just use the result
+            if hasattr(result, '__await__'):
+                await result
+            
+            logger.info(f"✅ Audio track successfully set on sender")
+            logger.info(f"   New sender track: {self._audio_sender.track}")
         except Exception as e:
-            logger.error(f"Failed to send audio track to HeyGen: {e}")
+            logger.error(f"❌ Failed to send audio track to HeyGen: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
 
     @property
     def is_connected(self) -> bool:

From 7f2983a66ec102be8ed06a2fa655aae3e84a35bf Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Thu, 30 Oct 2025 15:51:54 +0100
Subject: [PATCH 05/20] Clean up HeyGen implementation and fix duplicate text
 sending

- Removed obsolete heygen_audio_track.py (from old audio-based approach)
- Removed unused _audio_sender field and transceiver logic
- Removed unused _original_audio_write field
- Simplified audio track management
- Moved imports to top of file
- Updated docstrings to reflect text-based lip-sync approach

Fixed duplicate text sending issue:
- Added deduplication tracking with _sent_texts set
- Added minimum length filter (>15 chars) to prevent tiny fragments
- Simplified event handling to avoid duplicate subscriptions
- Proper buffer management between chunk and complete events

Known limitation: ~3-4 second audio delay is inherent to HeyGen platform
---
 .../vision_agents/core/agents/agents.py       |  10 +-
 plugins/heygen/example/avatar_example.py      |  26 +-
 .../example/avatar_streaming_llm_example.py   |  73 ++++
 .../plugins/heygen/heygen_audio_track.py      |  98 ------
 .../plugins/heygen/heygen_avatar_publisher.py | 318 ++++++++++++++----
 .../plugins/heygen/heygen_rtc_manager.py      |  71 ++--
 .../plugins/heygen/heygen_session.py          |  52 +++
 .../plugins/heygen/heygen_video_track.py      |   4 +-
 8 files changed, 425 insertions(+), 227 deletions(-)
 create mode 100644 plugins/heygen/example/avatar_streaming_llm_example.py
 delete mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py

diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
index 04519143..13853dd3 100644
--- a/agents-core/vision_agents/core/agents/agents.py
+++ b/agents-core/vision_agents/core/agents/agents.py
@@ -1008,10 +1008,13 @@ def publish_audio(self) -> bool:
         """Whether the agent should publish an outbound audio track.
 
         Returns:
-            True if TTS is configured or when in Realtime mode.
+            True if TTS is configured, when in Realtime mode, or if there are audio publishers.
         """
         if self.tts is not None or self.realtime_mode:
             return True
+        # Also publish audio if there are audio publishers (e.g., HeyGen avatar)
+        if self.audio_publishers:
+            return True
         return False
 
     @property
@@ -1137,6 +1140,11 @@ def _prepare_rtc(self):
             if self.realtime_mode and isinstance(self.llm, Realtime):
                 self._audio_track = self.llm.output_track
                 self.logger.info("🎵 Using Realtime provider output track for audio")
+            elif self.audio_publishers:
+                # Get the first audio publisher to create the track
+                audio_publisher = self.audio_publishers[0]
+                self._audio_track = audio_publisher.publish_audio_track()
+                self.logger.info("🎵 Audio track initialized from audio publisher")
             else:
                 # Default to WebRTC-friendly format unless configured differently
                 framerate = 48000
diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py
index 2d95421f..b3beef83 100644
--- a/plugins/heygen/example/avatar_example.py
+++ b/plugins/heygen/example/avatar_example.py
@@ -3,20 +3,23 @@
 from dotenv import load_dotenv
 
 from vision_agents.core import User, Agent
-from vision_agents.plugins import getstream, gemini, heygen
+from vision_agents.plugins import getstream, gemini, heygen, deepgram
 
 load_dotenv()
 
 
 async def start_avatar_agent() -> None:
-    """Start an agent with HeyGen avatar using Realtime LLM.
+    """Start an agent with HeyGen avatar using streaming LLM.
     
     This example demonstrates how to use HeyGen's avatar streaming
-    with Gemini Realtime. The avatar will lip-sync with the audio
-    generated by the Realtime LLM.
+    with a regular streaming LLM. This approach has much lower latency
+    than using Realtime LLMs because text goes directly to HeyGen
+    without any transcription round-trip.
+    
+    HeyGen handles all TTS and lip-sync based on the LLM's text output.
     """
     
-    # Create agent with HeyGen avatar and Realtime LLM
+    # Create agent with HeyGen avatar and streaming LLM
     agent = Agent(
         edge=getstream.Edge(),
         agent_user=User(
@@ -29,15 +32,20 @@ async def start_avatar_agent() -> None:
             "Don't use special characters or formatting."
         ),
         
-        # Use Gemini Realtime (includes built-in TTS and STT)
-        llm=gemini.Realtime(fps=2),
+        # Use regular streaming LLM (not Realtime) for lower latency
+        llm=gemini.LLM("gemini-2.0-flash-exp"),
+        
+        # Add STT for speech input
+        stt=deepgram.STT(),
         
         # Add HeyGen avatar as a video publisher
+        # Note: mute_llm_audio is not needed since streaming LLM doesn't produce audio
         processors=[
             heygen.AvatarPublisher(
                 avatar_id="default",  # Use your HeyGen avatar ID
                 quality="high",       # Video quality: "low", "medium", "high"
                 resolution=(1920, 1080),  # Output resolution
+                mute_llm_audio=False,  # Not needed for streaming LLM
             )
         ]
     )
@@ -47,7 +55,7 @@ async def start_avatar_agent() -> None:
     
     # Join the call
     with await agent.join(call):
-        # Set agent reference on avatar publisher for audio event subscription
+        # Set agent reference on avatar publisher for text event subscription
         avatar_publisher = agent.video_publishers[0]
         if hasattr(avatar_publisher, 'set_agent'):
             avatar_publisher.set_agent(agent)
@@ -55,7 +63,7 @@ async def start_avatar_agent() -> None:
         # Open demo UI
         await agent.edge.open_demo(call)
         
-        # Keep the call running - Realtime mode handles conversation automatically
+        # Keep the call running
         await agent.finish()
 
 
diff --git a/plugins/heygen/example/avatar_streaming_llm_example.py b/plugins/heygen/example/avatar_streaming_llm_example.py
new file mode 100644
index 00000000..8bd68998
--- /dev/null
+++ b/plugins/heygen/example/avatar_streaming_llm_example.py
@@ -0,0 +1,73 @@
+import asyncio
+from uuid import uuid4
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent
+from vision_agents.plugins import getstream, gemini, heygen, deepgram
+
+load_dotenv()
+
+
+async def start_avatar_agent_streaming() -> None:
+    """Start an agent with HeyGen avatar using streaming (non-Realtime) LLM.
+    
+    This example demonstrates how to use HeyGen's avatar streaming
+    with a regular streaming LLM (gemini.LLM) + STT. HeyGen will handle
+    both TTS and video generation based on the LLM's text output.
+    
+    This approach has lower latency than Realtime LLMs because:
+    - Text is sent to HeyGen immediately as it's generated
+    - No transcription round-trip (LLM → audio → transcription → HeyGen)
+    - HeyGen handles TTS and lip-sync simultaneously
+    """
+    
+    # Create agent with HeyGen avatar and streaming LLM
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(
+            name="AI Assistant with Avatar",
+            id="agent"
+        ),
+        instructions=(
+            "You're a friendly and helpful AI assistant. "
+            "Keep your responses conversational and engaging. "
+            "Don't use special characters or formatting."
+        ),
+        
+        # Use regular streaming LLM (not Realtime)
+        llm=gemini.LLM("gemini-2.0-flash-exp"),
+        
+        # Add STT for speech input
+        stt=deepgram.STT(),
+        
+        # Add HeyGen avatar as a video publisher
+        # Note: mute_llm_audio is not needed here since gemini.LLM doesn't produce audio
+        processors=[
+            heygen.AvatarPublisher(
+                avatar_id="default",  # Use your HeyGen avatar ID
+                quality="high",       # Video quality: "low", "medium", "high"
+                resolution=(1920, 1080),  # Output resolution
+            )
+        ]
+    )
+    
+    # Create a call
+    call = agent.edge.client.video.call("default", str(uuid4()))
+    
+    # Join the call
+    with await agent.join(call):
+        # Set agent reference on avatar publisher for text event subscription
+        avatar_publisher = agent.video_publishers[0]
+        if hasattr(avatar_publisher, 'set_agent'):
+            avatar_publisher.set_agent(agent)
+        
+        # Open demo UI
+        await agent.edge.open_demo(call)
+        
+        # Keep the call running
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    asyncio.run(start_avatar_agent_streaming())
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py
deleted file mode 100644
index f53a5399..00000000
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py
+++ /dev/null
@@ -1,98 +0,0 @@
-"""Custom audio track for sending audio to HeyGen for lip-sync."""
-
-import asyncio
-import logging
-from typing import Optional
-from fractions import Fraction
-
-import av
-import numpy as np
-from aiortc import AudioStreamTrack
-
-logger = logging.getLogger(__name__)
-
-
-class HeyGenAudioTrack(AudioStreamTrack):
-    """Audio track that accepts PCM data and produces frames for WebRTC.
-    
-    This track receives audio data from the Realtime LLM and produces
-    audio frames that can be sent to HeyGen via WebRTC for lip-sync.
-    """
-    
-    kind = "audio"
-    
-    def __init__(self, sample_rate: int = 24000):
-        """Initialize the audio track.
-        
-        Args:
-            sample_rate: Sample rate for audio frames (default: 24000 for Gemini).
-        """
-        super().__init__()
-        self._sample_rate = sample_rate
-        self._ts = 0
-        self._latest_chunk: Optional[bytes] = None
-        self._silence_cache: dict[int, np.ndarray] = {}
-        logger.info(f"🎤 HeyGenAudioTrack initialized at {sample_rate}Hz")
-    
-    def write_audio(self, pcm_data: bytes) -> None:
-        """Write PCM audio data to be sent to HeyGen.
-        
-        Args:
-            pcm_data: Raw PCM16 audio data from the LLM.
-        """
-        if not pcm_data:
-            return
-        self._latest_chunk = bytes(pcm_data)
-        logger.debug(f"✍️ Audio data written: {len(pcm_data)} bytes")
-    
-    async def recv(self) -> av.AudioFrame:
-        """Receive the next audio frame for WebRTC transmission.
-        
-        Returns:
-            Audio frame to send to HeyGen.
-        """
-        # Pace at 20ms per frame (50 fps)
-        await asyncio.sleep(0.02)
-        
-        sr = self._sample_rate
-        samples_per_frame = int(0.02 * sr)  # 20ms worth of samples
-        
-        chunk = self._latest_chunk
-        if chunk:
-            logger.debug(f"🎙️ recv() producing frame with audio data ({len(chunk)} bytes)")
-        if chunk:
-            # Consume and clear the latest pushed chunk
-            self._latest_chunk = None
-            arr = np.frombuffer(chunk, dtype=np.int16)
-            
-            # Ensure mono channel
-            if arr.ndim == 1:
-                samples = arr.reshape(1, -1)
-            else:
-                samples = arr[:1, :]
-            
-            # Pad or truncate to exactly one 20ms frame
-            needed = samples_per_frame
-            have = samples.shape[1]
-            if have < needed:
-                pad = np.zeros((1, needed - have), dtype=np.int16)
-                samples = np.concatenate([samples, pad], axis=1)
-            elif have > needed:
-                samples = samples[:, :needed]
-        else:
-            # Generate silence when no audio data is available
-            cached = self._silence_cache.get(sr)
-            if cached is None:
-                cached = np.zeros((1, samples_per_frame), dtype=np.int16)
-                self._silence_cache[sr] = cached
-            samples = cached
-        
-        # Create audio frame
-        frame = av.AudioFrame.from_ndarray(samples, format="s16", layout="mono")
-        frame.sample_rate = sr
-        frame.pts = self._ts
-        frame.time_base = Fraction(1, sr)
-        self._ts += samples.shape[1]
-        
-        return frame
-
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index 16c1a9bc..e9da7551 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -2,9 +2,13 @@
 import logging
 from typing import Optional, Any, Tuple
 
+import numpy as np
+from getstream.video.rtc import audio_track
+
 from vision_agents.core.processors.base_processor import (
     AudioVideoProcessor,
     VideoPublisherMixin,
+    AudioPublisherMixin,
 )
 
 from .heygen_rtc_manager import HeyGenRTCManager
@@ -13,20 +17,21 @@
 logger = logging.getLogger(__name__)
 
 
-class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin):
-    """HeyGen avatar video publisher.
+class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMixin):
+    """HeyGen avatar video and audio publisher.
     
-    Publishes video of a HeyGen avatar that lip-syncs to audio input.
+    Publishes video of a HeyGen avatar that lip-syncs based on LLM text output.
     Can be used as a processor in the Vision Agents framework to add
     realistic avatar video to AI agents.
     
+    HeyGen handles TTS internally, so no separate TTS is needed.
+    
     Example:
         agent = Agent(
             edge=getstream.Edge(),
             agent_user=User(name="Avatar AI"),
             instructions="Be helpful and friendly",
             llm=gemini.LLM("gemini-2.0-flash"),
-            tts=cartesia.TTS(),
             stt=deepgram.STT(),
             processors=[
                 heygen.AvatarPublisher(
@@ -44,6 +49,7 @@ def __init__(
         resolution: Tuple[int, int] = (1920, 1080),
         api_key: Optional[str] = None,
         interval: int = 0,
+        mute_llm_audio: bool = True,
         **kwargs,
     ):
         """Initialize the HeyGen avatar publisher.
@@ -54,11 +60,13 @@ def __init__(
             resolution: Output video resolution (width, height).
             api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided.
             interval: Processing interval (not used, kept for compatibility).
+            mute_llm_audio: If True, mutes the Realtime LLM's audio output so only
+                HeyGen's video (with audio) is heard. Default: True.
             **kwargs: Additional arguments passed to parent class.
         """
         super().__init__(
             interval=interval,
-            receive_audio=True,  # Receive audio to forward to HeyGen for lip-sync
+            receive_audio=False,  # We send text to HeyGen, not audio
             receive_video=False,
             **kwargs
         )
@@ -67,6 +75,7 @@ def __init__(
         self.quality = quality
         self.resolution = resolution
         self.api_key = api_key
+        self.mute_llm_audio = mute_llm_audio
         
         # WebRTC manager for HeyGen connection
         self.rtc_manager = HeyGenRTCManager(
@@ -81,21 +90,38 @@ def __init__(
             height=resolution[1],
         )
         
+        # Audio track for publishing HeyGen's audio
+        # Create it immediately so the agent can detect it during initialization
+        self._audio_track = audio_track.AudioStreamTrack(
+            framerate=48000, stereo=True
+        )
+        
         # Connection state
         self._connected = False
         self._connection_task: Optional[asyncio.Task] = None
-        self._audio_track_set = False
         self._agent = None  # Will be set by the agent
         
-        # Create a custom audio track for HeyGen that we can write to
-        from .heygen_audio_track import HeyGenAudioTrack
-        self._heygen_audio_track = HeyGenAudioTrack(sample_rate=24000)
+        # Text buffer for accumulating LLM response chunks before sending to HeyGen
+        self._text_buffer = ""
+        self._current_response_id: Optional[str] = None
+        self._sent_texts: set = set()  # Track sent texts to avoid duplicates
+        
+        # Audio forwarding state (for selective muting of Realtime LLM audio)
+        self._forwarding_audio = False
         
         logger.info(
             f"🎭 HeyGen AvatarPublisher initialized "
             f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})"
         )
     
+    def publish_audio_track(self):
+        """Return the audio track for publishing HeyGen's audio.
+        
+        This method is called by the Agent to get the audio track that will
+        be published to the call. HeyGen's audio will be forwarded to this track.
+        """
+        return self._audio_track
+    
     def set_agent(self, agent: Any) -> None:
         """Set the agent reference for event subscription.
         
@@ -106,48 +132,127 @@ def set_agent(self, agent: Any) -> None:
         """
         self._agent = agent
         logger.info("🔗 Agent reference set for HeyGen avatar publisher")
+        
+        # Mute the Realtime LLM's audio if requested
+        if self.mute_llm_audio:
+            self._mute_realtime_llm_audio()
+        
+        # Subscribe to text events immediately when agent is set
+        self._subscribe_to_text_events()
 
     async def _connect_to_heygen(self) -> None:
-        """Establish connection to HeyGen and start receiving video."""
+        """Establish connection to HeyGen and start receiving video and audio."""
         try:
-            # Set up video callback before connecting
+            # Set up video and audio callbacks before connecting
             self.rtc_manager.set_video_callback(self._on_video_track)
+            self.rtc_manager.set_audio_callback(self._on_audio_track)
             
             # Connect to HeyGen
             await self.rtc_manager.connect()
             
             self._connected = True
             logger.info("✅ Connected to HeyGen, avatar streaming active")
-            
-            # Subscribe to audio output events from the LLM for lip-sync
-            self._subscribe_to_audio_events()
         
         except Exception as e:
             logger.error(f"❌ Failed to connect to HeyGen: {e}")
             self._connected = False
             raise
     
-    def _subscribe_to_audio_events(self) -> None:
-        """Subscribe to audio output events from the LLM."""
+    def _subscribe_to_text_events(self) -> None:
+        """Subscribe to text output events from the LLM.
+        
+        HeyGen requires text input (not audio) for proper lip-sync.
+        We listen to the LLM's text output and send it to HeyGen's task API.
+        """
         try:
-            # Import the event type
-            from vision_agents.core.llm.events import RealtimeAudioOutputEvent
+            # Import the event types
+            from vision_agents.core.llm.events import (
+                LLMResponseChunkEvent,
+                LLMResponseCompletedEvent,
+                RealtimeAgentSpeechTranscriptionEvent,
+            )
             
-            # Get the agent's event manager
-            # Note: This will be set when the processor is attached to an agent
-            if hasattr(self, '_agent') and self._agent:
-                @self._agent.events.subscribe
-                async def on_audio_output(event: RealtimeAudioOutputEvent):
-                    logger.debug(f"📢 Received audio output event: {len(event.audio_data)} bytes at {event.sample_rate}Hz")
-                    await self._on_audio_output(event.audio_data, event.sample_rate)
-                logger.info("🎧 Subscribed to LLM audio output events for lip-sync")
+            # Get the LLM's event manager (events are emitted by the LLM, not the agent)
+            if hasattr(self, '_agent') and self._agent and hasattr(self._agent, 'llm'):
+                @self._agent.llm.events.subscribe
+                async def on_text_chunk(event: LLMResponseChunkEvent):
+                    """Handle streaming text chunks from the LLM."""
+                    logger.debug(f"📝 HeyGen received text chunk: delta='{event.delta}'")
+                    if event.delta:
+                        await self._on_text_chunk(event.delta, event.item_id)
+                
+                @self._agent.llm.events.subscribe
+                async def on_text_complete(event: LLMResponseCompletedEvent):
+                    """Handle end of LLM response - send any remaining buffered text."""
+                    # Send any remaining buffered text
+                    if self._text_buffer.strip():
+                        text_to_send = self._text_buffer.strip()
+                        if text_to_send not in self._sent_texts:
+                            await self._send_text_to_heygen(text_to_send)
+                            self._sent_texts.add(text_to_send)
+                        self._text_buffer = ""
+                    # Reset for next response
+                    self._current_response_id = None
+                    self._sent_texts.clear()
+                
+                @self._agent.llm.events.subscribe
+                async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent):
+                    """Handle agent speech transcription from Realtime LLMs.
+                    
+                    This is the primary path for Gemini Realtime which transcribes
+                    the agent's speech output as text.
+                    """
+                    logger.debug(f"📝 HeyGen received agent speech: text='{event.text}'")
+                    if event.text:
+                        # Send directly to HeyGen - this is the complete utterance
+                        await self._send_text_to_heygen(event.text)
                 
-                # Also log what events are registered
-                logger.info(f"   Event manager has {len(self._agent.events._handlers)} event handlers")
+                logger.info("📝 Subscribed to LLM text output events for HeyGen lip-sync")
             else:
-                logger.warning("⚠️ Cannot subscribe to audio events - no agent attached yet")
+                logger.warning("⚠️ Cannot subscribe to text events - no agent or LLM attached yet")
+        except Exception as e:
+            logger.error(f"Failed to subscribe to text events: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+
+    def _mute_realtime_llm_audio(self) -> None:
+        """Mute the Realtime LLM's audio output.
+        
+        When using HeyGen, we want HeyGen to handle all audio (with lip-sync),
+        so we mute the LLM's native audio output to avoid duplicated/overlapping audio.
+        
+        This works by intercepting writes to the LLM's output_track and only blocking
+        writes that come from the LLM itself (not from HeyGen forwarding).
+        """
+        try:
+            from vision_agents.core.llm.realtime import Realtime
+            
+            if not hasattr(self, '_agent') or not self._agent:
+                logger.warning("⚠️ Cannot mute LLM audio - no agent set")
+                return
+                
+            if not hasattr(self._agent, 'llm') or not isinstance(self._agent.llm, Realtime):
+                logger.info("ℹ️ LLM is not a Realtime LLM - no audio to mute")
+                return
+            
+            # Store the original write method
+            original_write = self._agent.llm.output_track.write
+            
+            # Create a selective write method
+            async def selective_write(audio_data: bytes) -> None:
+                """Only allow writes from HeyGen forwarding, block LLM writes."""
+                if self._forwarding_audio:
+                    # This is from HeyGen - allow it
+                    await original_write(audio_data)
+                # else: This is from the Realtime LLM - block it
+            
+            # Replace the write method
+            self._agent.llm.output_track.write = selective_write
+            
+            logger.info("🔇 Muted Realtime LLM audio output (HeyGen will provide audio)")
+            
         except Exception as e:
-            logger.error(f"Failed to subscribe to audio events: {e}")
+            logger.error(f"Failed to mute LLM audio: {e}")
             import traceback
             logger.error(traceback.format_exc())
 
@@ -160,58 +265,131 @@ async def _on_video_track(self, track: Any) -> None:
         logger.info("📹 Received video track from HeyGen, starting frame forwarding")
         await self._video_track.start_receiving(track)
 
-    async def _setup_audio_forwarding(self) -> None:
-        """Set up audio forwarding from agent to HeyGen for lip-sync."""
-        if self._audio_track_set:
-            return  # Already set up
+    async def _on_audio_track(self, track: Any) -> None:
+        """Callback when audio track is received from HeyGen.
         
-        logger.info("🎤 Setting up audio forwarding to HeyGen for lip-sync")
+        HeyGen provides audio with lip-synced TTS. We forward this audio
+        to the agent's audio track so it gets published to the call.
         
-        # Wait for HeyGen connection
-        if not self._connected:
-            if self._connection_task:
-                try:
-                    await asyncio.wait_for(self._connection_task, timeout=10.0)
-                except asyncio.TimeoutError:
-                    logger.error("Timeout waiting for HeyGen connection")
-                    return
-            else:
-                logger.error("HeyGen connection not started")
-                return
+        Args:
+            track: Incoming audio track from HeyGen's WebRTC connection.
+        """
+        logger.info("🔊 Received audio track from HeyGen, starting audio forwarding")
         
-        # Set our custom audio track on the HeyGen sender
-        await self.rtc_manager.send_audio_track(self._heygen_audio_track)
-        self._audio_track_set = True
-        logger.info("✅ Audio track set up for HeyGen lip-sync")
-
-    async def _on_audio_output(self, audio_data: bytes, sample_rate: int) -> None:
-        """Handle audio output from the LLM and forward to HeyGen.
+        # Forward audio frames from HeyGen to our audio track
+        asyncio.create_task(self._forward_audio_frames(track, self._audio_track))
+    
+    async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> None:
+        """Forward audio frames from HeyGen to agent's audio track.
         
         Args:
-            audio_data: Raw PCM audio data from the LLM.
-            sample_rate: Sample rate of the audio data.
+            source_track: Audio track from HeyGen.
+            dest_track: Agent's audio track to write to.
         """
-        logger.debug(f"🎵 _on_audio_output called: {len(audio_data)} bytes at {sample_rate}Hz")
+        try:
+            logger.info("🔊 Starting HeyGen audio frame forwarding")
+            frame_count = 0
+            while True:
+                try:
+                    # Read audio frame from HeyGen
+                    frame = await source_track.recv()
+                    frame_count += 1
+                    
+                    # Convert frame to bytes and write to agent's audio track
+                    if hasattr(frame, 'to_ndarray'):
+                        audio_array = frame.to_ndarray()
+                        
+                        # Convert mono to stereo if needed (agent track expects stereo)
+                        # HeyGen sends mono (shape=(1, samples)), we need interleaved stereo
+                        if audio_array.shape[0] == 1:
+                            # Flatten to 1D array of samples
+                            mono_samples = audio_array.flatten()
+                            
+                            # Create stereo by interleaving each mono sample
+                            stereo_samples = np.repeat(mono_samples, 2)
+                            audio_bytes = stereo_samples.tobytes()
+                        else:
+                            # Already multi-channel, just flatten and convert
+                            audio_bytes = audio_array.flatten().tobytes()
+                        
+                        # Set flag to allow HeyGen audio through the muted track
+                        self._forwarding_audio = True
+                        await dest_track.write(audio_bytes)
+                        self._forwarding_audio = False
+                    else:
+                        logger.warning("⚠️ Received frame without to_ndarray() method")
+                        
+                except Exception as e:
+                    if "ended" in str(e).lower() or "closed" in str(e).lower():
+                        logger.info(f"🔊 HeyGen audio track ended (forwarded {frame_count} frames)")
+                        break
+                    else:
+                        logger.error(f"❌ Error forwarding audio frame #{frame_count}: {e}")
+                        import traceback
+                        logger.error(traceback.format_exc())
+                        break
+                        
+        except Exception as e:
+            logger.error(f"❌ Error in audio forwarding loop: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
+
+    async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None:
+        """Handle text chunk from the LLM.
         
-        if not self._audio_track_set:
-            # Set up audio forwarding on first audio output
-            logger.info("🔧 Setting up audio forwarding on first audio output")
-            await self._setup_audio_forwarding()
+        Accumulates text chunks until a complete sentence or response is ready,
+        then sends to HeyGen for lip-sync.
         
-        # Write audio data to our custom track for HeyGen
-        logger.info(f"✍️ Writing {len(audio_data)} bytes to HeyGen audio track")
-        self._heygen_audio_track.write_audio(audio_data)
-
-    def set_agent_audio_track(self, audio_track: Any) -> None:
-        """Set the agent's audio track for forwarding to HeyGen.
+        Args:
+            text_delta: The text chunk/delta from the LLM.
+            item_id: The response item ID.
+        """
+        # If this is a new response, reset the buffer and sent tracking
+        if item_id != self._current_response_id:
+            if self._text_buffer:
+                # Send any accumulated text from previous response
+                await self._send_text_to_heygen(self._text_buffer.strip())
+            self._text_buffer = ""
+            self._current_response_id = item_id
+            self._sent_texts.clear()
+        
+        # Accumulate text
+        self._text_buffer += text_delta
         
-        DEPRECATED: This method is no longer needed. Audio is now forwarded
-        via event listening instead of track sharing.
+        # Send when we have a complete sentence (ending with period, !, or ?)
+        # But only if it's substantial enough (> 15 chars) to avoid sending tiny fragments
+        # Don't send on commas/semicolons to reduce repetition
+        if any(self._text_buffer.rstrip().endswith(p) for p in ['.', '!', '?']):
+            text_to_send = self._text_buffer.strip()
+            # Only send if it's substantial (>15 chars) and not already sent
+            if text_to_send and len(text_to_send) > 15 and text_to_send not in self._sent_texts:
+                await self._send_text_to_heygen(text_to_send)
+                self._sent_texts.add(text_to_send)
+                self._text_buffer = ""  # Clear buffer after sending
+            elif text_to_send in self._sent_texts:
+                self._text_buffer = ""  # Clear buffer to avoid re-sending
+    
+    async def _send_text_to_heygen(self, text: str) -> None:
+        """Send text to HeyGen for the avatar to speak with lip-sync.
         
         Args:
-            audio_track: The agent's audio output track (unused).
+            text: The text for the avatar to speak.
         """
-        logger.warning("set_agent_audio_track is deprecated - audio forwarding is automatic via events")
+        if not text:
+            return
+        
+        if not self._connected:
+            logger.warning("Cannot send text to HeyGen - not connected")
+            return
+        
+        try:
+            logger.info(f"📤 Sending text to HeyGen: '{text[:50]}...'")
+            await self.rtc_manager.send_text(text, task_type="repeat")
+            logger.debug("✅ Text sent to HeyGen successfully")
+        except Exception as e:
+            logger.error(f"❌ Failed to send text to HeyGen: {e}")
+            import traceback
+            logger.error(traceback.format_exc())
 
     def publish_video_track(self):
         """Publish the HeyGen avatar video track.
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
index 278ea9b5..3e61674a 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -46,8 +46,8 @@ def __init__(
         # Video track callback for receiving avatar video
         self._video_callback: Optional[Callable[[MediaStreamTrack], Any]] = None
         
-        # Audio track for sending to HeyGen
-        self._audio_sender: Optional[Any] = None
+        # Audio track callback for receiving avatar audio
+        self._audio_callback: Optional[Callable[[MediaStreamTrack], Any]] = None
         
         self._connected = False
         self._connection_ready = asyncio.Event()
@@ -117,28 +117,9 @@ async def on_connection_state_change():
             offer = RTCSessionDescription(sdp=offer_sdp, type="offer")
             await self.pc.setRemoteDescription(offer)
             
-            # HeyGen's offer includes tracks for video/audio they send us
-            # Check transceivers to see if we have an audio sender
+            # Log transceivers for debugging
             logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}")
             
-            # Find the audio transceiver and modify it to allow sending
-            logger.info(f"🔍 Checking {len(self.pc.getTransceivers())} transceivers for audio")
-            for idx, transceiver in enumerate(self.pc.getTransceivers()):
-                logger.info(f"  Transceiver {idx}: mid={transceiver.mid}, direction={transceiver.direction}")
-                if transceiver.receiver and transceiver.receiver.track:
-                    logger.info(f"    Receiver track: kind={transceiver.receiver.track.kind}")
-                    if transceiver.receiver.track.kind == "audio":
-                        # Found the audio transceiver - modify its direction to allow sending
-                        logger.info(f"    🔧 Modifying audio transceiver direction from {transceiver.direction} to sendrecv")
-                        transceiver.direction = "sendrecv"
-                        self._audio_sender = transceiver.sender
-                        logger.info("✅ Audio transceiver modified for lip-sync")
-                        break
-            
-            # If no audio transceiver found, log warning
-            if not self._audio_sender:
-                logger.warning("⚠️ No audio transceiver found - lip-sync may not work")
-            
             # Create our answer
             answer = await self.pc.createAnswer()
             await self.pc.setLocalDescription(answer)
@@ -225,8 +206,12 @@ async def _handle_track(self, track: MediaStreamTrack) -> None:
             else:
                 logger.warning("Video track received but no callback registered")
         elif track.kind == "audio":
-            # Audio track from HeyGen (avatar speech) - currently not used
-            logger.debug("Audio track received from HeyGen (ignored)")
+            # Audio track from HeyGen (avatar speech with lip-synced TTS)
+            logger.info("🔊 Audio track received from HeyGen")
+            if self._audio_callback:
+                await self._audio_callback(track)
+            else:
+                logger.warning("⚠️ Audio track received but no callback registered")
 
     def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None:
         """Set callback for handling incoming video track.
@@ -236,33 +221,25 @@ def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> Non
         """
         self._video_callback = callback
 
-    async def send_audio_track(self, audio_track: MediaStreamTrack) -> None:
-        """Send audio track to HeyGen for lip-sync.
+    def set_audio_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None:
+        """Set callback for handling incoming audio track.
         
         Args:
-            audio_track: Audio track containing agent's speech.
+            callback: Async function to handle audio track.
         """
-        if not self._audio_sender:
-            logger.warning("No audio sender available - connection may not be established")
-            return
+        self._audio_callback = callback
+
+    async def send_text(self, text: str, task_type: str = "repeat") -> None:
+        """Send text to HeyGen for the avatar to speak with lip-sync.
         
-        try:
-            logger.info(f"🎤 Attempting to send audio track to HeyGen: {audio_track}")
-            logger.info(f"   Audio track kind: {audio_track.kind if hasattr(audio_track, 'kind') else 'unknown'}")
-            logger.info(f"   Current sender track: {self._audio_sender.track}")
-            
-            # replaceTrack is not async in aiortc
-            result = self._audio_sender.replaceTrack(audio_track)
-            # If it returns a coroutine, await it; otherwise just use the result
-            if hasattr(result, '__await__'):
-                await result
-            
-            logger.info(f"✅ Audio track successfully set on sender")
-            logger.info(f"   New sender track: {self._audio_sender.track}")
-        except Exception as e:
-            logger.error(f"❌ Failed to send audio track to HeyGen: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
+        This is the correct way to achieve lip-sync with HeyGen - they handle
+        TTS and lip-sync server-side based on the text input.
+        
+        Args:
+            text: The text for the avatar to speak.
+            task_type: Either "repeat" or "talk" (default: "repeat").
+        """
+        await self.session_manager.send_task(text, task_type)
 
     @property
     def is_connected(self) -> bool:
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
index 917a4a52..aca8caa3 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
@@ -134,6 +134,58 @@ async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any
             logger.error(f"❌ Failed to start HeyGen session: {e}")
             raise
 
+    async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any]:
+        """Send a text task to HeyGen for the avatar to speak.
+        
+        This is the proper way to achieve lip-sync with HeyGen - send text,
+        and HeyGen handles TTS and lip-sync server-side.
+        
+        Args:
+            text: The text for the avatar to speak.
+            task_type: Either "repeat" (avatar repeats text exactly) or 
+                      "talk" (processes through HeyGen's LLM first).
+            
+        Returns:
+            Task response from HeyGen.
+        """
+        if not self.session_id:
+            raise RuntimeError("Session not created. Call create_session() first.")
+        
+        if not self._http_session:
+            self._http_session = aiohttp.ClientSession()
+        
+        headers = {
+            "X-Api-Key": self.api_key,
+            "Content-Type": "application/json",
+        }
+        
+        payload = {
+            "session_id": self.session_id,
+            "text": text,
+            "task_type": task_type,
+        }
+        
+        try:
+            async with self._http_session.post(
+                f"{self.base_url}/streaming.task",
+                json=payload,
+                headers=headers,
+            ) as response:
+                if response.status != 200:
+                    error_text = await response.text()
+                    logger.warning(
+                        f"Failed to send task to HeyGen: {response.status} - {error_text}"
+                    )
+                    return {}
+                
+                data = await response.json()
+                logger.debug(f"📤 Sent text to HeyGen: '{text[:50]}...'")
+                return data
+        
+        except Exception as e:
+            logger.error(f"❌ Error sending task to HeyGen: {e}")
+            return {}
+
     async def stop_session(self) -> None:
         """Stop the HeyGen streaming session."""
         if not self.session_id:
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
index 1fcbc39b..ace06d5e 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
@@ -31,8 +31,8 @@ def __init__(self, width: int = 1920, height: int = 1080):
         self.width = width
         self.height = height
         
-        # Queue for incoming frames from HeyGen
-        self.frame_queue: LatestNQueue[av.VideoFrame] = LatestNQueue(maxlen=30)
+        # Queue for incoming frames from HeyGen - keep minimal for low latency
+        self.frame_queue: LatestNQueue[av.VideoFrame] = LatestNQueue(maxlen=2)
         
         # Create placeholder frame for when no frames are available
         placeholder = Image.new("RGB", (self.width, self.height), color=(30, 30, 40))

From 96f1cc94b9bb5784a6c78c6e5c784237edb2f2aa Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Mon, 3 Nov 2025 17:17:09 +0100
Subject: [PATCH 06/20] PR cleanup

---
 aiortc                                        |  1 -
 plugins/heygen/README.md                      | 12 +--
 .../example/avatar_streaming_llm_example.py   | 73 -------------------
 .../plugins/heygen/heygen_avatar_publisher.py | 48 ++++++------
 .../plugins/heygen/heygen_rtc_manager.py      | 22 +++---
 .../plugins/heygen/heygen_session.py          | 16 ++--
 .../plugins/heygen/heygen_video_track.py      |  8 +-
 7 files changed, 53 insertions(+), 127 deletions(-)
 delete mode 160000 aiortc
 delete mode 100644 plugins/heygen/example/avatar_streaming_llm_example.py

diff --git a/aiortc b/aiortc
deleted file mode 160000
index f84800ce..00000000
--- a/aiortc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit f84800ce052de7d81a62b07c6f6094504c19b65f
diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md
index 2d4cdd10..b7360c3a 100644
--- a/plugins/heygen/README.md
+++ b/plugins/heygen/README.md
@@ -56,10 +56,10 @@ async def start_avatar_agent():
     call = agent.edge.client.video.call("default", str(uuid4()))
     
     with await agent.join(call):
-        # Enable lip-sync by forwarding agent's audio to HeyGen
+        # Set agent reference for event subscription
         avatar_publisher = agent.video_publishers[0]
-        if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track:
-            avatar_publisher.set_agent_audio_track(agent._audio_track)
+        if hasattr(avatar_publisher, 'set_agent'):
+            avatar_publisher.set_agent(agent)
         
         await agent.edge.open_demo(call)
         await agent.simple_response("Hello! I'm your AI assistant with an avatar.")
@@ -112,10 +112,10 @@ agent = Agent(
 call = agent.edge.client.video.call("default", str(uuid4()))
 
 with await agent.join(call):
-    # Enable lip-sync
+    # Set agent reference for event subscription
     avatar_publisher = agent.video_publishers[0]
-    if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track:
-        avatar_publisher.set_agent_audio_track(agent._audio_track)
+    if hasattr(avatar_publisher, 'set_agent'):
+        avatar_publisher.set_agent(agent)
     
     await agent.finish()
 ```
diff --git a/plugins/heygen/example/avatar_streaming_llm_example.py b/plugins/heygen/example/avatar_streaming_llm_example.py
deleted file mode 100644
index 8bd68998..00000000
--- a/plugins/heygen/example/avatar_streaming_llm_example.py
+++ /dev/null
@@ -1,73 +0,0 @@
-import asyncio
-from uuid import uuid4
-from dotenv import load_dotenv
-
-from vision_agents.core import User, Agent
-from vision_agents.plugins import getstream, gemini, heygen, deepgram
-
-load_dotenv()
-
-
-async def start_avatar_agent_streaming() -> None:
-    """Start an agent with HeyGen avatar using streaming (non-Realtime) LLM.
-    
-    This example demonstrates how to use HeyGen's avatar streaming
-    with a regular streaming LLM (gemini.LLM) + STT. HeyGen will handle
-    both TTS and video generation based on the LLM's text output.
-    
-    This approach has lower latency than Realtime LLMs because:
-    - Text is sent to HeyGen immediately as it's generated
-    - No transcription round-trip (LLM → audio → transcription → HeyGen)
-    - HeyGen handles TTS and lip-sync simultaneously
-    """
-    
-    # Create agent with HeyGen avatar and streaming LLM
-    agent = Agent(
-        edge=getstream.Edge(),
-        agent_user=User(
-            name="AI Assistant with Avatar",
-            id="agent"
-        ),
-        instructions=(
-            "You're a friendly and helpful AI assistant. "
-            "Keep your responses conversational and engaging. "
-            "Don't use special characters or formatting."
-        ),
-        
-        # Use regular streaming LLM (not Realtime)
-        llm=gemini.LLM("gemini-2.0-flash-exp"),
-        
-        # Add STT for speech input
-        stt=deepgram.STT(),
-        
-        # Add HeyGen avatar as a video publisher
-        # Note: mute_llm_audio is not needed here since gemini.LLM doesn't produce audio
-        processors=[
-            heygen.AvatarPublisher(
-                avatar_id="default",  # Use your HeyGen avatar ID
-                quality="high",       # Video quality: "low", "medium", "high"
-                resolution=(1920, 1080),  # Output resolution
-            )
-        ]
-    )
-    
-    # Create a call
-    call = agent.edge.client.video.call("default", str(uuid4()))
-    
-    # Join the call
-    with await agent.join(call):
-        # Set agent reference on avatar publisher for text event subscription
-        avatar_publisher = agent.video_publishers[0]
-        if hasattr(avatar_publisher, 'set_agent'):
-            avatar_publisher.set_agent(agent)
-        
-        # Open demo UI
-        await agent.edge.open_demo(call)
-        
-        # Keep the call running
-        await agent.finish()
-
-
-if __name__ == "__main__":
-    asyncio.run(start_avatar_agent_streaming())
-
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index e9da7551..f78e538b 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -110,7 +110,7 @@ def __init__(
         self._forwarding_audio = False
         
         logger.info(
-            f"🎭 HeyGen AvatarPublisher initialized "
+            f"HeyGen AvatarPublisher initialized "
             f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})"
         )
     
@@ -131,7 +131,7 @@ def set_agent(self, agent: Any) -> None:
             agent: The agent instance.
         """
         self._agent = agent
-        logger.info("🔗 Agent reference set for HeyGen avatar publisher")
+        logger.info("Agent reference set for HeyGen avatar publisher")
         
         # Mute the Realtime LLM's audio if requested
         if self.mute_llm_audio:
@@ -151,10 +151,10 @@ async def _connect_to_heygen(self) -> None:
             await self.rtc_manager.connect()
             
             self._connected = True
-            logger.info("✅ Connected to HeyGen, avatar streaming active")
+            logger.info("Connected to HeyGen, avatar streaming active")
         
         except Exception as e:
-            logger.error(f"❌ Failed to connect to HeyGen: {e}")
+            logger.error(f"Failed to connect to HeyGen: {e}")
             self._connected = False
             raise
     
@@ -177,7 +177,7 @@ def _subscribe_to_text_events(self) -> None:
                 @self._agent.llm.events.subscribe
                 async def on_text_chunk(event: LLMResponseChunkEvent):
                     """Handle streaming text chunks from the LLM."""
-                    logger.debug(f"📝 HeyGen received text chunk: delta='{event.delta}'")
+                    logger.debug(f"HeyGen received text chunk: delta='{event.delta}'")
                     if event.delta:
                         await self._on_text_chunk(event.delta, event.item_id)
                 
@@ -202,14 +202,14 @@ async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent):
                     This is the primary path for Gemini Realtime which transcribes
                     the agent's speech output as text.
                     """
-                    logger.debug(f"📝 HeyGen received agent speech: text='{event.text}'")
+                    logger.debug(f"HeyGen received agent speech: text='{event.text}'")
                     if event.text:
                         # Send directly to HeyGen - this is the complete utterance
                         await self._send_text_to_heygen(event.text)
                 
-                logger.info("📝 Subscribed to LLM text output events for HeyGen lip-sync")
+                logger.info("Subscribed to LLM text output events for HeyGen lip-sync")
             else:
-                logger.warning("⚠️ Cannot subscribe to text events - no agent or LLM attached yet")
+                logger.warning("Cannot subscribe to text events - no agent or LLM attached yet")
         except Exception as e:
             logger.error(f"Failed to subscribe to text events: {e}")
             import traceback
@@ -228,11 +228,11 @@ def _mute_realtime_llm_audio(self) -> None:
             from vision_agents.core.llm.realtime import Realtime
             
             if not hasattr(self, '_agent') or not self._agent:
-                logger.warning("⚠️ Cannot mute LLM audio - no agent set")
+                logger.warning("Cannot mute LLM audio - no agent set")
                 return
                 
             if not hasattr(self._agent, 'llm') or not isinstance(self._agent.llm, Realtime):
-                logger.info("ℹ️ LLM is not a Realtime LLM - no audio to mute")
+                logger.info("LLM is not a Realtime LLM - no audio to mute")
                 return
             
             # Store the original write method
@@ -249,7 +249,7 @@ async def selective_write(audio_data: bytes) -> None:
             # Replace the write method
             self._agent.llm.output_track.write = selective_write
             
-            logger.info("🔇 Muted Realtime LLM audio output (HeyGen will provide audio)")
+            logger.info("Muted Realtime LLM audio output (HeyGen will provide audio)")
             
         except Exception as e:
             logger.error(f"Failed to mute LLM audio: {e}")
@@ -262,7 +262,7 @@ async def _on_video_track(self, track: Any) -> None:
         Args:
             track: Incoming video track from HeyGen's WebRTC connection.
         """
-        logger.info("📹 Received video track from HeyGen, starting frame forwarding")
+        logger.info("Received video track from HeyGen, starting frame forwarding")
         await self._video_track.start_receiving(track)
 
     async def _on_audio_track(self, track: Any) -> None:
@@ -274,7 +274,7 @@ async def _on_audio_track(self, track: Any) -> None:
         Args:
             track: Incoming audio track from HeyGen's WebRTC connection.
         """
-        logger.info("🔊 Received audio track from HeyGen, starting audio forwarding")
+        logger.info("Received audio track from HeyGen, starting audio forwarding")
         
         # Forward audio frames from HeyGen to our audio track
         asyncio.create_task(self._forward_audio_frames(track, self._audio_track))
@@ -287,7 +287,7 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non
             dest_track: Agent's audio track to write to.
         """
         try:
-            logger.info("🔊 Starting HeyGen audio frame forwarding")
+            logger.info("Starting HeyGen audio frame forwarding")
             frame_count = 0
             while True:
                 try:
@@ -317,20 +317,20 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non
                         await dest_track.write(audio_bytes)
                         self._forwarding_audio = False
                     else:
-                        logger.warning("⚠️ Received frame without to_ndarray() method")
+                        logger.warning("Received frame without to_ndarray() method")
                         
                 except Exception as e:
                     if "ended" in str(e).lower() or "closed" in str(e).lower():
-                        logger.info(f"🔊 HeyGen audio track ended (forwarded {frame_count} frames)")
+                        logger.info(f"HeyGen audio track ended (forwarded {frame_count} frames)")
                         break
                     else:
-                        logger.error(f"❌ Error forwarding audio frame #{frame_count}: {e}")
+                        logger.error(f"Error forwarding audio frame #{frame_count}: {e}")
                         import traceback
                         logger.error(traceback.format_exc())
                         break
                         
         except Exception as e:
-            logger.error(f"❌ Error in audio forwarding loop: {e}")
+            logger.error(f"Error in audio forwarding loop: {e}")
             import traceback
             logger.error(traceback.format_exc())
 
@@ -383,11 +383,11 @@ async def _send_text_to_heygen(self, text: str) -> None:
             return
         
         try:
-            logger.info(f"📤 Sending text to HeyGen: '{text[:50]}...'")
+            logger.info(f"Sending text to HeyGen: '{text[:50]}...'")
             await self.rtc_manager.send_text(text, task_type="repeat")
-            logger.debug("✅ Text sent to HeyGen successfully")
+            logger.debug("Text sent to HeyGen successfully")
         except Exception as e:
-            logger.error(f"❌ Failed to send text to HeyGen: {e}")
+            logger.error(f"Failed to send text to HeyGen: {e}")
             import traceback
             logger.error(traceback.format_exc())
 
@@ -404,7 +404,7 @@ def publish_video_track(self):
         if not self._connected and not self._connection_task:
             self._connection_task = asyncio.create_task(self._connect_to_heygen())
         
-        logger.info("🎥 Publishing HeyGen avatar video track")
+        logger.info("Publishing HeyGen avatar video track")
         return self._video_track
 
     def state(self) -> dict:
@@ -423,7 +423,7 @@ def state(self) -> dict:
 
     async def close(self) -> None:
         """Clean up resources and close connections."""
-        logger.info("🔌 Closing HeyGen avatar publisher")
+        logger.info("Closing HeyGen avatar publisher")
         
         # Stop video track
         if self._video_track:
@@ -442,5 +442,5 @@ async def close(self) -> None:
                 pass
         
         self._connected = False
-        logger.info("✅ HeyGen avatar publisher closed")
+        logger.info("HeyGen avatar publisher closed")
 
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
index 3e61674a..6e9876ac 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -88,7 +88,7 @@ async def connect(self) -> None:
                 offer_sdp = None
             
             if not offer_sdp:
-                logger.error(f"❌ Unexpected SDP format. Type: {type(sdp_data)}")
+                logger.error(f"Unexpected SDP format. Type: {type(sdp_data)}")
                 if isinstance(sdp_data, dict):
                     logger.error(f"SDP dict keys: {list(sdp_data.keys())}")
                 logger.error(f"SDP data: {str(sdp_data)[:200] if sdp_data else 'None'}")
@@ -105,7 +105,7 @@ async def on_track(track: MediaStreamTrack):
             
             @self.pc.on("connectionstatechange")
             async def on_connection_state_change():
-                logger.info(f"🔗 HeyGen connection state: {self.pc.connectionState}")
+                logger.info(f"HeyGen connection state: {self.pc.connectionState}")
                 if self.pc.connectionState == "connected":
                     self._connected = True
                     self._connection_ready.set()
@@ -131,10 +131,10 @@ async def on_connection_state_change():
             # Wait for connection to be established
             await asyncio.wait_for(self._connection_ready.wait(), timeout=10.0)
             
-            logger.info("✅ HeyGen WebRTC connection established")
+            logger.info("HeyGen WebRTC connection established")
             
         except Exception as e:
-            logger.error(f"❌ Failed to connect to HeyGen: {e}")
+            logger.error(f"Failed to connect to HeyGen: {e}")
             raise
 
     def _parse_ice_servers(self, session_info: dict) -> list:
@@ -158,7 +158,7 @@ def _parse_ice_servers(self, session_info: dict) -> list:
         )
         
         if ice_server_configs and not isinstance(ice_server_configs, list):
-            logger.warning(f"⚠️ Unexpected ice_servers format: {type(ice_server_configs)}")
+            logger.warning(f"Unexpected ice_servers format: {type(ice_server_configs)}")
             ice_server_configs = []
         
         for server_config in ice_server_configs:
@@ -180,12 +180,12 @@ def _parse_ice_servers(self, session_info: dict) -> list:
                         credential=credential,
                     )
                 )
-                logger.info(f"🧊 Added ICE server: {urls[0]}")
+                logger.info(f"Added ICE server: {urls[0]}")
         
         # When using LiveKit, ICE servers may be embedded in SDP
         # In that case, use public STUN as fallback
         if not ice_servers:
-            logger.info("ℹ️ Using default STUN servers (LiveKit may provide its own via SDP)")
+            logger.info("Using default STUN servers (LiveKit may provide its own via SDP)")
             ice_servers.append(
                 RTCIceServer(urls=["stun:stun.l.google.com:19302"])
             )
@@ -198,7 +198,7 @@ async def _handle_track(self, track: MediaStreamTrack) -> None:
         Args:
             track: Incoming media track (audio or video).
         """
-        logger.info(f"📡 Received track from HeyGen: {track.kind}")
+        logger.info(f"Received track from HeyGen: {track.kind}")
         
         if track.kind == "video":
             if self._video_callback:
@@ -207,11 +207,11 @@ async def _handle_track(self, track: MediaStreamTrack) -> None:
                 logger.warning("Video track received but no callback registered")
         elif track.kind == "audio":
             # Audio track from HeyGen (avatar speech with lip-synced TTS)
-            logger.info("🔊 Audio track received from HeyGen")
+            logger.info("Audio track received from HeyGen")
             if self._audio_callback:
                 await self._audio_callback(track)
             else:
-                logger.warning("⚠️ Audio track received but no callback registered")
+                logger.warning("Audio track received but no callback registered")
 
     def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None:
         """Set callback for handling incoming video track.
@@ -257,5 +257,5 @@ async def close(self) -> None:
         self._connected = False
         self._connection_ready.clear()
         
-        logger.info("🔌 HeyGen RTC connection closed")
+        logger.info("HeyGen RTC connection closed")
 
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
index aca8caa3..3565c7a6 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
@@ -76,11 +76,11 @@ async def create_session(self) -> Dict[str, Any]:
                 self.session_info = data.get("data", {})
                 self.session_id = self.session_info.get("session_id")
                 
-                logger.info(f"✅ HeyGen session created: {self.session_id}")
+                logger.info(f"HeyGen session created: {self.session_id}")
                 return self.session_info
                 
         except Exception as e:
-            logger.error(f"❌ Failed to create HeyGen session: {e}")
+            logger.error(f"Failed to create HeyGen session: {e}")
             raise
 
     async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any]:
@@ -127,11 +127,11 @@ async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any
                     )
                 
                 data = await response.json()
-                logger.info(f"✅ HeyGen session started: {self.session_id}")
+                logger.info(f"HeyGen session started: {self.session_id}")
                 return data
                 
         except Exception as e:
-            logger.error(f"❌ Failed to start HeyGen session: {e}")
+            logger.error(f"Failed to start HeyGen session: {e}")
             raise
 
     async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any]:
@@ -179,11 +179,11 @@ async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any
                     return {}
                 
                 data = await response.json()
-                logger.debug(f"📤 Sent text to HeyGen: '{text[:50]}...'")
+                logger.debug(f"Sent text to HeyGen: '{text[:50]}...'")
                 return data
         
         except Exception as e:
-            logger.error(f"❌ Error sending task to HeyGen: {e}")
+            logger.error(f"Error sending task to HeyGen: {e}")
             return {}
 
     async def stop_session(self) -> None:
@@ -211,13 +211,13 @@ async def stop_session(self) -> None:
                 headers=headers,
             ) as response:
                 if response.status == 200:
-                    logger.info(f"✅ HeyGen session stopped: {self.session_id}")
+                    logger.info(f"HeyGen session stopped: {self.session_id}")
                 else:
                     logger.warning(
                         f"Failed to stop HeyGen session: {response.status}"
                     )
         except Exception as e:
-            logger.error(f"❌ Error stopping HeyGen session: {e}")
+            logger.error(f"Error stopping HeyGen session: {e}")
 
     async def close(self) -> None:
         """Clean up session resources."""
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
index ace06d5e..38f707ca 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
@@ -43,7 +43,7 @@ def __init__(self, width: int = 1920, height: int = 1080):
         self._receiving_task: Optional[asyncio.Task] = None
         self._source_track: Optional[MediaStreamTrack] = None
         
-        logger.info(f"🎬 HeyGenVideoTrack initialized ({width}x{height})")
+        logger.info(f"HeyGenVideoTrack initialized ({width}x{height})")
 
     async def start_receiving(self, source_track: MediaStreamTrack) -> None:
         """Start receiving frames from HeyGen's video track.
@@ -57,7 +57,7 @@ async def start_receiving(self, source_track: MediaStreamTrack) -> None:
         
         self._source_track = source_track
         self._receiving_task = asyncio.create_task(self._receive_frames())
-        logger.info("📥 Started receiving frames from HeyGen")
+        logger.info("Started receiving frames from HeyGen")
 
     async def _receive_frames(self) -> None:
         """Continuously receive frames from HeyGen and add to queue."""
@@ -80,7 +80,7 @@ async def _receive_frames(self) -> None:
                         self.frame_queue.put_latest_nowait(frame)
                         
                         logger.debug(
-                            f"📥 Received frame from HeyGen: {frame.width}x{frame.height}"
+                            f"Received frame from HeyGen: {frame.width}x{frame.height}"
                         )
                 
                 except Exception as e:
@@ -157,5 +157,5 @@ def stop(self) -> None:
             self._receiving_task = None
         
         super().stop()
-        logger.info("🛑 HeyGenVideoTrack stopped")
+        logger.info("HeyGenVideoTrack stopped")
 

From c14b98c459aa84a96cdcac23bd9c6599089bcd83 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Mon, 3 Nov 2025 18:41:06 +0100
Subject: [PATCH 07/20] Auto-attach processors to agent (no more manual
 set_agent calls)

- Add processor._attach_agent() lifecycle hook to Agent.__init__
- Rename HeyGen set_agent() -> _attach_agent() for consistency with LLM
- Remove manual agent attachment from examples and docs
- HeyGen now works like YOLO - just add to processors list

Examples are now much cleaner:
  agent = Agent(processors=[heygen.AvatarPublisher()])
  # That's it! No manual wiring needed.
---
 agents-core/vision_agents/core/agents/agents.py        |  5 +++++
 plugins/aws/example/uv.lock                            |  4 +++-
 plugins/heygen/README.md                               | 10 ----------
 plugins/heygen/example/README.md                       |  6 ++++--
 plugins/heygen/example/avatar_example.py               |  5 -----
 plugins/heygen/example/pyproject.toml                  |  2 ++
 .../plugins/heygen/heygen_avatar_publisher.py          |  6 +++---
 7 files changed, 17 insertions(+), 21 deletions(-)

diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py
index 38a99bda..e537c7cc 100644
--- a/agents-core/vision_agents/core/agents/agents.py
+++ b/agents-core/vision_agents/core/agents/agents.py
@@ -215,6 +215,11 @@ def __init__(
 
         self.llm._attach_agent(self)
 
+        # Attach processors that need agent reference
+        for processor in self.processors:
+            if hasattr(processor, '_attach_agent'):
+                processor._attach_agent(self)
+
         self.events.subscribe(self._on_vad_audio)
         self.events.subscribe(self._on_agent_say)
         # Initialize state variables
diff --git a/plugins/aws/example/uv.lock b/plugins/aws/example/uv.lock
index fad869b7..5c0123ac 100644
--- a/plugins/aws/example/uv.lock
+++ b/plugins/aws/example/uv.lock
@@ -2648,6 +2648,8 @@ requires-dist = [
     { name = "vision-agents-plugins-gemini", marker = "extra == 'gemini'", editable = "../../gemini" },
     { name = "vision-agents-plugins-getstream", marker = "extra == 'all-plugins'", editable = "../../getstream" },
     { name = "vision-agents-plugins-getstream", marker = "extra == 'getstream'", editable = "../../getstream" },
+    { name = "vision-agents-plugins-heygen", marker = "extra == 'all-plugins'", editable = "../../heygen" },
+    { name = "vision-agents-plugins-heygen", marker = "extra == 'heygen'", editable = "../../heygen" },
     { name = "vision-agents-plugins-kokoro", marker = "extra == 'all-plugins'", editable = "../../kokoro" },
     { name = "vision-agents-plugins-kokoro", marker = "extra == 'kokoro'", editable = "../../kokoro" },
     { name = "vision-agents-plugins-krisp", marker = "extra == 'all-plugins'", editable = "../../krisp" },
@@ -2665,7 +2667,7 @@ requires-dist = [
     { name = "vision-agents-plugins-xai", marker = "extra == 'all-plugins'", editable = "../../xai" },
     { name = "vision-agents-plugins-xai", marker = "extra == 'xai'", editable = "../../xai" },
 ]
-provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "kokoro", "krisp", "moonshine", "openai", "smart-turn", "ultralytics", "wizper", "xai"]
+provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "heygen", "kokoro", "krisp", "moonshine", "openai", "smart-turn", "ultralytics", "wizper", "xai"]
 
 [[package]]
 name = "vision-agents-plugins-aws"
diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md
index b7360c3a..0ae26514 100644
--- a/plugins/heygen/README.md
+++ b/plugins/heygen/README.md
@@ -56,11 +56,6 @@ async def start_avatar_agent():
     call = agent.edge.client.video.call("default", str(uuid4()))
     
     with await agent.join(call):
-        # Set agent reference for event subscription
-        avatar_publisher = agent.video_publishers[0]
-        if hasattr(avatar_publisher, 'set_agent'):
-            avatar_publisher.set_agent(agent)
-        
         await agent.edge.open_demo(call)
         await agent.simple_response("Hello! I'm your AI assistant with an avatar.")
         await agent.finish()
@@ -112,11 +107,6 @@ agent = Agent(
 call = agent.edge.client.video.call("default", str(uuid4()))
 
 with await agent.join(call):
-    # Set agent reference for event subscription
-    avatar_publisher = agent.video_publishers[0]
-    if hasattr(avatar_publisher, 'set_agent'):
-        avatar_publisher.set_agent(agent)
-    
     await agent.finish()
 ```
 
diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md
index 631d3309..830dd75c 100644
--- a/plugins/heygen/example/README.md
+++ b/plugins/heygen/example/README.md
@@ -28,14 +28,16 @@ Required API keys:
 
 ## Running the Example
 
+From the project root:
+
 ```bash
-uv run avatar_example.py
+uv run plugins/heygen/example/avatar_example.py
 ```
 
 This will:
 1. Start an AI agent with a HeyGen avatar
 2. Open a demo UI in your browser
-3. The avatar will greet you and be ready to chat
+3. The avatar will speak and be ready to chat
 
 ## What's Happening
 
diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py
index b3beef83..1683491b 100644
--- a/plugins/heygen/example/avatar_example.py
+++ b/plugins/heygen/example/avatar_example.py
@@ -55,11 +55,6 @@ async def start_avatar_agent() -> None:
     
     # Join the call
     with await agent.join(call):
-        # Set agent reference on avatar publisher for text event subscription
-        avatar_publisher = agent.video_publishers[0]
-        if hasattr(avatar_publisher, 'set_agent'):
-            avatar_publisher.set_agent(agent)
-        
         # Open demo UI
         await agent.edge.open_demo(call)
         
diff --git a/plugins/heygen/example/pyproject.toml b/plugins/heygen/example/pyproject.toml
index 4e1fdf61..ffdd3922 100644
--- a/plugins/heygen/example/pyproject.toml
+++ b/plugins/heygen/example/pyproject.toml
@@ -8,6 +8,7 @@ dependencies = [
     "vision-agents-plugins-heygen",
     "vision-agents-plugins-gemini",
     "vision-agents-plugins-getstream",
+    "vision-agents-plugins-deepgram",
     "python-dotenv",
 ]
 
@@ -16,4 +17,5 @@ vision-agents = { workspace = true }
 vision-agents-plugins-heygen = { workspace = true }
 vision-agents-plugins-gemini = { workspace = true }
 vision-agents-plugins-getstream = { workspace = true }
+vision-agents-plugins-deepgram = { workspace = true }
 
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index f78e538b..52249183 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -122,10 +122,10 @@ def publish_audio_track(self):
         """
         return self._audio_track
     
-    def set_agent(self, agent: Any) -> None:
-        """Set the agent reference for event subscription.
+    def _attach_agent(self, agent: Any) -> None:
+        """Attach the agent reference for event subscription.
         
-        This is called by the agent when the processor is attached.
+        This is called automatically by the Agent during initialization.
         
         Args:
             agent: The agent instance.

From 6188ed38fb48767536b6149841879a557d7622d3 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Mon, 3 Nov 2025 19:06:22 +0100
Subject: [PATCH 08/20] fixed audio duplication and sluggishness

---
 .../plugins/heygen/heygen_avatar_publisher.py | 82 +++++++++----------
 1 file changed, 40 insertions(+), 42 deletions(-)

diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index 52249183..6d7467fe 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -104,7 +104,7 @@ def __init__(
         # Text buffer for accumulating LLM response chunks before sending to HeyGen
         self._text_buffer = ""
         self._current_response_id: Optional[str] = None
-        self._sent_texts: set = set()  # Track sent texts to avoid duplicates
+        self._all_sent_texts: set = set()  # Track all sent texts to prevent duplicates
         
         # Audio forwarding state (for selective muting of Realtime LLM audio)
         self._forwarding_audio = False
@@ -183,17 +183,37 @@ async def on_text_chunk(event: LLMResponseChunkEvent):
                 
                 @self._agent.llm.events.subscribe
                 async def on_text_complete(event: LLMResponseCompletedEvent):
-                    """Handle end of LLM response - send any remaining buffered text."""
-                    # Send any remaining buffered text
-                    if self._text_buffer.strip():
-                        text_to_send = self._text_buffer.strip()
-                        if text_to_send not in self._sent_texts:
-                            await self._send_text_to_heygen(text_to_send)
-                            self._sent_texts.add(text_to_send)
-                        self._text_buffer = ""
+                    """Handle end of LLM response - split into sentences and send each once."""
+                    if not self._text_buffer.strip():
+                        return
+                    
+                    # Split the complete response into sentences
+                    import re
+                    text = self._text_buffer.strip()
+                    # Split on sentence boundaries but keep the punctuation
+                    sentences = re.split(r'([.!?]+\s*)', text)
+                    # Recombine sentences with their punctuation
+                    full_sentences = []
+                    for i in range(0, len(sentences)-1, 2):
+                        if sentences[i].strip():
+                            sentence = (sentences[i] + sentences[i+1] if i+1 < len(sentences) else sentences[i]).strip()
+                            full_sentences.append(sentence)
+                    # Handle last part if no punctuation
+                    if sentences and sentences[-1].strip() and not any(sentences[-1].strip().endswith(p) for p in ['.', '!', '?']):
+                        full_sentences.append(sentences[-1].strip())
+                    
+                    # Send each sentence once if not already sent
+                    for sentence in full_sentences:
+                        if sentence and len(sentence) > 5:
+                            if sentence not in self._all_sent_texts:
+                                await self._send_text_to_heygen(sentence)
+                                self._all_sent_texts.add(sentence)
+                            else:
+                                logger.debug(f"Skipping duplicate: '{sentence[:30]}...'")
+                    
                     # Reset for next response
+                    self._text_buffer = ""
                     self._current_response_id = None
-                    self._sent_texts.clear()
                 
                 @self._agent.llm.events.subscribe
                 async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent):
@@ -298,19 +318,8 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non
                     # Convert frame to bytes and write to agent's audio track
                     if hasattr(frame, 'to_ndarray'):
                         audio_array = frame.to_ndarray()
-                        
-                        # Convert mono to stereo if needed (agent track expects stereo)
-                        # HeyGen sends mono (shape=(1, samples)), we need interleaved stereo
-                        if audio_array.shape[0] == 1:
-                            # Flatten to 1D array of samples
-                            mono_samples = audio_array.flatten()
-                            
-                            # Create stereo by interleaving each mono sample
-                            stereo_samples = np.repeat(mono_samples, 2)
-                            audio_bytes = stereo_samples.tobytes()
-                        else:
-                            # Already multi-channel, just flatten and convert
-                            audio_bytes = audio_array.flatten().tobytes()
+                        # Pass raw audio data - AudioStreamTrack handles format conversion
+                        audio_bytes = audio_array.tobytes()
                         
                         # Set flag to allow HeyGen audio through the muted track
                         self._forwarding_audio = True
@@ -337,8 +346,8 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non
     async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None:
         """Handle text chunk from the LLM.
         
-        Accumulates text chunks until a complete sentence or response is ready,
-        then sends to HeyGen for lip-sync.
+        Accumulates text chunks. Does NOT send immediately - waits for completion event
+        to avoid sending partial/duplicate sentences.
         
         Args:
             text_delta: The text chunk/delta from the LLM.
@@ -348,26 +357,16 @@ async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None:
         if item_id != self._current_response_id:
             if self._text_buffer:
                 # Send any accumulated text from previous response
-                await self._send_text_to_heygen(self._text_buffer.strip())
+                text_to_send = self._text_buffer.strip()
+                if text_to_send and text_to_send not in self._all_sent_texts:
+                    await self._send_text_to_heygen(text_to_send)
+                    self._all_sent_texts.add(text_to_send)
             self._text_buffer = ""
             self._current_response_id = item_id
-            self._sent_texts.clear()
         
-        # Accumulate text
+        # Just accumulate text - don't send yet!
+        # Wait for completion event to avoid sending partial sentences
         self._text_buffer += text_delta
-        
-        # Send when we have a complete sentence (ending with period, !, or ?)
-        # But only if it's substantial enough (> 15 chars) to avoid sending tiny fragments
-        # Don't send on commas/semicolons to reduce repetition
-        if any(self._text_buffer.rstrip().endswith(p) for p in ['.', '!', '?']):
-            text_to_send = self._text_buffer.strip()
-            # Only send if it's substantial (>15 chars) and not already sent
-            if text_to_send and len(text_to_send) > 15 and text_to_send not in self._sent_texts:
-                await self._send_text_to_heygen(text_to_send)
-                self._sent_texts.add(text_to_send)
-                self._text_buffer = ""  # Clear buffer after sending
-            elif text_to_send in self._sent_texts:
-                self._text_buffer = ""  # Clear buffer to avoid re-sending
     
     async def _send_text_to_heygen(self, text: str) -> None:
         """Send text to HeyGen for the avatar to speak with lip-sync.
@@ -385,7 +384,6 @@ async def _send_text_to_heygen(self, text: str) -> None:
         try:
             logger.info(f"Sending text to HeyGen: '{text[:50]}...'")
             await self.rtc_manager.send_text(text, task_type="repeat")
-            logger.debug("Text sent to HeyGen successfully")
         except Exception as e:
             logger.error(f"Failed to send text to HeyGen: {e}")
             import traceback

From 74aa6ff9bfd448a1313121ab5075d72b4c7aeaf3 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Mon, 3 Nov 2025 19:15:16 +0100
Subject: [PATCH 09/20] Fix video aspect ratio stretching - add letterboxing

---
 .../plugins/heygen/heygen_video_track.py      | 28 ++++++++++++++++---
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
index 38f707ca..9339e1da 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
@@ -94,18 +94,38 @@ async def _receive_frames(self) -> None:
             logger.error(f"Fatal error in frame receiving: {e}")
 
     def _resize_frame(self, frame: av.VideoFrame) -> av.VideoFrame:
-        """Resize a video frame to match the track dimensions.
+        """Resize a video frame to match the track dimensions while maintaining aspect ratio.
         
         Args:
             frame: Input video frame.
             
         Returns:
-            Resized video frame.
+            Resized video frame with letterboxing if needed.
         """
         try:
             img = frame.to_image()
-            resized = img.resize((self.width, self.height), Image.LANCZOS)
-            return av.VideoFrame.from_image(resized)
+            
+            # Calculate scaling to maintain aspect ratio
+            src_width, src_height = img.size
+            target_width, target_height = self.width, self.height
+            
+            # Calculate scale factor (fit within target dimensions)
+            scale = min(target_width / src_width, target_height / src_height)
+            new_width = int(src_width * scale)
+            new_height = int(src_height * scale)
+            
+            # Resize with aspect ratio maintained
+            resized = img.resize((new_width, new_height), Image.LANCZOS)
+            
+            # Create black background at target resolution
+            result = Image.new('RGB', (target_width, target_height), (0, 0, 0))
+            
+            # Paste resized image centered
+            x_offset = (target_width - new_width) // 2
+            y_offset = (target_height - new_height) // 2
+            result.paste(resized, (x_offset, y_offset))
+            
+            return av.VideoFrame.from_image(result)
         
         except Exception as e:
             logger.error(f"Error resizing frame: {e}")

From f54c372786b28aaa199a8596a5bc3991742f0404 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Tue, 4 Nov 2025 10:26:47 +0100
Subject: [PATCH 10/20] fixed and simplified both implementations

---
 plugins/heygen/example/README.md              |  86 ++++++++++++---
 .../heygen/example/avatar_realtime_example.py |  65 +++++++++++
 .../plugins/heygen/heygen_avatar_publisher.py | 104 +++++-------------
 3 files changed, 164 insertions(+), 91 deletions(-)
 create mode 100644 plugins/heygen/example/avatar_realtime_example.py

diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md
index 830dd75c..b922e7e2 100644
--- a/plugins/heygen/example/README.md
+++ b/plugins/heygen/example/README.md
@@ -1,6 +1,16 @@
-# HeyGen Avatar Example
+# HeyGen Avatar Examples
 
-This example demonstrates how to use the HeyGen plugin to add realistic avatar video to your AI agent.
+This directory contains examples of how to use the HeyGen plugin to add realistic avatar video to your AI agent.
+
+## Examples
+
+### 1. Standard Streaming LLM (`avatar_example.py`)
+
+Uses a standard streaming LLM (Gemini) with separate TTS/STT components. Best for traditional text-based LLMs.
+
+### 2. Realtime LLM (`avatar_realtime_example.py`)
+
+Uses Gemini Realtime with native audio input/output. The avatar lip-syncs to the transcribed text while Gemini handles voice processing.
 
 ## Setup
 
@@ -19,27 +29,40 @@ Copy `.env.example` to `.env` and fill in your API keys:
 cp .env.example .env
 ```
 
-Required API keys:
+**For Standard Example** (`avatar_example.py`):
 - `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com)
 - `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io)
 - `CARTESIA_API_KEY` - Get from [Cartesia](https://cartesia.ai)
 - `DEEPGRAM_API_KEY` - Get from [Deepgram](https://deepgram.com)
 - `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey)
 
-## Running the Example
+**For Realtime Example** (`avatar_realtime_example.py`):
+- `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com)
+- `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io)
+- `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey)
+
+## Running the Examples
 
 From the project root:
 
+**Standard Streaming LLM:**
 ```bash
 uv run plugins/heygen/example/avatar_example.py
 ```
 
-This will:
+**Realtime LLM:**
+```bash
+uv run plugins/heygen/example/avatar_realtime_example.py
+```
+
+Both will:
 1. Start an AI agent with a HeyGen avatar
 2. Open a demo UI in your browser
 3. The avatar will speak and be ready to chat
 
-## What's Happening
+## How It Works
+
+### Standard Streaming LLM (`avatar_example.py`)
 
 1. **Agent Setup**: The agent is configured with:
    - Gemini LLM for generating responses
@@ -49,10 +72,10 @@ This will:
 
 2. **Avatar Streaming**: When the agent speaks:
    - Text is generated by Gemini LLM
+   - Text is sent to HeyGen for lip-sync
    - Audio is synthesized by Cartesia TTS
-   - Audio is sent to HeyGen via WebRTC
    - HeyGen generates avatar video with lip-sync
-   - Avatar video is streamed to the call
+   - Avatar video and audio are streamed to the call
 
 3. **User Interaction**: When you speak:
    - Audio is captured from your microphone
@@ -60,6 +83,24 @@ This will:
    - Sent to Gemini LLM for processing
    - Response is generated and spoken through the avatar
 
+### Realtime LLM (`avatar_realtime_example.py`)
+
+1. **Agent Setup**: The agent is configured with:
+   - Gemini Realtime for native audio processing
+   - HeyGen AvatarPublisher for avatar video
+
+2. **Avatar Streaming**: When the agent speaks:
+   - Gemini Realtime generates audio directly (24kHz PCM)
+   - Text transcription is sent to HeyGen for lip-sync
+   - HeyGen generates avatar video with lip-sync
+   - Gemini's audio is used (HeyGen audio is not forwarded for Realtime LLMs)
+   - Avatar video and Gemini audio are streamed to the call
+
+3. **User Interaction**: When you speak:
+   - Audio is captured and sent directly to Gemini Realtime
+   - Gemini processes audio natively (no separate STT needed)
+   - Response is generated and spoken through the avatar
+
 ## Customization
 
 ### Using a Different Avatar
@@ -87,17 +128,36 @@ heygen.AvatarPublisher(
 
 ### Using a Different LLM
 
-Switch to OpenAI's Realtime API:
+**With Standard Streaming LLM:**
+```python
+from vision_agents.plugins import openai, elevenlabs
+
+agent = Agent(
+    edge=getstream.Edge(),
+    agent_user=User(name="Avatar AI"),
+    instructions="Your instructions here",
+    llm=openai.LLM("gpt-4"),
+    tts=elevenlabs.TTS(),
+    stt=deepgram.STT(),
+    processors=[
+        heygen.AvatarPublisher(avatar_id="default")
+    ]
+)
+```
 
+**With Realtime LLM:**
 ```python
 from vision_agents.plugins import openai
 
 agent = Agent(
-    # ... other config ...
-    llm=openai.Realtime(model="gpt-realtime", voice="alloy"),
-    # No need for separate TTS/STT with Realtime LLM
+    edge=getstream.Edge(),
+    agent_user=User(name="Avatar AI"),
+    instructions="Your instructions here",
+    llm=openai.Realtime(model="gpt-4o-realtime-preview"),
     processors=[
-        heygen.AvatarPublisher(avatar_id="default")
+        heygen.AvatarPublisher(
+            avatar_id="default"
+        )
     ]
 )
 ```
diff --git a/plugins/heygen/example/avatar_realtime_example.py b/plugins/heygen/example/avatar_realtime_example.py
new file mode 100644
index 00000000..695f7b55
--- /dev/null
+++ b/plugins/heygen/example/avatar_realtime_example.py
@@ -0,0 +1,65 @@
+import asyncio
+import logging
+from uuid import uuid4
+
+from dotenv import load_dotenv
+
+from vision_agents.core import User, Agent
+from vision_agents.plugins import getstream, gemini, heygen
+
+load_dotenv()
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+)
+
+
+async def start_avatar_agent() -> None:
+    """Start a HeyGen avatar agent with Gemini Realtime LLM.
+    
+    This example demonstrates using a HeyGen avatar with a Realtime LLM.
+    HeyGen provides the lip-synced avatar video based on text transcriptions,
+    while Gemini Realtime provides the audio directly.
+    """
+    
+    # Create agent with Gemini Realtime and HeyGen avatar
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="Avatar AI Assistant"),
+        instructions=(
+            "You are a helpful AI assistant with a virtual avatar. "
+            "Keep responses conversational and natural. "
+            "Be friendly and engaging."
+        ),
+        llm=gemini.Realtime(
+            model="gemini-2.5-flash-native-audio-preview-09-2025"
+        ),
+        processors=[
+            heygen.AvatarPublisher(
+                avatar_id="default",
+                quality="high",
+            )
+        ],
+    )
+    
+    # Create a call
+    call = agent.edge.client.video.call("default", str(uuid4()))
+    
+    # Join call first
+    with await agent.join(call):
+        # Open demo UI after joining
+        await agent.edge.open_demo(call)
+        
+        # Start the conversation
+        await agent.llm.simple_response(
+            text="Hello! I'm your AI assistant. How can I help you today?"
+        )
+        
+        # Keep running until the call ends
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    asyncio.run(start_avatar_agent())
+
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index 6d7467fe..e5e12f03 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -2,7 +2,6 @@
 import logging
 from typing import Optional, Any, Tuple
 
-import numpy as np
 from getstream.video.rtc import audio_track
 
 from vision_agents.core.processors.base_processor import (
@@ -21,10 +20,9 @@ class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMi
     """HeyGen avatar video and audio publisher.
     
     Publishes video of a HeyGen avatar that lip-syncs based on LLM text output.
-    Can be used as a processor in the Vision Agents framework to add
-    realistic avatar video to AI agents.
     
-    HeyGen handles TTS internally, so no separate TTS is needed.
+    For standard LLMs: HeyGen provides both video and audio (with TTS).
+    For Realtime LLMs: HeyGen provides video only; LLM provides audio.
     
     Example:
         agent = Agent(
@@ -49,7 +47,6 @@ def __init__(
         resolution: Tuple[int, int] = (1920, 1080),
         api_key: Optional[str] = None,
         interval: int = 0,
-        mute_llm_audio: bool = True,
         **kwargs,
     ):
         """Initialize the HeyGen avatar publisher.
@@ -60,8 +57,6 @@ def __init__(
             resolution: Output video resolution (width, height).
             api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided.
             interval: Processing interval (not used, kept for compatibility).
-            mute_llm_audio: If True, mutes the Realtime LLM's audio output so only
-                HeyGen's video (with audio) is heard. Default: True.
             **kwargs: Additional arguments passed to parent class.
         """
         super().__init__(
@@ -75,7 +70,6 @@ def __init__(
         self.quality = quality
         self.resolution = resolution
         self.api_key = api_key
-        self.mute_llm_audio = mute_llm_audio
         
         # WebRTC manager for HeyGen connection
         self.rtc_manager = HeyGenRTCManager(
@@ -106,9 +100,6 @@ def __init__(
         self._current_response_id: Optional[str] = None
         self._all_sent_texts: set = set()  # Track all sent texts to prevent duplicates
         
-        # Audio forwarding state (for selective muting of Realtime LLM audio)
-        self._forwarding_audio = False
-        
         logger.info(
             f"HeyGen AvatarPublisher initialized "
             f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})"
@@ -133,10 +124,6 @@ def _attach_agent(self, agent: Any) -> None:
         self._agent = agent
         logger.info("Agent reference set for HeyGen avatar publisher")
         
-        # Mute the Realtime LLM's audio if requested
-        if self.mute_llm_audio:
-            self._mute_realtime_llm_audio()
-        
         # Subscribe to text events immediately when agent is set
         self._subscribe_to_text_events()
 
@@ -231,50 +218,7 @@ async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent):
             else:
                 logger.warning("Cannot subscribe to text events - no agent or LLM attached yet")
         except Exception as e:
-            logger.error(f"Failed to subscribe to text events: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
-
-    def _mute_realtime_llm_audio(self) -> None:
-        """Mute the Realtime LLM's audio output.
-        
-        When using HeyGen, we want HeyGen to handle all audio (with lip-sync),
-        so we mute the LLM's native audio output to avoid duplicated/overlapping audio.
-        
-        This works by intercepting writes to the LLM's output_track and only blocking
-        writes that come from the LLM itself (not from HeyGen forwarding).
-        """
-        try:
-            from vision_agents.core.llm.realtime import Realtime
-            
-            if not hasattr(self, '_agent') or not self._agent:
-                logger.warning("Cannot mute LLM audio - no agent set")
-                return
-                
-            if not hasattr(self._agent, 'llm') or not isinstance(self._agent.llm, Realtime):
-                logger.info("LLM is not a Realtime LLM - no audio to mute")
-                return
-            
-            # Store the original write method
-            original_write = self._agent.llm.output_track.write
-            
-            # Create a selective write method
-            async def selective_write(audio_data: bytes) -> None:
-                """Only allow writes from HeyGen forwarding, block LLM writes."""
-                if self._forwarding_audio:
-                    # This is from HeyGen - allow it
-                    await original_write(audio_data)
-                # else: This is from the Realtime LLM - block it
-            
-            # Replace the write method
-            self._agent.llm.output_track.write = selective_write
-            
-            logger.info("Muted Realtime LLM audio output (HeyGen will provide audio)")
-            
-        except Exception as e:
-            logger.error(f"Failed to mute LLM audio: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
+            logger.error(f"Failed to subscribe to text events: {e}", exc_info=True)
 
     async def _on_video_track(self, track: Any) -> None:
         """Callback when video track is received from HeyGen.
@@ -291,12 +235,29 @@ async def _on_audio_track(self, track: Any) -> None:
         HeyGen provides audio with lip-synced TTS. We forward this audio
         to the agent's audio track so it gets published to the call.
         
+        For Realtime LLMs: We DON'T forward HeyGen audio - the LLM generates its own audio.
+        HeyGen is only used for video lip-sync based on text transcriptions.
+        
         Args:
             track: Incoming audio track from HeyGen's WebRTC connection.
         """
-        logger.info("Received audio track from HeyGen, starting audio forwarding")
+        logger.info("Received audio track from HeyGen")
+        
+        # Check if we're using a Realtime LLM
+        using_realtime_llm = False
+        if hasattr(self, '_agent') and self._agent:
+            from vision_agents.core.llm.realtime import Realtime
+            if hasattr(self._agent, 'llm') and isinstance(self._agent.llm, Realtime):
+                using_realtime_llm = True
+        
+        if using_realtime_llm:
+            # For Realtime LLMs, don't forward HeyGen audio - use the LLM's native audio
+            # HeyGen is only used for lip-synced video based on text transcriptions
+            logger.info("Using Realtime LLM - skipping HeyGen audio forwarding (using LLM's native audio)")
+            return
         
-        # Forward audio frames from HeyGen to our audio track
+        # For standard LLMs, forward HeyGen's audio to our audio track
+        logger.info("Forwarding HeyGen audio to audio track")
         asyncio.create_task(self._forward_audio_frames(track, self._audio_track))
     
     async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> None:
@@ -315,16 +276,10 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non
                     frame = await source_track.recv()
                     frame_count += 1
                     
-                    # Convert frame to bytes and write to agent's audio track
                     if hasattr(frame, 'to_ndarray'):
                         audio_array = frame.to_ndarray()
-                        # Pass raw audio data - AudioStreamTrack handles format conversion
                         audio_bytes = audio_array.tobytes()
-                        
-                        # Set flag to allow HeyGen audio through the muted track
-                        self._forwarding_audio = True
                         await dest_track.write(audio_bytes)
-                        self._forwarding_audio = False
                     else:
                         logger.warning("Received frame without to_ndarray() method")
                         
@@ -332,16 +287,11 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non
                     if "ended" in str(e).lower() or "closed" in str(e).lower():
                         logger.info(f"HeyGen audio track ended (forwarded {frame_count} frames)")
                         break
-                    else:
-                        logger.error(f"Error forwarding audio frame #{frame_count}: {e}")
-                        import traceback
-                        logger.error(traceback.format_exc())
-                        break
+                    logger.error(f"Error forwarding audio frame: {e}", exc_info=True)
+                    break
                         
         except Exception as e:
-            logger.error(f"Error in audio forwarding loop: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
+            logger.error(f"Error in audio forwarding loop: {e}", exc_info=True)
 
     async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None:
         """Handle text chunk from the LLM.
@@ -385,9 +335,7 @@ async def _send_text_to_heygen(self, text: str) -> None:
             logger.info(f"Sending text to HeyGen: '{text[:50]}...'")
             await self.rtc_manager.send_text(text, task_type="repeat")
         except Exception as e:
-            logger.error(f"Failed to send text to HeyGen: {e}")
-            import traceback
-            logger.error(traceback.format_exc())
+            logger.error(f"Failed to send text to HeyGen: {e}", exc_info=True)
 
     def publish_video_track(self):
         """Publish the HeyGen avatar video track.

From fad9f49b4eb24bb129aeb7d6f4f1bd0920d095f8 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Tue, 4 Nov 2025 10:30:11 +0100
Subject: [PATCH 11/20] Fix ruff linting - remove unused imports

---
 plugins/heygen/tests/test_heygen_plugin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py
index 3be160d6..0c73a127 100644
--- a/plugins/heygen/tests/test_heygen_plugin.py
+++ b/plugins/heygen/tests/test_heygen_plugin.py
@@ -1,5 +1,5 @@
 import pytest
-from unittest.mock import Mock, AsyncMock, patch
+from unittest.mock import patch
 from vision_agents.plugins.heygen import (
     AvatarPublisher,
     HeyGenVideoTrack,

From f03c81d1a335a9fb13f886cd398fc21763bf8eb8 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Tue, 4 Nov 2025 10:32:46 +0100
Subject: [PATCH 12/20] Fix HeyGen plugin tests - import paths and mocking

---
 plugins/heygen/tests/test_heygen_plugin.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py
index 0c73a127..cfb754a8 100644
--- a/plugins/heygen/tests/test_heygen_plugin.py
+++ b/plugins/heygen/tests/test_heygen_plugin.py
@@ -1,11 +1,9 @@
 import pytest
 from unittest.mock import patch
-from vision_agents.plugins.heygen import (
-    AvatarPublisher,
-    HeyGenVideoTrack,
-    HeyGenRTCManager,
-    HeyGenSession,
-)
+from vision_agents.plugins.heygen import AvatarPublisher
+from vision_agents.plugins.heygen.heygen_video_track import HeyGenVideoTrack
+from vision_agents.plugins.heygen.heygen_rtc_manager import HeyGenRTCManager
+from vision_agents.plugins.heygen.heygen_session import HeyGenSession
 
 
 class TestHeyGenSession:
@@ -97,6 +95,9 @@ def test_publish_video_track(self):
         """Test publishing video track."""
         with patch.object(HeyGenRTCManager, "__init__", return_value=None):
             publisher = AvatarPublisher(api_key="test_key")
+            # Set _connected to True to avoid creating async task
+            publisher._connected = True
+            publisher._connection_task = None
             
             track = publisher.publish_video_track()
             
@@ -110,6 +111,8 @@ def test_state(self):
                 quality="medium",
                 api_key="test_key",
             )
+            # Mock the _connected attribute on the RTC manager
+            publisher.rtc_manager._connected = False
             
             state = publisher.state()
             

From a5be2065202e3d3d0e9f339e75ed12882a9a85b8 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Tue, 4 Nov 2025 10:48:42 +0100
Subject: [PATCH 13/20] Fix mypy type errors in HeyGen plugin

---
 .../vision_agents/plugins/heygen/heygen_rtc_manager.py |  2 ++
 .../vision_agents/plugins/heygen/heygen_session.py     | 10 +++++-----
 .../vision_agents/plugins/heygen/heygen_video_track.py |  5 +++--
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
index 6e9876ac..1572512e 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -105,6 +105,8 @@ async def on_track(track: MediaStreamTrack):
             
             @self.pc.on("connectionstatechange")
             async def on_connection_state_change():
+                if self.pc is None:
+                    return
                 logger.info(f"HeyGen connection state: {self.pc.connectionState}")
                 if self.pc.connectionState == "connected":
                     self._connected = True
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
index 3565c7a6..1d332891 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
@@ -28,7 +28,7 @@ def __init__(
         """
         self.avatar_id = avatar_id
         self.quality = quality
-        self.api_key = api_key or getenv("HEYGEN_API_KEY")
+        self.api_key: str = api_key or getenv("HEYGEN_API_KEY") or ""
         
         if not self.api_key:
             raise ValueError(
@@ -50,7 +50,7 @@ async def create_session(self) -> Dict[str, Any]:
         if not self._http_session:
             self._http_session = aiohttp.ClientSession()
         
-        headers = {
+        headers: dict[str, str] = {
             "X-Api-Key": self.api_key,
             "Content-Type": "application/json",
         }
@@ -98,7 +98,7 @@ async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any
         if not self._http_session:
             self._http_session = aiohttp.ClientSession()
         
-        headers = {
+        headers: dict[str, str] = {
             "X-Api-Key": self.api_key,
             "Content-Type": "application/json",
         }
@@ -154,7 +154,7 @@ async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any
         if not self._http_session:
             self._http_session = aiohttp.ClientSession()
         
-        headers = {
+        headers: dict[str, str] = {
             "X-Api-Key": self.api_key,
             "Content-Type": "application/json",
         }
@@ -195,7 +195,7 @@ async def stop_session(self) -> None:
         if not self._http_session:
             return
         
-        headers = {
+        headers: dict[str, str] = {
             "X-Api-Key": self.api_key,
             "Content-Type": "application/json",
         }
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
index 9339e1da..9f890b95 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
@@ -71,7 +71,8 @@ async def _receive_frames(self) -> None:
                     # Receive frame from HeyGen
                     frame = await self._source_track.recv()
                     
-                    if frame:
+                    # Type check: ensure we have a VideoFrame
+                    if frame and isinstance(frame, av.VideoFrame):
                         # Resize if needed
                         if frame.width != self.width or frame.height != self.height:
                             frame = self._resize_frame(frame)
@@ -115,7 +116,7 @@ def _resize_frame(self, frame: av.VideoFrame) -> av.VideoFrame:
             new_height = int(src_height * scale)
             
             # Resize with aspect ratio maintained
-            resized = img.resize((new_width, new_height), Image.LANCZOS)
+            resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS)
             
             # Create black background at target resolution
             result = Image.new('RGB', (target_width, target_height), (0, 0, 0))

From d6d66bf74ed0a92b89e0be41c36f0705ab5031f3 Mon Sep 17 00:00:00 2001
From: Deven Joshi <deven9852@gmail.com>
Date: Tue, 4 Nov 2025 10:50:51 +0100
Subject: [PATCH 14/20] Allow reattaching to new HeyGen video tracks on
 renegotiation

---
 .../vision_agents/plugins/heygen/heygen_video_track.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
index 9f890b95..e74a4c23 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py
@@ -52,8 +52,14 @@ async def start_receiving(self, source_track: MediaStreamTrack) -> None:
             source_track: The incoming video track from HeyGen's WebRTC connection.
         """
         if self._receiving_task:
-            logger.warning("Already receiving frames from HeyGen")
-            return
+            logger.info("Restarting HeyGen video receiver with new source track")
+            self._receiving_task.cancel()
+            try:
+                await self._receiving_task
+            except asyncio.CancelledError:
+                pass
+            self._receiving_task = None
+            self._source_track = None
         
         self._source_track = source_track
         self._receiving_task = asyncio.create_task(self._receive_frames())

From f7a2f37184330a3ff934af75280f7b7b9c6916df Mon Sep 17 00:00:00 2001
From: "Neevash Ramdial (Nash)" <mail@neevash.dev>
Date: Tue, 4 Nov 2025 14:59:42 -0700
Subject: [PATCH 15/20] Migrate quality to enum

---
 plugins/heygen/README.md                         |  9 ++++++---
 plugins/heygen/example/README.md                 |  8 ++++++--
 plugins/heygen/example/avatar_example.py         |  3 ++-
 .../heygen/example/avatar_realtime_example.py    |  3 ++-
 plugins/heygen/tests/test_heygen_plugin.py       | 16 ++++++++--------
 .../vision_agents/plugins/heygen/__init__.py     |  3 ++-
 .../plugins/heygen/heygen_avatar_publisher.py    | 16 +++++++++++++---
 .../plugins/heygen/heygen_rtc_manager.py         | 11 +++++++++--
 .../plugins/heygen/heygen_session.py             | 10 ++++++++--
 9 files changed, 56 insertions(+), 23 deletions(-)

diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md
index 0ae26514..f8b53b77 100644
--- a/plugins/heygen/README.md
+++ b/plugins/heygen/README.md
@@ -31,6 +31,7 @@ from dotenv import load_dotenv
 
 from vision_agents.core import User, Agent
 from vision_agents.plugins import cartesia, deepgram, getstream, gemini, heygen
+from vision_agents.plugins.heygen import VideoQuality
 
 load_dotenv()
 
@@ -48,7 +49,7 @@ async def start_avatar_agent():
         processors=[
             heygen.AvatarPublisher(
                 avatar_id="default",
-                quality="high"
+                quality=VideoQuality.HIGH
             )
         ]
     )
@@ -77,9 +78,11 @@ HEYGEN_API_KEY=your_heygen_api_key_here
 ### AvatarPublisher Options
 
 ```python
+from vision_agents.plugins.heygen import VideoQuality
+
 heygen.AvatarPublisher(
     avatar_id="default",           # HeyGen avatar ID
-    quality="high",                # Video quality: "low", "medium", "high"
+    quality=VideoQuality.HIGH,    # Video quality: VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH
     resolution=(1920, 1080),       # Output resolution (width, height)
     api_key=None,                  # Optional: override env var
 )
@@ -159,7 +162,7 @@ If you experience connection problems:
 
 To optimize video quality:
 
-- Use `quality="high"` for best results
+- Use `quality=VideoQuality.HIGH` for best results
 - Increase resolution if bandwidth allows
 - Ensure stable internet connection
 
diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md
index b922e7e2..a9206171 100644
--- a/plugins/heygen/example/README.md
+++ b/plugins/heygen/example/README.md
@@ -108,9 +108,11 @@ Both will:
 Get your avatar ID from HeyGen dashboard and update:
 
 ```python
+from vision_agents.plugins.heygen import VideoQuality
+
 heygen.AvatarPublisher(
     avatar_id="your_avatar_id_here",
-    quality="high"
+    quality=VideoQuality.HIGH
 )
 ```
 
@@ -119,9 +121,11 @@ heygen.AvatarPublisher(
 Choose quality based on your bandwidth:
 
 ```python
+from vision_agents.plugins.heygen import VideoQuality
+
 heygen.AvatarPublisher(
     avatar_id="default",
-    quality="low",      # Options: "low", "medium", "high"
+    quality=VideoQuality.LOW,  # Options: VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH
     resolution=(1280, 720)  # Lower resolution for better performance
 )
 ```
diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py
index 1683491b..a07e77cd 100644
--- a/plugins/heygen/example/avatar_example.py
+++ b/plugins/heygen/example/avatar_example.py
@@ -4,6 +4,7 @@
 
 from vision_agents.core import User, Agent
 from vision_agents.plugins import getstream, gemini, heygen, deepgram
+from vision_agents.plugins.heygen import VideoQuality
 
 load_dotenv()
 
@@ -43,7 +44,7 @@ async def start_avatar_agent() -> None:
         processors=[
             heygen.AvatarPublisher(
                 avatar_id="default",  # Use your HeyGen avatar ID
-                quality="high",       # Video quality: "low", "medium", "high"
+                quality=VideoQuality.HIGH,  # Video quality: VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH
                 resolution=(1920, 1080),  # Output resolution
                 mute_llm_audio=False,  # Not needed for streaming LLM
             )
diff --git a/plugins/heygen/example/avatar_realtime_example.py b/plugins/heygen/example/avatar_realtime_example.py
index 695f7b55..a851064b 100644
--- a/plugins/heygen/example/avatar_realtime_example.py
+++ b/plugins/heygen/example/avatar_realtime_example.py
@@ -6,6 +6,7 @@
 
 from vision_agents.core import User, Agent
 from vision_agents.plugins import getstream, gemini, heygen
+from vision_agents.plugins.heygen import VideoQuality
 
 load_dotenv()
 
@@ -38,7 +39,7 @@ async def start_avatar_agent() -> None:
         processors=[
             heygen.AvatarPublisher(
                 avatar_id="default",
-                quality="high",
+                quality=VideoQuality.HIGH,
             )
         ],
     )
diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py
index cfb754a8..d3aa6c2c 100644
--- a/plugins/heygen/tests/test_heygen_plugin.py
+++ b/plugins/heygen/tests/test_heygen_plugin.py
@@ -1,6 +1,6 @@
 import pytest
 from unittest.mock import patch
-from vision_agents.plugins.heygen import AvatarPublisher
+from vision_agents.plugins.heygen import AvatarPublisher, VideoQuality
 from vision_agents.plugins.heygen.heygen_video_track import HeyGenVideoTrack
 from vision_agents.plugins.heygen.heygen_rtc_manager import HeyGenRTCManager
 from vision_agents.plugins.heygen.heygen_session import HeyGenSession
@@ -13,12 +13,12 @@ def test_init_with_api_key(self):
         """Test initialization with explicit API key."""
         session = HeyGenSession(
             avatar_id="test_avatar",
-            quality="high",
+            quality=VideoQuality.HIGH,
             api_key="test_key",
         )
         
         assert session.avatar_id == "test_avatar"
-        assert session.quality == "high"
+        assert session.quality == VideoQuality.HIGH
         assert session.api_key == "test_key"
     
     def test_init_without_api_key_raises(self):
@@ -55,7 +55,7 @@ def test_init(self):
         with patch.object(HeyGenSession, "__init__", return_value=None):
             manager = HeyGenRTCManager(
                 avatar_id="test_avatar",
-                quality="medium",
+                quality=VideoQuality.MEDIUM,
                 api_key="test_key",
             )
             
@@ -81,13 +81,13 @@ def test_init(self):
         with patch.object(HeyGenRTCManager, "__init__", return_value=None):
             publisher = AvatarPublisher(
                 avatar_id="test_avatar",
-                quality="high",
+                quality=VideoQuality.HIGH,
                 resolution=(1920, 1080),
                 api_key="test_key",
             )
             
             assert publisher.avatar_id == "test_avatar"
-            assert publisher.quality == "high"
+            assert publisher.quality == VideoQuality.HIGH
             assert publisher.resolution == (1920, 1080)
             assert not publisher._connected
     
@@ -108,7 +108,7 @@ def test_state(self):
         with patch.object(HeyGenRTCManager, "__init__", return_value=None):
             publisher = AvatarPublisher(
                 avatar_id="test_avatar",
-                quality="medium",
+                quality=VideoQuality.MEDIUM,
                 api_key="test_key",
             )
             # Mock the _connected attribute on the RTC manager
@@ -117,7 +117,7 @@ def test_state(self):
             state = publisher.state()
             
             assert state["avatar_id"] == "test_avatar"
-            assert state["quality"] == "medium"
+            assert state["quality"] == VideoQuality.MEDIUM
             assert "connected" in state
             assert "rtc_connected" in state
 
diff --git a/plugins/heygen/vision_agents/plugins/heygen/__init__.py b/plugins/heygen/vision_agents/plugins/heygen/__init__.py
index ef7db7ba..98d608cd 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/__init__.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/__init__.py
@@ -4,9 +4,10 @@
 allowing AI agents to have realistic avatar video output with lip-sync.
 """
 
-from .heygen_avatar_publisher import AvatarPublisher
+from .heygen_avatar_publisher import AvatarPublisher, VideoQuality
 
 __all__ = [
     "AvatarPublisher",
+    "VideoQuality",
 ]
 
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index e5e12f03..fc2ffe00 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -1,5 +1,15 @@
 import asyncio
 import logging
+from enum import Enum
+
+# Define VideoQuality enum FIRST before any other imports to avoid circular import issues
+class VideoQuality(str, Enum):
+    """Video quality options for HeyGen avatar streaming."""
+    
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+
 from typing import Optional, Any, Tuple
 
 from getstream.video.rtc import audio_track
@@ -34,7 +44,7 @@ class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMi
             processors=[
                 heygen.AvatarPublisher(
                     avatar_id="default",
-                    quality="high"
+                    quality=heygen.VideoQuality.HIGH
                 )
             ]
         )
@@ -43,7 +53,7 @@ class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMi
     def __init__(
         self,
         avatar_id: str = "default",
-        quality: str = "high",
+        quality: VideoQuality = VideoQuality.HIGH,
         resolution: Tuple[int, int] = (1920, 1080),
         api_key: Optional[str] = None,
         interval: int = 0,
@@ -53,7 +63,7 @@ def __init__(
         
         Args:
             avatar_id: HeyGen avatar ID to use for streaming.
-            quality: Video quality ("low", "medium", "high").
+            quality: Video quality (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH).
             resolution: Output video resolution (width, height).
             api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided.
             interval: Processing interval (not used, kept for compatibility).
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
index 1572512e..67fc9e19 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -10,7 +10,10 @@
     MediaStreamTrack,
 )
 
+from typing import Optional, Callable, Any
+
 from .heygen_session import HeyGenSession
+from .heygen_avatar_publisher import VideoQuality
 
 logger = logging.getLogger(__name__)
 
@@ -25,16 +28,20 @@ class HeyGenRTCManager:
     def __init__(
         self,
         avatar_id: str = "default",
-        quality: str = "high",
+        quality: "VideoQuality" = None,
         api_key: Optional[str] = None,
     ):
         """Initialize the RTC manager.
         
         Args:
             avatar_id: HeyGen avatar ID to use.
-            quality: Video quality setting ("low", "medium", "high").
+            quality: Video quality setting (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH).
             api_key: HeyGen API key (uses HEYGEN_API_KEY env var if not provided).
         """
+        # Default to HIGH if not provided
+        if quality is None:
+            quality = VideoQuality.HIGH
+        
         self.session_manager = HeyGenSession(
             avatar_id=avatar_id,
             quality=quality,
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
index 1d332891..9aa5187d 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
@@ -3,6 +3,8 @@
 from os import getenv
 import aiohttp
 
+from .heygen_avatar_publisher import VideoQuality
+
 logger = logging.getLogger(__name__)
 
 
@@ -16,16 +18,20 @@ class HeyGenSession:
     def __init__(
         self,
         avatar_id: str = "default",
-        quality: str = "high",
+        quality: "VideoQuality" = None,
         api_key: Optional[str] = None,
     ):
         """Initialize HeyGen session manager.
         
         Args:
             avatar_id: HeyGen avatar ID to use for streaming.
-            quality: Video quality setting ("low", "medium", "high").
+            quality: Video quality setting (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH).
             api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided.
         """
+        # Default to HIGH if not provided
+        if quality is None:
+            quality = VideoQuality.HIGH
+        
         self.avatar_id = avatar_id
         self.quality = quality
         self.api_key: str = api_key or getenv("HEYGEN_API_KEY") or ""

From 0b4894af5bae82c9455ae82ec341d24117b67b82 Mon Sep 17 00:00:00 2001
From: "Neevash Ramdial (Nash)" <mail@neevash.dev>
Date: Tue, 4 Nov 2025 15:09:02 -0700
Subject: [PATCH 16/20] Ruff and Mypy

---
 .../heygen/vision_agents/plugins/heygen/__init__.py  |  3 ++-
 .../plugins/heygen/heygen_avatar_publisher.py        | 11 +----------
 .../plugins/heygen/heygen_rtc_manager.py             |  6 ++----
 .../vision_agents/plugins/heygen/heygen_session.py   |  8 ++------
 .../vision_agents/plugins/heygen/heygen_types.py     | 12 ++++++++++++
 5 files changed, 19 insertions(+), 21 deletions(-)
 create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_types.py

diff --git a/plugins/heygen/vision_agents/plugins/heygen/__init__.py b/plugins/heygen/vision_agents/plugins/heygen/__init__.py
index 98d608cd..e5dd68f6 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/__init__.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/__init__.py
@@ -4,7 +4,8 @@
 allowing AI agents to have realistic avatar video output with lip-sync.
 """
 
-from .heygen_avatar_publisher import AvatarPublisher, VideoQuality
+from .heygen_avatar_publisher import AvatarPublisher
+from .heygen_types import VideoQuality
 
 __all__ = [
     "AvatarPublisher",
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index fc2ffe00..fd571fbf 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -1,15 +1,5 @@
 import asyncio
 import logging
-from enum import Enum
-
-# Define VideoQuality enum FIRST before any other imports to avoid circular import issues
-class VideoQuality(str, Enum):
-    """Video quality options for HeyGen avatar streaming."""
-    
-    LOW = "low"
-    MEDIUM = "medium"
-    HIGH = "high"
-
 from typing import Optional, Any, Tuple
 
 from getstream.video.rtc import audio_track
@@ -21,6 +11,7 @@ class VideoQuality(str, Enum):
 )
 
 from .heygen_rtc_manager import HeyGenRTCManager
+from .heygen_types import VideoQuality
 from .heygen_video_track import HeyGenVideoTrack
 
 logger = logging.getLogger(__name__)
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
index 67fc9e19..e91ba6c8 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py
@@ -10,10 +10,8 @@
     MediaStreamTrack,
 )
 
-from typing import Optional, Callable, Any
-
 from .heygen_session import HeyGenSession
-from .heygen_avatar_publisher import VideoQuality
+from .heygen_types import VideoQuality
 
 logger = logging.getLogger(__name__)
 
@@ -28,7 +26,7 @@ class HeyGenRTCManager:
     def __init__(
         self,
         avatar_id: str = "default",
-        quality: "VideoQuality" = None,
+        quality: Optional["VideoQuality"] = VideoQuality.HIGH,
         api_key: Optional[str] = None,
     ):
         """Initialize the RTC manager.
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
index 9aa5187d..c73c8648 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py
@@ -3,7 +3,7 @@
 from os import getenv
 import aiohttp
 
-from .heygen_avatar_publisher import VideoQuality
+from .heygen_types import VideoQuality
 
 logger = logging.getLogger(__name__)
 
@@ -18,7 +18,7 @@ class HeyGenSession:
     def __init__(
         self,
         avatar_id: str = "default",
-        quality: "VideoQuality" = None,
+        quality: VideoQuality = VideoQuality.HIGH,
         api_key: Optional[str] = None,
     ):
         """Initialize HeyGen session manager.
@@ -28,10 +28,6 @@ def __init__(
             quality: Video quality setting (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH).
             api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided.
         """
-        # Default to HIGH if not provided
-        if quality is None:
-            quality = VideoQuality.HIGH
-        
         self.avatar_id = avatar_id
         self.quality = quality
         self.api_key: str = api_key or getenv("HEYGEN_API_KEY") or ""
diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_types.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_types.py
new file mode 100644
index 00000000..f7981db9
--- /dev/null
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_types.py
@@ -0,0 +1,12 @@
+"""Type definitions for HeyGen plugin."""
+
+from enum import Enum
+
+
+class VideoQuality(str, Enum):
+    """Video quality options for HeyGen avatar streaming."""
+    
+    LOW = "low"
+    MEDIUM = "medium"
+    HIGH = "high"
+

From 4bafa667685892dd7dba8822c5c9e7d017056b5d Mon Sep 17 00:00:00 2001
From: "Neevash Ramdial (Nash)" <mail@neevash.dev>
Date: Tue, 4 Nov 2025 15:21:42 -0700
Subject: [PATCH 17/20] More ruff issues

---
 plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py | 2 +-
 plugins/openai/vision_agents/plugins/openai/openai_realtime.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
index 3106e596..14d54713 100644
--- a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
+++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py
@@ -152,7 +152,7 @@ async def simple_audio_response(
         audio_bytes = pcm.resample(
             target_sample_rate=16000, target_channels=1
         ).samples.tobytes()
-        mime = f"audio/pcm;rate=16000"
+        mime = "audio/pcm;rate=16000"
         blob = Blob(data=audio_bytes, mime_type=mime)
 
         await self._require_session().send_realtime_input(audio=blob)
diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
index dfa30f75..fa2d67d0 100644
--- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
+++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
@@ -6,7 +6,7 @@
     RealtimeSessionCreateRequestParam,
     ResponseAudioTranscriptDoneEvent,
     InputAudioBufferSpeechStartedEvent,
-    ConversationItemInputAudioTranscriptionCompletedEvent, SessionUpdatedEvent, ResponseCreatedEvent, ResponseDoneEvent,
+    ConversationItemInputAudioTranscriptionCompletedEvent, ResponseCreatedEvent, ResponseDoneEvent,
 )
 
 from vision_agents.core.llm import realtime

From f5a1aaa14644f113655e9086a3653b568166ad77 Mon Sep 17 00:00:00 2001
From: "Neevash Ramdial (Nash)" <mail@neevash.dev>
Date: Tue, 4 Nov 2025 15:23:35 -0700
Subject: [PATCH 18/20] Fix broken method sigs

---
 .../vision_agents/plugins/heygen/heygen_avatar_publisher.py     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
index fd571fbf..7fa18c26 100644
--- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
+++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py
@@ -88,7 +88,7 @@ def __init__(
         # Audio track for publishing HeyGen's audio
         # Create it immediately so the agent can detect it during initialization
         self._audio_track = audio_track.AudioStreamTrack(
-            framerate=48000, stereo=True
+            sample_rate=48000, channels=2, format="s16"
         )
         
         # Connection state

From 3f5e2037377682d006734da067a819d14094dd08 Mon Sep 17 00:00:00 2001
From: "Neevash Ramdial (Nash)" <mail@neevash.dev>
Date: Tue, 4 Nov 2025 15:52:17 -0700
Subject: [PATCH 19/20] Unused var

---
 plugins/openai/vision_agents/plugins/openai/openai_realtime.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
index fa2d67d0..c6249bf5 100644
--- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
+++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
@@ -241,7 +241,6 @@ async def _handle_openai_event(self, event: dict) -> None:
             # Handle tool calls from OpenAI realtime
             await self._handle_tool_call_event(event)
         elif et == "response.created":
-            e = ResponseCreatedEvent(**event)
             pass
         elif et == "response.done":
             logger.info("OpenAI response done %s", event)

From 12cad153dc62f9e9d38b0dd054ed02e93790d59a Mon Sep 17 00:00:00 2001
From: "Neevash Ramdial (Nash)" <mail@neevash.dev>
Date: Tue, 4 Nov 2025 16:21:41 -0700
Subject: [PATCH 20/20] final ruff error

---
 plugins/openai/vision_agents/plugins/openai/openai_realtime.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
index c6249bf5..fb1efcb2 100644
--- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
+++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py
@@ -6,7 +6,7 @@
     RealtimeSessionCreateRequestParam,
     ResponseAudioTranscriptDoneEvent,
     InputAudioBufferSpeechStartedEvent,
-    ConversationItemInputAudioTranscriptionCompletedEvent, ResponseCreatedEvent, ResponseDoneEvent,
+    ConversationItemInputAudioTranscriptionCompletedEvent, ResponseDoneEvent,
 )
 
 from vision_agents.core.llm import realtime