From 9675a1449a7a31c289398e5d2a74498e47503661 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Mon, 27 Oct 2025 15:40:28 +0100 Subject: [PATCH 01/20] implemented heygen avatars --- agents-core/pyproject.toml | 2 + plugins/heygen/README.md | 181 ++++++++++++++ plugins/heygen/example/README.md | 126 ++++++++++ plugins/heygen/example/__init__.py | 0 plugins/heygen/example/avatar_example.py | 74 ++++++ plugins/heygen/example/pyproject.toml | 25 ++ plugins/heygen/py.typed | 0 plugins/heygen/pyproject.toml | 41 +++ plugins/heygen/tests/__init__.py | 0 plugins/heygen/tests/test_heygen_plugin.py | 120 +++++++++ .../vision_agents/plugins/heygen/__init__.py | 12 + .../plugins/heygen/heygen_avatar_publisher.py | 171 +++++++++++++ .../plugins/heygen/heygen_rtc_manager.py | 235 ++++++++++++++++++ .../plugins/heygen/heygen_session.py | 181 ++++++++++++++ .../plugins/heygen/heygen_video_track.py | 161 ++++++++++++ pyproject.toml | 4 +- uv.lock | 56 ++++- 17 files changed, 1387 insertions(+), 2 deletions(-) create mode 100644 plugins/heygen/README.md create mode 100644 plugins/heygen/example/README.md create mode 100644 plugins/heygen/example/__init__.py create mode 100644 plugins/heygen/example/avatar_example.py create mode 100644 plugins/heygen/example/pyproject.toml create mode 100644 plugins/heygen/py.typed create mode 100644 plugins/heygen/pyproject.toml create mode 100644 plugins/heygen/tests/__init__.py create mode 100644 plugins/heygen/tests/test_heygen_plugin.py create mode 100644 plugins/heygen/vision_agents/plugins/heygen/__init__.py create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_session.py create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py diff --git a/agents-core/pyproject.toml b/agents-core/pyproject.toml index b817ceb2..f66da592 100644 --- a/agents-core/pyproject.toml +++ b/agents-core/pyproject.toml @@ -41,6 +41,7 @@ deepgram = ["vision-agents-plugins-deepgram"] elevenlabs = ["vision-agents-plugins-elevenlabs"] gemini = ["vision-agents-plugins-gemini"] getstream = ["vision-agents-plugins-getstream"] +heygen = ["vision-agents-plugins-heygen"] kokoro = ["vision-agents-plugins-kokoro"] krisp = ["vision-agents-plugins-krisp"] moonshine = ["vision-agents-plugins-moonshine"] @@ -57,6 +58,7 @@ all-plugins = [ "vision-agents-plugins-elevenlabs", "vision-agents-plugins-gemini", "vision-agents-plugins-getstream", + "vision-agents-plugins-heygen", "vision-agents-plugins-kokoro", "vision-agents-plugins-krisp", "vision-agents-plugins-moonshine", diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md new file mode 100644 index 00000000..2dc19e34 --- /dev/null +++ b/plugins/heygen/README.md @@ -0,0 +1,181 @@ +# HeyGen Avatar Plugin for Vision Agents + +Add realistic avatar video to your AI agents using HeyGen's streaming avatar API. + +## Features + +- 🎭 **Realistic Avatars**: Use HeyGen's high-quality avatars with natural movements +- 🎤 **Automatic Lip-Sync**: Avatar automatically syncs with audio from any TTS provider +- 🚀 **WebRTC Streaming**: Low-latency real-time video streaming via WebRTC +- 🔌 **Easy Integration**: Works seamlessly with Vision Agents framework +- 🎨 **Customizable**: Configure avatar, quality, resolution, and more + +## Installation + +```bash +pip install vision-agents-plugins-heygen +``` + +Or with uv: + +```bash +uv pip install vision-agents-plugins-heygen +``` + +## Quick Start + +```python +import asyncio +from uuid import uuid4 +from dotenv import load_dotenv + +from vision_agents.core import User, Agent +from vision_agents.plugins import cartesia, deepgram, getstream, gemini, heygen + +load_dotenv() + +async def start_avatar_agent(): + agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="AI Assistant with Avatar", id="agent"), + instructions="You're a friendly AI assistant.", + + llm=gemini.LLM("gemini-2.0-flash"), + tts=cartesia.TTS(), + stt=deepgram.STT(), + + # Add HeyGen avatar + processors=[ + heygen.AvatarPublisher( + avatar_id="default", + quality="high" + ) + ] + ) + + call = agent.edge.client.video.call("default", str(uuid4())) + + with await agent.join(call): + await agent.edge.open_demo(call) + await agent.simple_response("Hello! I'm your AI assistant with an avatar.") + await agent.finish() + +if __name__ == "__main__": + asyncio.run(start_avatar_agent()) +``` + +## Configuration + +### Environment Variables + +Set your HeyGen API key: + +```bash +HEYGEN_API_KEY=your_heygen_api_key_here +``` + +### AvatarPublisher Options + +```python +heygen.AvatarPublisher( + avatar_id="default", # HeyGen avatar ID + quality="high", # Video quality: "low", "medium", "high" + resolution=(1920, 1080), # Output resolution (width, height) + api_key=None, # Optional: override env var +) +``` + +## Usage Examples + +### With Realtime LLM + +```python +from vision_agents.plugins import gemini, heygen, getstream + +agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Realtime Avatar AI"), + instructions="Be conversational and responsive.", + + llm=gemini.Realtime(fps=2), # No separate TTS needed + + processors=[ + heygen.AvatarPublisher(avatar_id="professional_presenter") + ] +) +``` + +### With Multiple Processors + +```python +from vision_agents.plugins import ultralytics, heygen + +agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Fitness Coach"), + instructions="Analyze user poses and provide feedback.", + + llm=gemini.Realtime(fps=3), + + processors=[ + # Process incoming user video + ultralytics.YOLOPoseProcessor(model_path="yolo11n-pose.pt"), + # Publish avatar video + heygen.AvatarPublisher(avatar_id="fitness_trainer") + ] +) +``` + +## How It Works + +1. **Connection**: Establishes WebRTC connection to HeyGen's streaming API +2. **Audio Input**: Receives audio from your TTS provider or Realtime LLM +3. **Avatar Generation**: HeyGen generates avatar video with lip-sync +4. **Video Streaming**: Streams avatar video to call participants via GetStream Edge + +## Requirements + +- Python 3.10+ +- HeyGen API key (get one at [heygen.com](https://heygen.com)) +- GetStream account for video calls +- TTS provider (Cartesia, ElevenLabs, etc.) or Realtime LLM + +## Troubleshooting + +### Connection Issues + +If you experience connection problems: + +1. Check your HeyGen API key is valid +2. Ensure you have network access to HeyGen's servers +3. Check firewall settings for WebRTC traffic + +### Video Quality + +To optimize video quality: + +- Use `quality="high"` for best results +- Increase resolution if bandwidth allows +- Ensure stable internet connection + +## API Reference + +### AvatarPublisher + +Main class for publishing HeyGen avatar video. + +**Methods:** +- `publish_video_track()`: Returns video track for streaming +- `state()`: Returns current state information +- `close()`: Clean up resources + +## License + +MIT + +## Links + +- [Documentation](https://visionagents.ai/) +- [GitHub](https://github.com/GetStream/Vision-Agents) +- [HeyGen API Docs](https://docs.heygen.com/docs/streaming-api) + diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md new file mode 100644 index 00000000..631d3309 --- /dev/null +++ b/plugins/heygen/example/README.md @@ -0,0 +1,126 @@ +# HeyGen Avatar Example + +This example demonstrates how to use the HeyGen plugin to add realistic avatar video to your AI agent. + +## Setup + +1. **Install dependencies:** + +```bash +cd plugins/heygen/example +uv pip install -e . +``` + +2. **Configure environment variables:** + +Copy `.env.example` to `.env` and fill in your API keys: + +```bash +cp .env.example .env +``` + +Required API keys: +- `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com) +- `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io) +- `CARTESIA_API_KEY` - Get from [Cartesia](https://cartesia.ai) +- `DEEPGRAM_API_KEY` - Get from [Deepgram](https://deepgram.com) +- `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey) + +## Running the Example + +```bash +uv run avatar_example.py +``` + +This will: +1. Start an AI agent with a HeyGen avatar +2. Open a demo UI in your browser +3. The avatar will greet you and be ready to chat + +## What's Happening + +1. **Agent Setup**: The agent is configured with: + - Gemini LLM for generating responses + - Cartesia TTS for speech synthesis + - Deepgram STT for speech recognition + - HeyGen AvatarPublisher for avatar video + +2. **Avatar Streaming**: When the agent speaks: + - Text is generated by Gemini LLM + - Audio is synthesized by Cartesia TTS + - Audio is sent to HeyGen via WebRTC + - HeyGen generates avatar video with lip-sync + - Avatar video is streamed to the call + +3. **User Interaction**: When you speak: + - Audio is captured from your microphone + - Transcribed to text by Deepgram + - Sent to Gemini LLM for processing + - Response is generated and spoken through the avatar + +## Customization + +### Using a Different Avatar + +Get your avatar ID from HeyGen dashboard and update: + +```python +heygen.AvatarPublisher( + avatar_id="your_avatar_id_here", + quality="high" +) +``` + +### Adjusting Video Quality + +Choose quality based on your bandwidth: + +```python +heygen.AvatarPublisher( + avatar_id="default", + quality="low", # Options: "low", "medium", "high" + resolution=(1280, 720) # Lower resolution for better performance +) +``` + +### Using a Different LLM + +Switch to OpenAI's Realtime API: + +```python +from vision_agents.plugins import openai + +agent = Agent( + # ... other config ... + llm=openai.Realtime(model="gpt-realtime", voice="alloy"), + # No need for separate TTS/STT with Realtime LLM + processors=[ + heygen.AvatarPublisher(avatar_id="default") + ] +) +``` + +## Troubleshooting + +### "HeyGen API key required" Error + +Make sure `HEYGEN_API_KEY` is set in your `.env` file. + +### Connection Timeout + +- Check your internet connection +- Verify HeyGen API key is valid +- Ensure firewall allows WebRTC traffic + +### No Video Appearing + +- Check browser console for errors +- Verify GetStream credentials are correct +- Try lowering video quality settings + +## Learn More + +- [HeyGen API Documentation](https://docs.heygen.com/docs/streaming-api) +- [Vision Agents Documentation](https://visionagents.ai/) +- [GetStream Video Documentation](https://getstream.io/video/docs/) + diff --git a/plugins/heygen/example/__init__.py b/plugins/heygen/example/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py new file mode 100644 index 00000000..5d021b41 --- /dev/null +++ b/plugins/heygen/example/avatar_example.py @@ -0,0 +1,74 @@ +import asyncio +from uuid import uuid4 +from dotenv import load_dotenv + +from vision_agents.core import User, Agent +from vision_agents.plugins import kokoro, deepgram, getstream, smart_turn, gemini, heygen + +load_dotenv() + + +async def start_avatar_agent() -> None: + """Start an agent with HeyGen avatar. + + This example demonstrates how to use HeyGen's avatar streaming + to add realistic avatar video to your AI agent. The avatar will + lip-sync with the audio generated by the TTS. + """ + + # Create LLM + llm = gemini.LLM("gemini-2.0-flash") + + # Create agent with HeyGen avatar + agent = Agent( + edge=getstream.Edge(), + agent_user=User( + name="AI Assistant with Avatar", + id="agent" + ), + instructions=( + "You're a friendly and helpful AI assistant. " + "Keep your responses conversational and engaging. " + "Don't use special characters or formatting." + ), + + # LLM and speech components + llm=llm, + tts=kokoro.TTS(), # Using Kokoro (free, local TTS) + stt=deepgram.STT(), + turn_detection=smart_turn.TurnDetection( + buffer_duration=2.0, + confidence_threshold=0.5 + ), + + # Add HeyGen avatar as a video publisher + processors=[ + heygen.AvatarPublisher( + avatar_id="default", # Use your HeyGen avatar ID + quality="high", # Video quality: "low", "medium", "high" + resolution=(1920, 1080), # Output resolution + ) + ] + ) + + # Create a call + call = agent.edge.client.video.call("default", str(uuid4())) + + # Join the call + with await agent.join(call): + # Open demo UI + await agent.edge.open_demo(call) + + # Greet the user through the avatar + await agent.simple_response( + "Hello! I'm your AI assistant with an avatar. " + "How can I help you today?" + ) + + # Keep the call running + await agent.finish() + + +if __name__ == "__main__": + asyncio.run(start_avatar_agent()) + diff --git a/plugins/heygen/example/pyproject.toml b/plugins/heygen/example/pyproject.toml new file mode 100644 index 00000000..83fd9bdd --- /dev/null +++ b/plugins/heygen/example/pyproject.toml @@ -0,0 +1,25 @@ +[project] +name = "heygen-avatar-example" +version = "0.1.0" +description = "Example using HeyGen avatar with Vision Agents" +requires-python = ">=3.10" +dependencies = [ + "vision-agents", + "vision-agents-plugins-heygen", + "vision-agents-plugins-kokoro", + "vision-agents-plugins-deepgram", + "vision-agents-plugins-gemini", + "vision-agents-plugins-getstream", + "vision-agents-plugins-smart-turn", + "python-dotenv", +] + +[tool.uv.sources] +vision-agents = { workspace = true } +vision-agents-plugins-heygen = { workspace = true } +vision-agents-plugins-kokoro = { workspace = true } +vision-agents-plugins-deepgram = { workspace = true } +vision-agents-plugins-gemini = { workspace = true } +vision-agents-plugins-getstream = { workspace = true } +vision-agents-plugins-smart-turn = { workspace = true } + diff --git a/plugins/heygen/py.typed b/plugins/heygen/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/plugins/heygen/pyproject.toml b/plugins/heygen/pyproject.toml new file mode 100644 index 00000000..b152460d --- /dev/null +++ b/plugins/heygen/pyproject.toml @@ -0,0 +1,41 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "vision-agents-plugins-heygen" +version = "0.1.0" +description = "HeyGen avatar plugin for Vision Agents" +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +dependencies = [ + "vision-agents", + "aiortc>=1.9.0", + "aiohttp>=3.9.0", +] + +[project.urls] +Documentation = "https://visionagents.ai/" +Website = "https://visionagents.ai/" +Source = "https://github.com/GetStream/Vision-Agents" + +[tool.hatch.version] +source = "vcs" +raw-options = { root = "..", search_parent_directories = true, fallback_version = "0.0.0" } + +[tool.hatch.build.targets.wheel] +packages = ["vision_agents"] + +[tool.hatch.build.targets.sdist] +include = ["/vision_agents"] + +[tool.uv.sources] +vision-agents = { workspace = true } + +[dependency-groups] +dev = [ + "pytest>=8.4.1", + "pytest-asyncio>=1.0.0", +] + diff --git a/plugins/heygen/tests/__init__.py b/plugins/heygen/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py new file mode 100644 index 00000000..3be160d6 --- /dev/null +++ b/plugins/heygen/tests/test_heygen_plugin.py @@ -0,0 +1,120 @@ +import pytest +from unittest.mock import Mock, AsyncMock, patch +from vision_agents.plugins.heygen import ( + AvatarPublisher, + HeyGenVideoTrack, + HeyGenRTCManager, + HeyGenSession, +) + + +class TestHeyGenSession: + """Tests for HeyGenSession.""" + + def test_init_with_api_key(self): + """Test initialization with explicit API key.""" + session = HeyGenSession( + avatar_id="test_avatar", + quality="high", + api_key="test_key", + ) + + assert session.avatar_id == "test_avatar" + assert session.quality == "high" + assert session.api_key == "test_key" + + def test_init_without_api_key_raises(self): + """Test initialization without API key raises error.""" + with patch.dict("os.environ", {}, clear=True): + with pytest.raises(ValueError, match="HeyGen API key required"): + HeyGenSession(avatar_id="test_avatar") + + +class TestHeyGenVideoTrack: + """Tests for HeyGenVideoTrack.""" + + def test_init(self): + """Test video track initialization.""" + track = HeyGenVideoTrack(width=1920, height=1080) + + assert track.width == 1920 + assert track.height == 1080 + assert not track._stopped + + def test_stop(self): + """Test stopping the video track.""" + track = HeyGenVideoTrack() + track.stop() + + assert track._stopped + + +class TestHeyGenRTCManager: + """Tests for HeyGenRTCManager.""" + + def test_init(self): + """Test RTC manager initialization.""" + with patch.object(HeyGenSession, "__init__", return_value=None): + manager = HeyGenRTCManager( + avatar_id="test_avatar", + quality="medium", + api_key="test_key", + ) + + assert manager.pc is None + assert not manager._connected + + def test_is_connected_property(self): + """Test is_connected property.""" + with patch.object(HeyGenSession, "__init__", return_value=None): + manager = HeyGenRTCManager(api_key="test_key") + + assert not manager.is_connected + + manager._connected = True + assert manager.is_connected + + +class TestAvatarPublisher: + """Tests for AvatarPublisher.""" + + def test_init(self): + """Test avatar publisher initialization.""" + with patch.object(HeyGenRTCManager, "__init__", return_value=None): + publisher = AvatarPublisher( + avatar_id="test_avatar", + quality="high", + resolution=(1920, 1080), + api_key="test_key", + ) + + assert publisher.avatar_id == "test_avatar" + assert publisher.quality == "high" + assert publisher.resolution == (1920, 1080) + assert not publisher._connected + + def test_publish_video_track(self): + """Test publishing video track.""" + with patch.object(HeyGenRTCManager, "__init__", return_value=None): + publisher = AvatarPublisher(api_key="test_key") + + track = publisher.publish_video_track() + + assert isinstance(track, HeyGenVideoTrack) + + def test_state(self): + """Test state method.""" + with patch.object(HeyGenRTCManager, "__init__", return_value=None): + publisher = AvatarPublisher( + avatar_id="test_avatar", + quality="medium", + api_key="test_key", + ) + + state = publisher.state() + + assert state["avatar_id"] == "test_avatar" + assert state["quality"] == "medium" + assert "connected" in state + assert "rtc_connected" in state + diff --git a/plugins/heygen/vision_agents/plugins/heygen/__init__.py b/plugins/heygen/vision_agents/plugins/heygen/__init__.py new file mode 100644 index 00000000..ef7db7ba --- /dev/null +++ b/plugins/heygen/vision_agents/plugins/heygen/__init__.py @@ -0,0 +1,12 @@ +"""HeyGen avatar plugin for Vision Agents. + +This plugin provides HeyGen's interactive avatar streaming capabilities, +allowing AI agents to have realistic avatar video output with lip-sync. +""" + +from .heygen_avatar_publisher import AvatarPublisher + +__all__ = [ + "AvatarPublisher", +] + diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py new file mode 100644 index 00000000..786b9421 --- /dev/null +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -0,0 +1,171 @@ +import asyncio +import logging +from typing import Optional, Any, Tuple + +from vision_agents.core.processors.base_processor import ( + AudioVideoProcessor, + VideoPublisherMixin, +) + +from .heygen_rtc_manager import HeyGenRTCManager +from .heygen_video_track import HeyGenVideoTrack + +logger = logging.getLogger(__name__) + + +class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin): + """HeyGen avatar video publisher. + + Publishes video of a HeyGen avatar that lip-syncs to audio input. + Can be used as a processor in the Vision Agents framework to add + realistic avatar video to AI agents. + + Example: + agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Avatar AI"), + instructions="Be helpful and friendly", + llm=gemini.LLM("gemini-2.0-flash"), + tts=cartesia.TTS(), + stt=deepgram.STT(), + processors=[ + heygen.AvatarPublisher( + avatar_id="default", + quality="high" + ) + ] + ) + """ + + def __init__( + self, + avatar_id: str = "default", + quality: str = "high", + resolution: Tuple[int, int] = (1920, 1080), + api_key: Optional[str] = None, + interval: int = 0, + **kwargs, + ): + """Initialize the HeyGen avatar publisher. + + Args: + avatar_id: HeyGen avatar ID to use for streaming. + quality: Video quality ("low", "medium", "high"). + resolution: Output video resolution (width, height). + api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided. + interval: Processing interval (not used, kept for compatibility). + **kwargs: Additional arguments passed to parent class. + """ + super().__init__( + interval=interval, + receive_audio=False, + receive_video=False, + **kwargs + ) + + self.avatar_id = avatar_id + self.quality = quality + self.resolution = resolution + self.api_key = api_key + + # WebRTC manager for HeyGen connection + self.rtc_manager = HeyGenRTCManager( + avatar_id=avatar_id, + quality=quality, + api_key=api_key, + ) + + # Video track for publishing avatar frames + self._video_track = HeyGenVideoTrack( + width=resolution[0], + height=resolution[1], + ) + + # Connection state + self._connected = False + self._connection_task: Optional[asyncio.Task] = None + + logger.info( + f"🎭 HeyGen AvatarPublisher initialized " + f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})" + ) + + async def _connect_to_heygen(self) -> None: + """Establish connection to HeyGen and start receiving video.""" + try: + # Set up video callback before connecting + self.rtc_manager.set_video_callback(self._on_video_track) + + # Connect to HeyGen + await self.rtc_manager.connect() + + self._connected = True + logger.info("✅ Connected to HeyGen, avatar streaming active") + + except Exception as e: + logger.error(f"❌ Failed to connect to HeyGen: {e}") + self._connected = False + raise + + async def _on_video_track(self, track: Any) -> None: + """Callback when video track is received from HeyGen. + + Args: + track: Incoming video track from HeyGen's WebRTC connection. + """ + logger.info("📹 Received video track from HeyGen, starting frame forwarding") + await self._video_track.start_receiving(track) + + def publish_video_track(self): + """Publish the HeyGen avatar video track. + + This method is called by the Agent to get the video track + for publishing to the call. + + Returns: + HeyGenVideoTrack instance for streaming avatar video. + """ + # Start connection if not already connected + if not self._connected and not self._connection_task: + self._connection_task = asyncio.create_task(self._connect_to_heygen()) + + logger.info("đŸŽĨ Publishing HeyGen avatar video track") + return self._video_track + + def state(self) -> dict: + """Get current state of the avatar publisher. + + Returns: + Dictionary containing current state information. + """ + return { + "avatar_id": self.avatar_id, + "quality": self.quality, + "resolution": self.resolution, + "connected": self._connected, + "rtc_connected": self.rtc_manager.is_connected, + } + + async def close(self) -> None: + """Clean up resources and close connections.""" + logger.info("🔌 Closing HeyGen avatar publisher") + + # Stop video track + if self._video_track: + self._video_track.stop() + + # Close RTC connection + if self.rtc_manager: + await self.rtc_manager.close() + + # Cancel connection task if running + if self._connection_task: + self._connection_task.cancel() + try: + await self._connection_task + except asyncio.CancelledError: + pass + + self._connected = False + logger.info("✅ HeyGen avatar publisher closed") + diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py new file mode 100644 index 00000000..8e5d340b --- /dev/null +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -0,0 +1,235 @@ +import asyncio +import logging +from typing import Optional, Callable, Any + +from aiortc import ( + RTCPeerConnection, + RTCSessionDescription, + RTCIceServer, + RTCConfiguration, + MediaStreamTrack, +) + +from .heygen_session import HeyGenSession + +logger = logging.getLogger(__name__) + + +class HeyGenRTCManager: + """Manages WebRTC connection to HeyGen's Streaming Avatar API. + + Handles the low-level WebRTC peer connection, audio/video streaming, + and communication with HeyGen's servers. + """ + + def __init__( + self, + avatar_id: str = "default", + quality: str = "high", + api_key: Optional[str] = None, + ): + """Initialize the RTC manager. + + Args: + avatar_id: HeyGen avatar ID to use. + quality: Video quality setting ("low", "medium", "high"). + api_key: HeyGen API key (uses HEYGEN_API_KEY env var if not provided). + """ + self.session_manager = HeyGenSession( + avatar_id=avatar_id, + quality=quality, + api_key=api_key, + ) + + self.pc: Optional[RTCPeerConnection] = None + + # Video track callback for receiving avatar video + self._video_callback: Optional[Callable[[MediaStreamTrack], Any]] = None + + self._connected = False + self._connection_ready = asyncio.Event() + + async def connect(self) -> None: + """Establish WebRTC connection to HeyGen's Streaming API. + + Sets up the peer connection, negotiates tracks, and establishes + the connection for real-time avatar streaming. + + HeyGen flow: + 1. Create session -> HeyGen provides SDP offer and ICE servers + 2. Set HeyGen's offer as remote description + 3. Create answer + 4. Send answer to HeyGen + 5. Start session + """ + try: + # Create HeyGen session - they provide the SDP offer + session_info = await self.session_manager.create_session() + + # Extract ICE servers and SDP offer from session info + ice_servers = self._parse_ice_servers(session_info) + + # HeyGen's sdp field - check the actual structure + sdp_data = session_info.get("sdp") + + if isinstance(sdp_data, dict): + # Standard WebRTC format: {'type': 'offer', 'sdp': 'v=0...'} + offer_sdp = sdp_data.get("sdp") + sdp_type = sdp_data.get("type") + logger.debug(f"Got SDP dict from HeyGen (type: {sdp_type})") + elif isinstance(sdp_data, str) and sdp_data.startswith("v=0"): + # Raw SDP string (less common) + offer_sdp = sdp_data + logger.debug("Got raw SDP string from HeyGen") + else: + offer_sdp = None + + if not offer_sdp: + logger.error(f"❌ Unexpected SDP format. Type: {type(sdp_data)}") + if isinstance(sdp_data, dict): + logger.error(f"SDP dict keys: {list(sdp_data.keys())}") + logger.error(f"SDP data: {str(sdp_data)[:200] if sdp_data else 'None'}") + raise RuntimeError("No valid SDP offer received from HeyGen") + + # Create RTCPeerConnection with ICE servers + config = RTCConfiguration(iceServers=ice_servers) + self.pc = RTCPeerConnection(configuration=config) + + # Set up track handlers + @self.pc.on("track") + async def on_track(track: MediaStreamTrack): + await self._handle_track(track) + + @self.pc.on("connectionstatechange") + async def on_connection_state_change(): + logger.info(f"🔗 HeyGen connection state: {self.pc.connectionState}") + if self.pc.connectionState == "connected": + self._connected = True + self._connection_ready.set() + elif self.pc.connectionState in ["failed", "closed"]: + self._connected = False + self._connection_ready.clear() + + # Set HeyGen's offer as remote description + offer = RTCSessionDescription(sdp=offer_sdp, type="offer") + await self.pc.setRemoteDescription(offer) + + # HeyGen's offer already includes tracks, so transceivers are auto-created + # We just need to create our answer + logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}") + + # Create our answer + answer = await self.pc.createAnswer() + await self.pc.setLocalDescription(answer) + + # Start the session with our SDP answer + # HeyGen expects the answer in the start_session call + await self.session_manager.start_session(sdp_answer=self.pc.localDescription.sdp) + + # Wait for connection to be established + await asyncio.wait_for(self._connection_ready.wait(), timeout=10.0) + + logger.info("✅ HeyGen WebRTC connection established") + + except Exception as e: + logger.error(f"❌ Failed to connect to HeyGen: {e}") + raise + + def _parse_ice_servers(self, session_info: dict) -> list: + """Parse ICE servers from HeyGen session info. + + HeyGen may provide ice_servers, ice_servers2, or rely on LiveKit's embedded servers. + + Args: + session_info: Session information from HeyGen API. + + Returns: + List of RTCIceServer objects. + """ + ice_servers = [] + + # Try ice_servers first, then ice_servers2 as backup + ice_server_configs = ( + session_info.get("ice_servers") or + session_info.get("ice_servers2") or + session_info.get("iceServers", []) + ) + + if ice_server_configs and not isinstance(ice_server_configs, list): + logger.warning(f"âš ī¸ Unexpected ice_servers format: {type(ice_server_configs)}") + ice_server_configs = [] + + for server_config in ice_server_configs: + if not isinstance(server_config, dict): + continue + + urls = server_config.get("urls", []) + if isinstance(urls, str): + urls = [urls] # Convert single URL to list + + username = server_config.get("username") + credential = server_config.get("credential") + + if urls: + ice_servers.append( + RTCIceServer( + urls=urls, + username=username, + credential=credential, + ) + ) + logger.info(f"🧊 Added ICE server: {urls[0]}") + + # When using LiveKit, ICE servers may be embedded in SDP + # In that case, use public STUN as fallback + if not ice_servers: + logger.info("â„šī¸ Using default STUN servers (LiveKit may provide its own via SDP)") + ice_servers.append( + RTCIceServer(urls=["stun:stun.l.google.com:19302"]) + ) + + return ice_servers + + async def _handle_track(self, track: MediaStreamTrack) -> None: + """Handle incoming media track from HeyGen. + + Args: + track: Incoming media track (audio or video). + """ + logger.info(f"📡 Received track from HeyGen: {track.kind}") + + if track.kind == "video": + if self._video_callback: + await self._video_callback(track) + else: + logger.warning("Video track received but no callback registered") + elif track.kind == "audio": + # Audio track from HeyGen (avatar speech) - currently not used + logger.debug("Audio track received from HeyGen (ignored)") + + def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None: + """Set callback for handling incoming video track. + + Args: + callback: Async function to handle video track. + """ + self._video_callback = callback + + @property + def is_connected(self) -> bool: + """Check if WebRTC connection is established.""" + return self._connected + + async def close(self) -> None: + """Close the WebRTC connection and clean up resources.""" + if self.pc: + await self.pc.close() + self.pc = None + + await self.session_manager.close() + + self._connected = False + self._connection_ready.clear() + + logger.info("🔌 HeyGen RTC connection closed") + diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py new file mode 100644 index 00000000..917a4a52 --- /dev/null +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py @@ -0,0 +1,181 @@ +import logging +from typing import Optional, Dict, Any +from os import getenv +import aiohttp + +logger = logging.getLogger(__name__) + + +class HeyGenSession: + """Manages HeyGen API session lifecycle and configuration. + + Handles authentication, session creation, and API communication + with HeyGen's Streaming API. + """ + + def __init__( + self, + avatar_id: str = "default", + quality: str = "high", + api_key: Optional[str] = None, + ): + """Initialize HeyGen session manager. + + Args: + avatar_id: HeyGen avatar ID to use for streaming. + quality: Video quality setting ("low", "medium", "high"). + api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided. + """ + self.avatar_id = avatar_id + self.quality = quality + self.api_key = api_key or getenv("HEYGEN_API_KEY") + + if not self.api_key: + raise ValueError( + "HeyGen API key required. Set HEYGEN_API_KEY environment variable " + "or pass api_key parameter." + ) + + self.base_url = "https://api.heygen.com/v1" + self.session_id: Optional[str] = None + self.session_info: Optional[Dict[str, Any]] = None + self._http_session: Optional[aiohttp.ClientSession] = None + + async def create_session(self) -> Dict[str, Any]: + """Create a new HeyGen streaming session. + + Returns: + Session information including session_id, ICE servers, and SDP offer. + """ + if not self._http_session: + self._http_session = aiohttp.ClientSession() + + headers = { + "X-Api-Key": self.api_key, + "Content-Type": "application/json", + } + + payload = { + "avatar_id": self.avatar_id, + "quality": self.quality, + } + + try: + async with self._http_session.post( + f"{self.base_url}/streaming.new", + json=payload, + headers=headers, + ) as response: + if response.status != 200: + error_text = await response.text() + raise RuntimeError( + f"Failed to create HeyGen session: {response.status} - {error_text}" + ) + + data = await response.json() + self.session_info = data.get("data", {}) + self.session_id = self.session_info.get("session_id") + + logger.info(f"✅ HeyGen session created: {self.session_id}") + return self.session_info + + except Exception as e: + logger.error(f"❌ Failed to create HeyGen session: {e}") + raise + + async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any]: + """Start the HeyGen streaming session. + + Args: + sdp_answer: Optional SDP answer to include in the start request. + + Returns: + Start confirmation with session details. + """ + if not self.session_id: + raise RuntimeError("Session not created. Call create_session() first.") + + if not self._http_session: + self._http_session = aiohttp.ClientSession() + + headers = { + "X-Api-Key": self.api_key, + "Content-Type": "application/json", + } + + payload: Dict[str, Any] = { + "session_id": self.session_id, + } + + # Include SDP answer if provided + if sdp_answer: + payload["sdp"] = { + "type": "answer", + "sdp": sdp_answer + } + + try: + async with self._http_session.post( + f"{self.base_url}/streaming.start", + json=payload, + headers=headers, + ) as response: + if response.status != 200: + error_text = await response.text() + raise RuntimeError( + f"Failed to start HeyGen session: {response.status} - {error_text}" + ) + + data = await response.json() + logger.info(f"✅ HeyGen session started: {self.session_id}") + return data + + except Exception as e: + logger.error(f"❌ Failed to start HeyGen session: {e}") + raise + + async def stop_session(self) -> None: + """Stop the HeyGen streaming session.""" + if not self.session_id: + logger.warning("No active session to stop") + return + + if not self._http_session: + return + + headers = { + "X-Api-Key": self.api_key, + "Content-Type": "application/json", + } + + payload = { + "session_id": self.session_id, + } + + try: + async with self._http_session.post( + f"{self.base_url}/streaming.stop", + json=payload, + headers=headers, + ) as response: + if response.status == 200: + logger.info(f"✅ HeyGen session stopped: {self.session_id}") + else: + logger.warning( + f"Failed to stop HeyGen session: {response.status}" + ) + except Exception as e: + logger.error(f"❌ Error stopping HeyGen session: {e}") + + async def close(self) -> None: + """Clean up session resources.""" + await self.stop_session() + + if self._http_session: + await self._http_session.close() + self._http_session = None + + self.session_id = None + self.session_info = None + logger.info("HeyGen session cleaned up") + diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py new file mode 100644 index 00000000..1fcbc39b --- /dev/null +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py @@ -0,0 +1,161 @@ +import asyncio +import logging +from typing import Optional + +import av +from aiortc import MediaStreamTrack, VideoStreamTrack +from PIL import Image + +from vision_agents.core.utils.queue import LatestNQueue + +logger = logging.getLogger(__name__) + + +class HeyGenVideoTrack(VideoStreamTrack): + """Video track that forwards HeyGen avatar video frames. + + Receives video frames from HeyGen's WebRTC connection and provides + them through the standard VideoStreamTrack interface for publishing + to the call. + """ + + def __init__(self, width: int = 1920, height: int = 1080): + """Initialize the HeyGen video track. + + Args: + width: Video frame width. + height: Video frame height. + """ + super().__init__() + + self.width = width + self.height = height + + # Queue for incoming frames from HeyGen + self.frame_queue: LatestNQueue[av.VideoFrame] = LatestNQueue(maxlen=30) + + # Create placeholder frame for when no frames are available + placeholder = Image.new("RGB", (self.width, self.height), color=(30, 30, 40)) + self.placeholder_frame = av.VideoFrame.from_image(placeholder) + self.last_frame: av.VideoFrame = self.placeholder_frame + + self._stopped = False + self._receiving_task: Optional[asyncio.Task] = None + self._source_track: Optional[MediaStreamTrack] = None + + logger.info(f"đŸŽŦ HeyGenVideoTrack initialized ({width}x{height})") + + async def start_receiving(self, source_track: MediaStreamTrack) -> None: + """Start receiving frames from HeyGen's video track. + + Args: + source_track: The incoming video track from HeyGen's WebRTC connection. + """ + if self._receiving_task: + logger.warning("Already receiving frames from HeyGen") + return + + self._source_track = source_track + self._receiving_task = asyncio.create_task(self._receive_frames()) + logger.info("đŸ“Ĩ Started receiving frames from HeyGen") + + async def _receive_frames(self) -> None: + """Continuously receive frames from HeyGen and add to queue.""" + if not self._source_track: + logger.error("No source track set") + return + + try: + while not self._stopped: + try: + # Receive frame from HeyGen + frame = await self._source_track.recv() + + if frame: + # Resize if needed + if frame.width != self.width or frame.height != self.height: + frame = self._resize_frame(frame) + + # Add to queue (will replace oldest if full) + self.frame_queue.put_latest_nowait(frame) + + logger.debug( + f"đŸ“Ĩ Received frame from HeyGen: {frame.width}x{frame.height}" + ) + + except Exception as e: + if not self._stopped: + logger.warning(f"Error receiving frame from HeyGen: {e}") + await asyncio.sleep(0.01) + + except asyncio.CancelledError: + logger.info("Frame receiving task cancelled") + except Exception as e: + logger.error(f"Fatal error in frame receiving: {e}") + + def _resize_frame(self, frame: av.VideoFrame) -> av.VideoFrame: + """Resize a video frame to match the track dimensions. + + Args: + frame: Input video frame. + + Returns: + Resized video frame. + """ + try: + img = frame.to_image() + resized = img.resize((self.width, self.height), Image.LANCZOS) + return av.VideoFrame.from_image(resized) + + except Exception as e: + logger.error(f"Error resizing frame: {e}") + return frame + + async def recv(self) -> av.VideoFrame: + """Receive the next video frame. + + This is called by the WebRTC stack to get frames for transmission. + + Returns: + Video frame to transmit. + """ + if self._stopped: + raise Exception("Track stopped") + + try: + # Try to get a new frame from queue with short timeout + frame = await asyncio.wait_for( + self.frame_queue.get(), + timeout=0.033 # ~30 FPS + ) + if frame: + self.last_frame = frame + + except asyncio.TimeoutError: + # No new frame, use last frame + pass + + except Exception as e: + logger.warning(f"Error getting frame from queue: {e}") + + # Get timestamp for the frame + pts, time_base = await self.next_timestamp() + + # Create a copy of the frame with updated timestamp + output_frame = self.last_frame + output_frame.pts = pts + output_frame.time_base = time_base + + return output_frame + + def stop(self) -> None: + """Stop the video track.""" + self._stopped = True + + if self._receiving_task: + self._receiving_task.cancel() + self._receiving_task = None + + super().stop() + logger.info("🛑 HeyGenVideoTrack stopped") + diff --git a/pyproject.toml b/pyproject.toml index b6e6d93e..26f6c0a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ vision-agents-plugins-ultralytics = { workspace = true } vision-agents-plugins-krisp = { workspace = true } vision-agents-plugins-smart-turn = { workspace = true } vision-agents-plugins-wizper = { workspace = true } +vision-agents-plugins-heygen = { workspace = true } [tool.uv.workspace] members = [ @@ -44,7 +45,8 @@ members = [ "plugins/ultralytics", "plugins/krisp", "plugins/smart_turn", - "plugins/wizper" + "plugins/wizper", + "plugins/heygen" ] exclude = [ "**/__pycache__", diff --git a/uv.lock b/uv.lock index 0ef7c3c5..61c02726 100644 --- a/uv.lock +++ b/uv.lock @@ -19,6 +19,7 @@ members = [ "vision-agents-plugins-fish", "vision-agents-plugins-gemini", "vision-agents-plugins-getstream", + "vision-agents-plugins-heygen", "vision-agents-plugins-kokoro", "vision-agents-plugins-krisp", "vision-agents-plugins-moonshine", @@ -165,6 +166,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3b/58/af07dda649c22a1ae954ffb7aaaf4d4a57f1bf00ebdf62307affc0b8552f/aioice-0.10.1-py3-none-any.whl", hash = "sha256:f31ae2abc8608b1283ed5f21aebd7b6bd472b152ff9551e9b559b2d8efed79e9", size = 24872, upload-time = "2025-04-13T08:15:24.044Z" }, ] +[[package]] +name = "aiortc" +version = "1.14.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "aioice" }, + { name = "av" }, + { name = "cryptography" }, + { name = "google-crc32c" }, + { name = "pyee" }, + { name = "pylibsrtp" }, + { name = "pyopenssl" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/51/9c/4e027bfe0195de0442da301e2389329496745d40ae44d2d7c4571c4290ce/aiortc-1.14.0.tar.gz", hash = "sha256:adc8a67ace10a085721e588e06a00358ed8eaf5f6b62f0a95358ff45628dd762", size = 1180864, upload-time = "2025-10-13T21:40:37.905Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/57/ab/31646a49209568cde3b97eeade0d28bb78b400e6645c56422c101df68932/aiortc-1.14.0-py3-none-any.whl", hash = "sha256:4b244d7e482f4e1f67e685b3468269628eca1ec91fa5b329ab517738cfca086e", size = 93183, upload-time = "2025-10-13T21:40:36.59Z" }, +] + [[package]] name = "aiortc-getstream" version = "1.13.0.post1" @@ -4855,6 +4874,7 @@ all-plugins = [ { name = "vision-agents-plugins-elevenlabs" }, { name = "vision-agents-plugins-gemini" }, { name = "vision-agents-plugins-getstream" }, + { name = "vision-agents-plugins-heygen" }, { name = "vision-agents-plugins-kokoro" }, { name = "vision-agents-plugins-krisp" }, { name = "vision-agents-plugins-moonshine" }, @@ -4889,6 +4909,9 @@ gemini = [ getstream = [ { name = "vision-agents-plugins-getstream" }, ] +heygen = [ + { name = "vision-agents-plugins-heygen" }, +] kokoro = [ { name = "vision-agents-plugins-kokoro" }, ] @@ -4940,6 +4963,8 @@ requires-dist = [ { name = "vision-agents-plugins-gemini", marker = "extra == 'gemini'", editable = "plugins/gemini" }, { name = "vision-agents-plugins-getstream", marker = "extra == 'all-plugins'", editable = "plugins/getstream" }, { name = "vision-agents-plugins-getstream", marker = "extra == 'getstream'", editable = "plugins/getstream" }, + { name = "vision-agents-plugins-heygen", marker = "extra == 'all-plugins'", editable = "plugins/heygen" }, + { name = "vision-agents-plugins-heygen", marker = "extra == 'heygen'", editable = "plugins/heygen" }, { name = "vision-agents-plugins-kokoro", marker = "extra == 'all-plugins'", editable = "plugins/kokoro" }, { name = "vision-agents-plugins-kokoro", marker = "extra == 'kokoro'", editable = "plugins/kokoro" }, { name = "vision-agents-plugins-krisp", marker = "extra == 'all-plugins'", editable = "plugins/krisp" }, @@ -4959,7 +4984,7 @@ requires-dist = [ { name = "vision-agents-plugins-xai", marker = "extra == 'all-plugins'", editable = "plugins/xai" }, { name = "vision-agents-plugins-xai", marker = "extra == 'xai'", editable = "plugins/xai" }, ] -provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "kokoro", "krisp", "moonshine", "openai", "silero", "smart-turn", "ultralytics", "wizper", "xai"] +provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "heygen", "kokoro", "krisp", "moonshine", "openai", "silero", "smart-turn", "ultralytics", "wizper", "xai"] [[package]] name = "vision-agents-plugins-anthropic" @@ -5183,6 +5208,35 @@ dev = [ { name = "pytest-asyncio", specifier = ">=1.0.0" }, ] +[[package]] +name = "vision-agents-plugins-heygen" +version = "0.1.0" +source = { editable = "plugins/heygen" } +dependencies = [ + { name = "aiohttp" }, + { name = "aiortc" }, + { name = "vision-agents" }, +] + +[package.dev-dependencies] +dev = [ + { name = "pytest" }, + { name = "pytest-asyncio" }, +] + +[package.metadata] +requires-dist = [ + { name = "aiohttp", specifier = ">=3.9.0" }, + { name = "aiortc", specifier = ">=1.9.0" }, + { name = "vision-agents", editable = "agents-core" }, +] + +[package.metadata.requires-dev] +dev = [ + { name = "pytest", specifier = ">=8.4.1" }, + { name = "pytest-asyncio", specifier = ">=1.0.0" }, +] + [[package]] name = "vision-agents-plugins-kokoro" source = { editable = "plugins/kokoro" } From 4f3a6e4d8e6df8008c3ea75c1f49be94d2ea6d60 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Tue, 28 Oct 2025 10:46:20 +0100 Subject: [PATCH 02/20] add lip-sync support by forwarding agent audio to heygen --- plugins/heygen/README.md | 15 +++++++ plugins/heygen/example/avatar_example.py | 5 +++ .../plugins/heygen/heygen_avatar_publisher.py | 40 ++++++++++++++++++- .../plugins/heygen/heygen_rtc_manager.py | 26 ++++++++++++ 4 files changed, 85 insertions(+), 1 deletion(-) diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md index 2dc19e34..2d4cdd10 100644 --- a/plugins/heygen/README.md +++ b/plugins/heygen/README.md @@ -56,6 +56,11 @@ async def start_avatar_agent(): call = agent.edge.client.video.call("default", str(uuid4())) with await agent.join(call): + # Enable lip-sync by forwarding agent's audio to HeyGen + avatar_publisher = agent.video_publishers[0] + if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track: + avatar_publisher.set_agent_audio_track(agent._audio_track) + await agent.edge.open_demo(call) await agent.simple_response("Hello! I'm your AI assistant with an avatar.") await agent.finish() @@ -103,6 +108,16 @@ agent = Agent( heygen.AvatarPublisher(avatar_id="professional_presenter") ] ) + +call = agent.edge.client.video.call("default", str(uuid4())) + +with await agent.join(call): + # Enable lip-sync + avatar_publisher = agent.video_publishers[0] + if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track: + avatar_publisher.set_agent_audio_track(agent._audio_track) + + await agent.finish() ``` ### With Multiple Processors diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py index 5d021b41..2a81c5a7 100644 --- a/plugins/heygen/example/avatar_example.py +++ b/plugins/heygen/example/avatar_example.py @@ -56,6 +56,11 @@ async def start_avatar_agent() -> None: # Join the call with await agent.join(call): + # Forward agent's audio to HeyGen for lip-sync + avatar_publisher = agent.video_publishers[0] + if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track: + avatar_publisher.set_agent_audio_track(agent._audio_track) + # Open demo UI await agent.edge.open_demo(call) diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index 786b9421..d4ef079f 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -58,7 +58,7 @@ def __init__( """ super().__init__( interval=interval, - receive_audio=False, + receive_audio=True, # Receive audio to forward to HeyGen for lip-sync receive_video=False, **kwargs ) @@ -84,6 +84,7 @@ def __init__( # Connection state self._connected = False self._connection_task: Optional[asyncio.Task] = None + self._audio_track_set = False logger.info( f"🎭 HeyGen AvatarPublisher initialized " @@ -116,6 +117,43 @@ async def _on_video_track(self, track: Any) -> None: logger.info("📹 Received video track from HeyGen, starting frame forwarding") await self._video_track.start_receiving(track) + async def _forward_audio_track(self, audio_track: Any) -> None: + """Forward agent's audio track to HeyGen for lip-sync. + + Args: + audio_track: The agent's audio output track. + """ + if self._audio_track_set: + return # Already forwarded + + logger.info("🎤 Forwarding agent's audio output to HeyGen for lip-sync") + + # Wait for HeyGen connection + if not self._connected: + if self._connection_task: + try: + await asyncio.wait_for(self._connection_task, timeout=10.0) + except asyncio.TimeoutError: + logger.error("Timeout waiting for HeyGen connection") + return + else: + logger.error("HeyGen connection not started") + return + + # Forward the agent's audio track to HeyGen + await self.rtc_manager.send_audio_track(audio_track) + self._audio_track_set = True + + def set_agent_audio_track(self, audio_track: Any) -> None: + """Set the agent's audio track for forwarding to HeyGen. + + This should be called by the agent after audio track is created. + + Args: + audio_track: The agent's audio output track for TTS/Realtime. + """ + asyncio.create_task(self._forward_audio_track(audio_track)) + def publish_video_track(self): """Publish the HeyGen avatar video track. diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py index 8e5d340b..ee1344d0 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -46,6 +46,9 @@ def __init__( # Video track callback for receiving avatar video self._video_callback: Optional[Callable[[MediaStreamTrack], Any]] = None + # Audio track for sending to HeyGen + self._audio_sender: Optional[Any] = None + self._connected = False self._connection_ready = asyncio.Event() @@ -118,6 +121,13 @@ async def on_connection_state_change(): # We just need to create our answer logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}") + # Find and store the audio sender so we can send audio to HeyGen later + for sender in self.pc.getSenders(): + if sender.track and sender.track.kind == "audio": + self._audio_sender = sender + logger.debug("Found audio sender for HeyGen") + break + # Create our answer answer = await self.pc.createAnswer() await self.pc.setLocalDescription(answer) @@ -215,6 +225,22 @@ def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> Non """ self._video_callback = callback + async def send_audio_track(self, audio_track: MediaStreamTrack) -> None: + """Send audio track to HeyGen for lip-sync. + + Args: + audio_track: Audio track containing agent's speech. + """ + if not self._audio_sender: + logger.warning("No audio sender available - connection may not be established") + return + + try: + await self._audio_sender.replaceTrack(audio_track) + logger.info("🎤 Audio track sent to HeyGen for lip-sync") + except Exception as e: + logger.error(f"Failed to send audio track to HeyGen: {e}") + @property def is_connected(self) -> bool: """Check if WebRTC connection is established.""" From 680f5d776b6007a1b371593b5870c9763bb1d7b1 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Tue, 28 Oct 2025 10:48:34 +0100 Subject: [PATCH 03/20] switch avatar example to use gemini realtime for better lip-sync testing --- plugins/heygen/example/avatar_example.py | 23 +++++++---------------- plugins/heygen/example/pyproject.toml | 6 ------ 2 files changed, 7 insertions(+), 22 deletions(-) diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py index 2a81c5a7..19bbbfdc 100644 --- a/plugins/heygen/example/avatar_example.py +++ b/plugins/heygen/example/avatar_example.py @@ -3,23 +3,20 @@ from dotenv import load_dotenv from vision_agents.core import User, Agent -from vision_agents.plugins import kokoro, deepgram, getstream, smart_turn, gemini, heygen +from vision_agents.plugins import getstream, gemini, heygen load_dotenv() async def start_avatar_agent() -> None: - """Start an agent with HeyGen avatar. + """Start an agent with HeyGen avatar using Realtime LLM. This example demonstrates how to use HeyGen's avatar streaming - to add realistic avatar video to your AI agent. The avatar will - lip-sync with the audio generated by the TTS. + with Gemini Realtime. The avatar will lip-sync with the audio + generated by the Realtime LLM. """ - # Create LLM - llm = gemini.LLM("gemini-2.0-flash") - - # Create agent with HeyGen avatar + # Create agent with HeyGen avatar and Realtime LLM agent = Agent( edge=getstream.Edge(), agent_user=User( @@ -32,14 +29,8 @@ async def start_avatar_agent() -> None: "Don't use special characters or formatting." ), - # LLM and speech components - llm=llm, - tts=kokoro.TTS(), # Using Kokoro (free, local TTS) - stt=deepgram.STT(), - turn_detection=smart_turn.TurnDetection( - buffer_duration=2.0, - confidence_threshold=0.5 - ), + # Use Gemini Realtime (includes built-in TTS and STT) + llm=gemini.Realtime(fps=2), # Add HeyGen avatar as a video publisher processors=[ diff --git a/plugins/heygen/example/pyproject.toml b/plugins/heygen/example/pyproject.toml index 83fd9bdd..4e1fdf61 100644 --- a/plugins/heygen/example/pyproject.toml +++ b/plugins/heygen/example/pyproject.toml @@ -6,20 +6,14 @@ requires-python = ">=3.10" dependencies = [ "vision-agents", "vision-agents-plugins-heygen", - "vision-agents-plugins-kokoro", - "vision-agents-plugins-deepgram", "vision-agents-plugins-gemini", "vision-agents-plugins-getstream", - "vision-agents-plugins-smart-turn", "python-dotenv", ] [tool.uv.sources] vision-agents = { workspace = true } vision-agents-plugins-heygen = { workspace = true } -vision-agents-plugins-kokoro = { workspace = true } -vision-agents-plugins-deepgram = { workspace = true } vision-agents-plugins-gemini = { workspace = true } vision-agents-plugins-getstream = { workspace = true } -vision-agents-plugins-smart-turn = { workspace = true } From 6eb638fd5590764dd88a4ca5c138d828a3039294 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Wed, 29 Oct 2025 13:33:59 +0100 Subject: [PATCH 04/20] WIP: audio track approach for lip-sync (audio flows but no lip movement) --- plugins/heygen/example/avatar_example.py | 14 +-- .../plugins/heygen/heygen_audio_track.py | 98 +++++++++++++++++++ .../plugins/heygen/heygen_avatar_publisher.py | 85 +++++++++++++--- .../plugins/heygen/heygen_rtc_manager.py | 45 ++++++--- 4 files changed, 208 insertions(+), 34 deletions(-) create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py index 19bbbfdc..2d95421f 100644 --- a/plugins/heygen/example/avatar_example.py +++ b/plugins/heygen/example/avatar_example.py @@ -47,21 +47,15 @@ async def start_avatar_agent() -> None: # Join the call with await agent.join(call): - # Forward agent's audio to HeyGen for lip-sync + # Set agent reference on avatar publisher for audio event subscription avatar_publisher = agent.video_publishers[0] - if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track: - avatar_publisher.set_agent_audio_track(agent._audio_track) + if hasattr(avatar_publisher, 'set_agent'): + avatar_publisher.set_agent(agent) # Open demo UI await agent.edge.open_demo(call) - # Greet the user through the avatar - await agent.simple_response( - "Hello! I'm your AI assistant with an avatar. " - "How can I help you today?" - ) - - # Keep the call running + # Keep the call running - Realtime mode handles conversation automatically await agent.finish() diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py new file mode 100644 index 00000000..f53a5399 --- /dev/null +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py @@ -0,0 +1,98 @@ +"""Custom audio track for sending audio to HeyGen for lip-sync.""" + +import asyncio +import logging +from typing import Optional +from fractions import Fraction + +import av +import numpy as np +from aiortc import AudioStreamTrack + +logger = logging.getLogger(__name__) + + +class HeyGenAudioTrack(AudioStreamTrack): + """Audio track that accepts PCM data and produces frames for WebRTC. + + This track receives audio data from the Realtime LLM and produces + audio frames that can be sent to HeyGen via WebRTC for lip-sync. + """ + + kind = "audio" + + def __init__(self, sample_rate: int = 24000): + """Initialize the audio track. + + Args: + sample_rate: Sample rate for audio frames (default: 24000 for Gemini). + """ + super().__init__() + self._sample_rate = sample_rate + self._ts = 0 + self._latest_chunk: Optional[bytes] = None + self._silence_cache: dict[int, np.ndarray] = {} + logger.info(f"🎤 HeyGenAudioTrack initialized at {sample_rate}Hz") + + def write_audio(self, pcm_data: bytes) -> None: + """Write PCM audio data to be sent to HeyGen. + + Args: + pcm_data: Raw PCM16 audio data from the LLM. + """ + if not pcm_data: + return + self._latest_chunk = bytes(pcm_data) + logger.debug(f"âœī¸ Audio data written: {len(pcm_data)} bytes") + + async def recv(self) -> av.AudioFrame: + """Receive the next audio frame for WebRTC transmission. + + Returns: + Audio frame to send to HeyGen. + """ + # Pace at 20ms per frame (50 fps) + await asyncio.sleep(0.02) + + sr = self._sample_rate + samples_per_frame = int(0.02 * sr) # 20ms worth of samples + + chunk = self._latest_chunk + if chunk: + logger.debug(f"đŸŽ™ī¸ recv() producing frame with audio data ({len(chunk)} bytes)") + if chunk: + # Consume and clear the latest pushed chunk + self._latest_chunk = None + arr = np.frombuffer(chunk, dtype=np.int16) + + # Ensure mono channel + if arr.ndim == 1: + samples = arr.reshape(1, -1) + else: + samples = arr[:1, :] + + # Pad or truncate to exactly one 20ms frame + needed = samples_per_frame + have = samples.shape[1] + if have < needed: + pad = np.zeros((1, needed - have), dtype=np.int16) + samples = np.concatenate([samples, pad], axis=1) + elif have > needed: + samples = samples[:, :needed] + else: + # Generate silence when no audio data is available + cached = self._silence_cache.get(sr) + if cached is None: + cached = np.zeros((1, samples_per_frame), dtype=np.int16) + self._silence_cache[sr] = cached + samples = cached + + # Create audio frame + frame = av.AudioFrame.from_ndarray(samples, format="s16", layout="mono") + frame.sample_rate = sr + frame.pts = self._ts + frame.time_base = Fraction(1, sr) + self._ts += samples.shape[1] + + return frame + diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index d4ef079f..16c1a9bc 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -85,11 +85,27 @@ def __init__( self._connected = False self._connection_task: Optional[asyncio.Task] = None self._audio_track_set = False + self._agent = None # Will be set by the agent + + # Create a custom audio track for HeyGen that we can write to + from .heygen_audio_track import HeyGenAudioTrack + self._heygen_audio_track = HeyGenAudioTrack(sample_rate=24000) logger.info( f"🎭 HeyGen AvatarPublisher initialized " f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})" ) + + def set_agent(self, agent: Any) -> None: + """Set the agent reference for event subscription. + + This is called by the agent when the processor is attached. + + Args: + agent: The agent instance. + """ + self._agent = agent + logger.info("🔗 Agent reference set for HeyGen avatar publisher") async def _connect_to_heygen(self) -> None: """Establish connection to HeyGen and start receiving video.""" @@ -102,11 +118,38 @@ async def _connect_to_heygen(self) -> None: self._connected = True logger.info("✅ Connected to HeyGen, avatar streaming active") + + # Subscribe to audio output events from the LLM for lip-sync + self._subscribe_to_audio_events() except Exception as e: logger.error(f"❌ Failed to connect to HeyGen: {e}") self._connected = False raise + + def _subscribe_to_audio_events(self) -> None: + """Subscribe to audio output events from the LLM.""" + try: + # Import the event type + from vision_agents.core.llm.events import RealtimeAudioOutputEvent + + # Get the agent's event manager + # Note: This will be set when the processor is attached to an agent + if hasattr(self, '_agent') and self._agent: + @self._agent.events.subscribe + async def on_audio_output(event: RealtimeAudioOutputEvent): + logger.debug(f"đŸ“ĸ Received audio output event: {len(event.audio_data)} bytes at {event.sample_rate}Hz") + await self._on_audio_output(event.audio_data, event.sample_rate) + logger.info("🎧 Subscribed to LLM audio output events for lip-sync") + + # Also log what events are registered + logger.info(f" Event manager has {len(self._agent.events._handlers)} event handlers") + else: + logger.warning("âš ī¸ Cannot subscribe to audio events - no agent attached yet") + except Exception as e: + logger.error(f"Failed to subscribe to audio events: {e}") + import traceback + logger.error(traceback.format_exc()) async def _on_video_track(self, track: Any) -> None: """Callback when video track is received from HeyGen. @@ -117,16 +160,12 @@ async def _on_video_track(self, track: Any) -> None: logger.info("📹 Received video track from HeyGen, starting frame forwarding") await self._video_track.start_receiving(track) - async def _forward_audio_track(self, audio_track: Any) -> None: - """Forward agent's audio track to HeyGen for lip-sync. - - Args: - audio_track: The agent's audio output track. - """ + async def _setup_audio_forwarding(self) -> None: + """Set up audio forwarding from agent to HeyGen for lip-sync.""" if self._audio_track_set: - return # Already forwarded + return # Already set up - logger.info("🎤 Forwarding agent's audio output to HeyGen for lip-sync") + logger.info("🎤 Setting up audio forwarding to HeyGen for lip-sync") # Wait for HeyGen connection if not self._connected: @@ -140,19 +179,39 @@ async def _forward_audio_track(self, audio_track: Any) -> None: logger.error("HeyGen connection not started") return - # Forward the agent's audio track to HeyGen - await self.rtc_manager.send_audio_track(audio_track) + # Set our custom audio track on the HeyGen sender + await self.rtc_manager.send_audio_track(self._heygen_audio_track) self._audio_track_set = True + logger.info("✅ Audio track set up for HeyGen lip-sync") + + async def _on_audio_output(self, audio_data: bytes, sample_rate: int) -> None: + """Handle audio output from the LLM and forward to HeyGen. + + Args: + audio_data: Raw PCM audio data from the LLM. + sample_rate: Sample rate of the audio data. + """ + logger.debug(f"đŸŽĩ _on_audio_output called: {len(audio_data)} bytes at {sample_rate}Hz") + + if not self._audio_track_set: + # Set up audio forwarding on first audio output + logger.info("🔧 Setting up audio forwarding on first audio output") + await self._setup_audio_forwarding() + + # Write audio data to our custom track for HeyGen + logger.info(f"âœī¸ Writing {len(audio_data)} bytes to HeyGen audio track") + self._heygen_audio_track.write_audio(audio_data) def set_agent_audio_track(self, audio_track: Any) -> None: """Set the agent's audio track for forwarding to HeyGen. - This should be called by the agent after audio track is created. + DEPRECATED: This method is no longer needed. Audio is now forwarded + via event listening instead of track sharing. Args: - audio_track: The agent's audio output track for TTS/Realtime. + audio_track: The agent's audio output track (unused). """ - asyncio.create_task(self._forward_audio_track(audio_track)) + logger.warning("set_agent_audio_track is deprecated - audio forwarding is automatic via events") def publish_video_track(self): """Publish the HeyGen avatar video track. diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py index ee1344d0..278ea9b5 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -117,16 +117,27 @@ async def on_connection_state_change(): offer = RTCSessionDescription(sdp=offer_sdp, type="offer") await self.pc.setRemoteDescription(offer) - # HeyGen's offer already includes tracks, so transceivers are auto-created - # We just need to create our answer + # HeyGen's offer includes tracks for video/audio they send us + # Check transceivers to see if we have an audio sender logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}") - # Find and store the audio sender so we can send audio to HeyGen later - for sender in self.pc.getSenders(): - if sender.track and sender.track.kind == "audio": - self._audio_sender = sender - logger.debug("Found audio sender for HeyGen") - break + # Find the audio transceiver and modify it to allow sending + logger.info(f"🔍 Checking {len(self.pc.getTransceivers())} transceivers for audio") + for idx, transceiver in enumerate(self.pc.getTransceivers()): + logger.info(f" Transceiver {idx}: mid={transceiver.mid}, direction={transceiver.direction}") + if transceiver.receiver and transceiver.receiver.track: + logger.info(f" Receiver track: kind={transceiver.receiver.track.kind}") + if transceiver.receiver.track.kind == "audio": + # Found the audio transceiver - modify its direction to allow sending + logger.info(f" 🔧 Modifying audio transceiver direction from {transceiver.direction} to sendrecv") + transceiver.direction = "sendrecv" + self._audio_sender = transceiver.sender + logger.info("✅ Audio transceiver modified for lip-sync") + break + + # If no audio transceiver found, log warning + if not self._audio_sender: + logger.warning("âš ī¸ No audio transceiver found - lip-sync may not work") # Create our answer answer = await self.pc.createAnswer() @@ -236,10 +247,22 @@ async def send_audio_track(self, audio_track: MediaStreamTrack) -> None: return try: - await self._audio_sender.replaceTrack(audio_track) - logger.info("🎤 Audio track sent to HeyGen for lip-sync") + logger.info(f"🎤 Attempting to send audio track to HeyGen: {audio_track}") + logger.info(f" Audio track kind: {audio_track.kind if hasattr(audio_track, 'kind') else 'unknown'}") + logger.info(f" Current sender track: {self._audio_sender.track}") + + # replaceTrack is not async in aiortc + result = self._audio_sender.replaceTrack(audio_track) + # If it returns a coroutine, await it; otherwise just use the result + if hasattr(result, '__await__'): + await result + + logger.info(f"✅ Audio track successfully set on sender") + logger.info(f" New sender track: {self._audio_sender.track}") except Exception as e: - logger.error(f"Failed to send audio track to HeyGen: {e}") + logger.error(f"❌ Failed to send audio track to HeyGen: {e}") + import traceback + logger.error(traceback.format_exc()) @property def is_connected(self) -> bool: From 7f2983a66ec102be8ed06a2fa655aae3e84a35bf Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Thu, 30 Oct 2025 15:51:54 +0100 Subject: [PATCH 05/20] Clean up HeyGen implementation and fix duplicate text sending - Removed obsolete heygen_audio_track.py (from old audio-based approach) - Removed unused _audio_sender field and transceiver logic - Removed unused _original_audio_write field - Simplified audio track management - Moved imports to top of file - Updated docstrings to reflect text-based lip-sync approach Fixed duplicate text sending issue: - Added deduplication tracking with _sent_texts set - Added minimum length filter (>15 chars) to prevent tiny fragments - Simplified event handling to avoid duplicate subscriptions - Proper buffer management between chunk and complete events Known limitation: ~3-4 second audio delay is inherent to HeyGen platform --- .../vision_agents/core/agents/agents.py | 10 +- plugins/heygen/example/avatar_example.py | 26 +- .../example/avatar_streaming_llm_example.py | 73 ++++ .../plugins/heygen/heygen_audio_track.py | 98 ------ .../plugins/heygen/heygen_avatar_publisher.py | 318 ++++++++++++++---- .../plugins/heygen/heygen_rtc_manager.py | 71 ++-- .../plugins/heygen/heygen_session.py | 52 +++ .../plugins/heygen/heygen_video_track.py | 4 +- 8 files changed, 425 insertions(+), 227 deletions(-) create mode 100644 plugins/heygen/example/avatar_streaming_llm_example.py delete mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py index 04519143..13853dd3 100644 --- a/agents-core/vision_agents/core/agents/agents.py +++ b/agents-core/vision_agents/core/agents/agents.py @@ -1008,10 +1008,13 @@ def publish_audio(self) -> bool: """Whether the agent should publish an outbound audio track. Returns: - True if TTS is configured or when in Realtime mode. + True if TTS is configured, when in Realtime mode, or if there are audio publishers. """ if self.tts is not None or self.realtime_mode: return True + # Also publish audio if there are audio publishers (e.g., HeyGen avatar) + if self.audio_publishers: + return True return False @property @@ -1137,6 +1140,11 @@ def _prepare_rtc(self): if self.realtime_mode and isinstance(self.llm, Realtime): self._audio_track = self.llm.output_track self.logger.info("đŸŽĩ Using Realtime provider output track for audio") + elif self.audio_publishers: + # Get the first audio publisher to create the track + audio_publisher = self.audio_publishers[0] + self._audio_track = audio_publisher.publish_audio_track() + self.logger.info("đŸŽĩ Audio track initialized from audio publisher") else: # Default to WebRTC-friendly format unless configured differently framerate = 48000 diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py index 2d95421f..b3beef83 100644 --- a/plugins/heygen/example/avatar_example.py +++ b/plugins/heygen/example/avatar_example.py @@ -3,20 +3,23 @@ from dotenv import load_dotenv from vision_agents.core import User, Agent -from vision_agents.plugins import getstream, gemini, heygen +from vision_agents.plugins import getstream, gemini, heygen, deepgram load_dotenv() async def start_avatar_agent() -> None: - """Start an agent with HeyGen avatar using Realtime LLM. + """Start an agent with HeyGen avatar using streaming LLM. This example demonstrates how to use HeyGen's avatar streaming - with Gemini Realtime. The avatar will lip-sync with the audio - generated by the Realtime LLM. + with a regular streaming LLM. This approach has much lower latency + than using Realtime LLMs because text goes directly to HeyGen + without any transcription round-trip. + + HeyGen handles all TTS and lip-sync based on the LLM's text output. """ - # Create agent with HeyGen avatar and Realtime LLM + # Create agent with HeyGen avatar and streaming LLM agent = Agent( edge=getstream.Edge(), agent_user=User( @@ -29,15 +32,20 @@ async def start_avatar_agent() -> None: "Don't use special characters or formatting." ), - # Use Gemini Realtime (includes built-in TTS and STT) - llm=gemini.Realtime(fps=2), + # Use regular streaming LLM (not Realtime) for lower latency + llm=gemini.LLM("gemini-2.0-flash-exp"), + + # Add STT for speech input + stt=deepgram.STT(), # Add HeyGen avatar as a video publisher + # Note: mute_llm_audio is not needed since streaming LLM doesn't produce audio processors=[ heygen.AvatarPublisher( avatar_id="default", # Use your HeyGen avatar ID quality="high", # Video quality: "low", "medium", "high" resolution=(1920, 1080), # Output resolution + mute_llm_audio=False, # Not needed for streaming LLM ) ] ) @@ -47,7 +55,7 @@ async def start_avatar_agent() -> None: # Join the call with await agent.join(call): - # Set agent reference on avatar publisher for audio event subscription + # Set agent reference on avatar publisher for text event subscription avatar_publisher = agent.video_publishers[0] if hasattr(avatar_publisher, 'set_agent'): avatar_publisher.set_agent(agent) @@ -55,7 +63,7 @@ async def start_avatar_agent() -> None: # Open demo UI await agent.edge.open_demo(call) - # Keep the call running - Realtime mode handles conversation automatically + # Keep the call running await agent.finish() diff --git a/plugins/heygen/example/avatar_streaming_llm_example.py b/plugins/heygen/example/avatar_streaming_llm_example.py new file mode 100644 index 00000000..8bd68998 --- /dev/null +++ b/plugins/heygen/example/avatar_streaming_llm_example.py @@ -0,0 +1,73 @@ +import asyncio +from uuid import uuid4 +from dotenv import load_dotenv + +from vision_agents.core import User, Agent +from vision_agents.plugins import getstream, gemini, heygen, deepgram + +load_dotenv() + + +async def start_avatar_agent_streaming() -> None: + """Start an agent with HeyGen avatar using streaming (non-Realtime) LLM. + + This example demonstrates how to use HeyGen's avatar streaming + with a regular streaming LLM (gemini.LLM) + STT. HeyGen will handle + both TTS and video generation based on the LLM's text output. + + This approach has lower latency than Realtime LLMs because: + - Text is sent to HeyGen immediately as it's generated + - No transcription round-trip (LLM → audio → transcription → HeyGen) + - HeyGen handles TTS and lip-sync simultaneously + """ + + # Create agent with HeyGen avatar and streaming LLM + agent = Agent( + edge=getstream.Edge(), + agent_user=User( + name="AI Assistant with Avatar", + id="agent" + ), + instructions=( + "You're a friendly and helpful AI assistant. " + "Keep your responses conversational and engaging. " + "Don't use special characters or formatting." + ), + + # Use regular streaming LLM (not Realtime) + llm=gemini.LLM("gemini-2.0-flash-exp"), + + # Add STT for speech input + stt=deepgram.STT(), + + # Add HeyGen avatar as a video publisher + # Note: mute_llm_audio is not needed here since gemini.LLM doesn't produce audio + processors=[ + heygen.AvatarPublisher( + avatar_id="default", # Use your HeyGen avatar ID + quality="high", # Video quality: "low", "medium", "high" + resolution=(1920, 1080), # Output resolution + ) + ] + ) + + # Create a call + call = agent.edge.client.video.call("default", str(uuid4())) + + # Join the call + with await agent.join(call): + # Set agent reference on avatar publisher for text event subscription + avatar_publisher = agent.video_publishers[0] + if hasattr(avatar_publisher, 'set_agent'): + avatar_publisher.set_agent(agent) + + # Open demo UI + await agent.edge.open_demo(call) + + # Keep the call running + await agent.finish() + + +if __name__ == "__main__": + asyncio.run(start_avatar_agent_streaming()) + diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py deleted file mode 100644 index f53a5399..00000000 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_audio_track.py +++ /dev/null @@ -1,98 +0,0 @@ -"""Custom audio track for sending audio to HeyGen for lip-sync.""" - -import asyncio -import logging -from typing import Optional -from fractions import Fraction - -import av -import numpy as np -from aiortc import AudioStreamTrack - -logger = logging.getLogger(__name__) - - -class HeyGenAudioTrack(AudioStreamTrack): - """Audio track that accepts PCM data and produces frames for WebRTC. - - This track receives audio data from the Realtime LLM and produces - audio frames that can be sent to HeyGen via WebRTC for lip-sync. - """ - - kind = "audio" - - def __init__(self, sample_rate: int = 24000): - """Initialize the audio track. - - Args: - sample_rate: Sample rate for audio frames (default: 24000 for Gemini). - """ - super().__init__() - self._sample_rate = sample_rate - self._ts = 0 - self._latest_chunk: Optional[bytes] = None - self._silence_cache: dict[int, np.ndarray] = {} - logger.info(f"🎤 HeyGenAudioTrack initialized at {sample_rate}Hz") - - def write_audio(self, pcm_data: bytes) -> None: - """Write PCM audio data to be sent to HeyGen. - - Args: - pcm_data: Raw PCM16 audio data from the LLM. - """ - if not pcm_data: - return - self._latest_chunk = bytes(pcm_data) - logger.debug(f"âœī¸ Audio data written: {len(pcm_data)} bytes") - - async def recv(self) -> av.AudioFrame: - """Receive the next audio frame for WebRTC transmission. - - Returns: - Audio frame to send to HeyGen. - """ - # Pace at 20ms per frame (50 fps) - await asyncio.sleep(0.02) - - sr = self._sample_rate - samples_per_frame = int(0.02 * sr) # 20ms worth of samples - - chunk = self._latest_chunk - if chunk: - logger.debug(f"đŸŽ™ī¸ recv() producing frame with audio data ({len(chunk)} bytes)") - if chunk: - # Consume and clear the latest pushed chunk - self._latest_chunk = None - arr = np.frombuffer(chunk, dtype=np.int16) - - # Ensure mono channel - if arr.ndim == 1: - samples = arr.reshape(1, -1) - else: - samples = arr[:1, :] - - # Pad or truncate to exactly one 20ms frame - needed = samples_per_frame - have = samples.shape[1] - if have < needed: - pad = np.zeros((1, needed - have), dtype=np.int16) - samples = np.concatenate([samples, pad], axis=1) - elif have > needed: - samples = samples[:, :needed] - else: - # Generate silence when no audio data is available - cached = self._silence_cache.get(sr) - if cached is None: - cached = np.zeros((1, samples_per_frame), dtype=np.int16) - self._silence_cache[sr] = cached - samples = cached - - # Create audio frame - frame = av.AudioFrame.from_ndarray(samples, format="s16", layout="mono") - frame.sample_rate = sr - frame.pts = self._ts - frame.time_base = Fraction(1, sr) - self._ts += samples.shape[1] - - return frame - diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index 16c1a9bc..e9da7551 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -2,9 +2,13 @@ import logging from typing import Optional, Any, Tuple +import numpy as np +from getstream.video.rtc import audio_track + from vision_agents.core.processors.base_processor import ( AudioVideoProcessor, VideoPublisherMixin, + AudioPublisherMixin, ) from .heygen_rtc_manager import HeyGenRTCManager @@ -13,20 +17,21 @@ logger = logging.getLogger(__name__) -class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin): - """HeyGen avatar video publisher. +class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMixin): + """HeyGen avatar video and audio publisher. - Publishes video of a HeyGen avatar that lip-syncs to audio input. + Publishes video of a HeyGen avatar that lip-syncs based on LLM text output. Can be used as a processor in the Vision Agents framework to add realistic avatar video to AI agents. + HeyGen handles TTS internally, so no separate TTS is needed. + Example: agent = Agent( edge=getstream.Edge(), agent_user=User(name="Avatar AI"), instructions="Be helpful and friendly", llm=gemini.LLM("gemini-2.0-flash"), - tts=cartesia.TTS(), stt=deepgram.STT(), processors=[ heygen.AvatarPublisher( @@ -44,6 +49,7 @@ def __init__( resolution: Tuple[int, int] = (1920, 1080), api_key: Optional[str] = None, interval: int = 0, + mute_llm_audio: bool = True, **kwargs, ): """Initialize the HeyGen avatar publisher. @@ -54,11 +60,13 @@ def __init__( resolution: Output video resolution (width, height). api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided. interval: Processing interval (not used, kept for compatibility). + mute_llm_audio: If True, mutes the Realtime LLM's audio output so only + HeyGen's video (with audio) is heard. Default: True. **kwargs: Additional arguments passed to parent class. """ super().__init__( interval=interval, - receive_audio=True, # Receive audio to forward to HeyGen for lip-sync + receive_audio=False, # We send text to HeyGen, not audio receive_video=False, **kwargs ) @@ -67,6 +75,7 @@ def __init__( self.quality = quality self.resolution = resolution self.api_key = api_key + self.mute_llm_audio = mute_llm_audio # WebRTC manager for HeyGen connection self.rtc_manager = HeyGenRTCManager( @@ -81,21 +90,38 @@ def __init__( height=resolution[1], ) + # Audio track for publishing HeyGen's audio + # Create it immediately so the agent can detect it during initialization + self._audio_track = audio_track.AudioStreamTrack( + framerate=48000, stereo=True + ) + # Connection state self._connected = False self._connection_task: Optional[asyncio.Task] = None - self._audio_track_set = False self._agent = None # Will be set by the agent - # Create a custom audio track for HeyGen that we can write to - from .heygen_audio_track import HeyGenAudioTrack - self._heygen_audio_track = HeyGenAudioTrack(sample_rate=24000) + # Text buffer for accumulating LLM response chunks before sending to HeyGen + self._text_buffer = "" + self._current_response_id: Optional[str] = None + self._sent_texts: set = set() # Track sent texts to avoid duplicates + + # Audio forwarding state (for selective muting of Realtime LLM audio) + self._forwarding_audio = False logger.info( f"🎭 HeyGen AvatarPublisher initialized " f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})" ) + def publish_audio_track(self): + """Return the audio track for publishing HeyGen's audio. + + This method is called by the Agent to get the audio track that will + be published to the call. HeyGen's audio will be forwarded to this track. + """ + return self._audio_track + def set_agent(self, agent: Any) -> None: """Set the agent reference for event subscription. @@ -106,48 +132,127 @@ def set_agent(self, agent: Any) -> None: """ self._agent = agent logger.info("🔗 Agent reference set for HeyGen avatar publisher") + + # Mute the Realtime LLM's audio if requested + if self.mute_llm_audio: + self._mute_realtime_llm_audio() + + # Subscribe to text events immediately when agent is set + self._subscribe_to_text_events() async def _connect_to_heygen(self) -> None: - """Establish connection to HeyGen and start receiving video.""" + """Establish connection to HeyGen and start receiving video and audio.""" try: - # Set up video callback before connecting + # Set up video and audio callbacks before connecting self.rtc_manager.set_video_callback(self._on_video_track) + self.rtc_manager.set_audio_callback(self._on_audio_track) # Connect to HeyGen await self.rtc_manager.connect() self._connected = True logger.info("✅ Connected to HeyGen, avatar streaming active") - - # Subscribe to audio output events from the LLM for lip-sync - self._subscribe_to_audio_events() except Exception as e: logger.error(f"❌ Failed to connect to HeyGen: {e}") self._connected = False raise - def _subscribe_to_audio_events(self) -> None: - """Subscribe to audio output events from the LLM.""" + def _subscribe_to_text_events(self) -> None: + """Subscribe to text output events from the LLM. + + HeyGen requires text input (not audio) for proper lip-sync. + We listen to the LLM's text output and send it to HeyGen's task API. + """ try: - # Import the event type - from vision_agents.core.llm.events import RealtimeAudioOutputEvent + # Import the event types + from vision_agents.core.llm.events import ( + LLMResponseChunkEvent, + LLMResponseCompletedEvent, + RealtimeAgentSpeechTranscriptionEvent, + ) - # Get the agent's event manager - # Note: This will be set when the processor is attached to an agent - if hasattr(self, '_agent') and self._agent: - @self._agent.events.subscribe - async def on_audio_output(event: RealtimeAudioOutputEvent): - logger.debug(f"đŸ“ĸ Received audio output event: {len(event.audio_data)} bytes at {event.sample_rate}Hz") - await self._on_audio_output(event.audio_data, event.sample_rate) - logger.info("🎧 Subscribed to LLM audio output events for lip-sync") + # Get the LLM's event manager (events are emitted by the LLM, not the agent) + if hasattr(self, '_agent') and self._agent and hasattr(self._agent, 'llm'): + @self._agent.llm.events.subscribe + async def on_text_chunk(event: LLMResponseChunkEvent): + """Handle streaming text chunks from the LLM.""" + logger.debug(f"📝 HeyGen received text chunk: delta='{event.delta}'") + if event.delta: + await self._on_text_chunk(event.delta, event.item_id) + + @self._agent.llm.events.subscribe + async def on_text_complete(event: LLMResponseCompletedEvent): + """Handle end of LLM response - send any remaining buffered text.""" + # Send any remaining buffered text + if self._text_buffer.strip(): + text_to_send = self._text_buffer.strip() + if text_to_send not in self._sent_texts: + await self._send_text_to_heygen(text_to_send) + self._sent_texts.add(text_to_send) + self._text_buffer = "" + # Reset for next response + self._current_response_id = None + self._sent_texts.clear() + + @self._agent.llm.events.subscribe + async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent): + """Handle agent speech transcription from Realtime LLMs. + + This is the primary path for Gemini Realtime which transcribes + the agent's speech output as text. + """ + logger.debug(f"📝 HeyGen received agent speech: text='{event.text}'") + if event.text: + # Send directly to HeyGen - this is the complete utterance + await self._send_text_to_heygen(event.text) - # Also log what events are registered - logger.info(f" Event manager has {len(self._agent.events._handlers)} event handlers") + logger.info("📝 Subscribed to LLM text output events for HeyGen lip-sync") else: - logger.warning("âš ī¸ Cannot subscribe to audio events - no agent attached yet") + logger.warning("âš ī¸ Cannot subscribe to text events - no agent or LLM attached yet") + except Exception as e: + logger.error(f"Failed to subscribe to text events: {e}") + import traceback + logger.error(traceback.format_exc()) + + def _mute_realtime_llm_audio(self) -> None: + """Mute the Realtime LLM's audio output. + + When using HeyGen, we want HeyGen to handle all audio (with lip-sync), + so we mute the LLM's native audio output to avoid duplicated/overlapping audio. + + This works by intercepting writes to the LLM's output_track and only blocking + writes that come from the LLM itself (not from HeyGen forwarding). + """ + try: + from vision_agents.core.llm.realtime import Realtime + + if not hasattr(self, '_agent') or not self._agent: + logger.warning("âš ī¸ Cannot mute LLM audio - no agent set") + return + + if not hasattr(self._agent, 'llm') or not isinstance(self._agent.llm, Realtime): + logger.info("â„šī¸ LLM is not a Realtime LLM - no audio to mute") + return + + # Store the original write method + original_write = self._agent.llm.output_track.write + + # Create a selective write method + async def selective_write(audio_data: bytes) -> None: + """Only allow writes from HeyGen forwarding, block LLM writes.""" + if self._forwarding_audio: + # This is from HeyGen - allow it + await original_write(audio_data) + # else: This is from the Realtime LLM - block it + + # Replace the write method + self._agent.llm.output_track.write = selective_write + + logger.info("🔇 Muted Realtime LLM audio output (HeyGen will provide audio)") + except Exception as e: - logger.error(f"Failed to subscribe to audio events: {e}") + logger.error(f"Failed to mute LLM audio: {e}") import traceback logger.error(traceback.format_exc()) @@ -160,58 +265,131 @@ async def _on_video_track(self, track: Any) -> None: logger.info("📹 Received video track from HeyGen, starting frame forwarding") await self._video_track.start_receiving(track) - async def _setup_audio_forwarding(self) -> None: - """Set up audio forwarding from agent to HeyGen for lip-sync.""" - if self._audio_track_set: - return # Already set up + async def _on_audio_track(self, track: Any) -> None: + """Callback when audio track is received from HeyGen. - logger.info("🎤 Setting up audio forwarding to HeyGen for lip-sync") + HeyGen provides audio with lip-synced TTS. We forward this audio + to the agent's audio track so it gets published to the call. - # Wait for HeyGen connection - if not self._connected: - if self._connection_task: - try: - await asyncio.wait_for(self._connection_task, timeout=10.0) - except asyncio.TimeoutError: - logger.error("Timeout waiting for HeyGen connection") - return - else: - logger.error("HeyGen connection not started") - return + Args: + track: Incoming audio track from HeyGen's WebRTC connection. + """ + logger.info("🔊 Received audio track from HeyGen, starting audio forwarding") - # Set our custom audio track on the HeyGen sender - await self.rtc_manager.send_audio_track(self._heygen_audio_track) - self._audio_track_set = True - logger.info("✅ Audio track set up for HeyGen lip-sync") - - async def _on_audio_output(self, audio_data: bytes, sample_rate: int) -> None: - """Handle audio output from the LLM and forward to HeyGen. + # Forward audio frames from HeyGen to our audio track + asyncio.create_task(self._forward_audio_frames(track, self._audio_track)) + + async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> None: + """Forward audio frames from HeyGen to agent's audio track. Args: - audio_data: Raw PCM audio data from the LLM. - sample_rate: Sample rate of the audio data. + source_track: Audio track from HeyGen. + dest_track: Agent's audio track to write to. """ - logger.debug(f"đŸŽĩ _on_audio_output called: {len(audio_data)} bytes at {sample_rate}Hz") + try: + logger.info("🔊 Starting HeyGen audio frame forwarding") + frame_count = 0 + while True: + try: + # Read audio frame from HeyGen + frame = await source_track.recv() + frame_count += 1 + + # Convert frame to bytes and write to agent's audio track + if hasattr(frame, 'to_ndarray'): + audio_array = frame.to_ndarray() + + # Convert mono to stereo if needed (agent track expects stereo) + # HeyGen sends mono (shape=(1, samples)), we need interleaved stereo + if audio_array.shape[0] == 1: + # Flatten to 1D array of samples + mono_samples = audio_array.flatten() + + # Create stereo by interleaving each mono sample + stereo_samples = np.repeat(mono_samples, 2) + audio_bytes = stereo_samples.tobytes() + else: + # Already multi-channel, just flatten and convert + audio_bytes = audio_array.flatten().tobytes() + + # Set flag to allow HeyGen audio through the muted track + self._forwarding_audio = True + await dest_track.write(audio_bytes) + self._forwarding_audio = False + else: + logger.warning("âš ī¸ Received frame without to_ndarray() method") + + except Exception as e: + if "ended" in str(e).lower() or "closed" in str(e).lower(): + logger.info(f"🔊 HeyGen audio track ended (forwarded {frame_count} frames)") + break + else: + logger.error(f"❌ Error forwarding audio frame #{frame_count}: {e}") + import traceback + logger.error(traceback.format_exc()) + break + + except Exception as e: + logger.error(f"❌ Error in audio forwarding loop: {e}") + import traceback + logger.error(traceback.format_exc()) + + async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None: + """Handle text chunk from the LLM. - if not self._audio_track_set: - # Set up audio forwarding on first audio output - logger.info("🔧 Setting up audio forwarding on first audio output") - await self._setup_audio_forwarding() + Accumulates text chunks until a complete sentence or response is ready, + then sends to HeyGen for lip-sync. - # Write audio data to our custom track for HeyGen - logger.info(f"âœī¸ Writing {len(audio_data)} bytes to HeyGen audio track") - self._heygen_audio_track.write_audio(audio_data) - - def set_agent_audio_track(self, audio_track: Any) -> None: - """Set the agent's audio track for forwarding to HeyGen. + Args: + text_delta: The text chunk/delta from the LLM. + item_id: The response item ID. + """ + # If this is a new response, reset the buffer and sent tracking + if item_id != self._current_response_id: + if self._text_buffer: + # Send any accumulated text from previous response + await self._send_text_to_heygen(self._text_buffer.strip()) + self._text_buffer = "" + self._current_response_id = item_id + self._sent_texts.clear() + + # Accumulate text + self._text_buffer += text_delta - DEPRECATED: This method is no longer needed. Audio is now forwarded - via event listening instead of track sharing. + # Send when we have a complete sentence (ending with period, !, or ?) + # But only if it's substantial enough (> 15 chars) to avoid sending tiny fragments + # Don't send on commas/semicolons to reduce repetition + if any(self._text_buffer.rstrip().endswith(p) for p in ['.', '!', '?']): + text_to_send = self._text_buffer.strip() + # Only send if it's substantial (>15 chars) and not already sent + if text_to_send and len(text_to_send) > 15 and text_to_send not in self._sent_texts: + await self._send_text_to_heygen(text_to_send) + self._sent_texts.add(text_to_send) + self._text_buffer = "" # Clear buffer after sending + elif text_to_send in self._sent_texts: + self._text_buffer = "" # Clear buffer to avoid re-sending + + async def _send_text_to_heygen(self, text: str) -> None: + """Send text to HeyGen for the avatar to speak with lip-sync. Args: - audio_track: The agent's audio output track (unused). + text: The text for the avatar to speak. """ - logger.warning("set_agent_audio_track is deprecated - audio forwarding is automatic via events") + if not text: + return + + if not self._connected: + logger.warning("Cannot send text to HeyGen - not connected") + return + + try: + logger.info(f"📤 Sending text to HeyGen: '{text[:50]}...'") + await self.rtc_manager.send_text(text, task_type="repeat") + logger.debug("✅ Text sent to HeyGen successfully") + except Exception as e: + logger.error(f"❌ Failed to send text to HeyGen: {e}") + import traceback + logger.error(traceback.format_exc()) def publish_video_track(self): """Publish the HeyGen avatar video track. diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py index 278ea9b5..3e61674a 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -46,8 +46,8 @@ def __init__( # Video track callback for receiving avatar video self._video_callback: Optional[Callable[[MediaStreamTrack], Any]] = None - # Audio track for sending to HeyGen - self._audio_sender: Optional[Any] = None + # Audio track callback for receiving avatar audio + self._audio_callback: Optional[Callable[[MediaStreamTrack], Any]] = None self._connected = False self._connection_ready = asyncio.Event() @@ -117,28 +117,9 @@ async def on_connection_state_change(): offer = RTCSessionDescription(sdp=offer_sdp, type="offer") await self.pc.setRemoteDescription(offer) - # HeyGen's offer includes tracks for video/audio they send us - # Check transceivers to see if we have an audio sender + # Log transceivers for debugging logger.debug(f"Transceivers after setRemoteDescription: {len(self.pc.getTransceivers())}") - # Find the audio transceiver and modify it to allow sending - logger.info(f"🔍 Checking {len(self.pc.getTransceivers())} transceivers for audio") - for idx, transceiver in enumerate(self.pc.getTransceivers()): - logger.info(f" Transceiver {idx}: mid={transceiver.mid}, direction={transceiver.direction}") - if transceiver.receiver and transceiver.receiver.track: - logger.info(f" Receiver track: kind={transceiver.receiver.track.kind}") - if transceiver.receiver.track.kind == "audio": - # Found the audio transceiver - modify its direction to allow sending - logger.info(f" 🔧 Modifying audio transceiver direction from {transceiver.direction} to sendrecv") - transceiver.direction = "sendrecv" - self._audio_sender = transceiver.sender - logger.info("✅ Audio transceiver modified for lip-sync") - break - - # If no audio transceiver found, log warning - if not self._audio_sender: - logger.warning("âš ī¸ No audio transceiver found - lip-sync may not work") - # Create our answer answer = await self.pc.createAnswer() await self.pc.setLocalDescription(answer) @@ -225,8 +206,12 @@ async def _handle_track(self, track: MediaStreamTrack) -> None: else: logger.warning("Video track received but no callback registered") elif track.kind == "audio": - # Audio track from HeyGen (avatar speech) - currently not used - logger.debug("Audio track received from HeyGen (ignored)") + # Audio track from HeyGen (avatar speech with lip-synced TTS) + logger.info("🔊 Audio track received from HeyGen") + if self._audio_callback: + await self._audio_callback(track) + else: + logger.warning("âš ī¸ Audio track received but no callback registered") def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None: """Set callback for handling incoming video track. @@ -236,33 +221,25 @@ def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> Non """ self._video_callback = callback - async def send_audio_track(self, audio_track: MediaStreamTrack) -> None: - """Send audio track to HeyGen for lip-sync. + def set_audio_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None: + """Set callback for handling incoming audio track. Args: - audio_track: Audio track containing agent's speech. + callback: Async function to handle audio track. """ - if not self._audio_sender: - logger.warning("No audio sender available - connection may not be established") - return + self._audio_callback = callback + + async def send_text(self, text: str, task_type: str = "repeat") -> None: + """Send text to HeyGen for the avatar to speak with lip-sync. - try: - logger.info(f"🎤 Attempting to send audio track to HeyGen: {audio_track}") - logger.info(f" Audio track kind: {audio_track.kind if hasattr(audio_track, 'kind') else 'unknown'}") - logger.info(f" Current sender track: {self._audio_sender.track}") - - # replaceTrack is not async in aiortc - result = self._audio_sender.replaceTrack(audio_track) - # If it returns a coroutine, await it; otherwise just use the result - if hasattr(result, '__await__'): - await result - - logger.info(f"✅ Audio track successfully set on sender") - logger.info(f" New sender track: {self._audio_sender.track}") - except Exception as e: - logger.error(f"❌ Failed to send audio track to HeyGen: {e}") - import traceback - logger.error(traceback.format_exc()) + This is the correct way to achieve lip-sync with HeyGen - they handle + TTS and lip-sync server-side based on the text input. + + Args: + text: The text for the avatar to speak. + task_type: Either "repeat" or "talk" (default: "repeat"). + """ + await self.session_manager.send_task(text, task_type) @property def is_connected(self) -> bool: diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py index 917a4a52..aca8caa3 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py @@ -134,6 +134,58 @@ async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any logger.error(f"❌ Failed to start HeyGen session: {e}") raise + async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any]: + """Send a text task to HeyGen for the avatar to speak. + + This is the proper way to achieve lip-sync with HeyGen - send text, + and HeyGen handles TTS and lip-sync server-side. + + Args: + text: The text for the avatar to speak. + task_type: Either "repeat" (avatar repeats text exactly) or + "talk" (processes through HeyGen's LLM first). + + Returns: + Task response from HeyGen. + """ + if not self.session_id: + raise RuntimeError("Session not created. Call create_session() first.") + + if not self._http_session: + self._http_session = aiohttp.ClientSession() + + headers = { + "X-Api-Key": self.api_key, + "Content-Type": "application/json", + } + + payload = { + "session_id": self.session_id, + "text": text, + "task_type": task_type, + } + + try: + async with self._http_session.post( + f"{self.base_url}/streaming.task", + json=payload, + headers=headers, + ) as response: + if response.status != 200: + error_text = await response.text() + logger.warning( + f"Failed to send task to HeyGen: {response.status} - {error_text}" + ) + return {} + + data = await response.json() + logger.debug(f"📤 Sent text to HeyGen: '{text[:50]}...'") + return data + + except Exception as e: + logger.error(f"❌ Error sending task to HeyGen: {e}") + return {} + async def stop_session(self) -> None: """Stop the HeyGen streaming session.""" if not self.session_id: diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py index 1fcbc39b..ace06d5e 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py @@ -31,8 +31,8 @@ def __init__(self, width: int = 1920, height: int = 1080): self.width = width self.height = height - # Queue for incoming frames from HeyGen - self.frame_queue: LatestNQueue[av.VideoFrame] = LatestNQueue(maxlen=30) + # Queue for incoming frames from HeyGen - keep minimal for low latency + self.frame_queue: LatestNQueue[av.VideoFrame] = LatestNQueue(maxlen=2) # Create placeholder frame for when no frames are available placeholder = Image.new("RGB", (self.width, self.height), color=(30, 30, 40)) From 96f1cc94b9bb5784a6c78c6e5c784237edb2f2aa Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Mon, 3 Nov 2025 17:17:09 +0100 Subject: [PATCH 06/20] PR cleanup --- aiortc | 1 - plugins/heygen/README.md | 12 +-- .../example/avatar_streaming_llm_example.py | 73 ------------------- .../plugins/heygen/heygen_avatar_publisher.py | 48 ++++++------ .../plugins/heygen/heygen_rtc_manager.py | 22 +++--- .../plugins/heygen/heygen_session.py | 16 ++-- .../plugins/heygen/heygen_video_track.py | 8 +- 7 files changed, 53 insertions(+), 127 deletions(-) delete mode 160000 aiortc delete mode 100644 plugins/heygen/example/avatar_streaming_llm_example.py diff --git a/aiortc b/aiortc deleted file mode 160000 index f84800ce..00000000 --- a/aiortc +++ /dev/null @@ -1 +0,0 @@ -Subproject commit f84800ce052de7d81a62b07c6f6094504c19b65f diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md index 2d4cdd10..b7360c3a 100644 --- a/plugins/heygen/README.md +++ b/plugins/heygen/README.md @@ -56,10 +56,10 @@ async def start_avatar_agent(): call = agent.edge.client.video.call("default", str(uuid4())) with await agent.join(call): - # Enable lip-sync by forwarding agent's audio to HeyGen + # Set agent reference for event subscription avatar_publisher = agent.video_publishers[0] - if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track: - avatar_publisher.set_agent_audio_track(agent._audio_track) + if hasattr(avatar_publisher, 'set_agent'): + avatar_publisher.set_agent(agent) await agent.edge.open_demo(call) await agent.simple_response("Hello! I'm your AI assistant with an avatar.") @@ -112,10 +112,10 @@ agent = Agent( call = agent.edge.client.video.call("default", str(uuid4())) with await agent.join(call): - # Enable lip-sync + # Set agent reference for event subscription avatar_publisher = agent.video_publishers[0] - if hasattr(avatar_publisher, 'set_agent_audio_track') and agent._audio_track: - avatar_publisher.set_agent_audio_track(agent._audio_track) + if hasattr(avatar_publisher, 'set_agent'): + avatar_publisher.set_agent(agent) await agent.finish() ``` diff --git a/plugins/heygen/example/avatar_streaming_llm_example.py b/plugins/heygen/example/avatar_streaming_llm_example.py deleted file mode 100644 index 8bd68998..00000000 --- a/plugins/heygen/example/avatar_streaming_llm_example.py +++ /dev/null @@ -1,73 +0,0 @@ -import asyncio -from uuid import uuid4 -from dotenv import load_dotenv - -from vision_agents.core import User, Agent -from vision_agents.plugins import getstream, gemini, heygen, deepgram - -load_dotenv() - - -async def start_avatar_agent_streaming() -> None: - """Start an agent with HeyGen avatar using streaming (non-Realtime) LLM. - - This example demonstrates how to use HeyGen's avatar streaming - with a regular streaming LLM (gemini.LLM) + STT. HeyGen will handle - both TTS and video generation based on the LLM's text output. - - This approach has lower latency than Realtime LLMs because: - - Text is sent to HeyGen immediately as it's generated - - No transcription round-trip (LLM → audio → transcription → HeyGen) - - HeyGen handles TTS and lip-sync simultaneously - """ - - # Create agent with HeyGen avatar and streaming LLM - agent = Agent( - edge=getstream.Edge(), - agent_user=User( - name="AI Assistant with Avatar", - id="agent" - ), - instructions=( - "You're a friendly and helpful AI assistant. " - "Keep your responses conversational and engaging. " - "Don't use special characters or formatting." - ), - - # Use regular streaming LLM (not Realtime) - llm=gemini.LLM("gemini-2.0-flash-exp"), - - # Add STT for speech input - stt=deepgram.STT(), - - # Add HeyGen avatar as a video publisher - # Note: mute_llm_audio is not needed here since gemini.LLM doesn't produce audio - processors=[ - heygen.AvatarPublisher( - avatar_id="default", # Use your HeyGen avatar ID - quality="high", # Video quality: "low", "medium", "high" - resolution=(1920, 1080), # Output resolution - ) - ] - ) - - # Create a call - call = agent.edge.client.video.call("default", str(uuid4())) - - # Join the call - with await agent.join(call): - # Set agent reference on avatar publisher for text event subscription - avatar_publisher = agent.video_publishers[0] - if hasattr(avatar_publisher, 'set_agent'): - avatar_publisher.set_agent(agent) - - # Open demo UI - await agent.edge.open_demo(call) - - # Keep the call running - await agent.finish() - - -if __name__ == "__main__": - asyncio.run(start_avatar_agent_streaming()) - diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index e9da7551..f78e538b 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -110,7 +110,7 @@ def __init__( self._forwarding_audio = False logger.info( - f"🎭 HeyGen AvatarPublisher initialized " + f"HeyGen AvatarPublisher initialized " f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})" ) @@ -131,7 +131,7 @@ def set_agent(self, agent: Any) -> None: agent: The agent instance. """ self._agent = agent - logger.info("🔗 Agent reference set for HeyGen avatar publisher") + logger.info("Agent reference set for HeyGen avatar publisher") # Mute the Realtime LLM's audio if requested if self.mute_llm_audio: @@ -151,10 +151,10 @@ async def _connect_to_heygen(self) -> None: await self.rtc_manager.connect() self._connected = True - logger.info("✅ Connected to HeyGen, avatar streaming active") + logger.info("Connected to HeyGen, avatar streaming active") except Exception as e: - logger.error(f"❌ Failed to connect to HeyGen: {e}") + logger.error(f"Failed to connect to HeyGen: {e}") self._connected = False raise @@ -177,7 +177,7 @@ def _subscribe_to_text_events(self) -> None: @self._agent.llm.events.subscribe async def on_text_chunk(event: LLMResponseChunkEvent): """Handle streaming text chunks from the LLM.""" - logger.debug(f"📝 HeyGen received text chunk: delta='{event.delta}'") + logger.debug(f"HeyGen received text chunk: delta='{event.delta}'") if event.delta: await self._on_text_chunk(event.delta, event.item_id) @@ -202,14 +202,14 @@ async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent): This is the primary path for Gemini Realtime which transcribes the agent's speech output as text. """ - logger.debug(f"📝 HeyGen received agent speech: text='{event.text}'") + logger.debug(f"HeyGen received agent speech: text='{event.text}'") if event.text: # Send directly to HeyGen - this is the complete utterance await self._send_text_to_heygen(event.text) - logger.info("📝 Subscribed to LLM text output events for HeyGen lip-sync") + logger.info("Subscribed to LLM text output events for HeyGen lip-sync") else: - logger.warning("âš ī¸ Cannot subscribe to text events - no agent or LLM attached yet") + logger.warning("Cannot subscribe to text events - no agent or LLM attached yet") except Exception as e: logger.error(f"Failed to subscribe to text events: {e}") import traceback @@ -228,11 +228,11 @@ def _mute_realtime_llm_audio(self) -> None: from vision_agents.core.llm.realtime import Realtime if not hasattr(self, '_agent') or not self._agent: - logger.warning("âš ī¸ Cannot mute LLM audio - no agent set") + logger.warning("Cannot mute LLM audio - no agent set") return if not hasattr(self._agent, 'llm') or not isinstance(self._agent.llm, Realtime): - logger.info("â„šī¸ LLM is not a Realtime LLM - no audio to mute") + logger.info("LLM is not a Realtime LLM - no audio to mute") return # Store the original write method @@ -249,7 +249,7 @@ async def selective_write(audio_data: bytes) -> None: # Replace the write method self._agent.llm.output_track.write = selective_write - logger.info("🔇 Muted Realtime LLM audio output (HeyGen will provide audio)") + logger.info("Muted Realtime LLM audio output (HeyGen will provide audio)") except Exception as e: logger.error(f"Failed to mute LLM audio: {e}") @@ -262,7 +262,7 @@ async def _on_video_track(self, track: Any) -> None: Args: track: Incoming video track from HeyGen's WebRTC connection. """ - logger.info("📹 Received video track from HeyGen, starting frame forwarding") + logger.info("Received video track from HeyGen, starting frame forwarding") await self._video_track.start_receiving(track) async def _on_audio_track(self, track: Any) -> None: @@ -274,7 +274,7 @@ async def _on_audio_track(self, track: Any) -> None: Args: track: Incoming audio track from HeyGen's WebRTC connection. """ - logger.info("🔊 Received audio track from HeyGen, starting audio forwarding") + logger.info("Received audio track from HeyGen, starting audio forwarding") # Forward audio frames from HeyGen to our audio track asyncio.create_task(self._forward_audio_frames(track, self._audio_track)) @@ -287,7 +287,7 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non dest_track: Agent's audio track to write to. """ try: - logger.info("🔊 Starting HeyGen audio frame forwarding") + logger.info("Starting HeyGen audio frame forwarding") frame_count = 0 while True: try: @@ -317,20 +317,20 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non await dest_track.write(audio_bytes) self._forwarding_audio = False else: - logger.warning("âš ī¸ Received frame without to_ndarray() method") + logger.warning("Received frame without to_ndarray() method") except Exception as e: if "ended" in str(e).lower() or "closed" in str(e).lower(): - logger.info(f"🔊 HeyGen audio track ended (forwarded {frame_count} frames)") + logger.info(f"HeyGen audio track ended (forwarded {frame_count} frames)") break else: - logger.error(f"❌ Error forwarding audio frame #{frame_count}: {e}") + logger.error(f"Error forwarding audio frame #{frame_count}: {e}") import traceback logger.error(traceback.format_exc()) break except Exception as e: - logger.error(f"❌ Error in audio forwarding loop: {e}") + logger.error(f"Error in audio forwarding loop: {e}") import traceback logger.error(traceback.format_exc()) @@ -383,11 +383,11 @@ async def _send_text_to_heygen(self, text: str) -> None: return try: - logger.info(f"📤 Sending text to HeyGen: '{text[:50]}...'") + logger.info(f"Sending text to HeyGen: '{text[:50]}...'") await self.rtc_manager.send_text(text, task_type="repeat") - logger.debug("✅ Text sent to HeyGen successfully") + logger.debug("Text sent to HeyGen successfully") except Exception as e: - logger.error(f"❌ Failed to send text to HeyGen: {e}") + logger.error(f"Failed to send text to HeyGen: {e}") import traceback logger.error(traceback.format_exc()) @@ -404,7 +404,7 @@ def publish_video_track(self): if not self._connected and not self._connection_task: self._connection_task = asyncio.create_task(self._connect_to_heygen()) - logger.info("đŸŽĨ Publishing HeyGen avatar video track") + logger.info("Publishing HeyGen avatar video track") return self._video_track def state(self) -> dict: @@ -423,7 +423,7 @@ def state(self) -> dict: async def close(self) -> None: """Clean up resources and close connections.""" - logger.info("🔌 Closing HeyGen avatar publisher") + logger.info("Closing HeyGen avatar publisher") # Stop video track if self._video_track: @@ -442,5 +442,5 @@ async def close(self) -> None: pass self._connected = False - logger.info("✅ HeyGen avatar publisher closed") + logger.info("HeyGen avatar publisher closed") diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py index 3e61674a..6e9876ac 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -88,7 +88,7 @@ async def connect(self) -> None: offer_sdp = None if not offer_sdp: - logger.error(f"❌ Unexpected SDP format. Type: {type(sdp_data)}") + logger.error(f"Unexpected SDP format. Type: {type(sdp_data)}") if isinstance(sdp_data, dict): logger.error(f"SDP dict keys: {list(sdp_data.keys())}") logger.error(f"SDP data: {str(sdp_data)[:200] if sdp_data else 'None'}") @@ -105,7 +105,7 @@ async def on_track(track: MediaStreamTrack): @self.pc.on("connectionstatechange") async def on_connection_state_change(): - logger.info(f"🔗 HeyGen connection state: {self.pc.connectionState}") + logger.info(f"HeyGen connection state: {self.pc.connectionState}") if self.pc.connectionState == "connected": self._connected = True self._connection_ready.set() @@ -131,10 +131,10 @@ async def on_connection_state_change(): # Wait for connection to be established await asyncio.wait_for(self._connection_ready.wait(), timeout=10.0) - logger.info("✅ HeyGen WebRTC connection established") + logger.info("HeyGen WebRTC connection established") except Exception as e: - logger.error(f"❌ Failed to connect to HeyGen: {e}") + logger.error(f"Failed to connect to HeyGen: {e}") raise def _parse_ice_servers(self, session_info: dict) -> list: @@ -158,7 +158,7 @@ def _parse_ice_servers(self, session_info: dict) -> list: ) if ice_server_configs and not isinstance(ice_server_configs, list): - logger.warning(f"âš ī¸ Unexpected ice_servers format: {type(ice_server_configs)}") + logger.warning(f"Unexpected ice_servers format: {type(ice_server_configs)}") ice_server_configs = [] for server_config in ice_server_configs: @@ -180,12 +180,12 @@ def _parse_ice_servers(self, session_info: dict) -> list: credential=credential, ) ) - logger.info(f"🧊 Added ICE server: {urls[0]}") + logger.info(f"Added ICE server: {urls[0]}") # When using LiveKit, ICE servers may be embedded in SDP # In that case, use public STUN as fallback if not ice_servers: - logger.info("â„šī¸ Using default STUN servers (LiveKit may provide its own via SDP)") + logger.info("Using default STUN servers (LiveKit may provide its own via SDP)") ice_servers.append( RTCIceServer(urls=["stun:stun.l.google.com:19302"]) ) @@ -198,7 +198,7 @@ async def _handle_track(self, track: MediaStreamTrack) -> None: Args: track: Incoming media track (audio or video). """ - logger.info(f"📡 Received track from HeyGen: {track.kind}") + logger.info(f"Received track from HeyGen: {track.kind}") if track.kind == "video": if self._video_callback: @@ -207,11 +207,11 @@ async def _handle_track(self, track: MediaStreamTrack) -> None: logger.warning("Video track received but no callback registered") elif track.kind == "audio": # Audio track from HeyGen (avatar speech with lip-synced TTS) - logger.info("🔊 Audio track received from HeyGen") + logger.info("Audio track received from HeyGen") if self._audio_callback: await self._audio_callback(track) else: - logger.warning("âš ī¸ Audio track received but no callback registered") + logger.warning("Audio track received but no callback registered") def set_video_callback(self, callback: Callable[[MediaStreamTrack], Any]) -> None: """Set callback for handling incoming video track. @@ -257,5 +257,5 @@ async def close(self) -> None: self._connected = False self._connection_ready.clear() - logger.info("🔌 HeyGen RTC connection closed") + logger.info("HeyGen RTC connection closed") diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py index aca8caa3..3565c7a6 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py @@ -76,11 +76,11 @@ async def create_session(self) -> Dict[str, Any]: self.session_info = data.get("data", {}) self.session_id = self.session_info.get("session_id") - logger.info(f"✅ HeyGen session created: {self.session_id}") + logger.info(f"HeyGen session created: {self.session_id}") return self.session_info except Exception as e: - logger.error(f"❌ Failed to create HeyGen session: {e}") + logger.error(f"Failed to create HeyGen session: {e}") raise async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any]: @@ -127,11 +127,11 @@ async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any ) data = await response.json() - logger.info(f"✅ HeyGen session started: {self.session_id}") + logger.info(f"HeyGen session started: {self.session_id}") return data except Exception as e: - logger.error(f"❌ Failed to start HeyGen session: {e}") + logger.error(f"Failed to start HeyGen session: {e}") raise async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any]: @@ -179,11 +179,11 @@ async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any return {} data = await response.json() - logger.debug(f"📤 Sent text to HeyGen: '{text[:50]}...'") + logger.debug(f"Sent text to HeyGen: '{text[:50]}...'") return data except Exception as e: - logger.error(f"❌ Error sending task to HeyGen: {e}") + logger.error(f"Error sending task to HeyGen: {e}") return {} async def stop_session(self) -> None: @@ -211,13 +211,13 @@ async def stop_session(self) -> None: headers=headers, ) as response: if response.status == 200: - logger.info(f"✅ HeyGen session stopped: {self.session_id}") + logger.info(f"HeyGen session stopped: {self.session_id}") else: logger.warning( f"Failed to stop HeyGen session: {response.status}" ) except Exception as e: - logger.error(f"❌ Error stopping HeyGen session: {e}") + logger.error(f"Error stopping HeyGen session: {e}") async def close(self) -> None: """Clean up session resources.""" diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py index ace06d5e..38f707ca 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py @@ -43,7 +43,7 @@ def __init__(self, width: int = 1920, height: int = 1080): self._receiving_task: Optional[asyncio.Task] = None self._source_track: Optional[MediaStreamTrack] = None - logger.info(f"đŸŽŦ HeyGenVideoTrack initialized ({width}x{height})") + logger.info(f"HeyGenVideoTrack initialized ({width}x{height})") async def start_receiving(self, source_track: MediaStreamTrack) -> None: """Start receiving frames from HeyGen's video track. @@ -57,7 +57,7 @@ async def start_receiving(self, source_track: MediaStreamTrack) -> None: self._source_track = source_track self._receiving_task = asyncio.create_task(self._receive_frames()) - logger.info("đŸ“Ĩ Started receiving frames from HeyGen") + logger.info("Started receiving frames from HeyGen") async def _receive_frames(self) -> None: """Continuously receive frames from HeyGen and add to queue.""" @@ -80,7 +80,7 @@ async def _receive_frames(self) -> None: self.frame_queue.put_latest_nowait(frame) logger.debug( - f"đŸ“Ĩ Received frame from HeyGen: {frame.width}x{frame.height}" + f"Received frame from HeyGen: {frame.width}x{frame.height}" ) except Exception as e: @@ -157,5 +157,5 @@ def stop(self) -> None: self._receiving_task = None super().stop() - logger.info("🛑 HeyGenVideoTrack stopped") + logger.info("HeyGenVideoTrack stopped") From c14b98c459aa84a96cdcac23bd9c6599089bcd83 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Mon, 3 Nov 2025 18:41:06 +0100 Subject: [PATCH 07/20] Auto-attach processors to agent (no more manual set_agent calls) - Add processor._attach_agent() lifecycle hook to Agent.__init__ - Rename HeyGen set_agent() -> _attach_agent() for consistency with LLM - Remove manual agent attachment from examples and docs - HeyGen now works like YOLO - just add to processors list Examples are now much cleaner: agent = Agent(processors=[heygen.AvatarPublisher()]) # That's it! No manual wiring needed. --- agents-core/vision_agents/core/agents/agents.py | 5 +++++ plugins/aws/example/uv.lock | 4 +++- plugins/heygen/README.md | 10 ---------- plugins/heygen/example/README.md | 6 ++++-- plugins/heygen/example/avatar_example.py | 5 ----- plugins/heygen/example/pyproject.toml | 2 ++ .../plugins/heygen/heygen_avatar_publisher.py | 6 +++--- 7 files changed, 17 insertions(+), 21 deletions(-) diff --git a/agents-core/vision_agents/core/agents/agents.py b/agents-core/vision_agents/core/agents/agents.py index 38a99bda..e537c7cc 100644 --- a/agents-core/vision_agents/core/agents/agents.py +++ b/agents-core/vision_agents/core/agents/agents.py @@ -215,6 +215,11 @@ def __init__( self.llm._attach_agent(self) + # Attach processors that need agent reference + for processor in self.processors: + if hasattr(processor, '_attach_agent'): + processor._attach_agent(self) + self.events.subscribe(self._on_vad_audio) self.events.subscribe(self._on_agent_say) # Initialize state variables diff --git a/plugins/aws/example/uv.lock b/plugins/aws/example/uv.lock index fad869b7..5c0123ac 100644 --- a/plugins/aws/example/uv.lock +++ b/plugins/aws/example/uv.lock @@ -2648,6 +2648,8 @@ requires-dist = [ { name = "vision-agents-plugins-gemini", marker = "extra == 'gemini'", editable = "../../gemini" }, { name = "vision-agents-plugins-getstream", marker = "extra == 'all-plugins'", editable = "../../getstream" }, { name = "vision-agents-plugins-getstream", marker = "extra == 'getstream'", editable = "../../getstream" }, + { name = "vision-agents-plugins-heygen", marker = "extra == 'all-plugins'", editable = "../../heygen" }, + { name = "vision-agents-plugins-heygen", marker = "extra == 'heygen'", editable = "../../heygen" }, { name = "vision-agents-plugins-kokoro", marker = "extra == 'all-plugins'", editable = "../../kokoro" }, { name = "vision-agents-plugins-kokoro", marker = "extra == 'kokoro'", editable = "../../kokoro" }, { name = "vision-agents-plugins-krisp", marker = "extra == 'all-plugins'", editable = "../../krisp" }, @@ -2665,7 +2667,7 @@ requires-dist = [ { name = "vision-agents-plugins-xai", marker = "extra == 'all-plugins'", editable = "../../xai" }, { name = "vision-agents-plugins-xai", marker = "extra == 'xai'", editable = "../../xai" }, ] -provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "kokoro", "krisp", "moonshine", "openai", "smart-turn", "ultralytics", "wizper", "xai"] +provides-extras = ["all-plugins", "anthropic", "cartesia", "deepgram", "dev", "elevenlabs", "gemini", "getstream", "heygen", "kokoro", "krisp", "moonshine", "openai", "smart-turn", "ultralytics", "wizper", "xai"] [[package]] name = "vision-agents-plugins-aws" diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md index b7360c3a..0ae26514 100644 --- a/plugins/heygen/README.md +++ b/plugins/heygen/README.md @@ -56,11 +56,6 @@ async def start_avatar_agent(): call = agent.edge.client.video.call("default", str(uuid4())) with await agent.join(call): - # Set agent reference for event subscription - avatar_publisher = agent.video_publishers[0] - if hasattr(avatar_publisher, 'set_agent'): - avatar_publisher.set_agent(agent) - await agent.edge.open_demo(call) await agent.simple_response("Hello! I'm your AI assistant with an avatar.") await agent.finish() @@ -112,11 +107,6 @@ agent = Agent( call = agent.edge.client.video.call("default", str(uuid4())) with await agent.join(call): - # Set agent reference for event subscription - avatar_publisher = agent.video_publishers[0] - if hasattr(avatar_publisher, 'set_agent'): - avatar_publisher.set_agent(agent) - await agent.finish() ``` diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md index 631d3309..830dd75c 100644 --- a/plugins/heygen/example/README.md +++ b/plugins/heygen/example/README.md @@ -28,14 +28,16 @@ Required API keys: ## Running the Example +From the project root: + ```bash -uv run avatar_example.py +uv run plugins/heygen/example/avatar_example.py ``` This will: 1. Start an AI agent with a HeyGen avatar 2. Open a demo UI in your browser -3. The avatar will greet you and be ready to chat +3. The avatar will speak and be ready to chat ## What's Happening diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py index b3beef83..1683491b 100644 --- a/plugins/heygen/example/avatar_example.py +++ b/plugins/heygen/example/avatar_example.py @@ -55,11 +55,6 @@ async def start_avatar_agent() -> None: # Join the call with await agent.join(call): - # Set agent reference on avatar publisher for text event subscription - avatar_publisher = agent.video_publishers[0] - if hasattr(avatar_publisher, 'set_agent'): - avatar_publisher.set_agent(agent) - # Open demo UI await agent.edge.open_demo(call) diff --git a/plugins/heygen/example/pyproject.toml b/plugins/heygen/example/pyproject.toml index 4e1fdf61..ffdd3922 100644 --- a/plugins/heygen/example/pyproject.toml +++ b/plugins/heygen/example/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "vision-agents-plugins-heygen", "vision-agents-plugins-gemini", "vision-agents-plugins-getstream", + "vision-agents-plugins-deepgram", "python-dotenv", ] @@ -16,4 +17,5 @@ vision-agents = { workspace = true } vision-agents-plugins-heygen = { workspace = true } vision-agents-plugins-gemini = { workspace = true } vision-agents-plugins-getstream = { workspace = true } +vision-agents-plugins-deepgram = { workspace = true } diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index f78e538b..52249183 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -122,10 +122,10 @@ def publish_audio_track(self): """ return self._audio_track - def set_agent(self, agent: Any) -> None: - """Set the agent reference for event subscription. + def _attach_agent(self, agent: Any) -> None: + """Attach the agent reference for event subscription. - This is called by the agent when the processor is attached. + This is called automatically by the Agent during initialization. Args: agent: The agent instance. From 6188ed38fb48767536b6149841879a557d7622d3 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Mon, 3 Nov 2025 19:06:22 +0100 Subject: [PATCH 08/20] fixed audio duplication and sluggishness --- .../plugins/heygen/heygen_avatar_publisher.py | 82 +++++++++---------- 1 file changed, 40 insertions(+), 42 deletions(-) diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index 52249183..6d7467fe 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -104,7 +104,7 @@ def __init__( # Text buffer for accumulating LLM response chunks before sending to HeyGen self._text_buffer = "" self._current_response_id: Optional[str] = None - self._sent_texts: set = set() # Track sent texts to avoid duplicates + self._all_sent_texts: set = set() # Track all sent texts to prevent duplicates # Audio forwarding state (for selective muting of Realtime LLM audio) self._forwarding_audio = False @@ -183,17 +183,37 @@ async def on_text_chunk(event: LLMResponseChunkEvent): @self._agent.llm.events.subscribe async def on_text_complete(event: LLMResponseCompletedEvent): - """Handle end of LLM response - send any remaining buffered text.""" - # Send any remaining buffered text - if self._text_buffer.strip(): - text_to_send = self._text_buffer.strip() - if text_to_send not in self._sent_texts: - await self._send_text_to_heygen(text_to_send) - self._sent_texts.add(text_to_send) - self._text_buffer = "" + """Handle end of LLM response - split into sentences and send each once.""" + if not self._text_buffer.strip(): + return + + # Split the complete response into sentences + import re + text = self._text_buffer.strip() + # Split on sentence boundaries but keep the punctuation + sentences = re.split(r'([.!?]+\s*)', text) + # Recombine sentences with their punctuation + full_sentences = [] + for i in range(0, len(sentences)-1, 2): + if sentences[i].strip(): + sentence = (sentences[i] + sentences[i+1] if i+1 < len(sentences) else sentences[i]).strip() + full_sentences.append(sentence) + # Handle last part if no punctuation + if sentences and sentences[-1].strip() and not any(sentences[-1].strip().endswith(p) for p in ['.', '!', '?']): + full_sentences.append(sentences[-1].strip()) + + # Send each sentence once if not already sent + for sentence in full_sentences: + if sentence and len(sentence) > 5: + if sentence not in self._all_sent_texts: + await self._send_text_to_heygen(sentence) + self._all_sent_texts.add(sentence) + else: + logger.debug(f"Skipping duplicate: '{sentence[:30]}...'") + # Reset for next response + self._text_buffer = "" self._current_response_id = None - self._sent_texts.clear() @self._agent.llm.events.subscribe async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent): @@ -298,19 +318,8 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non # Convert frame to bytes and write to agent's audio track if hasattr(frame, 'to_ndarray'): audio_array = frame.to_ndarray() - - # Convert mono to stereo if needed (agent track expects stereo) - # HeyGen sends mono (shape=(1, samples)), we need interleaved stereo - if audio_array.shape[0] == 1: - # Flatten to 1D array of samples - mono_samples = audio_array.flatten() - - # Create stereo by interleaving each mono sample - stereo_samples = np.repeat(mono_samples, 2) - audio_bytes = stereo_samples.tobytes() - else: - # Already multi-channel, just flatten and convert - audio_bytes = audio_array.flatten().tobytes() + # Pass raw audio data - AudioStreamTrack handles format conversion + audio_bytes = audio_array.tobytes() # Set flag to allow HeyGen audio through the muted track self._forwarding_audio = True @@ -337,8 +346,8 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None: """Handle text chunk from the LLM. - Accumulates text chunks until a complete sentence or response is ready, - then sends to HeyGen for lip-sync. + Accumulates text chunks. Does NOT send immediately - waits for completion event + to avoid sending partial/duplicate sentences. Args: text_delta: The text chunk/delta from the LLM. @@ -348,26 +357,16 @@ async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None: if item_id != self._current_response_id: if self._text_buffer: # Send any accumulated text from previous response - await self._send_text_to_heygen(self._text_buffer.strip()) + text_to_send = self._text_buffer.strip() + if text_to_send and text_to_send not in self._all_sent_texts: + await self._send_text_to_heygen(text_to_send) + self._all_sent_texts.add(text_to_send) self._text_buffer = "" self._current_response_id = item_id - self._sent_texts.clear() - # Accumulate text + # Just accumulate text - don't send yet! + # Wait for completion event to avoid sending partial sentences self._text_buffer += text_delta - - # Send when we have a complete sentence (ending with period, !, or ?) - # But only if it's substantial enough (> 15 chars) to avoid sending tiny fragments - # Don't send on commas/semicolons to reduce repetition - if any(self._text_buffer.rstrip().endswith(p) for p in ['.', '!', '?']): - text_to_send = self._text_buffer.strip() - # Only send if it's substantial (>15 chars) and not already sent - if text_to_send and len(text_to_send) > 15 and text_to_send not in self._sent_texts: - await self._send_text_to_heygen(text_to_send) - self._sent_texts.add(text_to_send) - self._text_buffer = "" # Clear buffer after sending - elif text_to_send in self._sent_texts: - self._text_buffer = "" # Clear buffer to avoid re-sending async def _send_text_to_heygen(self, text: str) -> None: """Send text to HeyGen for the avatar to speak with lip-sync. @@ -385,7 +384,6 @@ async def _send_text_to_heygen(self, text: str) -> None: try: logger.info(f"Sending text to HeyGen: '{text[:50]}...'") await self.rtc_manager.send_text(text, task_type="repeat") - logger.debug("Text sent to HeyGen successfully") except Exception as e: logger.error(f"Failed to send text to HeyGen: {e}") import traceback From 74aa6ff9bfd448a1313121ab5075d72b4c7aeaf3 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Mon, 3 Nov 2025 19:15:16 +0100 Subject: [PATCH 09/20] Fix video aspect ratio stretching - add letterboxing --- .../plugins/heygen/heygen_video_track.py | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py index 38f707ca..9339e1da 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py @@ -94,18 +94,38 @@ async def _receive_frames(self) -> None: logger.error(f"Fatal error in frame receiving: {e}") def _resize_frame(self, frame: av.VideoFrame) -> av.VideoFrame: - """Resize a video frame to match the track dimensions. + """Resize a video frame to match the track dimensions while maintaining aspect ratio. Args: frame: Input video frame. Returns: - Resized video frame. + Resized video frame with letterboxing if needed. """ try: img = frame.to_image() - resized = img.resize((self.width, self.height), Image.LANCZOS) - return av.VideoFrame.from_image(resized) + + # Calculate scaling to maintain aspect ratio + src_width, src_height = img.size + target_width, target_height = self.width, self.height + + # Calculate scale factor (fit within target dimensions) + scale = min(target_width / src_width, target_height / src_height) + new_width = int(src_width * scale) + new_height = int(src_height * scale) + + # Resize with aspect ratio maintained + resized = img.resize((new_width, new_height), Image.LANCZOS) + + # Create black background at target resolution + result = Image.new('RGB', (target_width, target_height), (0, 0, 0)) + + # Paste resized image centered + x_offset = (target_width - new_width) // 2 + y_offset = (target_height - new_height) // 2 + result.paste(resized, (x_offset, y_offset)) + + return av.VideoFrame.from_image(result) except Exception as e: logger.error(f"Error resizing frame: {e}") From f54c372786b28aaa199a8596a5bc3991742f0404 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Tue, 4 Nov 2025 10:26:47 +0100 Subject: [PATCH 10/20] fixed and simplified both implementations --- plugins/heygen/example/README.md | 86 ++++++++++++--- .../heygen/example/avatar_realtime_example.py | 65 +++++++++++ .../plugins/heygen/heygen_avatar_publisher.py | 104 +++++------------- 3 files changed, 164 insertions(+), 91 deletions(-) create mode 100644 plugins/heygen/example/avatar_realtime_example.py diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md index 830dd75c..b922e7e2 100644 --- a/plugins/heygen/example/README.md +++ b/plugins/heygen/example/README.md @@ -1,6 +1,16 @@ -# HeyGen Avatar Example +# HeyGen Avatar Examples -This example demonstrates how to use the HeyGen plugin to add realistic avatar video to your AI agent. +This directory contains examples of how to use the HeyGen plugin to add realistic avatar video to your AI agent. + +## Examples + +### 1. Standard Streaming LLM (`avatar_example.py`) + +Uses a standard streaming LLM (Gemini) with separate TTS/STT components. Best for traditional text-based LLMs. + +### 2. Realtime LLM (`avatar_realtime_example.py`) + +Uses Gemini Realtime with native audio input/output. The avatar lip-syncs to the transcribed text while Gemini handles voice processing. ## Setup @@ -19,27 +29,40 @@ Copy `.env.example` to `.env` and fill in your API keys: cp .env.example .env ``` -Required API keys: +**For Standard Example** (`avatar_example.py`): - `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com) - `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io) - `CARTESIA_API_KEY` - Get from [Cartesia](https://cartesia.ai) - `DEEPGRAM_API_KEY` - Get from [Deepgram](https://deepgram.com) - `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey) -## Running the Example +**For Realtime Example** (`avatar_realtime_example.py`): +- `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com) +- `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io) +- `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey) + +## Running the Examples From the project root: +**Standard Streaming LLM:** ```bash uv run plugins/heygen/example/avatar_example.py ``` -This will: +**Realtime LLM:** +```bash +uv run plugins/heygen/example/avatar_realtime_example.py +``` + +Both will: 1. Start an AI agent with a HeyGen avatar 2. Open a demo UI in your browser 3. The avatar will speak and be ready to chat -## What's Happening +## How It Works + +### Standard Streaming LLM (`avatar_example.py`) 1. **Agent Setup**: The agent is configured with: - Gemini LLM for generating responses @@ -49,10 +72,10 @@ This will: 2. **Avatar Streaming**: When the agent speaks: - Text is generated by Gemini LLM + - Text is sent to HeyGen for lip-sync - Audio is synthesized by Cartesia TTS - - Audio is sent to HeyGen via WebRTC - HeyGen generates avatar video with lip-sync - - Avatar video is streamed to the call + - Avatar video and audio are streamed to the call 3. **User Interaction**: When you speak: - Audio is captured from your microphone @@ -60,6 +83,24 @@ This will: - Sent to Gemini LLM for processing - Response is generated and spoken through the avatar +### Realtime LLM (`avatar_realtime_example.py`) + +1. **Agent Setup**: The agent is configured with: + - Gemini Realtime for native audio processing + - HeyGen AvatarPublisher for avatar video + +2. **Avatar Streaming**: When the agent speaks: + - Gemini Realtime generates audio directly (24kHz PCM) + - Text transcription is sent to HeyGen for lip-sync + - HeyGen generates avatar video with lip-sync + - Gemini's audio is used (HeyGen audio is not forwarded for Realtime LLMs) + - Avatar video and Gemini audio are streamed to the call + +3. **User Interaction**: When you speak: + - Audio is captured and sent directly to Gemini Realtime + - Gemini processes audio natively (no separate STT needed) + - Response is generated and spoken through the avatar + ## Customization ### Using a Different Avatar @@ -87,17 +128,36 @@ heygen.AvatarPublisher( ### Using a Different LLM -Switch to OpenAI's Realtime API: +**With Standard Streaming LLM:** +```python +from vision_agents.plugins import openai, elevenlabs + +agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Avatar AI"), + instructions="Your instructions here", + llm=openai.LLM("gpt-4"), + tts=elevenlabs.TTS(), + stt=deepgram.STT(), + processors=[ + heygen.AvatarPublisher(avatar_id="default") + ] +) +``` +**With Realtime LLM:** ```python from vision_agents.plugins import openai agent = Agent( - # ... other config ... - llm=openai.Realtime(model="gpt-realtime", voice="alloy"), - # No need for separate TTS/STT with Realtime LLM + edge=getstream.Edge(), + agent_user=User(name="Avatar AI"), + instructions="Your instructions here", + llm=openai.Realtime(model="gpt-4o-realtime-preview"), processors=[ - heygen.AvatarPublisher(avatar_id="default") + heygen.AvatarPublisher( + avatar_id="default" + ) ] ) ``` diff --git a/plugins/heygen/example/avatar_realtime_example.py b/plugins/heygen/example/avatar_realtime_example.py new file mode 100644 index 00000000..695f7b55 --- /dev/null +++ b/plugins/heygen/example/avatar_realtime_example.py @@ -0,0 +1,65 @@ +import asyncio +import logging +from uuid import uuid4 + +from dotenv import load_dotenv + +from vision_agents.core import User, Agent +from vision_agents.plugins import getstream, gemini, heygen + +load_dotenv() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", +) + + +async def start_avatar_agent() -> None: + """Start a HeyGen avatar agent with Gemini Realtime LLM. + + This example demonstrates using a HeyGen avatar with a Realtime LLM. + HeyGen provides the lip-synced avatar video based on text transcriptions, + while Gemini Realtime provides the audio directly. + """ + + # Create agent with Gemini Realtime and HeyGen avatar + agent = Agent( + edge=getstream.Edge(), + agent_user=User(name="Avatar AI Assistant"), + instructions=( + "You are a helpful AI assistant with a virtual avatar. " + "Keep responses conversational and natural. " + "Be friendly and engaging." + ), + llm=gemini.Realtime( + model="gemini-2.5-flash-native-audio-preview-09-2025" + ), + processors=[ + heygen.AvatarPublisher( + avatar_id="default", + quality="high", + ) + ], + ) + + # Create a call + call = agent.edge.client.video.call("default", str(uuid4())) + + # Join call first + with await agent.join(call): + # Open demo UI after joining + await agent.edge.open_demo(call) + + # Start the conversation + await agent.llm.simple_response( + text="Hello! I'm your AI assistant. How can I help you today?" + ) + + # Keep running until the call ends + await agent.finish() + + +if __name__ == "__main__": + asyncio.run(start_avatar_agent()) + diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index 6d7467fe..e5e12f03 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -2,7 +2,6 @@ import logging from typing import Optional, Any, Tuple -import numpy as np from getstream.video.rtc import audio_track from vision_agents.core.processors.base_processor import ( @@ -21,10 +20,9 @@ class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMi """HeyGen avatar video and audio publisher. Publishes video of a HeyGen avatar that lip-syncs based on LLM text output. - Can be used as a processor in the Vision Agents framework to add - realistic avatar video to AI agents. - HeyGen handles TTS internally, so no separate TTS is needed. + For standard LLMs: HeyGen provides both video and audio (with TTS). + For Realtime LLMs: HeyGen provides video only; LLM provides audio. Example: agent = Agent( @@ -49,7 +47,6 @@ def __init__( resolution: Tuple[int, int] = (1920, 1080), api_key: Optional[str] = None, interval: int = 0, - mute_llm_audio: bool = True, **kwargs, ): """Initialize the HeyGen avatar publisher. @@ -60,8 +57,6 @@ def __init__( resolution: Output video resolution (width, height). api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided. interval: Processing interval (not used, kept for compatibility). - mute_llm_audio: If True, mutes the Realtime LLM's audio output so only - HeyGen's video (with audio) is heard. Default: True. **kwargs: Additional arguments passed to parent class. """ super().__init__( @@ -75,7 +70,6 @@ def __init__( self.quality = quality self.resolution = resolution self.api_key = api_key - self.mute_llm_audio = mute_llm_audio # WebRTC manager for HeyGen connection self.rtc_manager = HeyGenRTCManager( @@ -106,9 +100,6 @@ def __init__( self._current_response_id: Optional[str] = None self._all_sent_texts: set = set() # Track all sent texts to prevent duplicates - # Audio forwarding state (for selective muting of Realtime LLM audio) - self._forwarding_audio = False - logger.info( f"HeyGen AvatarPublisher initialized " f"(avatar: {avatar_id}, quality: {quality}, resolution: {resolution})" @@ -133,10 +124,6 @@ def _attach_agent(self, agent: Any) -> None: self._agent = agent logger.info("Agent reference set for HeyGen avatar publisher") - # Mute the Realtime LLM's audio if requested - if self.mute_llm_audio: - self._mute_realtime_llm_audio() - # Subscribe to text events immediately when agent is set self._subscribe_to_text_events() @@ -231,50 +218,7 @@ async def on_agent_speech(event: RealtimeAgentSpeechTranscriptionEvent): else: logger.warning("Cannot subscribe to text events - no agent or LLM attached yet") except Exception as e: - logger.error(f"Failed to subscribe to text events: {e}") - import traceback - logger.error(traceback.format_exc()) - - def _mute_realtime_llm_audio(self) -> None: - """Mute the Realtime LLM's audio output. - - When using HeyGen, we want HeyGen to handle all audio (with lip-sync), - so we mute the LLM's native audio output to avoid duplicated/overlapping audio. - - This works by intercepting writes to the LLM's output_track and only blocking - writes that come from the LLM itself (not from HeyGen forwarding). - """ - try: - from vision_agents.core.llm.realtime import Realtime - - if not hasattr(self, '_agent') or not self._agent: - logger.warning("Cannot mute LLM audio - no agent set") - return - - if not hasattr(self._agent, 'llm') or not isinstance(self._agent.llm, Realtime): - logger.info("LLM is not a Realtime LLM - no audio to mute") - return - - # Store the original write method - original_write = self._agent.llm.output_track.write - - # Create a selective write method - async def selective_write(audio_data: bytes) -> None: - """Only allow writes from HeyGen forwarding, block LLM writes.""" - if self._forwarding_audio: - # This is from HeyGen - allow it - await original_write(audio_data) - # else: This is from the Realtime LLM - block it - - # Replace the write method - self._agent.llm.output_track.write = selective_write - - logger.info("Muted Realtime LLM audio output (HeyGen will provide audio)") - - except Exception as e: - logger.error(f"Failed to mute LLM audio: {e}") - import traceback - logger.error(traceback.format_exc()) + logger.error(f"Failed to subscribe to text events: {e}", exc_info=True) async def _on_video_track(self, track: Any) -> None: """Callback when video track is received from HeyGen. @@ -291,12 +235,29 @@ async def _on_audio_track(self, track: Any) -> None: HeyGen provides audio with lip-synced TTS. We forward this audio to the agent's audio track so it gets published to the call. + For Realtime LLMs: We DON'T forward HeyGen audio - the LLM generates its own audio. + HeyGen is only used for video lip-sync based on text transcriptions. + Args: track: Incoming audio track from HeyGen's WebRTC connection. """ - logger.info("Received audio track from HeyGen, starting audio forwarding") + logger.info("Received audio track from HeyGen") + + # Check if we're using a Realtime LLM + using_realtime_llm = False + if hasattr(self, '_agent') and self._agent: + from vision_agents.core.llm.realtime import Realtime + if hasattr(self._agent, 'llm') and isinstance(self._agent.llm, Realtime): + using_realtime_llm = True + + if using_realtime_llm: + # For Realtime LLMs, don't forward HeyGen audio - use the LLM's native audio + # HeyGen is only used for lip-synced video based on text transcriptions + logger.info("Using Realtime LLM - skipping HeyGen audio forwarding (using LLM's native audio)") + return - # Forward audio frames from HeyGen to our audio track + # For standard LLMs, forward HeyGen's audio to our audio track + logger.info("Forwarding HeyGen audio to audio track") asyncio.create_task(self._forward_audio_frames(track, self._audio_track)) async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> None: @@ -315,16 +276,10 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non frame = await source_track.recv() frame_count += 1 - # Convert frame to bytes and write to agent's audio track if hasattr(frame, 'to_ndarray'): audio_array = frame.to_ndarray() - # Pass raw audio data - AudioStreamTrack handles format conversion audio_bytes = audio_array.tobytes() - - # Set flag to allow HeyGen audio through the muted track - self._forwarding_audio = True await dest_track.write(audio_bytes) - self._forwarding_audio = False else: logger.warning("Received frame without to_ndarray() method") @@ -332,16 +287,11 @@ async def _forward_audio_frames(self, source_track: Any, dest_track: Any) -> Non if "ended" in str(e).lower() or "closed" in str(e).lower(): logger.info(f"HeyGen audio track ended (forwarded {frame_count} frames)") break - else: - logger.error(f"Error forwarding audio frame #{frame_count}: {e}") - import traceback - logger.error(traceback.format_exc()) - break + logger.error(f"Error forwarding audio frame: {e}", exc_info=True) + break except Exception as e: - logger.error(f"Error in audio forwarding loop: {e}") - import traceback - logger.error(traceback.format_exc()) + logger.error(f"Error in audio forwarding loop: {e}", exc_info=True) async def _on_text_chunk(self, text_delta: str, item_id: Optional[str]) -> None: """Handle text chunk from the LLM. @@ -385,9 +335,7 @@ async def _send_text_to_heygen(self, text: str) -> None: logger.info(f"Sending text to HeyGen: '{text[:50]}...'") await self.rtc_manager.send_text(text, task_type="repeat") except Exception as e: - logger.error(f"Failed to send text to HeyGen: {e}") - import traceback - logger.error(traceback.format_exc()) + logger.error(f"Failed to send text to HeyGen: {e}", exc_info=True) def publish_video_track(self): """Publish the HeyGen avatar video track. From fad9f49b4eb24bb129aeb7d6f4f1bd0920d095f8 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Tue, 4 Nov 2025 10:30:11 +0100 Subject: [PATCH 11/20] Fix ruff linting - remove unused imports --- plugins/heygen/tests/test_heygen_plugin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py index 3be160d6..0c73a127 100644 --- a/plugins/heygen/tests/test_heygen_plugin.py +++ b/plugins/heygen/tests/test_heygen_plugin.py @@ -1,5 +1,5 @@ import pytest -from unittest.mock import Mock, AsyncMock, patch +from unittest.mock import patch from vision_agents.plugins.heygen import ( AvatarPublisher, HeyGenVideoTrack, From f03c81d1a335a9fb13f886cd398fc21763bf8eb8 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Tue, 4 Nov 2025 10:32:46 +0100 Subject: [PATCH 12/20] Fix HeyGen plugin tests - import paths and mocking --- plugins/heygen/tests/test_heygen_plugin.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py index 0c73a127..cfb754a8 100644 --- a/plugins/heygen/tests/test_heygen_plugin.py +++ b/plugins/heygen/tests/test_heygen_plugin.py @@ -1,11 +1,9 @@ import pytest from unittest.mock import patch -from vision_agents.plugins.heygen import ( - AvatarPublisher, - HeyGenVideoTrack, - HeyGenRTCManager, - HeyGenSession, -) +from vision_agents.plugins.heygen import AvatarPublisher +from vision_agents.plugins.heygen.heygen_video_track import HeyGenVideoTrack +from vision_agents.plugins.heygen.heygen_rtc_manager import HeyGenRTCManager +from vision_agents.plugins.heygen.heygen_session import HeyGenSession class TestHeyGenSession: @@ -97,6 +95,9 @@ def test_publish_video_track(self): """Test publishing video track.""" with patch.object(HeyGenRTCManager, "__init__", return_value=None): publisher = AvatarPublisher(api_key="test_key") + # Set _connected to True to avoid creating async task + publisher._connected = True + publisher._connection_task = None track = publisher.publish_video_track() @@ -110,6 +111,8 @@ def test_state(self): quality="medium", api_key="test_key", ) + # Mock the _connected attribute on the RTC manager + publisher.rtc_manager._connected = False state = publisher.state() From a5be2065202e3d3d0e9f339e75ed12882a9a85b8 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Tue, 4 Nov 2025 10:48:42 +0100 Subject: [PATCH 13/20] Fix mypy type errors in HeyGen plugin --- .../vision_agents/plugins/heygen/heygen_rtc_manager.py | 2 ++ .../vision_agents/plugins/heygen/heygen_session.py | 10 +++++----- .../vision_agents/plugins/heygen/heygen_video_track.py | 5 +++-- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py index 6e9876ac..1572512e 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -105,6 +105,8 @@ async def on_track(track: MediaStreamTrack): @self.pc.on("connectionstatechange") async def on_connection_state_change(): + if self.pc is None: + return logger.info(f"HeyGen connection state: {self.pc.connectionState}") if self.pc.connectionState == "connected": self._connected = True diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py index 3565c7a6..1d332891 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py @@ -28,7 +28,7 @@ def __init__( """ self.avatar_id = avatar_id self.quality = quality - self.api_key = api_key or getenv("HEYGEN_API_KEY") + self.api_key: str = api_key or getenv("HEYGEN_API_KEY") or "" if not self.api_key: raise ValueError( @@ -50,7 +50,7 @@ async def create_session(self) -> Dict[str, Any]: if not self._http_session: self._http_session = aiohttp.ClientSession() - headers = { + headers: dict[str, str] = { "X-Api-Key": self.api_key, "Content-Type": "application/json", } @@ -98,7 +98,7 @@ async def start_session(self, sdp_answer: Optional[str] = None) -> Dict[str, Any if not self._http_session: self._http_session = aiohttp.ClientSession() - headers = { + headers: dict[str, str] = { "X-Api-Key": self.api_key, "Content-Type": "application/json", } @@ -154,7 +154,7 @@ async def send_task(self, text: str, task_type: str = "repeat") -> Dict[str, Any if not self._http_session: self._http_session = aiohttp.ClientSession() - headers = { + headers: dict[str, str] = { "X-Api-Key": self.api_key, "Content-Type": "application/json", } @@ -195,7 +195,7 @@ async def stop_session(self) -> None: if not self._http_session: return - headers = { + headers: dict[str, str] = { "X-Api-Key": self.api_key, "Content-Type": "application/json", } diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py index 9339e1da..9f890b95 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py @@ -71,7 +71,8 @@ async def _receive_frames(self) -> None: # Receive frame from HeyGen frame = await self._source_track.recv() - if frame: + # Type check: ensure we have a VideoFrame + if frame and isinstance(frame, av.VideoFrame): # Resize if needed if frame.width != self.width or frame.height != self.height: frame = self._resize_frame(frame) @@ -115,7 +116,7 @@ def _resize_frame(self, frame: av.VideoFrame) -> av.VideoFrame: new_height = int(src_height * scale) # Resize with aspect ratio maintained - resized = img.resize((new_width, new_height), Image.LANCZOS) + resized = img.resize((new_width, new_height), Image.Resampling.LANCZOS) # Create black background at target resolution result = Image.new('RGB', (target_width, target_height), (0, 0, 0)) From d6d66bf74ed0a92b89e0be41c36f0705ab5031f3 Mon Sep 17 00:00:00 2001 From: Deven Joshi Date: Tue, 4 Nov 2025 10:50:51 +0100 Subject: [PATCH 14/20] Allow reattaching to new HeyGen video tracks on renegotiation --- .../vision_agents/plugins/heygen/heygen_video_track.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py index 9f890b95..e74a4c23 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_video_track.py @@ -52,8 +52,14 @@ async def start_receiving(self, source_track: MediaStreamTrack) -> None: source_track: The incoming video track from HeyGen's WebRTC connection. """ if self._receiving_task: - logger.warning("Already receiving frames from HeyGen") - return + logger.info("Restarting HeyGen video receiver with new source track") + self._receiving_task.cancel() + try: + await self._receiving_task + except asyncio.CancelledError: + pass + self._receiving_task = None + self._source_track = None self._source_track = source_track self._receiving_task = asyncio.create_task(self._receive_frames()) From f7a2f37184330a3ff934af75280f7b7b9c6916df Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 4 Nov 2025 14:59:42 -0700 Subject: [PATCH 15/20] Migrate quality to enum --- plugins/heygen/README.md | 9 ++++++--- plugins/heygen/example/README.md | 8 ++++++-- plugins/heygen/example/avatar_example.py | 3 ++- .../heygen/example/avatar_realtime_example.py | 3 ++- plugins/heygen/tests/test_heygen_plugin.py | 16 ++++++++-------- .../vision_agents/plugins/heygen/__init__.py | 3 ++- .../plugins/heygen/heygen_avatar_publisher.py | 16 +++++++++++++--- .../plugins/heygen/heygen_rtc_manager.py | 11 +++++++++-- .../plugins/heygen/heygen_session.py | 10 ++++++++-- 9 files changed, 56 insertions(+), 23 deletions(-) diff --git a/plugins/heygen/README.md b/plugins/heygen/README.md index 0ae26514..f8b53b77 100644 --- a/plugins/heygen/README.md +++ b/plugins/heygen/README.md @@ -31,6 +31,7 @@ from dotenv import load_dotenv from vision_agents.core import User, Agent from vision_agents.plugins import cartesia, deepgram, getstream, gemini, heygen +from vision_agents.plugins.heygen import VideoQuality load_dotenv() @@ -48,7 +49,7 @@ async def start_avatar_agent(): processors=[ heygen.AvatarPublisher( avatar_id="default", - quality="high" + quality=VideoQuality.HIGH ) ] ) @@ -77,9 +78,11 @@ HEYGEN_API_KEY=your_heygen_api_key_here ### AvatarPublisher Options ```python +from vision_agents.plugins.heygen import VideoQuality + heygen.AvatarPublisher( avatar_id="default", # HeyGen avatar ID - quality="high", # Video quality: "low", "medium", "high" + quality=VideoQuality.HIGH, # Video quality: VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH resolution=(1920, 1080), # Output resolution (width, height) api_key=None, # Optional: override env var ) @@ -159,7 +162,7 @@ If you experience connection problems: To optimize video quality: -- Use `quality="high"` for best results +- Use `quality=VideoQuality.HIGH` for best results - Increase resolution if bandwidth allows - Ensure stable internet connection diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md index b922e7e2..a9206171 100644 --- a/plugins/heygen/example/README.md +++ b/plugins/heygen/example/README.md @@ -108,9 +108,11 @@ Both will: Get your avatar ID from HeyGen dashboard and update: ```python +from vision_agents.plugins.heygen import VideoQuality + heygen.AvatarPublisher( avatar_id="your_avatar_id_here", - quality="high" + quality=VideoQuality.HIGH ) ``` @@ -119,9 +121,11 @@ heygen.AvatarPublisher( Choose quality based on your bandwidth: ```python +from vision_agents.plugins.heygen import VideoQuality + heygen.AvatarPublisher( avatar_id="default", - quality="low", # Options: "low", "medium", "high" + quality=VideoQuality.LOW, # Options: VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH resolution=(1280, 720) # Lower resolution for better performance ) ``` diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py index 1683491b..a07e77cd 100644 --- a/plugins/heygen/example/avatar_example.py +++ b/plugins/heygen/example/avatar_example.py @@ -4,6 +4,7 @@ from vision_agents.core import User, Agent from vision_agents.plugins import getstream, gemini, heygen, deepgram +from vision_agents.plugins.heygen import VideoQuality load_dotenv() @@ -43,7 +44,7 @@ async def start_avatar_agent() -> None: processors=[ heygen.AvatarPublisher( avatar_id="default", # Use your HeyGen avatar ID - quality="high", # Video quality: "low", "medium", "high" + quality=VideoQuality.HIGH, # Video quality: VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH resolution=(1920, 1080), # Output resolution mute_llm_audio=False, # Not needed for streaming LLM ) diff --git a/plugins/heygen/example/avatar_realtime_example.py b/plugins/heygen/example/avatar_realtime_example.py index 695f7b55..a851064b 100644 --- a/plugins/heygen/example/avatar_realtime_example.py +++ b/plugins/heygen/example/avatar_realtime_example.py @@ -6,6 +6,7 @@ from vision_agents.core import User, Agent from vision_agents.plugins import getstream, gemini, heygen +from vision_agents.plugins.heygen import VideoQuality load_dotenv() @@ -38,7 +39,7 @@ async def start_avatar_agent() -> None: processors=[ heygen.AvatarPublisher( avatar_id="default", - quality="high", + quality=VideoQuality.HIGH, ) ], ) diff --git a/plugins/heygen/tests/test_heygen_plugin.py b/plugins/heygen/tests/test_heygen_plugin.py index cfb754a8..d3aa6c2c 100644 --- a/plugins/heygen/tests/test_heygen_plugin.py +++ b/plugins/heygen/tests/test_heygen_plugin.py @@ -1,6 +1,6 @@ import pytest from unittest.mock import patch -from vision_agents.plugins.heygen import AvatarPublisher +from vision_agents.plugins.heygen import AvatarPublisher, VideoQuality from vision_agents.plugins.heygen.heygen_video_track import HeyGenVideoTrack from vision_agents.plugins.heygen.heygen_rtc_manager import HeyGenRTCManager from vision_agents.plugins.heygen.heygen_session import HeyGenSession @@ -13,12 +13,12 @@ def test_init_with_api_key(self): """Test initialization with explicit API key.""" session = HeyGenSession( avatar_id="test_avatar", - quality="high", + quality=VideoQuality.HIGH, api_key="test_key", ) assert session.avatar_id == "test_avatar" - assert session.quality == "high" + assert session.quality == VideoQuality.HIGH assert session.api_key == "test_key" def test_init_without_api_key_raises(self): @@ -55,7 +55,7 @@ def test_init(self): with patch.object(HeyGenSession, "__init__", return_value=None): manager = HeyGenRTCManager( avatar_id="test_avatar", - quality="medium", + quality=VideoQuality.MEDIUM, api_key="test_key", ) @@ -81,13 +81,13 @@ def test_init(self): with patch.object(HeyGenRTCManager, "__init__", return_value=None): publisher = AvatarPublisher( avatar_id="test_avatar", - quality="high", + quality=VideoQuality.HIGH, resolution=(1920, 1080), api_key="test_key", ) assert publisher.avatar_id == "test_avatar" - assert publisher.quality == "high" + assert publisher.quality == VideoQuality.HIGH assert publisher.resolution == (1920, 1080) assert not publisher._connected @@ -108,7 +108,7 @@ def test_state(self): with patch.object(HeyGenRTCManager, "__init__", return_value=None): publisher = AvatarPublisher( avatar_id="test_avatar", - quality="medium", + quality=VideoQuality.MEDIUM, api_key="test_key", ) # Mock the _connected attribute on the RTC manager @@ -117,7 +117,7 @@ def test_state(self): state = publisher.state() assert state["avatar_id"] == "test_avatar" - assert state["quality"] == "medium" + assert state["quality"] == VideoQuality.MEDIUM assert "connected" in state assert "rtc_connected" in state diff --git a/plugins/heygen/vision_agents/plugins/heygen/__init__.py b/plugins/heygen/vision_agents/plugins/heygen/__init__.py index ef7db7ba..98d608cd 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/__init__.py +++ b/plugins/heygen/vision_agents/plugins/heygen/__init__.py @@ -4,9 +4,10 @@ allowing AI agents to have realistic avatar video output with lip-sync. """ -from .heygen_avatar_publisher import AvatarPublisher +from .heygen_avatar_publisher import AvatarPublisher, VideoQuality __all__ = [ "AvatarPublisher", + "VideoQuality", ] diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index e5e12f03..fc2ffe00 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -1,5 +1,15 @@ import asyncio import logging +from enum import Enum + +# Define VideoQuality enum FIRST before any other imports to avoid circular import issues +class VideoQuality(str, Enum): + """Video quality options for HeyGen avatar streaming.""" + + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + from typing import Optional, Any, Tuple from getstream.video.rtc import audio_track @@ -34,7 +44,7 @@ class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMi processors=[ heygen.AvatarPublisher( avatar_id="default", - quality="high" + quality=heygen.VideoQuality.HIGH ) ] ) @@ -43,7 +53,7 @@ class AvatarPublisher(AudioVideoProcessor, VideoPublisherMixin, AudioPublisherMi def __init__( self, avatar_id: str = "default", - quality: str = "high", + quality: VideoQuality = VideoQuality.HIGH, resolution: Tuple[int, int] = (1920, 1080), api_key: Optional[str] = None, interval: int = 0, @@ -53,7 +63,7 @@ def __init__( Args: avatar_id: HeyGen avatar ID to use for streaming. - quality: Video quality ("low", "medium", "high"). + quality: Video quality (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH). resolution: Output video resolution (width, height). api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided. interval: Processing interval (not used, kept for compatibility). diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py index 1572512e..67fc9e19 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -10,7 +10,10 @@ MediaStreamTrack, ) +from typing import Optional, Callable, Any + from .heygen_session import HeyGenSession +from .heygen_avatar_publisher import VideoQuality logger = logging.getLogger(__name__) @@ -25,16 +28,20 @@ class HeyGenRTCManager: def __init__( self, avatar_id: str = "default", - quality: str = "high", + quality: "VideoQuality" = None, api_key: Optional[str] = None, ): """Initialize the RTC manager. Args: avatar_id: HeyGen avatar ID to use. - quality: Video quality setting ("low", "medium", "high"). + quality: Video quality setting (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH). api_key: HeyGen API key (uses HEYGEN_API_KEY env var if not provided). """ + # Default to HIGH if not provided + if quality is None: + quality = VideoQuality.HIGH + self.session_manager = HeyGenSession( avatar_id=avatar_id, quality=quality, diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py index 1d332891..9aa5187d 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py @@ -3,6 +3,8 @@ from os import getenv import aiohttp +from .heygen_avatar_publisher import VideoQuality + logger = logging.getLogger(__name__) @@ -16,16 +18,20 @@ class HeyGenSession: def __init__( self, avatar_id: str = "default", - quality: str = "high", + quality: "VideoQuality" = None, api_key: Optional[str] = None, ): """Initialize HeyGen session manager. Args: avatar_id: HeyGen avatar ID to use for streaming. - quality: Video quality setting ("low", "medium", "high"). + quality: Video quality setting (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH). api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided. """ + # Default to HIGH if not provided + if quality is None: + quality = VideoQuality.HIGH + self.avatar_id = avatar_id self.quality = quality self.api_key: str = api_key or getenv("HEYGEN_API_KEY") or "" From 0b4894af5bae82c9455ae82ec341d24117b67b82 Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 4 Nov 2025 15:09:02 -0700 Subject: [PATCH 16/20] Ruff and Mypy --- .../heygen/vision_agents/plugins/heygen/__init__.py | 3 ++- .../plugins/heygen/heygen_avatar_publisher.py | 11 +---------- .../plugins/heygen/heygen_rtc_manager.py | 6 ++---- .../vision_agents/plugins/heygen/heygen_session.py | 8 ++------ .../vision_agents/plugins/heygen/heygen_types.py | 12 ++++++++++++ 5 files changed, 19 insertions(+), 21 deletions(-) create mode 100644 plugins/heygen/vision_agents/plugins/heygen/heygen_types.py diff --git a/plugins/heygen/vision_agents/plugins/heygen/__init__.py b/plugins/heygen/vision_agents/plugins/heygen/__init__.py index 98d608cd..e5dd68f6 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/__init__.py +++ b/plugins/heygen/vision_agents/plugins/heygen/__init__.py @@ -4,7 +4,8 @@ allowing AI agents to have realistic avatar video output with lip-sync. """ -from .heygen_avatar_publisher import AvatarPublisher, VideoQuality +from .heygen_avatar_publisher import AvatarPublisher +from .heygen_types import VideoQuality __all__ = [ "AvatarPublisher", diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index fc2ffe00..fd571fbf 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -1,15 +1,5 @@ import asyncio import logging -from enum import Enum - -# Define VideoQuality enum FIRST before any other imports to avoid circular import issues -class VideoQuality(str, Enum): - """Video quality options for HeyGen avatar streaming.""" - - LOW = "low" - MEDIUM = "medium" - HIGH = "high" - from typing import Optional, Any, Tuple from getstream.video.rtc import audio_track @@ -21,6 +11,7 @@ class VideoQuality(str, Enum): ) from .heygen_rtc_manager import HeyGenRTCManager +from .heygen_types import VideoQuality from .heygen_video_track import HeyGenVideoTrack logger = logging.getLogger(__name__) diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py index 67fc9e19..e91ba6c8 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_rtc_manager.py @@ -10,10 +10,8 @@ MediaStreamTrack, ) -from typing import Optional, Callable, Any - from .heygen_session import HeyGenSession -from .heygen_avatar_publisher import VideoQuality +from .heygen_types import VideoQuality logger = logging.getLogger(__name__) @@ -28,7 +26,7 @@ class HeyGenRTCManager: def __init__( self, avatar_id: str = "default", - quality: "VideoQuality" = None, + quality: Optional["VideoQuality"] = VideoQuality.HIGH, api_key: Optional[str] = None, ): """Initialize the RTC manager. diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py index 9aa5187d..c73c8648 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_session.py @@ -3,7 +3,7 @@ from os import getenv import aiohttp -from .heygen_avatar_publisher import VideoQuality +from .heygen_types import VideoQuality logger = logging.getLogger(__name__) @@ -18,7 +18,7 @@ class HeyGenSession: def __init__( self, avatar_id: str = "default", - quality: "VideoQuality" = None, + quality: VideoQuality = VideoQuality.HIGH, api_key: Optional[str] = None, ): """Initialize HeyGen session manager. @@ -28,10 +28,6 @@ def __init__( quality: Video quality setting (VideoQuality.LOW, VideoQuality.MEDIUM, or VideoQuality.HIGH). api_key: HeyGen API key. Uses HEYGEN_API_KEY env var if not provided. """ - # Default to HIGH if not provided - if quality is None: - quality = VideoQuality.HIGH - self.avatar_id = avatar_id self.quality = quality self.api_key: str = api_key or getenv("HEYGEN_API_KEY") or "" diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_types.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_types.py new file mode 100644 index 00000000..f7981db9 --- /dev/null +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_types.py @@ -0,0 +1,12 @@ +"""Type definitions for HeyGen plugin.""" + +from enum import Enum + + +class VideoQuality(str, Enum): + """Video quality options for HeyGen avatar streaming.""" + + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + From 4bafa667685892dd7dba8822c5c9e7d017056b5d Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 4 Nov 2025 15:21:42 -0700 Subject: [PATCH 17/20] More ruff issues --- plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py | 2 +- plugins/openai/vision_agents/plugins/openai/openai_realtime.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py index 3106e596..14d54713 100644 --- a/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py +++ b/plugins/gemini/vision_agents/plugins/gemini/gemini_realtime.py @@ -152,7 +152,7 @@ async def simple_audio_response( audio_bytes = pcm.resample( target_sample_rate=16000, target_channels=1 ).samples.tobytes() - mime = f"audio/pcm;rate=16000" + mime = "audio/pcm;rate=16000" blob = Blob(data=audio_bytes, mime_type=mime) await self._require_session().send_realtime_input(audio=blob) diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py index dfa30f75..fa2d67d0 100644 --- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py +++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py @@ -6,7 +6,7 @@ RealtimeSessionCreateRequestParam, ResponseAudioTranscriptDoneEvent, InputAudioBufferSpeechStartedEvent, - ConversationItemInputAudioTranscriptionCompletedEvent, SessionUpdatedEvent, ResponseCreatedEvent, ResponseDoneEvent, + ConversationItemInputAudioTranscriptionCompletedEvent, ResponseCreatedEvent, ResponseDoneEvent, ) from vision_agents.core.llm import realtime From f5a1aaa14644f113655e9086a3653b568166ad77 Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 4 Nov 2025 15:23:35 -0700 Subject: [PATCH 18/20] Fix broken method sigs --- .../vision_agents/plugins/heygen/heygen_avatar_publisher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py index fd571fbf..7fa18c26 100644 --- a/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py +++ b/plugins/heygen/vision_agents/plugins/heygen/heygen_avatar_publisher.py @@ -88,7 +88,7 @@ def __init__( # Audio track for publishing HeyGen's audio # Create it immediately so the agent can detect it during initialization self._audio_track = audio_track.AudioStreamTrack( - framerate=48000, stereo=True + sample_rate=48000, channels=2, format="s16" ) # Connection state From 3f5e2037377682d006734da067a819d14094dd08 Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 4 Nov 2025 15:52:17 -0700 Subject: [PATCH 19/20] Unused var --- plugins/openai/vision_agents/plugins/openai/openai_realtime.py | 1 - 1 file changed, 1 deletion(-) diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py index fa2d67d0..c6249bf5 100644 --- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py +++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py @@ -241,7 +241,6 @@ async def _handle_openai_event(self, event: dict) -> None: # Handle tool calls from OpenAI realtime await self._handle_tool_call_event(event) elif et == "response.created": - e = ResponseCreatedEvent(**event) pass elif et == "response.done": logger.info("OpenAI response done %s", event) From 12cad153dc62f9e9d38b0dd054ed02e93790d59a Mon Sep 17 00:00:00 2001 From: "Neevash Ramdial (Nash)" Date: Tue, 4 Nov 2025 16:21:41 -0700 Subject: [PATCH 20/20] final ruff error --- plugins/openai/vision_agents/plugins/openai/openai_realtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py index c6249bf5..fb1efcb2 100644 --- a/plugins/openai/vision_agents/plugins/openai/openai_realtime.py +++ b/plugins/openai/vision_agents/plugins/openai/openai_realtime.py @@ -6,7 +6,7 @@ RealtimeSessionCreateRequestParam, ResponseAudioTranscriptDoneEvent, InputAudioBufferSpeechStartedEvent, - ConversationItemInputAudioTranscriptionCompletedEvent, ResponseCreatedEvent, ResponseDoneEvent, + ConversationItemInputAudioTranscriptionCompletedEvent, ResponseDoneEvent, ) from vision_agents.core.llm import realtime