Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 2 additions & 3 deletions agents-core/vision_agents/core/agents/agents.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
import time
import uuid
from dataclasses import asdict
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard, Coroutine
from typing import TYPE_CHECKING, Any, Dict, List, Optional, TypeGuard
from uuid import uuid4

import getstream.models
from aiortc import VideoStreamTrack
from getstream.video.async_call import Call
from getstream.video.rtc import Call

from getstream.video.rtc.pb.stream.video.sfu.models.models_pb2 import TrackType
Expand Down Expand Up @@ -697,7 +696,7 @@ async def create_user(self) -> None:
async def create_call(self, call_type: str, call_id: str) -> Call:
"""Shortcut for creating a call/room etc."""
call = self.edge.client.video.call(call_type, call_id)
response = await call.get_or_create(data={"created_by_id": self.agent_user.id})
await call.get_or_create(data={"created_by_id": self.agent_user.id})

return call

Expand Down
20 changes: 14 additions & 6 deletions examples/01_simple_agent_example/simple_agent_example.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,18 @@
import asyncio
from uuid import uuid4
import logging

from dotenv import load_dotenv

from vision_agents.core import User, Agent
from vision_agents.core import User, Agent, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import deepgram, getstream, gemini, vogent, elevenlabs
# from vision_agents.core.profiling import Profiler

logger = logging.getLogger(__name__)

load_dotenv()


async def start_agent() -> None:
async def create_agent(**kwargs) -> Agent:
llm = gemini.LLM("gemini-2.0-flash")
# create an agent to run with Stream's edge, openAI llm
agent = Agent(
Expand All @@ -30,9 +33,14 @@ async def start_agent() -> None:
# realtime version (vad, tts and stt not needed)
# llm=openai.Realtime()
)
return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
# ensure the agent user is created
await agent.create_user()
# Create a call
call = agent.edge.client.video.call("default", str(uuid4()))
call = await agent.create_call(call_type, call_id)

# Have the agent join the call/room
with await agent.join(call):
Expand Down Expand Up @@ -89,4 +97,4 @@ def _flush_and_shutdown():

if __name__ == "__main__":
# setup_telemetry()
asyncio.run(start_agent())
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
21 changes: 14 additions & 7 deletions examples/02_golf_coach_example/golf_coach_example.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import asyncio
from uuid import uuid4
import logging

from dotenv import load_dotenv

from vision_agents.core import User, Agent
from vision_agents.core import User, Agent, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import getstream, ultralytics, gemini

logger = logging.getLogger(__name__)

load_dotenv()


async def start_agent() -> None:
async def create_agent(**kwargs) -> Agent:
agent = Agent(
edge=getstream.Edge(), # use stream for edge video transport
agent_user=User(name="AI golf coach"),
Expand All @@ -20,9 +22,14 @@ async def start_agent() -> None:
ultralytics.YOLOPoseProcessor(model_path="yolo11n-pose.pt")
], # realtime pose detection with yolo
)
return agent


# create a call, some other video networks call this a room
call = agent.edge.client.video.call("default", str(uuid4()))
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
# ensure the agent user is created
await agent.create_user()
# Create a call
call = await agent.create_call(call_type, call_id)

# join the call and open a demo env
with await agent.join(call):
Expand All @@ -37,4 +44,4 @@ async def start_agent() -> None:


if __name__ == "__main__":
asyncio.run(start_agent())
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
using voice commands through the Gemini Live API.
"""

import asyncio
import logging
import os
from uuid import uuid4

from dotenv import load_dotenv

from vision_agents.core.agents import Agent
from vision_agents.core.agents import Agent, AgentLauncher
from vision_agents.core import cli
from vision_agents.core.mcp import MCPServerRemote
from vision_agents.plugins.gemini.gemini_realtime import Realtime
from vision_agents.plugins import getstream
Expand All @@ -26,22 +26,22 @@
logger = logging.getLogger(__name__)


async def start_agent():
async def create_agent(**kwargs) -> Agent:
"""Demonstrate Gemini Realtime with GitHub MCP server integration."""

# Get GitHub PAT from environment
github_pat = os.getenv("GITHUB_PAT")
if not github_pat:
logger.error("GITHUB_PAT environment variable not found!")
logger.error("Please set GITHUB_PAT in your .env file or environment")
return
raise ValueError("GITHUB_PAT environment variable not found")

# Get Google API key from environment
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key:
logger.error("GOOGLE_API_KEY environment variable not found!")
logger.error("Please set GOOGLE_API_KEY in your .env file or environment")
return
raise ValueError("GOOGLE_API_KEY environment variable not found")

# Create GitHub MCP server
github_server = MCPServerRemote(
Expand Down Expand Up @@ -69,6 +69,10 @@ async def start_agent():
logger.info("Agent created with Gemini Realtime and GitHub MCP server")
logger.info(f"GitHub server: {github_server}")

return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
try:
# Set up event handler for when participants join
@agent.subscribe
Expand All @@ -85,8 +89,10 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
f"Hello {event.participant.user.name}! I'm your GitHub AI assistant powered by Gemini Live. I have access to {len(mcp_functions)} GitHub tools and can help you with repositories, issues, pull requests, and more through voice commands!"
)

# ensure the agent user is created
await agent.create_user()
# Create a call
call = agent.edge.client.video.call("default", str(uuid4()))
call = await agent.create_call(call_type, call_id)

# Have the agent join the call/room
logger.info("🎤 Agent joining call...")
Expand Down Expand Up @@ -123,4 +129,4 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent):


if __name__ == "__main__":
asyncio.run(start_agent())
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
22 changes: 14 additions & 8 deletions examples/other_examples/09_github_mcp_demo/github_mcp_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
by the LLM without any manual registration required.
"""

import asyncio
import logging
import os
from uuid import uuid4

from dotenv import load_dotenv

from vision_agents.core.agents import Agent
from vision_agents.core.agents import Agent, AgentLauncher
from vision_agents.core import cli
from vision_agents.core.mcp import MCPServerRemote
from vision_agents.plugins.openai.openai_llm import OpenAILLM
from vision_agents.plugins import elevenlabs, deepgram, silero, getstream
Expand All @@ -26,15 +26,15 @@
logger = logging.getLogger(__name__)


async def start_agent():
async def create_agent(**kwargs) -> Agent:
"""Demonstrate GitHub MCP server integration."""

# Get GitHub PAT from environment
github_pat = os.getenv("GITHUB_PAT")
if not github_pat:
logger.error("GITHUB_PAT environment variable not found!")
logger.error("Please set GITHUB_PAT in your .env file or environment")
return
raise ValueError("GITHUB_PAT environment variable not found")

# Create GitHub MCP server
github_server = MCPServerRemote(
Expand All @@ -49,7 +49,7 @@ async def start_agent():
if not openai_api_key:
logger.error("OPENAI_API_KEY environment variable not found!")
logger.error("Please set OPENAI_API_KEY in your .env file or environment")
return
raise ValueError("OPENAI_API_KEY environment variable not found")

# Create OpenAI LLM
llm = OpenAILLM(model="gpt-4o", api_key=openai_api_key)
Expand All @@ -74,6 +74,10 @@ async def start_agent():
logger.info("Agent created with GitHub MCP server")
logger.info(f"GitHub server: {github_server}")

return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
try:
# Connect to GitHub MCP server with timeout
logger.info("Connecting to GitHub MCP server...")
Expand All @@ -95,8 +99,10 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
f"Hello {event.participant.user.name}! I'm your GitHub AI assistant with access to {len(mcp_functions)} GitHub tools. I can help you with repositories, issues, pull requests, and more!"
)

# ensure the agent user is created
await agent.create_user()
# Create a call
call = agent.edge.client.video.call("default", str(uuid4()))
call = await agent.create_call(call_type, call_id)

# Have the agent join the call/room
logger.info("🎤 Agent joining call...")
Expand Down Expand Up @@ -126,4 +132,4 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent):


if __name__ == "__main__":
asyncio.run(start_agent())
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@
using voice commands through the OpenAI Realtime API.
"""

import asyncio
import logging
import os
from uuid import uuid4

from dotenv import load_dotenv

from vision_agents.core.agents import Agent
from vision_agents.core.agents import Agent, AgentLauncher
from vision_agents.core import cli
from vision_agents.core.mcp import MCPServerRemote
from vision_agents.plugins.openai.openai_realtime import Realtime
from vision_agents.plugins import getstream
Expand All @@ -26,22 +26,22 @@
logger = logging.getLogger(__name__)


async def start_agent():
async def create_agent(**kwargs) -> Agent:
"""Demonstrate OpenAI Realtime with GitHub MCP server integration."""

# Get GitHub PAT from environment
github_pat = os.getenv("GITHUB_PAT")
if not github_pat:
logger.error("GITHUB_PAT environment variable not found!")
logger.error("Please set GITHUB_PAT in your .env file or environment")
return
raise ValueError("GITHUB_PAT environment variable not found")

# Check OpenAI API key from environment
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
logger.error("OPENAI_API_KEY environment variable not found!")
logger.error("Please set OPENAI_API_KEY in your .env file or environment")
return
raise ValueError("OPENAI_API_KEY environment variable not found")

# Create GitHub MCP server
github_server = MCPServerRemote(
Expand Down Expand Up @@ -71,6 +71,10 @@ async def start_agent():
logger.info("Agent created with OpenAI Realtime and GitHub MCP server")
logger.info(f"GitHub server: {github_server}")

return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
try:
# Set up event handler for when participants join
@agent.subscribe
Expand All @@ -87,8 +91,10 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
f"Hello {event.participant.user.name}! I'm your GitHub AI assistant powered by OpenAI Realtime. I have access to {len(mcp_functions)} GitHub tools and can help you with repositories, issues, pull requests, and more through voice commands!"
)

# ensure the agent user is created
await agent.create_user()
# Create a call
call = agent.edge.client.video.call("default", str(uuid4()))
call = await agent.create_call(call_type, call_id)

# Have the agent join the call/room
logger.info("🎤 Agent joining call...")
Expand Down Expand Up @@ -125,4 +131,4 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent):


if __name__ == "__main__":
asyncio.run(start_agent())
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
21 changes: 10 additions & 11 deletions examples/other_examples/gemini_live_realtime/gemini_live_example.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import asyncio
import logging
from uuid import uuid4

from dotenv import load_dotenv
from getstream import AsyncStream

from vision_agents.core.edge.types import User
from vision_agents.core.agents import Agent
from vision_agents.core.agents import Agent, AgentLauncher
from vision_agents.core import cli
from vision_agents.plugins import gemini, getstream

load_dotenv()
Expand All @@ -18,9 +16,7 @@
logger = logging.getLogger(__name__)


async def start_agent() -> None:
client = AsyncStream()

async def create_agent(**kwargs) -> Agent:
agent = Agent(
edge=getstream.Edge(),
agent_user=User(
Expand All @@ -30,17 +26,20 @@ async def start_agent() -> None:
llm=gemini.Realtime(),
processors=[], # processors can fetch extra data, check images/audio data or transform video
)

call = client.video.call("default", str(uuid4()))
return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
# ensure the agent user is created
await agent.create_user()
# Create a call
call = await agent.create_call(call_type, call_id)

with await agent.join(call):
await asyncio.sleep(5)
await agent.edge.open_demo(call)
await agent.llm.simple_response(text="Describe what you see and say hi")
await agent.finish() # run till the call ends


if __name__ == "__main__":
asyncio.run(start_agent())
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
Loading