Skip to content

feat(genai): Add Live API samples v2 #13523

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 80 additions & 0 deletions genai/live/live_conversation_audio_with_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


import asyncio
import pyaudio
from google import genai
from google.genai.types import LiveConnectConfig, Modality, AudioTranscriptionConfig, Blob

CHUNK = 4200
FORMAT = pyaudio.paInt16
CHANNELS = 1
RECORD_SECONDS = 5
MODEL = "gemini-2.0-flash-live-preview-04-09"
INPUT_RATE = 16000
OUTPUT_RATE = 24000

client = genai.Client()

config = LiveConnectConfig(
response_modalities=[Modality.AUDIO],
input_audio_transcription=AudioTranscriptionConfig(),
output_audio_transcription=AudioTranscriptionConfig()
)

async def main():
#exit()
print(MODEL)
p = pyaudio.PyAudio()
async with client.aio.live.connect(model=MODEL, config=config) as session:

async def send():
stream = p.open(
format=FORMAT, channels=CHANNELS, rate=INPUT_RATE, input=True, frames_per_buffer=CHUNK)
while True:
frame = stream.read(CHUNK)
await session.send_realtime_input(media=Blob(data=frame, mime_type="audio/pcm"))
await asyncio.sleep(10 ** -12)


async def receive():
output_stream = p.open(
format=FORMAT, channels=CHANNELS, rate=OUTPUT_RATE, output=True, frames_per_buffer=CHUNK)
async for message in session.receive():
if message.server_content.input_transcription:
print(message.server_content.model_dump(mode="json", exclude_none=True))
if message.server_content.output_transcription:
print(message.server_content.model_dump(mode="json", exclude_none=True))
if message.server_content.model_turn:
for part in message.server_content.model_turn.parts:
if part.inline_data.data:
audio_data = part.inline_data.data
output_stream.write(audio_data)
await asyncio.sleep(10 ** -12)




send_task = asyncio.create_task(send())
receive_task = asyncio.create_task(receive())
await asyncio.gather(send_task, receive_task)


#run it in terminal




asyncio.run(main())
129 changes: 129 additions & 0 deletions genai/live/live_conversation_websocket_audio_with_audio.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import asyncio
import os

import base64
import json
import numpy as np

from websockets.asyncio.client import connect
from scipy.io import wavfile


def get_bearer_token() -> str:
import google.auth
from google.auth.transport.requests import Request

creds, _ = google.auth.default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
auth_req = Request()
creds.refresh(auth_req)
bearer_token = creds.token
return bearer_token


# get bearer token
bearer_token = get_bearer_token()






# Set model generation_config
CONFIG = {"response_modalities": ["AUDIO"]}

headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {bearer_token[0]}",
}


async def main() -> None:
# Connect to the server
async with connect(SERVICE_URL, additional_headers=headers) as ws:

# Setup the session
async def setup() -> None:
await ws.send(
json.dumps(
{
"setup": {
"model": "gemini-live-2.5-flash",
"generation_config": CONFIG,
}
}
)
)

# Receive setup response
raw_response = await ws.recv(decode=False)
setup_response = json.loads(raw_response.decode("ascii"))
print(f"Connected: {setup_response}")
return

# Send text message
async def send() -> bool:
text_input = input("Input > ")
if text_input.lower() in ("q", "quit", "exit"):
return False

msg = {
"client_content": {
"turns": [{"role": "user", "parts": [{"text": text_input}]}],
"turn_complete": True,
}
}

await ws.send(json.dumps(msg))
return True

# Receive server response
async def receive() -> None:
responses = []

# Receive chucks of server response
async for raw_response in ws:
response = json.loads(raw_response.decode())
server_content = response.pop("serverContent", None)
if server_content is None:
break

model_turn = server_content.pop("modelTurn", None)
if model_turn is not None:
parts = model_turn.pop("parts", None)
if parts is not None:
for part in parts:
pcm_data = base64.b64decode(part["inlineData"]["data"])
responses.append(np.frombuffer(pcm_data, dtype=np.int16))

# End of turn
turn_complete = server_content.pop("turnComplete", None)
if turn_complete:
break

# Play the returned audio message
display(Markdown("**Response >**"))
display(Audio(np.concatenate(responses), rate=24000, autoplay=True))
return

await setup()

while True:
if not await send():
break
await receive()
73 changes: 73 additions & 0 deletions genai/live/live_ground_ragengine_with_txt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio

_memory_corpus = "projects/cloud-ai-devrel-softserve/locations/us-central1/ragCorpora/2305843009213693952"
Comment on lines +14 to +16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

For better reusability and to avoid exposing project-specific details, it's recommended to construct the _memory_corpus string from environment variables for the project ID and a placeholder for the user-specific RAG corpus ID. This requires importing the os module and aligns with the practices in other samples.

import asyncio
import os

# TODO(developer): Set this to your RAG Corpus ID.
RAG_CORPUS_ID = "your-rag-corpus-id"
_memory_corpus = f"projects/{os.getenv('GOOGLE_CLOUD_PROJECT')}/locations/us-central1/ragCorpora/{RAG_CORPUS_ID}"



async def generate_content(memory_corpus: str) -> list[str]:
# [START googlegenaisdk_live_ground_ragengine_with_txt]
from google import genai
from google.genai.types import (
Content,
LiveConnectConfig,
Modality,
Part,
Tool,
Retrieval,
VertexRagStore,
VertexRagStoreRagResource,
)

client = genai.Client()
model_id = "gemini-2.0-flash-live-preview-04-09"
rag_store = VertexRagStore(
rag_resources=[
VertexRagStoreRagResource(
rag_corpus=memory_corpus # Use memory corpus if you want to store context.
)
],
# Set `store_context` to true to allow Live API sink context into your memory corpus.
store_context=True,
)
config = LiveConnectConfig(
response_modalities=[Modality.TEXT],
tools=[Tool(retrieval=Retrieval(vertex_rag_store=rag_store))],
)

async with client.aio.live.connect(model=model_id, config=config) as session:
text_input = "What year did Mariusz Pudzianowski win World's Strongest Man?"
print("> ", text_input, "\n")

await session.send_client_content(
turns=Content(role="user", parts=[Part(text=text_input)])
)

response = []

async for message in session.receive():
if message.text:
response.append(message.text)
continue

print("".join(response))
# Example output:
# > What year did Mariusz Pudzianowski win World's Strongest Man?
# Mariusz Pudzianowski won World's Strongest Man in 2002, 2003, 2005, 2007, and 2008.
# [END googlegenaisdk_live_ground_ragengine_with_txt]
return response


if __name__ == "__main__":
asyncio.run(generate_content(_memory_corpus))
28 changes: 17 additions & 11 deletions genai/live/live_websocket_audiogen_with_txt.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def get_bearer_token() -> str:
import google.auth
from google.auth.transport.requests import Request

creds, _ = google.auth.default(scopes=["https://www.googleapis.com/auth/cloud-platform"])
creds, _ = google.auth.default(
scopes=["https://www.googleapis.com/auth/cloud-platform"]
)
auth_req = Request()
creds.refresh(auth_req)
bearer_token = creds.token
Expand Down Expand Up @@ -55,9 +57,7 @@ async def generate_content() -> str:

# Websocket Configuration
WEBSOCKET_HOST = "us-central1-aiplatform.googleapis.com"
WEBSOCKET_SERVICE_URL = (
f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"
)
WEBSOCKET_SERVICE_URL = f"wss://{WEBSOCKET_HOST}/ws/google.cloud.aiplatform.v1.LlmBidiService/BidiGenerateContent"

# Websocket Authentication
headers = {
Expand All @@ -66,9 +66,7 @@ async def generate_content() -> str:
}

# Model Configuration
model_path = (
f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
)
model_path = f"projects/{PROJECT_ID}/locations/{LOCATION}/publishers/google/models/{GEMINI_MODEL_NAME}"
model_generation_config = {
"response_modalities": ["AUDIO"],
"speech_config": {
Expand All @@ -77,7 +75,9 @@ async def generate_content() -> str:
},
}

async with connect(WEBSOCKET_SERVICE_URL, additional_headers=headers) as websocket_session:
async with connect(
WEBSOCKET_SERVICE_URL, additional_headers=headers
) as websocket_session:
# 1. Send setup configuration
websocket_config = {
"setup": {
Expand Down Expand Up @@ -120,7 +120,9 @@ async def generate_content() -> str:
server_content = response_chunk.get("serverContent")
if not server_content:
# This might indicate an error or an unexpected message format
print(f"Received non-serverContent message or empty content: {response_chunk}")
print(
f"Received non-serverContent message or empty content: {response_chunk}"
)
break

# Collect audio chunks
Expand All @@ -129,15 +131,19 @@ async def generate_content() -> str:
for part in model_turn["parts"]:
if part["inlineData"]["mimeType"] == "audio/pcm":
audio_chunk = base64.b64decode(part["inlineData"]["data"])
aggregated_response_parts.append(np.frombuffer(audio_chunk, dtype=np.int16))
aggregated_response_parts.append(
np.frombuffer(audio_chunk, dtype=np.int16)
)

# End of response
if server_content.get("turnComplete"):
break

# Save audio to a file
if aggregated_response_parts:
wavfile.write("output.wav", 24000, np.concatenate(aggregated_response_parts))
wavfile.write(
"output.wav", 24000, np.concatenate(aggregated_response_parts)
)
# Example response:
# Setup Response: {'setupComplete': {}}
# Input: Hello? Gemini are you there?
Expand Down
Loading