GoogleCloudPlatform · Guiners · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025 · Jul 28, 2025
@@ -0,0 +1,87 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+# Install helpers for converting files: pip install librosa soundfile
+
+import asyncio
+
+
+async def generate_content() -> list[str]:
+    # [START googlegenaisdk_live_audio_with_txt]
+    import numpy as np
+    from IPython.display import Audio, Markdown, display
+    from google import genai
+    from google.genai.types import (
+        Content,
+        LiveConnectConfig,
+        Modality,
+        Part,
+        SpeechConfig,
+        VoiceConfig,
+        PrebuiltVoiceConfig,
+    )
+
+    client = genai.Client()
+    voice_name = "Aoede"
+
+    config = LiveConnectConfig(
+        response_modalities=[Modality.AUDIO],
+        speech_config=SpeechConfig(
+            voice_config=VoiceConfig(
+                prebuilt_voice_config=PrebuiltVoiceConfig(
+                    voice_name=voice_name,
+                )
+            ),
+        ),
+    )
+    model = "gemini-2.0-flash-live-preview-04-09"
+
+    async with client.aio.live.connect(
+            model=model,
+            config=config,
+    ) as session:
+        text_input = "Hello? Gemini are you there?"
+        print("> ", text_input, "\n")
+
+        await session.send_client_content(
+            turns=Content(role="user", parts=[Part(text=text_input)]))
+
+        audio_data = []
+        async for message in session.receive():
+            if (
+                    message.server_content.model_turn
+                    and message.server_content.model_turn.parts
+            ):
+                for part in message.server_content.model_turn.parts:
+                    if part.inline_data:
+                        audio_data.append(
+                            np.frombuffer(part.inline_data.data, dtype=np.int16)
+                        )
+
+        if audio_data:
+            print("Received audio answer: ")
+            display(Audio(np.concatenate(audio_data), rate=24000, autoplay=True))
+
+    # Example output:
+    # >  Hello? Gemini are you there?
+    # Received audio answer:
+    # <IPython.lib.display.Audio object>
+    # [STOP googlegenaisdk_live_audio_with_txt]
+    return []
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_content())
@@ -0,0 +1,70 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Test file: https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+# Install helpers for converting files: pip install librosa soundfile
+
+import asyncio
+
+
+async def generate_content() -> list[str]:
+    # [START googlegenaisdk_live_txt_with_audio]
+    import io
+    import requests
+    from google import genai
+    from google.genai.types import Modality, LiveConnectConfig, Blob
+    import soundfile as sf
+    import librosa
+
+    client = genai.Client()
+    model = "gemini-2.0-flash-live-preview-04-09"
+    config = LiveConnectConfig(response_modalities=[Modality.TEXT])
+
+    async with client.aio.live.connect(model=model, config=config) as session:
+        audio_url = "https://storage.googleapis.com/generativeai-downloads/data/16000.wav"
+        response = requests.get(audio_url)
+        response.raise_for_status()
+        buffer = io.BytesIO(response.content)
+        y, sr = librosa.load(buffer, sr=16000)
+        sf.write(buffer, y, sr, format="RAW", subtype="PCM_16")
+        buffer.seek(0)
+        audio_bytes = buffer.read()
+
+        # If you've pre-converted to sample.pcm using ffmpeg, use this instead:
+        # audio_bytes = Path("sample.pcm").read_bytes()
+
+        print("> Answer to this audio url", audio_url, "\n")
+
+        await session.send_realtime_input(
+            media=Blob(data=audio_bytes, mime_type="audio/pcm;rate=16000")
+        )
+
+        response = []
+
+        async for message in session.receive():
+            if message.text is not None:
+                response.append(message.text)
+
+        print("".join(response))
+    # Example output:
+    # > Answer to this audio url https://storage.googleapis.com/generativeai-downloads/data/16000.wav
+    # Yes, I can hear you. How can I help you today?
+    # [STOP googlegenaisdk_live_txt_with_audio]
+    return response
+
+
+
+if __name__ == "__main__":
+    asyncio.run(generate_content())
@@ -1,3 +1,5 @@
 google-genai==1.20.0
 scipy==1.15.3
-websockets==15.0.1
+websockets==15.0.1
+librosa==0.11.0
+IPython==8.26.0
@@ -25,6 +25,8 @@
 import live_websocket_textgen_with_audio
 import live_websocket_textgen_with_txt
 import live_with_txt
+import live_txt_with_audio
+import live_audio_with_txt
 
 os.environ["GOOGLE_GENAI_USE_VERTEXAI"] = "True"
 os.environ["GOOGLE_CLOUD_LOCATION"] = "us-central1"
@@ -55,3 +57,14 @@ async def test_live_websocket_audiogen_with_txt() -> None:
 @pytest.mark.asyncio
 async def test_live_websocket_audiotranscript_with_txt() -> None:
     assert await live_websocket_audiotranscript_with_txt.generate_content()
+
+
+@pytest.mark.asyncio
+async def test_live_txt_with_audio() -> None:
+    assert await live_txt_with_audio.generate_content()
+
+
+@pytest.mark.asyncio
+async def test_live_audio_with_txt() -> None:
+    result = await live_audio_with_txt.generate_content()
+    assert result is not None