|
| 1 | +--- |
| 2 | +type: tutorial |
| 3 | +sidebar_position: 5 |
| 4 | +title: Gemini Live Integration |
| 5 | +--- |
| 6 | + |
| 7 | +import Tabs from "@theme/Tabs"; |
| 8 | +import TabItem from "@theme/TabItem"; |
| 9 | + |
| 10 | +# Gemini Live Integration |
| 11 | + |
| 12 | +:::info |
| 13 | +This tutorial requires a working Fishjam backend. If you haven't set one up yet, please check the [Backend Quick Start](../tutorials/backend-quick-start). |
| 14 | +::: |
| 15 | + |
| 16 | +This guide demonstrates how to build a real-time speech-to-speech agent using Fishjam and [Google's Multimodal Live API](https://ai.google.dev/gemini-api/docs/live). |
| 17 | +By connecting these two services, you can create a low-latency voice assistant that not only listens to peers in a room and responds with natural voice, but also provides a real-time transcription of the conversation. |
| 18 | + |
| 19 | +## Overview |
| 20 | + |
| 21 | +The implementation acts as a bridge between two real-time streams: |
| 22 | + |
| 23 | +1. **Fishjam ➡️ Gemini:** The Agent receives audio from the room and forwards it to Google GenAI. |
| 24 | +2. **Gemini ➡️ Fishjam:** The Agent receives audio generated by Gemini and plays it back into the room. |
| 25 | + |
| 26 | +To ensure these streams connect without audio glitches (garbled voice, wrong pitch), the audio sample rates must match between services. |
| 27 | + |
| 28 | +## Prerequisites |
| 29 | + |
| 30 | +You will need: |
| 31 | + |
| 32 | +- **Fishjam Server Credentials:** `fishjamId` and `managementToken`. You can get them at [fishjam.io/app](https://fishjam.io/app). |
| 33 | +- **Google Gemini API Key:** Obtainable from [Google AI Studio](https://aistudio.google.com/). |
| 34 | + |
| 35 | +### Installation |
| 36 | + |
| 37 | +Since the Google integration is optional, you need to install the specific dependencies for your SDK. |
| 38 | + |
| 39 | +<Tabs groupId="language"> |
| 40 | + <TabItem value="ts" label="TypeScript"> |
| 41 | + |
| 42 | + First, ensure you have the Google GenAI SDK installed alongside Fishjam. |
| 43 | + |
| 44 | + ```bash |
| 45 | + npm install @fishjam-cloud/js-server-sdk @google/genai |
| 46 | + ``` |
| 47 | + |
| 48 | + </TabItem> |
| 49 | + |
| 50 | + <TabItem value="python" label="Python"> |
| 51 | + |
| 52 | + Install Fishjam with the `gemini` extra to pull in the necessary libraries. |
| 53 | + |
| 54 | + ```bash |
| 55 | + pip install "fishjam[gemini]" |
| 56 | + ``` |
| 57 | + |
| 58 | + </TabItem> |
| 59 | +</Tabs> |
| 60 | + |
| 61 | +## Implementation |
| 62 | + |
| 63 | +### Step 1: Initialize Clients |
| 64 | + |
| 65 | +We provide a helper factory to initialize the Google Client. |
| 66 | + |
| 67 | +<Tabs groupId="language"> |
| 68 | + <TabItem value="ts" label="TypeScript"> |
| 69 | + |
| 70 | + ```ts |
| 71 | + import { FishjamClient } from '@fishjam-cloud/js-server-sdk'; |
| 72 | + import GeminiIntegration from '@fishjam-cloud/js-server-sdk/gemini'; |
| 73 | + |
| 74 | + const fishjamClient = new FishjamClient({ |
| 75 | + fishjamId: process.env.FISHJAM_ID!, |
| 76 | + managementToken: process.env.FISHJAM_TOKEN!, |
| 77 | + }); |
| 78 | + |
| 79 | + // [!code highlight:4] |
| 80 | + const genAi = GeminiIntegration.createClient({ |
| 81 | + // Pass standard Google client options here |
| 82 | + apiKey: process.env.GOOGLE_API_KEY!, |
| 83 | + }); |
| 84 | + ``` |
| 85 | + |
| 86 | + </TabItem> |
| 87 | + |
| 88 | + <TabItem value="python" label="Python"> |
| 89 | + |
| 90 | + ```python |
| 91 | + import os |
| 92 | + from fishjam import FishjamClient, GeminiIntegration |
| 93 | + |
| 94 | + fishjam_client = FishjamClient( |
| 95 | + fishjam_id=os.environ["FISHJAM_ID"], |
| 96 | + management_token=os.environ["FISHJAM_TOKEN"] |
| 97 | + ) |
| 98 | + |
| 99 | + # pass standard Google client kwargs here |
| 100 | + # [!code highlight:1] |
| 101 | + gen_ai = GeminiIntegration.create_client(api_key=os.environ["GOOGLE_API_KEY"]) |
| 102 | + ``` |
| 103 | + |
| 104 | + </TabItem> |
| 105 | +</Tabs> |
| 106 | + |
| 107 | +### Step 2: Configure the Agent |
| 108 | + |
| 109 | +Create a Fishjam agent configured to match the audio format that the Google client expects (16kHz and 24kHz on Gemini input and output, respectively). |
| 110 | + |
| 111 | +<Tabs groupId="language"> |
| 112 | + <TabItem value="ts" label="TypeScript"> |
| 113 | + |
| 114 | + ```ts |
| 115 | + import GeminiIntegration from '@fishjam-cloud/js-server-sdk/gemini'; |
| 116 | + |
| 117 | + const room = await fishjamClient.createRoom(); |
| 118 | + |
| 119 | + const { agent } = await fishjamClient.createAgent(room.id, { |
| 120 | + subscribeMode: 'auto', |
| 121 | + // Use our preset to match the required audio format (16kHz) |
| 122 | + // [!code highlight:1] |
| 123 | + output: GeminiIntegration.geminiInputAudioSettings, |
| 124 | + }); |
| 125 | + ``` |
| 126 | + |
| 127 | + </TabItem> |
| 128 | + |
| 129 | + <TabItem value="python" label="Python"> |
| 130 | + |
| 131 | + ```python |
| 132 | + from fishjam import GeminiIntegration |
| 133 | + from fishjam.peer import SubscribeOptions, SubscribeOptionsAudioSampleRate |
| 134 | + from fishjam.agent import OutgoingAudioTrackOptions, TrackEncoding |
| 135 | + |
| 136 | + room = fishjam_client.create_room() |
| 137 | + |
| 138 | + # Use our preset to match the required audio format (16kHz) |
| 139 | + # [!code highlight:1] |
| 140 | + agent_options = AgentOptions(output=GeminiIntegration.GeminiInputAudioSettings) |
| 141 | + agent = fishjam_client.create_agent(room.id, agent_options) |
| 142 | + ``` |
| 143 | + |
| 144 | + </TabItem> |
| 145 | +</Tabs> |
| 146 | + |
| 147 | +### Step 3: Connect the Streams |
| 148 | + |
| 149 | +:::warning Encoding |
| 150 | +Fishjam handles raw bytes, while Google GenAI SDKs often expect Base64 strings. Ensure you convert between them correctly as shown below. |
| 151 | +::: |
| 152 | + |
| 153 | +<Tabs groupId="language"> |
| 154 | + <TabItem value="ts" label="TypeScript"> |
| 155 | + Now we setup the callbacks. We need to forward incoming Fishjam audio to Google, and forward incoming Google audio to Fishjam. |
| 156 | + ```ts |
| 157 | + import GeminiIntegration from '@fishjam-cloud/js-server-sdk/gemini'; |
| 158 | + |
| 159 | + const GEMINI_MODEL = "gemini-2.5-flash-native-audio-preview-09-2025" |
| 160 | + |
| 161 | + |
| 162 | + // Use our preset to match the required audio format (24kHz) |
| 163 | + const agentTrack = agent.createTrack(GeminiIntegration.geminiOutputAudioSettings); |
| 164 | + |
| 165 | + const session = await genAi.live.connect({ |
| 166 | + model: GEMINI_MODEL, |
| 167 | + config: { responseModalities: ["AUDIO"] }, |
| 168 | + callbacks: { |
| 169 | + // Google -> Fishjam |
| 170 | + onmessage: (msg) => { |
| 171 | + if (msg.data) { |
| 172 | + const pcmData = Buffer.from(msg.data, "base64"); |
| 173 | + agent.sendData(agentTrack.id, pcmData); |
| 174 | + } |
| 175 | + |
| 176 | + if (msg.serverContent?.interrupted) { |
| 177 | + console.log("Agent was interrupted by user."); |
| 178 | + // Clears the buffer on the Fishjam media server |
| 179 | + agent.interruptTrack(agentTrack.id); |
| 180 | + } |
| 181 | + } |
| 182 | + } |
| 183 | + }); |
| 184 | + |
| 185 | + // Fishjam -> Google |
| 186 | + agent.on('trackData', ({ data }) => { |
| 187 | + session.sendRealtimeInput({ |
| 188 | + mimeType: GeminiIntegration.geminiInputMimeType, |
| 189 | + data: data.toString("base64") |
| 190 | + }); |
| 191 | + }); |
| 192 | + |
| 193 | + ``` |
| 194 | +
|
| 195 | + </TabItem> |
| 196 | +
|
| 197 | + <TabItem value="python" label="Python"> |
| 198 | +
|
| 199 | + Now we connect the websocket loops. We need to forward incoming Fishjam audio to Google, and forward incoming Google audio to Fishjam. |
| 200 | + ```python |
| 201 | + import asyncio |
| 202 | + import base64 |
| 203 | + from fishjam import GeminiIntegration |
| 204 | +
|
| 205 | + GEMINI_MODEL = "gemini-2.5-flash-native-audio-preview-09-2025" |
| 206 | +
|
| 207 | + async with agent.connect() as fishjam_session: |
| 208 | +
|
| 209 | + # Use our preset to match the required audio format (24kHz) |
| 210 | + outgoing_track = fishjam_session.create_track(GeminiIntegration.GeminiOutputAudioSettings) |
| 211 | +
|
| 212 | + async with gen_ai.aio.live.connect( |
| 213 | + model=GEMINI_MODEL, |
| 214 | + config={"response_modalities": ["AUDIO"]} |
| 215 | + ) as gemini_session: |
| 216 | +
|
| 217 | + # Fishjam -> Google |
| 218 | + async def forward_audio_to_gemini(): |
| 219 | + async for track_data in fishjam_session.receive(): |
| 220 | + b64_data = base64.b64encode(track_data.data).decode("utf-8") |
| 221 | + await gemini_session.send_input({ |
| 222 | + "mime_type": GeminiIntegration.GeminiInputMimeType, |
| 223 | + "data": b64_data |
| 224 | + }) |
| 225 | +
|
| 226 | + # Google -> Fishjam |
| 227 | + async def forward_audio_to_fishjam(): |
| 228 | + async for msg in gemini_session.receive(): |
| 229 | + server_content = msg.server_content |
| 230 | +
|
| 231 | + if server_content is None: |
| 232 | + continue |
| 233 | +
|
| 234 | + if server_content.interrupted: |
| 235 | + outgoing_track.interrupt() |
| 236 | +
|
| 237 | + if server_content.model_turn: |
| 238 | + for part in server_content.model_turn.parts: |
| 239 | + if part.inline_data: |
| 240 | + pcm_data = base64.b64decode(part.inline_data.data) |
| 241 | + outgoing_track.send_chunk(pcm_data) |
| 242 | +
|
| 243 | + # Run both loops concurrently |
| 244 | + await asyncio.gather( |
| 245 | + forward_audio_to_gemini(), |
| 246 | + forward_audio_to_fishjam() |
| 247 | + ) |
| 248 | + ``` |
| 249 | + |
| 250 | + </TabItem> |
| 251 | +</Tabs> |
0 commit comments