FCE-2438 / gemini integration (#207)

czerwiukk · web-flow · commit dd57b982fcfb · 2025-12-09T18:24:04.000+01:00
## Description

Adds a Google Gemini Live AI integration tutorial.
diff --git a/api/fishjam-server b/api/fishjam-server
@@ -1 +1 @@
-Subproject commit b43c1474328dbbb567394cee6a2ac28cbef9e984
+Subproject commit 3408127ab6e2405cbae61c98edce4fb725e24188
diff --git a/api/protos b/api/protos
@@ -1 +1 @@
-Subproject commit 2aaaf19da7bc4a86c4ef491152bea06e66f3c6dd
+Subproject commit 40f4ab8013644de2be5d7d7ff2652725935a2e92
diff --git a/api/room-manager b/api/room-manager
@@ -1 +1 @@
-Subproject commit d94cdf5771f053d6b9d373f09c17cd0e6d924853
+Subproject commit d3a6056afc54ff4f7f38ddd9afa2fe704e9c8569
diff --git a/docs/tutorials/gemini-live-integration.mdx b/docs/tutorials/gemini-live-integration.mdx
@@ -0,0 +1,251 @@
+---
+type: tutorial
+sidebar_position: 5
+title: Gemini Live Integration
+---
+
+import Tabs from "@theme/Tabs";
+import TabItem from "@theme/TabItem";
+
+# Gemini Live Integration
+
+:::info
+This tutorial requires a working Fishjam backend. If you haven't set one up yet, please check the [Backend Quick Start](../tutorials/backend-quick-start).
+:::
+
+This guide demonstrates how to build a real-time speech-to-speech agent using Fishjam and [Google's Multimodal Live API](https://ai.google.dev/gemini-api/docs/live).
+By connecting these two services, you can create a low-latency voice assistant that not only listens to peers in a room and responds with natural voice, but also provides a real-time transcription of the conversation.
+
+## Overview
+
+The implementation acts as a bridge between two real-time streams:
+
+1. **Fishjam ➡️ Gemini:** The Agent receives audio from the room and forwards it to Google GenAI.
+2. **Gemini ➡️ Fishjam:** The Agent receives audio generated by Gemini and plays it back into the room.
+
+To ensure these streams connect without audio glitches (garbled voice, wrong pitch), the audio sample rates must match between services.
+
+## Prerequisites
+
+You will need:
+
+- **Fishjam Server Credentials:** `fishjamId` and `managementToken`. You can get them at [fishjam.io/app](https://fishjam.io/app).
+- **Google Gemini API Key:** Obtainable from [Google AI Studio](https://aistudio.google.com/).
+
+### Installation
+
+Since the Google integration is optional, you need to install the specific dependencies for your SDK.
+
+<Tabs groupId="language">
+  <TabItem value="ts" label="TypeScript">
+  
+    First, ensure you have the Google GenAI SDK installed alongside Fishjam.
+
+    ```bash
+    npm install @fishjam-cloud/js-server-sdk @google/genai
+    ```
+
+  </TabItem>
+
+  <TabItem value="python" label="Python">
+
+    Install Fishjam with the `gemini` extra to pull in the necessary libraries.
+
+    ```bash
+    pip install "fishjam[gemini]"
+    ```
+
+  </TabItem>
+</Tabs>
+
+## Implementation
+
+### Step 1: Initialize Clients
+
+We provide a helper factory to initialize the Google Client.
+
+<Tabs groupId="language">
+  <TabItem value="ts" label="TypeScript">
+  
+    ```ts
+    import { FishjamClient } from '@fishjam-cloud/js-server-sdk';
+    import GeminiIntegration from '@fishjam-cloud/js-server-sdk/gemini';
+
+    const fishjamClient = new FishjamClient({
+      fishjamId: process.env.FISHJAM_ID!,
+      managementToken: process.env.FISHJAM_TOKEN!,
+    });
+
+    // [!code highlight:4]
+    const genAi = GeminiIntegration.createClient({
+      // Pass standard Google client options here
+      apiKey: process.env.GOOGLE_API_KEY!,
+    });
+    ```
+
+  </TabItem>
+
+  <TabItem value="python" label="Python">
+
+    ```python
+    import os
+    from fishjam import FishjamClient, GeminiIntegration
+
+    fishjam_client = FishjamClient(
+        fishjam_id=os.environ["FISHJAM_ID"],
+        management_token=os.environ["FISHJAM_TOKEN"]
+    )
+
+    # pass standard Google client kwargs here
+    # [!code highlight:1]
+    gen_ai = GeminiIntegration.create_client(api_key=os.environ["GOOGLE_API_KEY"])
+    ```
+
+  </TabItem>
+</Tabs>
+
+### Step 2: Configure the Agent
+
+Create a Fishjam agent configured to match the audio format that the Google client expects (16kHz and 24kHz on Gemini input and output, respectively).
+
+<Tabs groupId="language">
+  <TabItem value="ts" label="TypeScript">
+  
+    ```ts
+    import GeminiIntegration from '@fishjam-cloud/js-server-sdk/gemini';
+    
+    const room = await fishjamClient.createRoom();
+    
+    const { agent } = await fishjamClient.createAgent(room.id, {
+      subscribeMode: 'auto',
+      // Use our preset to match the required audio format (16kHz)
+      // [!code highlight:1]
+      output: GeminiIntegration.geminiInputAudioSettings,
+    });
+    ```
+
+  </TabItem>
+
+  <TabItem value="python" label="Python">
+
+    ```python
+    from fishjam import GeminiIntegration
+    from fishjam.peer import SubscribeOptions, SubscribeOptionsAudioSampleRate
+    from fishjam.agent import OutgoingAudioTrackOptions, TrackEncoding
+
+    room = fishjam_client.create_room()
+
+    # Use our preset to match the required audio format (16kHz)
+    # [!code highlight:1]
+    agent_options = AgentOptions(output=GeminiIntegration.GeminiInputAudioSettings)
+    agent = fishjam_client.create_agent(room.id, agent_options)
+    ```
+
+  </TabItem>
+</Tabs>
+
+### Step 3: Connect the Streams
+
+:::warning Encoding
+Fishjam handles raw bytes, while Google GenAI SDKs often expect Base64 strings. Ensure you convert between them correctly as shown below.
+:::
+
+<Tabs groupId="language">
+  <TabItem value="ts" label="TypeScript">
+    Now we setup the callbacks. We need to forward incoming Fishjam audio to Google, and forward incoming Google audio to Fishjam.
+    ```ts
+    import GeminiIntegration from '@fishjam-cloud/js-server-sdk/gemini';
+
+    const GEMINI_MODEL = "gemini-2.5-flash-native-audio-preview-09-2025"
+
+
+    // Use our preset to match the required audio format (24kHz)
+    const agentTrack = agent.createTrack(GeminiIntegration.geminiOutputAudioSettings);
+
+    const session = await genAi.live.connect({
+      model: GEMINI_MODEL,
+      config: { responseModalities: ["AUDIO"] },
+      callbacks: {
+        // Google -> Fishjam
+        onmessage: (msg) => {
+          if (msg.data) {
+            const pcmData = Buffer.from(msg.data, "base64");
+            agent.sendData(agentTrack.id, pcmData);
+          }
+
+          if (msg.serverContent?.interrupted) {
+            console.log("Agent was interrupted by user.");
+            // Clears the buffer on the Fishjam media server
+            agent.interruptTrack(agentTrack.id);
+          }
+        }
+      }
+    });
+
+    // Fishjam -> Google
+    agent.on('trackData', ({ data }) => {
+      session.sendRealtimeInput({
+        mimeType: GeminiIntegration.geminiInputMimeType,
+        data: data.toString("base64")
+      });
+    });
+
+    ```
+
+  </TabItem>
+
+  <TabItem value="python" label="Python">
+
+    Now we connect the websocket loops. We need to forward incoming Fishjam audio to Google, and forward incoming Google audio to Fishjam.
+    ```python
+    import asyncio
+    import base64
+    from fishjam import GeminiIntegration
+
+    GEMINI_MODEL = "gemini-2.5-flash-native-audio-preview-09-2025"
+
+    async with agent.connect() as fishjam_session:
+
+        # Use our preset to match the required audio format (24kHz)
+        outgoing_track = fishjam_session.create_track(GeminiIntegration.GeminiOutputAudioSettings)
+
+        async with gen_ai.aio.live.connect(
+            model=GEMINI_MODEL,
+            config={"response_modalities": ["AUDIO"]}
+        ) as gemini_session:
+
+            # Fishjam -> Google
+            async def forward_audio_to_gemini():
+                async for track_data in fishjam_session.receive():
+                    b64_data = base64.b64encode(track_data.data).decode("utf-8")
+                    await gemini_session.send_input({
+                        "mime_type": GeminiIntegration.GeminiInputMimeType,
+                        "data": b64_data
+                    })
+
+            # Google -> Fishjam
+            async def forward_audio_to_fishjam():
+                async for msg in gemini_session.receive():
+                    server_content = msg.server_content
+
+                    if server_content is None:
+                        continue
+
+                    if server_content.interrupted:
+                        outgoing_track.interrupt()
+
+                    if server_content.model_turn:
+                        for part in server_content.model_turn.parts:
+                            if part.inline_data:
+                                pcm_data = base64.b64decode(part.inline_data.data)
+                                outgoing_track.send_chunk(pcm_data)
+
+            # Run both loops concurrently
+            await asyncio.gather(
+                forward_audio_to_gemini(),
+                forward_audio_to_fishjam()
+            )
+    ```
+
+  </TabItem>
+</Tabs>
diff --git a/docusaurus.config.ts b/docusaurus.config.ts
@@ -39,7 +39,9 @@ const rehypeShikiPlugin = [
       transformerTwoslash({
         renderer: rendererClassic(),
         onTwoslashError(error, code, lang, options) {
-          if (isErrorFromVersionedDocs(options)) {
+          const isGeminiArticle = options.meta?.__raw?.includes("gemini");
+          const isVersionedDocs = isErrorFromVersionedDocs(options);
+          if (isVersionedDocs || isGeminiArticle) {
             return; // Ignore versioned docs
           }
           throw error;
diff --git a/packages/js-server-sdk b/packages/js-server-sdk
@@ -1 +1 @@
-Subproject commit 47c214593e589512a3ba31be9d92be66ca83da9a
+Subproject commit 5937ddafe21498f367fa41554113daa8e6ec9644
diff --git a/packages/mobile-client-sdk b/packages/mobile-client-sdk
@@ -1 +1 @@
-Subproject commit 13bc6085d5c0268377acde140fe9c29c39eaf73b
+Subproject commit 662ef7a5c563cb3d3333fb38ce68500a15c3ce47
diff --git a/packages/web-client-sdk b/packages/web-client-sdk
@@ -1 +1 @@
-Subproject commit 086057acaac6bb70cf3b439b0b98e1c0f9f80a67
+Subproject commit 7dcb43174a4f607c817508599273ca0a0e3590c0