Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions genai/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ go 1.24.0

require (
github.com/GoogleCloudPlatform/golang-samples v0.0.0-20250201051611-5fb145d1e974
github.com/go-audio/audio v1.0.0
github.com/go-audio/wav v1.1.0
golang.org/x/oauth2 v0.25.0
google.golang.org/genai v1.17.0
)
Expand All @@ -25,6 +27,7 @@ require (
github.com/envoyproxy/go-control-plane/envoy v1.32.3 // indirect
github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-audio/riff v1.0.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/google/go-cmp v0.6.0 // indirect
Expand Down
6 changes: 6 additions & 0 deletions genai/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6
github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
Expand Down
190 changes: 190 additions & 0 deletions genai/live/live_conversation_audio_with_audio.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,190 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package live shows how to use the GenAI SDK to generate text with live resources.
package live

// [START googlegenaisdk_live_conversation_audio_with_audio]
import (
"bytes"
"context"
"encoding/binary"
"fmt"
"io"
"os"

"github.com/go-audio/audio"
"github.com/go-audio/wav"
"google.golang.org/genai"
)

// generateLiveAudioConversation demonstrates two-way audio interaction with a Gemini model using live streaming.
func generateLiveAudioConversation(w io.Writer, audioFilePath string) error {
ctx := context.Background()

client, err := genai.NewClient(ctx, &genai.ClientConfig{
HTTPOptions: genai.HTTPOptions{
APIVersion: "v1beta1",
},
})
if err != nil {
return fmt.Errorf("failed to create genai client: %w", err)
}

modelName := "gemini-live-2.5-flash-preview-native-audio-09-2025"

// Configure model to receive and respond with audio, including transcriptions.
config := &genai.LiveConnectConfig{
ResponseModalities: []genai.Modality{genai.ModalityAudio},
InputAudioTranscription: &genai.AudioTranscriptionConfig{},
OutputAudioTranscription: &genai.AudioTranscriptionConfig{},
}

session, err := client.Live.Connect(ctx, modelName, config)
if err != nil {
return fmt.Errorf("failed to connect live: %w", err)
}
defer session.Close()

// Load the audio file
audioBytes, mimeType, err := loadAudioAsPCMBytes(audioFilePath)
if err != nil {
return fmt.Errorf("failed to load audio: %w", err)
}

fmt.Fprintf(w, "> Streaming audio from %s to the model\n\n", audioFilePath)

// Send audio data to the model
err = session.SendRealtimeInput(genai.LiveRealtimeInput{
Media: &genai.Blob{
Data: audioBytes,
MIMEType: mimeType,
},
})
if err != nil {
return fmt.Errorf("failed to send realtime input: %w", err)
}

// Gather audio response frames
var audioFrames [][]byte

for {
chunk, err := session.Receive()
if err != nil {
if err == io.EOF {
break
}
return fmt.Errorf("error receiving response: %w", err)
}

if chunk.ServerContent != nil {
if chunk.ServerContent.InputTranscription != nil {
fmt.Fprintf(w, "Input transcription: %s\n", chunk.ServerContent.InputTranscription.Text)
}
if chunk.ServerContent.OutputTranscription != nil {
fmt.Fprintf(w, "Output transcription: %s\n", chunk.ServerContent.OutputTranscription.Text)
}
if chunk.ServerContent.ModelTurn != nil {
for _, part := range chunk.ServerContent.ModelTurn.Parts {
if part.InlineData != nil && len(part.InlineData.Data) > 0 {
audioFrames = append(audioFrames, part.InlineData.Data)
}
}
}
}
}

// Save audio frames to WAV file if available
if len(audioFrames) > 0 {
outputFile := "model_response.wav"
err := saveAudioFramesAsWAV(outputFile, audioFrames, 24000)
if err != nil {
return fmt.Errorf("failed to write WAV: %w", err)
}
fmt.Fprintf(w, "Model response saved to %s\n", outputFile)
}

// Example output:
// gemini-2.0-flash-live-preview-04-09
// {'input_transcription': {'text': 'Hello.'}}
// {'output_transcription': {}}
// {'output_transcription': {'text': 'Hi'}}
// {'output_transcription': {'text': ' there. What can I do for you today?'}}
// {'output_transcription': {'finished': True}}
// Model response saved to example_model_response.wav
return nil
}

// loadAudioAsPCMBytes reads a WAV file and returns PCM bytes with a MIME type.
func loadAudioAsPCMBytes(path string) ([]byte, string, error) {
file, err := os.Open(path)
if err != nil {
return nil, "", fmt.Errorf("failed to open WAV file: %w", err)
}
defer file.Close()

wavDecoder := wav.NewDecoder(file)
if !wavDecoder.IsValidFile() {
return nil, "", fmt.Errorf("invalid WAV file")
}
buf, err := wavDecoder.FullPCMBuffer()
if err != nil {
return nil, "", fmt.Errorf("failed to decode WAV: %w", err)
}

sampleRate := wavDecoder.SampleRate
rawInts := buf.Data
data := make([]byte, len(rawInts)*2) // 16-bit PCM

for i, sample := range rawInts {
binary.LittleEndian.PutUint16(data[i*2:], uint16(int16(sample)))
}

mimeType := fmt.Sprintf("audio/pcm;rate=%d", sampleRate)
return data, mimeType, nil
}

// saveAudioFramesAsWAV writes audio frames (PCM bytes) to a WAV file.
func saveAudioFramesAsWAV(filePath string, frames [][]byte, sampleRate int) error {
audioData := bytes.Join(frames, nil)

// Create buffer
intData := audio.IntBuffer{
Format: &audio.Format{NumChannels: 1, SampleRate: sampleRate},
Data: make([]int, len(audioData)/2),
}

for i := 0; i < len(audioData); i += 2 {
intData.Data[i/2] = int(int16(audioData[i]) | int16(audioData[i+1])<<8)
}

file, err := os.Create(filePath)
if err != nil {
return fmt.Errorf("failed to create WAV file: %w", err)
}
defer file.Close()

wavEncoder := wav.NewEncoder(file, sampleRate, 16, 1, 1)
if err := wavEncoder.Write(&intData); err != nil {
return fmt.Errorf("failed to write audio data: %w", err)
}

if err := wavEncoder.Close(); err != nil {
return fmt.Errorf("failed to finalize WAV file: %w", err)
}

return nil
}

// [END googlegenaisdk_live_conversation_audio_with_audio]
58 changes: 58 additions & 0 deletions genai/live/live_examples_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@ func generateStructuredOutputWithTxtMock(w io.Writer) error {
_, err = fmt.Fprintln(w, string(b))
return err
}
func generateLiveRAGWithTextMock(w io.Writer, memoryCorpus string) error {
mockOutput := "> What are the newest Gemini models?\n\nGemini 2.0 Flash and Gemini 2.5 Ultra are among the latest models released by Google."
_, err := fmt.Fprintln(w, mockOutput)
return err
}

// Mock function simulating generateLiveTextWithAudio without API/WebSocket.
func generateLiveTextWithAudioMock(w io.Writer) error {
audioURL := "https://storage.googleapis.com/generativeai-downloads/data/16000.wav"
mockResponse := fmt.Sprintf("> Answer to this audio url: %s\n\nMocked transcript response: Hello from mock!", audioURL)
_, err := fmt.Fprintln(w, mockResponse)
return err
}

// Mock version of generateLiveAudioConversation
func generateLiveAudioConversationMock(w io.Writer, audioFile string) error {
// Simulating behavior: write the audioFile name and a processed message
mockOutput := fmt.Sprintf("> Received audio file: %s\nProcessed mock response: Hello from mock audio!", audioFile)
_, err := fmt.Fprintln(w, mockOutput)
return err
}

func TestLiveGeneration(t *testing.T) {
tc := testutil.SystemTest(t)
Expand Down Expand Up @@ -96,4 +117,41 @@ func TestLiveGeneration(t *testing.T) {
}
})

t.Run("generate RAG with txt", func(t *testing.T) {
buf.Reset()
if err := generateLiveRAGWithTextMock(buf, "test"); err != nil {
t.Fatalf("generateLiveRAGWithText failed: %v", err)
}

output := buf.String()
if output == "" {
t.Error("expected non-empty output, got empty")
}
})

t.Run("generate text with audio", func(t *testing.T) {
buf.Reset()
if err := generateLiveTextWithAudioMock(buf); err != nil {
t.Fatalf("generateLiveTextWithAudio failed: %v", err)
}

output := buf.String()
if output == "" {
t.Error("expected non-empty output, got empty")
}
})

t.Run("generate live audio conversation", func(t *testing.T) {
buf.Reset()
err := generateLiveAudioConversationMock(buf, "sample_audio.wav")
if err != nil {
t.Fatalf("generateLiveAudioConversation failed: %v", err)
}

output := buf.String()
if output == "" {
t.Error("expected non-empty output, got empty")
}
})

}
Loading