Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions genai/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ go 1.24.0

require (
github.com/GoogleCloudPlatform/golang-samples v0.0.0-20250201051611-5fb145d1e974
github.com/go-audio/audio v1.0.0
github.com/go-audio/wav v1.1.0
golang.org/x/oauth2 v0.25.0
google.golang.org/genai v1.17.0
)
Expand All @@ -25,6 +27,7 @@ require (
github.com/envoyproxy/go-control-plane/envoy v1.32.3 // indirect
github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-audio/riff v1.0.0 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect
github.com/google/go-cmp v0.6.0 // indirect
Expand Down
6 changes: 6 additions & 0 deletions genai/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6
github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4=
github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U=
github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4=
github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs=
github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA=
github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498=
github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g=
github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE=
github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY=
github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
Expand Down
192 changes: 192 additions & 0 deletions genai/live/live_conversation_audio_with_audio.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
// Copyright 2025 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// https://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package live shows how to use the GenAI SDK to generate text with live resources.
package live

// [START googlegenaisdk_live_conversation_audio_with_audio]
import (
"context"
"encoding/binary"
"fmt"
"io"
"os"

"github.com/go-audio/audio"
"github.com/go-audio/wav"
"google.golang.org/genai"
)

// generateLiveAudioConversation demonstrates two-way audio interaction with a Gemini model using live streaming.
func generateLiveAudioConversation(w io.Writer, audioFilePath string) error {
ctx := context.Background()

client, err := genai.NewClient(ctx, &genai.ClientConfig{
HTTPOptions: genai.HTTPOptions{
APIVersion: "v1beta1",
},
})
if err != nil {
return fmt.Errorf("failed to create genai client: %w", err)
}

modelName := "gemini-live-2.5-flash-preview-native-audio-09-2025"

// Configure model to receive and respond with audio, including transcriptions.
config := &genai.LiveConnectConfig{
ResponseModalities: []genai.Modality{genai.ModalityAudio},
InputAudioTranscription: &genai.AudioTranscriptionConfig{},
OutputAudioTranscription: &genai.AudioTranscriptionConfig{},
}

session, err := client.Live.Connect(ctx, modelName, config)
if err != nil {
return fmt.Errorf("failed to connect live: %w", err)
}
defer session.Close()

// Load the audio file
audioBytes, mimeType, err := loadAudioAsPCMBytes(audioFilePath)
if err != nil {
return fmt.Errorf("failed to load audio: %w", err)
}

fmt.Fprintf(w, "> Streaming audio from %s to the model\n\n", audioFilePath)

// Send audio data to the model
err = session.SendRealtimeInput(genai.LiveSendRealtimeInputParameters{
Media: &genai.Blob{
Data: audioBytes,
MIMEType: mimeType,
},
})
if err != nil {
return fmt.Errorf("failed to send realtime input: %w", err)
}

// Gather audio response frames
var audioFrames [][]byte

for {
chunk, err := session.Receive()
if err != nil {
if err == io.EOF {
break
}
return fmt.Errorf("error receiving response: %w", err)
}

if chunk.ServerContent != nil {
if chunk.ServerContent.InputTranscription != nil {
fmt.Fprintf(w, "Input transcription: %s\n", chunk.ServerContent.InputTranscription.Text)
}
if chunk.ServerContent.OutputTranscription != nil {
fmt.Fprintf(w, "Output transcription: %s\n", chunk.ServerContent.OutputTranscription.Text)
}
if chunk.ServerContent.ModelTurn != nil {
for _, part := range chunk.ServerContent.ModelTurn.Parts {
if part.InlineData != nil && len(part.InlineData.Data) > 0 {
audioFrames = append(audioFrames, part.InlineData.Data)
}
}
}
}
}

// Save audio frames to WAV file if available
if len(audioFrames) > 0 {
outputFile := "model_response.wav"
err := saveAudioFramesAsWAV(outputFile, audioFrames, 24000)
if err != nil {
return fmt.Errorf("failed to write WAV: %w", err)
}
fmt.Fprintf(w, "Model response saved to %s\n", outputFile)
}

// Example output:
// gemini-2.0-flash-live-preview-04-09
// {'input_transcription': {'text': 'Hello.'}}
// {'output_transcription': {}}
// {'output_transcription': {'text': 'Hi'}}
// {'output_transcription': {'text': ' there. What can I do for you today?'}}
// {'output_transcription': {'finished': True}}
// Model response saved to example_model_response.wav
return nil
}

// loadAudioAsPCMBytes reads a WAV file and returns PCM bytes with a MIME type.
func loadAudioAsPCMBytes(path string) ([]byte, string, error) {
file, err := os.Open(path)
if err != nil {
return nil, "", fmt.Errorf("failed to open WAV file: %w", err)
}
defer file.Close()

wavDecoder := wav.NewDecoder(file)
if !wavDecoder.IsValidFile() {
return nil, "", fmt.Errorf("invalid WAV file")
}
buf, err := wavDecoder.FullPCMBuffer()
if err != nil {
return nil, "", fmt.Errorf("failed to decode WAV: %w", err)
}

sampleRate := wavDecoder.SampleRate
rawInts := buf.Data
data := make([]byte, len(rawInts)*2) // 16-bit PCM

for i, sample := range rawInts {
binary.LittleEndian.PutUint16(data[i*2:], uint16(int16(sample)))
}

mimeType := fmt.Sprintf("audio/pcm;rate=%d", sampleRate)
return data, mimeType, nil
}

// saveAudioFramesAsWAV writes audio frames (PCM bytes) to a WAV file.
func saveAudioFramesAsWAV(filePath string, frames [][]byte, sampleRate int) error {
audioData := []byte{}
for _, f := range frames {
audioData = append(audioData, f...)
}

// Create buffer
intData := audio.IntBuffer{
Format: &audio.Format{NumChannels: 1, SampleRate: sampleRate},
Data: make([]int, len(audioData)/2),
}

for i := 0; i < len(audioData); i += 2 {
intData.Data[i/2] = int(int16(audioData[i]) | int16(audioData[i+1])<<8)
}

file, err := os.Create(filePath)
if err != nil {
return fmt.Errorf("failed to create WAV file: %w", err)
}
defer file.Close()

wavEncoder := wav.NewEncoder(file, sampleRate, 16, 1, 1)
if err := wavEncoder.Write(&intData); err != nil {
return fmt.Errorf("failed to write audio data: %w", err)
}

if err := wavEncoder.Close(); err != nil {
return fmt.Errorf("failed to finalize WAV file: %w", err)
}

return nil
}

// [END googlegenaisdk_live_conversation_audio_with_audio]
58 changes: 58 additions & 0 deletions genai/live/live_examples_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,27 @@ func generateStructuredOutputWithTxtMock(w io.Writer) error {
_, err = fmt.Fprintln(w, string(b))
return err
}
func generateLiveRAGWithTextMock(w io.Writer, memoryCorpus string) error {
mockOutput := "> What are the newest Gemini models?\n\nGemini 2.0 Flash and Gemini 2.5 Ultra are among the latest models released by Google."
_, err := fmt.Fprintln(w, mockOutput)
return err
}

// Mock function simulating generateLiveTextWithAudio without API/WebSocket.
func generateLiveTextWithAudioMock(w io.Writer) error {
audioURL := "https://storage.googleapis.com/generativeai-downloads/data/16000.wav"
mockResponse := fmt.Sprintf("> Answer to this audio url: %s\n\nMocked transcript response: Hello from mock!", audioURL)
_, err := fmt.Fprintln(w, mockResponse)
return err
}

// Mock version of generateLiveAudioConversation
func generateLiveAudioConversationMock(w io.Writer, audioFile string) error {
// Simulating behavior: write the audioFile name and a processed message
mockOutput := fmt.Sprintf("> Received audio file: %s\nProcessed mock response: Hello from mock audio!", audioFile)
_, err := fmt.Fprintln(w, mockOutput)
return err
}

func TestLiveGeneration(t *testing.T) {
tc := testutil.SystemTest(t)
Expand Down Expand Up @@ -96,4 +117,41 @@ func TestLiveGeneration(t *testing.T) {
}
})

t.Run("generate RAG with txt", func(t *testing.T) {
buf.Reset()
if err := generateLiveRAGWithTextMock(buf, "test"); err != nil {
t.Fatalf("generateLiveRAGWithText failed: %v", err)
}

output := buf.String()
if output == "" {
t.Error("expected non-empty output, got empty")
}
})

t.Run("generate RAG with txt", func(t *testing.T) {
buf.Reset()
if err := generateLiveTextWithAudioMock(buf); err != nil {
t.Fatalf("generateLiveTextWithAudio failed: %v", err)
}

output := buf.String()
if output == "" {
t.Error("expected non-empty output, got empty")
}
})

t.Run("generate live audio conversation", func(t *testing.T) {
buf.Reset()
err := generateLiveAudioConversationMock(buf, "sample_audio.wav")
if err != nil {
t.Fatalf("generateLiveAudioConversation failed: %v", err)
}

output := buf.String()
if output == "" {
t.Error("expected non-empty output, got empty")
}
})

}
Loading