From 1e2c44a0ea9ceb1a68f7d1a3096872afbab1acab Mon Sep 17 00:00:00 2001 From: cfloress Date: Mon, 17 Nov 2025 12:24:48 -0300 Subject: [PATCH 1/2] genai: added live generation samples --- genai/go.mod | 3 + genai/go.sum | 6 + .../live_conversation_audio_with_audio.go | 192 ++++++++++++++++++ genai/live/live_examples_test.go | 58 ++++++ genai/live/live_ground_ragengine_with_txt.go | 121 +++++++++++ genai/live/live_text_with_audio.go | 111 ++++++++++ 6 files changed, 491 insertions(+) create mode 100644 genai/live/live_conversation_audio_with_audio.go create mode 100644 genai/live/live_ground_ragengine_with_txt.go create mode 100644 genai/live/live_text_with_audio.go diff --git a/genai/go.mod b/genai/go.mod index 033bf8cca1..649905ab01 100644 --- a/genai/go.mod +++ b/genai/go.mod @@ -4,6 +4,8 @@ go 1.24.0 require ( github.com/GoogleCloudPlatform/golang-samples v0.0.0-20250201051611-5fb145d1e974 + github.com/go-audio/audio v1.0.0 + github.com/go-audio/wav v1.1.0 golang.org/x/oauth2 v0.25.0 google.golang.org/genai v1.17.0 ) @@ -25,6 +27,7 @@ require ( github.com/envoyproxy/go-control-plane/envoy v1.32.3 // indirect github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-audio/riff v1.0.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/google/go-cmp v0.6.0 // indirect diff --git a/genai/go.sum b/genai/go.sum index 263bb22be2..b91024f0a0 100644 --- a/genai/go.sum +++ b/genai/go.sum @@ -46,6 +46,12 @@ github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6 github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= +github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= +github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA= +github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498= +github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g= +github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= diff --git a/genai/live/live_conversation_audio_with_audio.go b/genai/live/live_conversation_audio_with_audio.go new file mode 100644 index 0000000000..1309100c43 --- /dev/null +++ b/genai/live/live_conversation_audio_with_audio.go @@ -0,0 +1,192 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package live shows how to use the GenAI SDK to generate text with live resources. +package live + +// [START googlegenaisdk_live_conversation_audio_with_audio] +import ( + "context" + "encoding/binary" + "fmt" + "io" + "os" + + "github.com/go-audio/audio" + "github.com/go-audio/wav" + "google.golang.org/genai" +) + +// generateLiveAudioConversation demonstrates two-way audio interaction with a Gemini model using live streaming. +func generateLiveAudioConversation(w io.Writer, audioFilePath string) error { + ctx := context.Background() + + client, err := genai.NewClient(ctx, &genai.ClientConfig{ + HTTPOptions: genai.HTTPOptions{ + APIVersion: "v1beta1", + }, + }) + if err != nil { + return fmt.Errorf("failed to create genai client: %w", err) + } + + modelName := "gemini-live-2.5-flash-preview-native-audio-09-2025" + + // Configure model to receive and respond with audio, including transcriptions. + config := &genai.LiveConnectConfig{ + ResponseModalities: []genai.Modality{genai.ModalityAudio}, + InputAudioTranscription: &genai.AudioTranscriptionConfig{}, + OutputAudioTranscription: &genai.AudioTranscriptionConfig{}, + } + + session, err := client.Live.Connect(ctx, modelName, config) + if err != nil { + return fmt.Errorf("failed to connect live: %w", err) + } + defer session.Close() + + // Load the audio file + audioBytes, mimeType, err := loadAudioAsPCMBytes(audioFilePath) + if err != nil { + return fmt.Errorf("failed to load audio: %w", err) + } + + fmt.Fprintf(w, "> Streaming audio from %s to the model\n\n", audioFilePath) + + // Send audio data to the model + err = session.SendRealtimeInput(genai.LiveSendRealtimeInputParameters{ + Media: &genai.Blob{ + Data: audioBytes, + MIMEType: mimeType, + }, + }) + if err != nil { + return fmt.Errorf("failed to send realtime input: %w", err) + } + + // Gather audio response frames + var audioFrames [][]byte + + for { + chunk, err := session.Receive() + if err != nil { + if err == io.EOF { + break + } + return fmt.Errorf("error receiving response: %w", err) + } + + if chunk.ServerContent != nil { + if chunk.ServerContent.InputTranscription != nil { + fmt.Fprintf(w, "Input transcription: %s\n", chunk.ServerContent.InputTranscription.Text) + } + if chunk.ServerContent.OutputTranscription != nil { + fmt.Fprintf(w, "Output transcription: %s\n", chunk.ServerContent.OutputTranscription.Text) + } + if chunk.ServerContent.ModelTurn != nil { + for _, part := range chunk.ServerContent.ModelTurn.Parts { + if part.InlineData != nil && len(part.InlineData.Data) > 0 { + audioFrames = append(audioFrames, part.InlineData.Data) + } + } + } + } + } + + // Save audio frames to WAV file if available + if len(audioFrames) > 0 { + outputFile := "model_response.wav" + err := saveAudioFramesAsWAV(outputFile, audioFrames, 24000) + if err != nil { + return fmt.Errorf("failed to write WAV: %w", err) + } + fmt.Fprintf(w, "Model response saved to %s\n", outputFile) + } + + // Example output: + // gemini-2.0-flash-live-preview-04-09 + // {'input_transcription': {'text': 'Hello.'}} + // {'output_transcription': {}} + // {'output_transcription': {'text': 'Hi'}} + // {'output_transcription': {'text': ' there. What can I do for you today?'}} + // {'output_transcription': {'finished': True}} + // Model response saved to example_model_response.wav + return nil +} + +// loadAudioAsPCMBytes reads a WAV file and returns PCM bytes with a MIME type. +func loadAudioAsPCMBytes(path string) ([]byte, string, error) { + file, err := os.Open(path) + if err != nil { + return nil, "", fmt.Errorf("failed to open WAV file: %w", err) + } + defer file.Close() + + wavDecoder := wav.NewDecoder(file) + if !wavDecoder.IsValidFile() { + return nil, "", fmt.Errorf("invalid WAV file") + } + buf, err := wavDecoder.FullPCMBuffer() + if err != nil { + return nil, "", fmt.Errorf("failed to decode WAV: %w", err) + } + + sampleRate := wavDecoder.SampleRate + rawInts := buf.Data + data := make([]byte, len(rawInts)*2) // 16-bit PCM + + for i, sample := range rawInts { + binary.LittleEndian.PutUint16(data[i*2:], uint16(int16(sample))) + } + + mimeType := fmt.Sprintf("audio/pcm;rate=%d", sampleRate) + return data, mimeType, nil +} + +// saveAudioFramesAsWAV writes audio frames (PCM bytes) to a WAV file. +func saveAudioFramesAsWAV(filePath string, frames [][]byte, sampleRate int) error { + audioData := []byte{} + for _, f := range frames { + audioData = append(audioData, f...) + } + + // Create buffer + intData := audio.IntBuffer{ + Format: &audio.Format{NumChannels: 1, SampleRate: sampleRate}, + Data: make([]int, len(audioData)/2), + } + + for i := 0; i < len(audioData); i += 2 { + intData.Data[i/2] = int(int16(audioData[i]) | int16(audioData[i+1])<<8) + } + + file, err := os.Create(filePath) + if err != nil { + return fmt.Errorf("failed to create WAV file: %w", err) + } + defer file.Close() + + wavEncoder := wav.NewEncoder(file, sampleRate, 16, 1, 1) + if err := wavEncoder.Write(&intData); err != nil { + return fmt.Errorf("failed to write audio data: %w", err) + } + + if err := wavEncoder.Close(); err != nil { + return fmt.Errorf("failed to finalize WAV file: %w", err) + } + + return nil +} + +// [END googlegenaisdk_live_conversation_audio_with_audio] diff --git a/genai/live/live_examples_test.go b/genai/live/live_examples_test.go index f142744dd4..2a48eaf387 100644 --- a/genai/live/live_examples_test.go +++ b/genai/live/live_examples_test.go @@ -49,6 +49,27 @@ func generateStructuredOutputWithTxtMock(w io.Writer) error { _, err = fmt.Fprintln(w, string(b)) return err } +func generateLiveRAGWithTextMock(w io.Writer, memoryCorpus string) error { + mockOutput := "> What are the newest Gemini models?\n\nGemini 2.0 Flash and Gemini 2.5 Ultra are among the latest models released by Google." + _, err := fmt.Fprintln(w, mockOutput) + return err +} + +// Mock function simulating generateLiveTextWithAudio without API/WebSocket. +func generateLiveTextWithAudioMock(w io.Writer) error { + audioURL := "https://storage.googleapis.com/generativeai-downloads/data/16000.wav" + mockResponse := fmt.Sprintf("> Answer to this audio url: %s\n\nMocked transcript response: Hello from mock!", audioURL) + _, err := fmt.Fprintln(w, mockResponse) + return err +} + +// Mock version of generateLiveAudioConversation +func generateLiveAudioConversationMock(w io.Writer, audioFile string) error { + // Simulating behavior: write the audioFile name and a processed message + mockOutput := fmt.Sprintf("> Received audio file: %s\nProcessed mock response: Hello from mock audio!", audioFile) + _, err := fmt.Fprintln(w, mockOutput) + return err +} func TestLiveGeneration(t *testing.T) { tc := testutil.SystemTest(t) @@ -96,4 +117,41 @@ func TestLiveGeneration(t *testing.T) { } }) + t.Run("generate RAG with txt", func(t *testing.T) { + buf.Reset() + if err := generateLiveRAGWithTextMock(buf, "test"); err != nil { + t.Fatalf("generateLiveRAGWithText failed: %v", err) + } + + output := buf.String() + if output == "" { + t.Error("expected non-empty output, got empty") + } + }) + + t.Run("generate RAG with txt", func(t *testing.T) { + buf.Reset() + if err := generateLiveTextWithAudioMock(buf); err != nil { + t.Fatalf("generateLiveTextWithAudio failed: %v", err) + } + + output := buf.String() + if output == "" { + t.Error("expected non-empty output, got empty") + } + }) + + t.Run("generate live audio conversation", func(t *testing.T) { + buf.Reset() + err := generateLiveAudioConversationMock(buf, "sample_audio.wav") + if err != nil { + t.Fatalf("generateLiveAudioConversation failed: %v", err) + } + + output := buf.String() + if output == "" { + t.Error("expected non-empty output, got empty") + } + }) + } diff --git a/genai/live/live_ground_ragengine_with_txt.go b/genai/live/live_ground_ragengine_with_txt.go new file mode 100644 index 0000000000..41cd2168be --- /dev/null +++ b/genai/live/live_ground_ragengine_with_txt.go @@ -0,0 +1,121 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package live shows how to use the GenAI SDK to generate text with live resources. +package live + +// [START googlegenaisdk_live_ground_ragengine_with_txt] +import ( + "context" + "fmt" + "io" + + "google.golang.org/genai" +) + +// generateLiveRAGWithText demonstrates how to use the Live API with a Vertex RAG Store. +// It sends a question to the model and retrieves grounded answers from the configured memory corpus. +func generateLiveRAGWithText(w io.Writer, memoryCorpus string) error { + ctx := context.Background() + + client, err := genai.NewClient(ctx, &genai.ClientConfig{ + HTTPOptions: genai.HTTPOptions{APIVersion: "v1"}, + }) + if err != nil { + return fmt.Errorf("failed to create genai client: %w", err) + } + + modelName := "gemini-2.0-flash-live-preview-04-09" + + // Configure Vertex RAG store + ragStore := &genai.VertexRAGStore{ + RAGResources: []*genai.VertexRAGStoreRAGResource{ + { + RAGCorpus: memoryCorpus, // Define the memory corpus where context is stored or retrieved + }, + }, + } + + config := &genai.LiveConnectConfig{ + ResponseModalities: []genai.Modality{genai.ModalityText}, + Tools: []*genai.Tool{ + { + Retrieval: &genai.Retrieval{ + VertexRAGStore: ragStore, + }, + }, + }, + } + + session, err := client.Live.Connect(ctx, modelName, config) + if err != nil { + return fmt.Errorf("failed to connect live: %w", err) + } + defer session.Close() + + inputText := "What are the newest Gemini models?" + fmt.Fprintf(w, "> %s\n\n", inputText) + + // Send the user message + err = session.SendClientContent(genai.LiveClientContentInput{ + Turns: []*genai.Content{ + { + Role: genai.RoleUser, + Parts: []*genai.Part{ + {Text: inputText}, + }, + }, + }, + }) + if err != nil { + return fmt.Errorf("failed to send content: %w", err) + } + + // Stream the response + var response string + for { + chunk, err := session.Receive() + if err != nil { + if err == io.EOF { + break + } + return fmt.Errorf("error receiving response: %w", err) + } + + if chunk.ServerContent == nil { + continue + } + + // If the server provided a model turn, iterate its parts for text. + if chunk.ServerContent.ModelTurn != nil { + for _, part := range chunk.ServerContent.ModelTurn.Parts { + if part == nil { + continue + } + if part.Text != "" { + response += part.Text + } + } + } + } + + fmt.Fprintln(w, response) + + // Example output: + // > What are the newest Gemini models? + // In December 2023, Google launched Gemini, their most capable and general model... + return nil +} + +// [END googlegenaisdk_live_ground_ragengine_with_txt] diff --git a/genai/live/live_text_with_audio.go b/genai/live/live_text_with_audio.go new file mode 100644 index 0000000000..85659cb593 --- /dev/null +++ b/genai/live/live_text_with_audio.go @@ -0,0 +1,111 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package live shows how to use the GenAI SDK to generate text with live resources. +package live + +// [START googlegenaisdk_live_txt_with_audio] +import ( + "context" + "fmt" + "io" + "net/http" + + "google.golang.org/genai" +) + +// generateLiveTextWithAudio demonstrates sending audio to a live session and +// receiving text output. It sends the audio as a Blob inside a genai.LiveRealtimeInput. +func generateLiveTextWithAudio(w io.Writer) error { + ctx := context.Background() + + client, err := genai.NewClient(ctx, &genai.ClientConfig{ + HTTPOptions: genai.HTTPOptions{APIVersion: "v1"}, + }) + if err != nil { + return fmt.Errorf("failed to create genai client: %w", err) + } + + modelName := "gemini-2.0-flash-live-preview-04-09" + + config := &genai.LiveConnectConfig{ + ResponseModalities: []genai.Modality{genai.ModalityText}, + } + + session, err := client.Live.Connect(ctx, modelName, config) + if err != nil { + return fmt.Errorf("failed to connect live: %w", err) + } + defer session.Close() + + audioURL := "https://storage.googleapis.com/generativeai-downloads/data/16000.wav" + // Download audio + resp, err := http.Get(audioURL) + if err != nil { + return fmt.Errorf("failed to download audio: %w", err) + } + defer resp.Body.Close() + + audioBytes, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read audio: %w", err) + } + + fmt.Fprintf(w, "> Answer to this audio url: %s\n\n", audioURL) + + // Send the audio as Blob media input + err = session.SendRealtimeInput(genai.LiveRealtimeInput{ + Media: &genai.Blob{ + Data: audioBytes, + MIMEType: "audio/pcm;rate=16000", + }, + }) + if err != nil { + return fmt.Errorf("failed to send audio input: %w", err) + } + + // Stream the response + var response string + for { + chunk, err := session.Receive() + if err != nil { + if err == io.EOF { + break + } + return fmt.Errorf("error receiving response: %w", err) + } + + if chunk.ServerContent == nil { + continue + } + + // Handle model turn responses + if chunk.ServerContent.ModelTurn != nil { + for _, part := range chunk.ServerContent.ModelTurn.Parts { + if part != nil && part.Text != "" { + response += part.Text + } + } + } + } + + fmt.Fprintln(w, response) + + // Example output: + // > Answer to this audio url: https://storage.googleapis.com/generativeai-downloads/data/16000.wav + // Yes, I can hear you. How can I help you today? + return nil +} + +// [END googlegenaisdk_live_txt_with_audio] From 677dd9c889daceab2cfa79bed02da2b46a16b85c Mon Sep 17 00:00:00 2001 From: cfloress Date: Mon, 17 Nov 2025 12:42:46 -0300 Subject: [PATCH 2/2] genai: PR comments --- genai/live/live_conversation_audio_with_audio.go | 8 +++----- genai/live/live_examples_test.go | 2 +- genai/live/live_ground_ragengine_with_txt.go | 7 ++++--- genai/live/live_text_with_audio.go | 7 ++++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/genai/live/live_conversation_audio_with_audio.go b/genai/live/live_conversation_audio_with_audio.go index 1309100c43..84959670ee 100644 --- a/genai/live/live_conversation_audio_with_audio.go +++ b/genai/live/live_conversation_audio_with_audio.go @@ -17,6 +17,7 @@ package live // [START googlegenaisdk_live_conversation_audio_with_audio] import ( + "bytes" "context" "encoding/binary" "fmt" @@ -65,7 +66,7 @@ func generateLiveAudioConversation(w io.Writer, audioFilePath string) error { fmt.Fprintf(w, "> Streaming audio from %s to the model\n\n", audioFilePath) // Send audio data to the model - err = session.SendRealtimeInput(genai.LiveSendRealtimeInputParameters{ + err = session.SendRealtimeInput(genai.LiveRealtimeInput{ Media: &genai.Blob{ Data: audioBytes, MIMEType: mimeType, @@ -156,10 +157,7 @@ func loadAudioAsPCMBytes(path string) ([]byte, string, error) { // saveAudioFramesAsWAV writes audio frames (PCM bytes) to a WAV file. func saveAudioFramesAsWAV(filePath string, frames [][]byte, sampleRate int) error { - audioData := []byte{} - for _, f := range frames { - audioData = append(audioData, f...) - } + audioData := bytes.Join(frames, nil) // Create buffer intData := audio.IntBuffer{ diff --git a/genai/live/live_examples_test.go b/genai/live/live_examples_test.go index 2a48eaf387..7ead6453a6 100644 --- a/genai/live/live_examples_test.go +++ b/genai/live/live_examples_test.go @@ -129,7 +129,7 @@ func TestLiveGeneration(t *testing.T) { } }) - t.Run("generate RAG with txt", func(t *testing.T) { + t.Run("generate text with audio", func(t *testing.T) { buf.Reset() if err := generateLiveTextWithAudioMock(buf); err != nil { t.Fatalf("generateLiveTextWithAudio failed: %v", err) diff --git a/genai/live/live_ground_ragengine_with_txt.go b/genai/live/live_ground_ragengine_with_txt.go index 41cd2168be..c008d47a24 100644 --- a/genai/live/live_ground_ragengine_with_txt.go +++ b/genai/live/live_ground_ragengine_with_txt.go @@ -20,6 +20,7 @@ import ( "context" "fmt" "io" + "strings" "google.golang.org/genai" ) @@ -83,7 +84,7 @@ func generateLiveRAGWithText(w io.Writer, memoryCorpus string) error { } // Stream the response - var response string + var response strings.Builder for { chunk, err := session.Receive() if err != nil { @@ -104,13 +105,13 @@ func generateLiveRAGWithText(w io.Writer, memoryCorpus string) error { continue } if part.Text != "" { - response += part.Text + response.WriteString(part.Text) } } } } - fmt.Fprintln(w, response) + fmt.Fprintln(w, response.String()) // Example output: // > What are the newest Gemini models? diff --git a/genai/live/live_text_with_audio.go b/genai/live/live_text_with_audio.go index 85659cb593..abd152762d 100644 --- a/genai/live/live_text_with_audio.go +++ b/genai/live/live_text_with_audio.go @@ -21,6 +21,7 @@ import ( "fmt" "io" "net/http" + "strings" "google.golang.org/genai" ) @@ -76,7 +77,7 @@ func generateLiveTextWithAudio(w io.Writer) error { } // Stream the response - var response string + var response strings.Builder for { chunk, err := session.Receive() if err != nil { @@ -94,13 +95,13 @@ func generateLiveTextWithAudio(w io.Writer) error { if chunk.ServerContent.ModelTurn != nil { for _, part := range chunk.ServerContent.ModelTurn.Parts { if part != nil && part.Text != "" { - response += part.Text + response.WriteString(part.Text) } } } } - fmt.Fprintln(w, response) + fmt.Fprintln(w, response.String()) // Example output: // > Answer to this audio url: https://storage.googleapis.com/generativeai-downloads/data/16000.wav