diff --git a/genai/go.mod b/genai/go.mod index 033bf8cca1..649905ab01 100644 --- a/genai/go.mod +++ b/genai/go.mod @@ -4,6 +4,8 @@ go 1.24.0 require ( github.com/GoogleCloudPlatform/golang-samples v0.0.0-20250201051611-5fb145d1e974 + github.com/go-audio/audio v1.0.0 + github.com/go-audio/wav v1.1.0 golang.org/x/oauth2 v0.25.0 google.golang.org/genai v1.17.0 ) @@ -25,6 +27,7 @@ require ( github.com/envoyproxy/go-control-plane/envoy v1.32.3 // indirect github.com/envoyproxy/protoc-gen-validate v1.1.0 // indirect github.com/felixge/httpsnoop v1.0.4 // indirect + github.com/go-audio/riff v1.0.0 // indirect github.com/go-logr/logr v1.4.2 // indirect github.com/go-logr/stdr v1.2.2 // indirect github.com/google/go-cmp v0.6.0 // indirect diff --git a/genai/go.sum b/genai/go.sum index 263bb22be2..b91024f0a0 100644 --- a/genai/go.sum +++ b/genai/go.sum @@ -46,6 +46,12 @@ github.com/envoyproxy/protoc-gen-validate v1.1.0 h1:tntQDh69XqOCOZsDz0lVJQez/2L6 github.com/envoyproxy/protoc-gen-validate v1.1.0/go.mod h1:sXRDRVmzEbkM7CVcM06s9shE/m23dg3wzjl0UWqJ2q4= github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg= github.com/felixge/httpsnoop v1.0.4/go.mod h1:m8KPJKqk1gH5J9DgRY2ASl2lWCfGKXixSwevea8zH2U= +github.com/go-audio/audio v1.0.0 h1:zS9vebldgbQqktK4H0lUqWrG8P0NxCJVqcj7ZpNnwd4= +github.com/go-audio/audio v1.0.0/go.mod h1:6uAu0+H2lHkwdGsAY+j2wHPNPpPoeg5AaEFh9FlA+Zs= +github.com/go-audio/riff v1.0.0 h1:d8iCGbDvox9BfLagY94fBynxSPHO80LmZCaOsmKxokA= +github.com/go-audio/riff v1.0.0/go.mod h1:l3cQwc85y79NQFCRB7TiPoNiaijp6q8Z0Uv38rVG498= +github.com/go-audio/wav v1.1.0 h1:jQgLtbqBzY7G+BM8fXF7AHUk1uHUviWS4X39d5rsL2g= +github.com/go-audio/wav v1.1.0/go.mod h1:mpe9qfwbScEbkd8uybLuIpTgHyrISw/OTuvjUW2iGtE= github.com/go-logr/logr v1.2.2/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.4.2 h1:6pFjapn8bFcIbiKo3XT4j/BhANplGihG6tvd+8rYgrY= github.com/go-logr/logr v1.4.2/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= diff --git a/genai/live/live_conversation_audio_with_audio.go b/genai/live/live_conversation_audio_with_audio.go new file mode 100644 index 0000000000..84959670ee --- /dev/null +++ b/genai/live/live_conversation_audio_with_audio.go @@ -0,0 +1,190 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package live shows how to use the GenAI SDK to generate text with live resources. +package live + +// [START googlegenaisdk_live_conversation_audio_with_audio] +import ( + "bytes" + "context" + "encoding/binary" + "fmt" + "io" + "os" + + "github.com/go-audio/audio" + "github.com/go-audio/wav" + "google.golang.org/genai" +) + +// generateLiveAudioConversation demonstrates two-way audio interaction with a Gemini model using live streaming. +func generateLiveAudioConversation(w io.Writer, audioFilePath string) error { + ctx := context.Background() + + client, err := genai.NewClient(ctx, &genai.ClientConfig{ + HTTPOptions: genai.HTTPOptions{ + APIVersion: "v1beta1", + }, + }) + if err != nil { + return fmt.Errorf("failed to create genai client: %w", err) + } + + modelName := "gemini-live-2.5-flash-preview-native-audio-09-2025" + + // Configure model to receive and respond with audio, including transcriptions. + config := &genai.LiveConnectConfig{ + ResponseModalities: []genai.Modality{genai.ModalityAudio}, + InputAudioTranscription: &genai.AudioTranscriptionConfig{}, + OutputAudioTranscription: &genai.AudioTranscriptionConfig{}, + } + + session, err := client.Live.Connect(ctx, modelName, config) + if err != nil { + return fmt.Errorf("failed to connect live: %w", err) + } + defer session.Close() + + // Load the audio file + audioBytes, mimeType, err := loadAudioAsPCMBytes(audioFilePath) + if err != nil { + return fmt.Errorf("failed to load audio: %w", err) + } + + fmt.Fprintf(w, "> Streaming audio from %s to the model\n\n", audioFilePath) + + // Send audio data to the model + err = session.SendRealtimeInput(genai.LiveRealtimeInput{ + Media: &genai.Blob{ + Data: audioBytes, + MIMEType: mimeType, + }, + }) + if err != nil { + return fmt.Errorf("failed to send realtime input: %w", err) + } + + // Gather audio response frames + var audioFrames [][]byte + + for { + chunk, err := session.Receive() + if err != nil { + if err == io.EOF { + break + } + return fmt.Errorf("error receiving response: %w", err) + } + + if chunk.ServerContent != nil { + if chunk.ServerContent.InputTranscription != nil { + fmt.Fprintf(w, "Input transcription: %s\n", chunk.ServerContent.InputTranscription.Text) + } + if chunk.ServerContent.OutputTranscription != nil { + fmt.Fprintf(w, "Output transcription: %s\n", chunk.ServerContent.OutputTranscription.Text) + } + if chunk.ServerContent.ModelTurn != nil { + for _, part := range chunk.ServerContent.ModelTurn.Parts { + if part.InlineData != nil && len(part.InlineData.Data) > 0 { + audioFrames = append(audioFrames, part.InlineData.Data) + } + } + } + } + } + + // Save audio frames to WAV file if available + if len(audioFrames) > 0 { + outputFile := "model_response.wav" + err := saveAudioFramesAsWAV(outputFile, audioFrames, 24000) + if err != nil { + return fmt.Errorf("failed to write WAV: %w", err) + } + fmt.Fprintf(w, "Model response saved to %s\n", outputFile) + } + + // Example output: + // gemini-2.0-flash-live-preview-04-09 + // {'input_transcription': {'text': 'Hello.'}} + // {'output_transcription': {}} + // {'output_transcription': {'text': 'Hi'}} + // {'output_transcription': {'text': ' there. What can I do for you today?'}} + // {'output_transcription': {'finished': True}} + // Model response saved to example_model_response.wav + return nil +} + +// loadAudioAsPCMBytes reads a WAV file and returns PCM bytes with a MIME type. +func loadAudioAsPCMBytes(path string) ([]byte, string, error) { + file, err := os.Open(path) + if err != nil { + return nil, "", fmt.Errorf("failed to open WAV file: %w", err) + } + defer file.Close() + + wavDecoder := wav.NewDecoder(file) + if !wavDecoder.IsValidFile() { + return nil, "", fmt.Errorf("invalid WAV file") + } + buf, err := wavDecoder.FullPCMBuffer() + if err != nil { + return nil, "", fmt.Errorf("failed to decode WAV: %w", err) + } + + sampleRate := wavDecoder.SampleRate + rawInts := buf.Data + data := make([]byte, len(rawInts)*2) // 16-bit PCM + + for i, sample := range rawInts { + binary.LittleEndian.PutUint16(data[i*2:], uint16(int16(sample))) + } + + mimeType := fmt.Sprintf("audio/pcm;rate=%d", sampleRate) + return data, mimeType, nil +} + +// saveAudioFramesAsWAV writes audio frames (PCM bytes) to a WAV file. +func saveAudioFramesAsWAV(filePath string, frames [][]byte, sampleRate int) error { + audioData := bytes.Join(frames, nil) + + // Create buffer + intData := audio.IntBuffer{ + Format: &audio.Format{NumChannels: 1, SampleRate: sampleRate}, + Data: make([]int, len(audioData)/2), + } + + for i := 0; i < len(audioData); i += 2 { + intData.Data[i/2] = int(int16(audioData[i]) | int16(audioData[i+1])<<8) + } + + file, err := os.Create(filePath) + if err != nil { + return fmt.Errorf("failed to create WAV file: %w", err) + } + defer file.Close() + + wavEncoder := wav.NewEncoder(file, sampleRate, 16, 1, 1) + if err := wavEncoder.Write(&intData); err != nil { + return fmt.Errorf("failed to write audio data: %w", err) + } + + if err := wavEncoder.Close(); err != nil { + return fmt.Errorf("failed to finalize WAV file: %w", err) + } + + return nil +} + +// [END googlegenaisdk_live_conversation_audio_with_audio] diff --git a/genai/live/live_examples_test.go b/genai/live/live_examples_test.go index f142744dd4..7ead6453a6 100644 --- a/genai/live/live_examples_test.go +++ b/genai/live/live_examples_test.go @@ -49,6 +49,27 @@ func generateStructuredOutputWithTxtMock(w io.Writer) error { _, err = fmt.Fprintln(w, string(b)) return err } +func generateLiveRAGWithTextMock(w io.Writer, memoryCorpus string) error { + mockOutput := "> What are the newest Gemini models?\n\nGemini 2.0 Flash and Gemini 2.5 Ultra are among the latest models released by Google." + _, err := fmt.Fprintln(w, mockOutput) + return err +} + +// Mock function simulating generateLiveTextWithAudio without API/WebSocket. +func generateLiveTextWithAudioMock(w io.Writer) error { + audioURL := "https://storage.googleapis.com/generativeai-downloads/data/16000.wav" + mockResponse := fmt.Sprintf("> Answer to this audio url: %s\n\nMocked transcript response: Hello from mock!", audioURL) + _, err := fmt.Fprintln(w, mockResponse) + return err +} + +// Mock version of generateLiveAudioConversation +func generateLiveAudioConversationMock(w io.Writer, audioFile string) error { + // Simulating behavior: write the audioFile name and a processed message + mockOutput := fmt.Sprintf("> Received audio file: %s\nProcessed mock response: Hello from mock audio!", audioFile) + _, err := fmt.Fprintln(w, mockOutput) + return err +} func TestLiveGeneration(t *testing.T) { tc := testutil.SystemTest(t) @@ -96,4 +117,41 @@ func TestLiveGeneration(t *testing.T) { } }) + t.Run("generate RAG with txt", func(t *testing.T) { + buf.Reset() + if err := generateLiveRAGWithTextMock(buf, "test"); err != nil { + t.Fatalf("generateLiveRAGWithText failed: %v", err) + } + + output := buf.String() + if output == "" { + t.Error("expected non-empty output, got empty") + } + }) + + t.Run("generate text with audio", func(t *testing.T) { + buf.Reset() + if err := generateLiveTextWithAudioMock(buf); err != nil { + t.Fatalf("generateLiveTextWithAudio failed: %v", err) + } + + output := buf.String() + if output == "" { + t.Error("expected non-empty output, got empty") + } + }) + + t.Run("generate live audio conversation", func(t *testing.T) { + buf.Reset() + err := generateLiveAudioConversationMock(buf, "sample_audio.wav") + if err != nil { + t.Fatalf("generateLiveAudioConversation failed: %v", err) + } + + output := buf.String() + if output == "" { + t.Error("expected non-empty output, got empty") + } + }) + } diff --git a/genai/live/live_ground_ragengine_with_txt.go b/genai/live/live_ground_ragengine_with_txt.go new file mode 100644 index 0000000000..c008d47a24 --- /dev/null +++ b/genai/live/live_ground_ragengine_with_txt.go @@ -0,0 +1,122 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package live shows how to use the GenAI SDK to generate text with live resources. +package live + +// [START googlegenaisdk_live_ground_ragengine_with_txt] +import ( + "context" + "fmt" + "io" + "strings" + + "google.golang.org/genai" +) + +// generateLiveRAGWithText demonstrates how to use the Live API with a Vertex RAG Store. +// It sends a question to the model and retrieves grounded answers from the configured memory corpus. +func generateLiveRAGWithText(w io.Writer, memoryCorpus string) error { + ctx := context.Background() + + client, err := genai.NewClient(ctx, &genai.ClientConfig{ + HTTPOptions: genai.HTTPOptions{APIVersion: "v1"}, + }) + if err != nil { + return fmt.Errorf("failed to create genai client: %w", err) + } + + modelName := "gemini-2.0-flash-live-preview-04-09" + + // Configure Vertex RAG store + ragStore := &genai.VertexRAGStore{ + RAGResources: []*genai.VertexRAGStoreRAGResource{ + { + RAGCorpus: memoryCorpus, // Define the memory corpus where context is stored or retrieved + }, + }, + } + + config := &genai.LiveConnectConfig{ + ResponseModalities: []genai.Modality{genai.ModalityText}, + Tools: []*genai.Tool{ + { + Retrieval: &genai.Retrieval{ + VertexRAGStore: ragStore, + }, + }, + }, + } + + session, err := client.Live.Connect(ctx, modelName, config) + if err != nil { + return fmt.Errorf("failed to connect live: %w", err) + } + defer session.Close() + + inputText := "What are the newest Gemini models?" + fmt.Fprintf(w, "> %s\n\n", inputText) + + // Send the user message + err = session.SendClientContent(genai.LiveClientContentInput{ + Turns: []*genai.Content{ + { + Role: genai.RoleUser, + Parts: []*genai.Part{ + {Text: inputText}, + }, + }, + }, + }) + if err != nil { + return fmt.Errorf("failed to send content: %w", err) + } + + // Stream the response + var response strings.Builder + for { + chunk, err := session.Receive() + if err != nil { + if err == io.EOF { + break + } + return fmt.Errorf("error receiving response: %w", err) + } + + if chunk.ServerContent == nil { + continue + } + + // If the server provided a model turn, iterate its parts for text. + if chunk.ServerContent.ModelTurn != nil { + for _, part := range chunk.ServerContent.ModelTurn.Parts { + if part == nil { + continue + } + if part.Text != "" { + response.WriteString(part.Text) + } + } + } + } + + fmt.Fprintln(w, response.String()) + + // Example output: + // > What are the newest Gemini models? + // In December 2023, Google launched Gemini, their most capable and general model... + return nil +} + +// [END googlegenaisdk_live_ground_ragengine_with_txt] diff --git a/genai/live/live_text_with_audio.go b/genai/live/live_text_with_audio.go new file mode 100644 index 0000000000..abd152762d --- /dev/null +++ b/genai/live/live_text_with_audio.go @@ -0,0 +1,112 @@ +// Copyright 2025 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package live shows how to use the GenAI SDK to generate text with live resources. +package live + +// [START googlegenaisdk_live_txt_with_audio] +import ( + "context" + "fmt" + "io" + "net/http" + "strings" + + "google.golang.org/genai" +) + +// generateLiveTextWithAudio demonstrates sending audio to a live session and +// receiving text output. It sends the audio as a Blob inside a genai.LiveRealtimeInput. +func generateLiveTextWithAudio(w io.Writer) error { + ctx := context.Background() + + client, err := genai.NewClient(ctx, &genai.ClientConfig{ + HTTPOptions: genai.HTTPOptions{APIVersion: "v1"}, + }) + if err != nil { + return fmt.Errorf("failed to create genai client: %w", err) + } + + modelName := "gemini-2.0-flash-live-preview-04-09" + + config := &genai.LiveConnectConfig{ + ResponseModalities: []genai.Modality{genai.ModalityText}, + } + + session, err := client.Live.Connect(ctx, modelName, config) + if err != nil { + return fmt.Errorf("failed to connect live: %w", err) + } + defer session.Close() + + audioURL := "https://storage.googleapis.com/generativeai-downloads/data/16000.wav" + // Download audio + resp, err := http.Get(audioURL) + if err != nil { + return fmt.Errorf("failed to download audio: %w", err) + } + defer resp.Body.Close() + + audioBytes, err := io.ReadAll(resp.Body) + if err != nil { + return fmt.Errorf("failed to read audio: %w", err) + } + + fmt.Fprintf(w, "> Answer to this audio url: %s\n\n", audioURL) + + // Send the audio as Blob media input + err = session.SendRealtimeInput(genai.LiveRealtimeInput{ + Media: &genai.Blob{ + Data: audioBytes, + MIMEType: "audio/pcm;rate=16000", + }, + }) + if err != nil { + return fmt.Errorf("failed to send audio input: %w", err) + } + + // Stream the response + var response strings.Builder + for { + chunk, err := session.Receive() + if err != nil { + if err == io.EOF { + break + } + return fmt.Errorf("error receiving response: %w", err) + } + + if chunk.ServerContent == nil { + continue + } + + // Handle model turn responses + if chunk.ServerContent.ModelTurn != nil { + for _, part := range chunk.ServerContent.ModelTurn.Parts { + if part != nil && part.Text != "" { + response.WriteString(part.Text) + } + } + } + } + + fmt.Fprintln(w, response.String()) + + // Example output: + // > Answer to this audio url: https://storage.googleapis.com/generativeai-downloads/data/16000.wav + // Yes, I can hear you. How can I help you today? + return nil +} + +// [END googlegenaisdk_live_txt_with_audio]