diff --git a/firebaseai/src/LiveGenerationConfig.cs b/firebaseai/src/LiveGenerationConfig.cs index f4418f6d..115557ad 100644 --- a/firebaseai/src/LiveGenerationConfig.cs +++ b/firebaseai/src/LiveGenerationConfig.cs @@ -20,23 +20,63 @@ namespace Firebase.AI { + /// + /// A struct used to configure speech generation settings. + /// + public readonly struct SpeechConfig + { + internal readonly string voice; + + private SpeechConfig(string voice) + { + this.voice = voice; + } + + /// + /// See https://cloud.google.com/text-to-speech/docs/chirp3-hd for the list of available voices. + /// + /// + /// + public static SpeechConfig UsePrebuiltVoice(string voice) + { + return new SpeechConfig(voice); + } + + /// + /// Intended for internal use only. + /// This method is used for serializing the object to JSON for the API request. + /// + internal Dictionary ToJson() + { + Dictionary dict = new(); + + if (!string.IsNullOrWhiteSpace(voice)) + { + dict["voiceConfig"] = new Dictionary() { + { "prebuiltVoiceConfig" , new Dictionary() { + { "voiceName", voice } + } } + }; + } + + return dict; + } + } + /// -/// A struct used to configure speech generation settings. +/// A struct used to configure speech transcription settings. /// -public readonly struct SpeechConfig { - internal readonly string voice; +public readonly struct AudioTranscriptionConfig { - private SpeechConfig(string voice) { - this.voice = voice; + private AudioTranscriptionConfig() { } /// - /// See https://cloud.google.com/text-to-speech/docs/chirp3-hd for the list of available voices. + /// Creates a new transcription configuration. /// - /// - /// - public static SpeechConfig UsePrebuiltVoice(string voice) { - return new SpeechConfig(voice); + /// A new transcription configuration. + public static AudioTranscriptionConfig GetInstance() { + return new AudioTranscriptionConfig(); } /// @@ -45,15 +85,6 @@ public static SpeechConfig UsePrebuiltVoice(string voice) { /// internal Dictionary ToJson() { Dictionary dict = new(); - - if (!string.IsNullOrWhiteSpace(voice)) { - dict["voiceConfig"] = new Dictionary() { - { "prebuiltVoiceConfig" , new Dictionary() { - { "voiceName", voice } - } } - }; - } - return dict; } } @@ -62,6 +93,8 @@ internal Dictionary ToJson() { /// A struct defining model parameters to be used when generating live session content. /// public readonly struct LiveGenerationConfig { + private readonly AudioTranscriptionConfig? _inputAudioTranscription; + private readonly AudioTranscriptionConfig? _outputAudioTranscription; private readonly SpeechConfig? _speechConfig; private readonly List _responseModalities; private readonly float? _temperature; @@ -81,6 +114,10 @@ public readonly struct LiveGenerationConfig { /// for more details. /// /// + /// The transcription configuration to use if transcribing audio input. + /// + /// The transcription configuration to use if transcribing audio output. + /// /// The speech configuration to use if generating audio output. /// /// A list of response types to receive from the model. @@ -155,6 +192,8 @@ public readonly struct LiveGenerationConfig { /// [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig) /// for more details. public LiveGenerationConfig( + AudioTranscriptionConfig? inputAudioTranscription = null, + AudioTranscriptionConfig? outputAudioTranscription = null, SpeechConfig? speechConfig = null, IEnumerable responseModalities = null, float? temperature = null, @@ -163,6 +202,8 @@ public LiveGenerationConfig( int? maxOutputTokens = null, float? presencePenalty = null, float? frequencyPenalty = null) { + _inputAudioTranscription = inputAudioTranscription; + _outputAudioTranscription = outputAudioTranscription; _speechConfig = speechConfig; _responseModalities = responseModalities != null ? new List(responseModalities) : new List(); @@ -178,8 +219,11 @@ public LiveGenerationConfig( /// Intended for internal use only. /// This method is used for serializing the object to JSON for the API request. /// - internal Dictionary ToJson() { + internal Dictionary ToJson() + { Dictionary jsonDict = new(); + if (_inputAudioTranscription.HasValue) jsonDict["inputAudioTranscription"] = _inputAudioTranscription?.ToJson(); + if (_outputAudioTranscription.HasValue) jsonDict["outputAudioTranscription"] = _outputAudioTranscription?.ToJson(); if (_speechConfig.HasValue) jsonDict["speechConfig"] = _speechConfig?.ToJson(); if (_responseModalities != null && _responseModalities.Any()) { jsonDict["responseModalities"] = diff --git a/firebaseai/src/LiveSessionResponse.cs b/firebaseai/src/LiveSessionResponse.cs index 5751698e..a8523efa 100644 --- a/firebaseai/src/LiveSessionResponse.cs +++ b/firebaseai/src/LiveSessionResponse.cs @@ -50,6 +50,18 @@ public string Text { } } + /// + /// The response's content that was a transcription, if it exists. + /// + public string Transcription { + get { + if (Message is Transcription transcription) { + return transcription.Text; + } + return null; + } + } + /// /// The response's content that was audio, if it exists. /// @@ -132,6 +144,40 @@ private LiveSessionResponse(ILiveSessionMessage liveSessionMessage) { /// public interface ILiveSessionMessage { } +/// +/// A transcription of the audio sent in a live session. +/// +public readonly struct Transcription { + /// + /// The transcribed text. + /// + public readonly string Text { get; } + /// + /// Whether this is the end of the transcription. + /// + public readonly bool Finished { get; } + + private Transcription(string text, bool finished) { + Text = text; + Finished = finished; + } + + private Transcription(string text) + { + Text = text; + Finished = true; + } + + /// + /// Intended for internal use only. + /// This method is used for deserializing JSON responses and should not be called directly. + /// + internal static Transcription FromJson(Dictionary jsonDict) { + return new Transcription( + jsonDict.ParseValue("text"), jsonDict.ParseValue("finished") + ); + } +} /// /// Content generated by the model in a live session. /// @@ -153,10 +199,17 @@ public interface ILiveSessionMessage { } /// public readonly bool Interrupted { get; } - private LiveSessionContent(ModelContent? content, bool turnComplete, bool interrupted) { + public readonly Transcription InputTranscription { get; } + public readonly Transcription OutputTranscription { get; } + + + private LiveSessionContent(ModelContent? content, bool turnComplete, bool interrupted, + Transcription? inputTranscription, Transcription? outputTranscription) { Content = content; TurnComplete = turnComplete; Interrupted = interrupted; + InputTranscription = inputTranscription; + OutputTranscription = outputTranscription; } /// @@ -167,7 +220,9 @@ internal static LiveSessionContent FromJson(Dictionary jsonDict) return new LiveSessionContent( jsonDict.ParseNullableObject("modelTurn", ModelContent.FromJson), jsonDict.ParseValue("turnComplete"), - jsonDict.ParseValue("interrupted") + jsonDict.ParseValue("interrupted"), + jsonDict.ParseValue("inputTranscription", Transcription.FromJson), + jsonDict.ParseValue("outputTranscription", Transcription.FromJson) ); } }