diff --git a/firebaseai/src/LiveGenerationConfig.cs b/firebaseai/src/LiveGenerationConfig.cs
index f4418f6d..115557ad 100644
--- a/firebaseai/src/LiveGenerationConfig.cs
+++ b/firebaseai/src/LiveGenerationConfig.cs
@@ -20,23 +20,63 @@
namespace Firebase.AI {
+ ///
+ /// A struct used to configure speech generation settings.
+ ///
+ public readonly struct SpeechConfig
+ {
+ internal readonly string voice;
+
+ private SpeechConfig(string voice)
+ {
+ this.voice = voice;
+ }
+
+ ///
+ /// See https://cloud.google.com/text-to-speech/docs/chirp3-hd for the list of available voices.
+ ///
+ ///
+ ///
+ public static SpeechConfig UsePrebuiltVoice(string voice)
+ {
+ return new SpeechConfig(voice);
+ }
+
+ ///
+ /// Intended for internal use only.
+ /// This method is used for serializing the object to JSON for the API request.
+ ///
+ internal Dictionary ToJson()
+ {
+ Dictionary dict = new();
+
+ if (!string.IsNullOrWhiteSpace(voice))
+ {
+ dict["voiceConfig"] = new Dictionary() {
+ { "prebuiltVoiceConfig" , new Dictionary() {
+ { "voiceName", voice }
+ } }
+ };
+ }
+
+ return dict;
+ }
+ }
+
///
-/// A struct used to configure speech generation settings.
+/// A struct used to configure speech transcription settings.
///
-public readonly struct SpeechConfig {
- internal readonly string voice;
+public readonly struct AudioTranscriptionConfig {
- private SpeechConfig(string voice) {
- this.voice = voice;
+ private AudioTranscriptionConfig() {
}
///
- /// See https://cloud.google.com/text-to-speech/docs/chirp3-hd for the list of available voices.
+ /// Creates a new transcription configuration.
///
- ///
- ///
- public static SpeechConfig UsePrebuiltVoice(string voice) {
- return new SpeechConfig(voice);
+ /// A new transcription configuration.
+ public static AudioTranscriptionConfig GetInstance() {
+ return new AudioTranscriptionConfig();
}
///
@@ -45,15 +85,6 @@ public static SpeechConfig UsePrebuiltVoice(string voice) {
///
internal Dictionary ToJson() {
Dictionary dict = new();
-
- if (!string.IsNullOrWhiteSpace(voice)) {
- dict["voiceConfig"] = new Dictionary() {
- { "prebuiltVoiceConfig" , new Dictionary() {
- { "voiceName", voice }
- } }
- };
- }
-
return dict;
}
}
@@ -62,6 +93,8 @@ internal Dictionary ToJson() {
/// A struct defining model parameters to be used when generating live session content.
///
public readonly struct LiveGenerationConfig {
+ private readonly AudioTranscriptionConfig? _inputAudioTranscription;
+ private readonly AudioTranscriptionConfig? _outputAudioTranscription;
private readonly SpeechConfig? _speechConfig;
private readonly List _responseModalities;
private readonly float? _temperature;
@@ -81,6 +114,10 @@ public readonly struct LiveGenerationConfig {
/// for more details.
///
///
+ /// The transcription configuration to use if transcribing audio input.
+ ///
+ /// The transcription configuration to use if transcribing audio output.
+ ///
/// The speech configuration to use if generating audio output.
///
/// A list of response types to receive from the model.
@@ -155,6 +192,8 @@ public readonly struct LiveGenerationConfig {
/// [Cloud documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference#generationconfig)
/// for more details.
public LiveGenerationConfig(
+ AudioTranscriptionConfig? inputAudioTranscription = null,
+ AudioTranscriptionConfig? outputAudioTranscription = null,
SpeechConfig? speechConfig = null,
IEnumerable responseModalities = null,
float? temperature = null,
@@ -163,6 +202,8 @@ public LiveGenerationConfig(
int? maxOutputTokens = null,
float? presencePenalty = null,
float? frequencyPenalty = null) {
+ _inputAudioTranscription = inputAudioTranscription;
+ _outputAudioTranscription = outputAudioTranscription;
_speechConfig = speechConfig;
_responseModalities = responseModalities != null ?
new List(responseModalities) : new List();
@@ -178,8 +219,11 @@ public LiveGenerationConfig(
/// Intended for internal use only.
/// This method is used for serializing the object to JSON for the API request.
///
- internal Dictionary ToJson() {
+ internal Dictionary ToJson()
+ {
Dictionary jsonDict = new();
+ if (_inputAudioTranscription.HasValue) jsonDict["inputAudioTranscription"] = _inputAudioTranscription?.ToJson();
+ if (_outputAudioTranscription.HasValue) jsonDict["outputAudioTranscription"] = _outputAudioTranscription?.ToJson();
if (_speechConfig.HasValue) jsonDict["speechConfig"] = _speechConfig?.ToJson();
if (_responseModalities != null && _responseModalities.Any()) {
jsonDict["responseModalities"] =
diff --git a/firebaseai/src/LiveSessionResponse.cs b/firebaseai/src/LiveSessionResponse.cs
index 5751698e..a8523efa 100644
--- a/firebaseai/src/LiveSessionResponse.cs
+++ b/firebaseai/src/LiveSessionResponse.cs
@@ -50,6 +50,18 @@ public string Text {
}
}
+ ///
+ /// The response's content that was a transcription, if it exists.
+ ///
+ public string Transcription {
+ get {
+ if (Message is Transcription transcription) {
+ return transcription.Text;
+ }
+ return null;
+ }
+ }
+
///
/// The response's content that was audio, if it exists.
///
@@ -132,6 +144,40 @@ private LiveSessionResponse(ILiveSessionMessage liveSessionMessage) {
///
public interface ILiveSessionMessage { }
+///
+/// A transcription of the audio sent in a live session.
+///
+public readonly struct Transcription {
+ ///
+ /// The transcribed text.
+ ///
+ public readonly string Text { get; }
+ ///
+ /// Whether this is the end of the transcription.
+ ///
+ public readonly bool Finished { get; }
+
+ private Transcription(string text, bool finished) {
+ Text = text;
+ Finished = finished;
+ }
+
+ private Transcription(string text)
+ {
+ Text = text;
+ Finished = true;
+ }
+
+ ///
+ /// Intended for internal use only.
+ /// This method is used for deserializing JSON responses and should not be called directly.
+ ///
+ internal static Transcription FromJson(Dictionary jsonDict) {
+ return new Transcription(
+ jsonDict.ParseValue("text"), jsonDict.ParseValue("finished")
+ );
+ }
+}
///
/// Content generated by the model in a live session.
///
@@ -153,10 +199,17 @@ public interface ILiveSessionMessage { }
///
public readonly bool Interrupted { get; }
- private LiveSessionContent(ModelContent? content, bool turnComplete, bool interrupted) {
+ public readonly Transcription InputTranscription { get; }
+ public readonly Transcription OutputTranscription { get; }
+
+
+ private LiveSessionContent(ModelContent? content, bool turnComplete, bool interrupted,
+ Transcription? inputTranscription, Transcription? outputTranscription) {
Content = content;
TurnComplete = turnComplete;
Interrupted = interrupted;
+ InputTranscription = inputTranscription;
+ OutputTranscription = outputTranscription;
}
///
@@ -167,7 +220,9 @@ internal static LiveSessionContent FromJson(Dictionary jsonDict)
return new LiveSessionContent(
jsonDict.ParseNullableObject("modelTurn", ModelContent.FromJson),
jsonDict.ParseValue("turnComplete"),
- jsonDict.ParseValue("interrupted")
+ jsonDict.ParseValue("interrupted"),
+ jsonDict.ParseValue("inputTranscription", Transcription.FromJson),
+ jsonDict.ParseValue("outputTranscription", Transcription.FromJson)
);
}
}