Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 33 additions & 2 deletions ElevenLabs-DotNet-Tests/TestFixture_04_TextToSpeechEndpoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,38 @@ public async Task Test_03_TextToSpeech_Transcription()
}

[Test]
public async Task Test_05_LanguageEnforced_TextToSpeech()
public async Task Test_04_StreamTextToSpeech_Transcription()
{
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = Voices.Voice.Adam;
Assert.NotNull(voice);
voice.Settings ??= await ElevenLabsClient.VoicesEndpoint.GetDefaultVoiceSettingsAsync();
var partialClips = new Queue<VoiceClip>();
var characters = new Queue<TimestampedTranscriptCharacter>();
Console.WriteLine("| Character | Start Time | End Time |");
Console.WriteLine("| --------- | ---------- | -------- |");
var request = new TextToSpeechRequest(voice, "The quick brown fox jumps over the lazy dog.", outputFormat: OutputFormat.PCM_24000, withTimestamps: true);
var voiceClip = await ElevenLabsClient.TextToSpeechEndpoint.TextToSpeechAsync(request, async partialClip =>
{
Assert.IsNotNull(partialClip);
partialClips.Enqueue(partialClip);
await Task.CompletedTask;
foreach (var character in partialClip.TimestampedTranscriptCharacters)
{
characters.Enqueue(character);
Console.WriteLine($"| {character.Character} | {character.StartTime} | {character.EndTime} |");
}
});
Assert.NotNull(partialClips);
Assert.NotNull(partialClips);
Assert.IsNotEmpty(partialClips);
Assert.NotNull(voiceClip);
Console.WriteLine(voiceClip.Id);
Assert.AreEqual(characters.ToArray(), voiceClip.TimestampedTranscriptCharacters);
}

[Test]
public async Task Test_05_01_LanguageEnforced_TextToSpeech()
{
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = Voices.Voice.Adam;
Expand Down Expand Up @@ -93,7 +124,7 @@ public async Task Test_05_LanguageEnforced_TextToSpeech()
}

[Test]
public async Task Test_TurboV2_5_LanguageEnforced_TextToSpeech()
public async Task Test_05_02_TurboV2_5_LanguageEnforced_TextToSpeech()
{
Assert.NotNull(ElevenLabsClient.TextToSpeechEndpoint);
var voice = Voices.Voice.Adam;
Expand Down
9 changes: 8 additions & 1 deletion ElevenLabs-DotNet/ElevenLabs-DotNet.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,15 @@ All copyrights, trademarks, logos, and assets are the property of their respecti
<SignAssembly>false</SignAssembly>
<IncludeSymbols>true</IncludeSymbols>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<Version>3.4.2</Version>
<Version>3.5.0</Version>
<PackageReleaseNotes>
Version 3.5.0
- Added TextToSpeechRequest.ctr overload
- Added seed property
- Added applyTextNormalization property
- Removed deprecated optimizeStreamingLatency property
- Added VoiceSettings.ctr overload
- Added speed property
Version 3.4.2
- Added flash models
- Added stream input support to dubbing endpoint
Expand Down
2 changes: 2 additions & 0 deletions ElevenLabs-DotNet/TextToSpeech/TextToSpeechEndpoint.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,12 @@ public async Task<VoiceClip> TextToSpeechAsync(TextToSpeechRequest request, Func
{ OutputFormatParameter, request.OutputFormat.ToString().ToLower() }
};

#pragma warning disable CS0618 // Type or member is obsolete
if (request.OptimizeStreamingLatency.HasValue)
{
parameters.Add(OptimizeStreamingLatencyParameter, request.OptimizeStreamingLatency.Value.ToString());
}
#pragma warning restore CS0618 // Type or member is obsolete

var endpoint = $"/{request.Voice.Id}";

Expand Down
104 changes: 83 additions & 21 deletions ElevenLabs-DotNet/TextToSpeech/TextToSpeechRequest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,32 @@ namespace ElevenLabs.TextToSpeech
{
public sealed class TextToSpeechRequest
{
[Obsolete]
[Obsolete("use new .ctr overload")]
public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings)
: this(null, text, voiceSettings: voiceSettings, model: model)
{
}

[Obsolete("use new .ctr overload")]
public TextToSpeechRequest(
Voice voice,
string text,
Encoding encoding,
VoiceSettings voiceSettings,
OutputFormat outputFormat,
int? optimizeStreamingLatency,
Model model = null,
string previousText = null,
string nextText = null,
string[] previousRequestIds = null,
string[] nextRequestIds = null,
string languageCode = null,
bool withTimestamps = false)
: this(voice, text, encoding, voiceSettings, outputFormat, model, previousText, nextText, previousRequestIds, nextRequestIds, languageCode, withTimestamps)
{
OptimizeStreamingLatency = optimizeStreamingLatency;
}

/// <summary>
/// Constructor.
/// </summary>
Expand All @@ -36,40 +56,70 @@ public TextToSpeechRequest(string text, Model model, VoiceSettings voiceSettings
/// Output format of the generated audio.<br/>
/// Defaults to <see cref="OutputFormat.MP3_44100_128"/>
/// </param>
/// <param name="optimizeStreamingLatency">
/// Optional, You can turn on latency optimizations at some cost of quality.
/// The best possible final latency varies by model.<br/>
/// Possible values:<br/>
/// 0 - default mode (no latency optimizations)<br/>
/// 1 - normal latency optimizations (about 50% of possible latency improvement of option 3)<br/>
/// 2 - strong latency optimizations (about 75% of possible latency improvement of option 3)<br/>
/// 3 - max latency optimizations<br/>
/// 4 - max latency optimizations, but also with text normalizer turned off for even more latency savings
/// (best latency, but can mispronounce e.g. numbers and dates).
/// <param name="previousText">
/// The text that came before the text of the current request.
/// Can be used to improve the speech�s continuity when concatenating together multiple generations or
/// to influence the speech�s continuity in the current generation.
/// </param>
/// <param name="nextText">
/// The text that comes after the text of the current request.
/// Can be used to improve the speech�s continuity when concatenating together multiple generations or
/// to influence the speech�s continuity in the current generation.
/// </param>
/// <param name="previousRequestIds">
/// A list of request_id of the samples that were generated before this generation.
/// Can be used to improve the speech�s continuity when splitting up a large task into multiple requests.
/// The results will be best when the same model is used across the generations. In case both previous_text and previous_request_ids is send,
/// previous_text will be ignored. A maximum of 3 request_ids can be send.
/// </param>
/// <param name="nextRequestIds">
/// A list of request_id of the samples that come after this generation.
/// next_request_ids is especially useful for maintaining the speech�s continuity when regenerating a sample that has had some audio quality issues.
/// For example, if you have generated 3 speech clips, and you want to improve clip 2,
/// passing the request id of clip 3 as a next_request_id (and that of clip 1 as a previous_request_id)
/// will help maintain natural flow in the combined speech.
/// The results will be best when the same model is used across the generations.
/// In case both next_text and next_request_ids is send, next_text will be ignored.
/// A maximum of 3 request_ids can be send.
/// </param>
/// <param name="previousText"></param>
/// <param name="nextText"></param>
/// <param name="previousRequestIds"></param>
/// <param name="nextRequestIds"></param>
/// <param name="languageCode">
/// Optional, Language code (ISO 639-1) used to enforce a language for the model. Currently only <see cref="Model.TurboV2_5"/> supports language enforcement.
/// For other models, an error will be returned if language code is provided.
/// </param>
/// <param name="withTimestamps"></param>
/// <param name="cacheFormat">
/// The audio format to save the audio in.
/// Defaults to <see cref="CacheFormat.Wav"/>
/// </param>
/// <param name="withTimestamps">
/// Generate speech from text with precise character-level timing information for audio-text synchronization.
/// </param>
/// <param name="seed">
/// If specified, our system will make a best effort to sample deterministically,
/// such that repeated requests with the same seed and parameters should return the same result.
/// Determinism is not guaranteed. Must be integer between 0 and 4294967295.
/// </param>
/// <param name="applyTextNormalization">
/// This parameter controls text normalization with three modes: �auto� (null), �on� (true), and �off� (false).
/// When set to �null�, the system will automatically decide whether to apply text normalization (e.g., spelling out numbers).
/// With �true�, text normalization will always be applied,
/// while with �false�, it will be skipped.
/// Cannot be turned on for �eleven_turbo_v2_5� model.
/// </param>
public TextToSpeechRequest(
Voice voice,
string text,
Encoding encoding = null,
VoiceSettings voiceSettings = null,
OutputFormat outputFormat = OutputFormat.MP3_44100_128,
int? optimizeStreamingLatency = null,
Model model = null,
string previousText = null,
string nextText = null,
string[] previousRequestIds = null,
string[] nextRequestIds = null,
string languageCode = null,
bool withTimestamps = false)
bool withTimestamps = false,
int? seed = null,
bool? applyTextNormalization = null)
{
if (string.IsNullOrWhiteSpace(text))
{
Expand All @@ -89,10 +139,9 @@ public TextToSpeechRequest(

Text = text;
Model = model ?? Models.Model.FlashV2;
Voice = voice;
Voice = string.IsNullOrWhiteSpace(voice) ? Voice.Adam : voice;
VoiceSettings = voiceSettings ?? voice.Settings;
OutputFormat = outputFormat;
OptimizeStreamingLatency = optimizeStreamingLatency;
PreviousText = previousText;
NextText = nextText;
if (previousRequestIds?.Length > 3)
Expand All @@ -107,6 +156,12 @@ public TextToSpeechRequest(
NextRequestIds = nextRequestIds;
LanguageCode = languageCode;
WithTimestamps = withTimestamps;
Seed = seed;

if (applyTextNormalization.HasValue)
{
ApplyTextNormalization = applyTextNormalization.Value ? "on" : "off";
}
}

[JsonPropertyName("text")]
Expand All @@ -129,6 +184,7 @@ public TextToSpeechRequest(
public OutputFormat OutputFormat { get; }

[JsonIgnore]
[Obsolete("Deprecated")]
public int? OptimizeStreamingLatency { get; }

[JsonPropertyName("next_text")]
Expand All @@ -146,11 +202,17 @@ public TextToSpeechRequest(
[JsonPropertyName("next_request_ids")]
public string[] NextRequestIds { get; }


[JsonPropertyName("language_code")]
public string LanguageCode { get; }

[JsonIgnore]
public bool WithTimestamps { get; }

[JsonPropertyName("seed")]
public int? Seed { get; }

[JsonPropertyName("apply_text_normalization")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingDefault)]
public string ApplyTextNormalization { get; }
}
}
18 changes: 17 additions & 1 deletion ElevenLabs-DotNet/Voices/VoiceSettings.cs
Original file line number Diff line number Diff line change
@@ -1,22 +1,35 @@
// Licensed under the MIT License. See LICENSE in the project root for license information.

using System;
using System.Text.Json.Serialization;

namespace ElevenLabs.Voices
{
public sealed class VoiceSettings
{
[Obsolete("use new .ctr overload")]
public VoiceSettings(
float stability,
float similarityBoost,
bool speakerBoost,
float style)
: this(stability, similarityBoost, style, speakerBoost)
{
}

[JsonConstructor]
public VoiceSettings(
float stability = .75f,
float similarityBoost = .75f,
float style = 0.45f,
bool speakerBoost = true,
float style = 0.45f)
float speed = 1f)
{
Stability = stability;
SimilarityBoost = similarityBoost;
Style = style;
SpeakerBoost = speakerBoost;
Speed = speed;
}

[JsonPropertyName("stability")]
Expand All @@ -30,5 +43,8 @@ public VoiceSettings(

[JsonPropertyName("use_speaker_boost")]
public bool SpeakerBoost { get; set; }

[JsonPropertyName("speed")]
public float Speed { get; set; }
}
}