Skip to content

Commit 8fb7881

Browse files
com.openai.unity 7.7.6 (#210)
- Added support for streaming text to speech - Added AudioEndpoit.CreateSpeechStreamAsync(SpeechRequest, Action<AudioClip>, CancellationToken) - Added support for Audio Transcription and Translation verbose json output - Added support for timestamp granularities for segments and words - Marked CreateTranscriptionAsync obsolete - Added CreateTranscriptionTextAsync - Added CreateTranscriptionJsonAsync - Marked CreateTranspationAsync obsolete - Added CreateTranslationTextAsync - Added CreateTranslationJsonAsync - Updated SpeechResponseFormat to include wav and pcm
1 parent bc257af commit 8fb7881

17 files changed

+595
-63
lines changed

Documentation~/README.md

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ The recommended installation method is though the unity package manager and [Ope
103103
- [Json Mode](#chat-json-mode)
104104
- [Audio](#audio)
105105
- [Create Speech](#create-speech)
106+
- [Stream Speech](#stream-speech) :new:
106107
- [Create Transcription](#create-transcription)
107108
- [Create Translation](#create-translation)
108109
- [Images](#images)
@@ -1047,6 +1048,18 @@ var api = new OpenAIClient();
10471048
var request = new SpeechRequest("Hello world!");
10481049
var (path, clip) = await api.AudioEndpoint.CreateSpeechAsync(request);
10491050
audioSource.PlayOneShot(clip);
1051+
Debug.Log(path);
1052+
```
1053+
1054+
##### [Stream Speech]
1055+
1056+
Generate streamed audio from the input text.
1057+
1058+
```csharp
1059+
var api = new OpenAIClient();
1060+
var request = new SpeechRequest("Hello world!");
1061+
var (path, clip) = await OpenAIClient.AudioEndpoint.CreateSpeechStreamAsync(request, partialClip => audioSource.PlayOneShot(partialClip));
1062+
Debug.Log(path);
10501063
```
10511064

10521065
#### [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
@@ -1060,6 +1073,19 @@ var result = await api.AudioEndpoint.CreateTranscriptionAsync(request);
10601073
Debug.Log(result);
10611074
```
10621075

1076+
You can also get detailed information using `verbose_json` to get timestamp granularities:
1077+
1078+
```csharp
1079+
var api = new OpenAIClient();
1080+
using var request = new AudioTranscriptionRequest(transcriptionAudio, responseFormat: AudioResponseFormat.Verbose_Json, timestampGranularity: TimestampGranularity.Word, temperature: 0.1f, language: "en");
1081+
var response = await api.AudioEndpoint.CreateTranscriptionTextAsync(request);
1082+
1083+
foreach (var word in response.Words)
1084+
{
1085+
Debug.Log($"[{word.Start}-{word.End}] \"{word.Word}\"");
1086+
}
1087+
```
1088+
10631089
#### [Create Translation](https://platform.openai.com/docs/api-reference/audio/createTranslation)
10641090

10651091
Translates audio into into English.
@@ -1187,7 +1213,7 @@ Returns information about a specific file.
11871213

11881214
```csharp
11891215
var api = new OpenAIClient();
1190-
var file = await GetFileInfoAsync(fileId);
1216+
var file = await api.FilesEndpoint.GetFileInfoAsync(fileId);
11911217
Debug.Log($"{file.Id} -> {file.Object}: {file.FileName} | {file.Size} bytes");
11921218
```
11931219

Runtime/Audio/AudioEndpoint.cs

Lines changed: 130 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
using System.Threading.Tasks;
88
using UnityEngine;
99
using UnityEngine.Networking;
10-
using UnityEngine.Scripting;
1110
using Utilities.WebRequestRest;
1211

1312
namespace OpenAI.Audio
@@ -18,21 +17,6 @@ namespace OpenAI.Audio
1817
/// </summary>
1918
public sealed class AudioEndpoint : OpenAIBaseEndpoint
2019
{
21-
[Preserve]
22-
private class AudioResponse
23-
{
24-
[Preserve]
25-
[JsonConstructor]
26-
public AudioResponse([JsonProperty("text")] string text)
27-
{
28-
Text = text;
29-
}
30-
31-
[Preserve]
32-
[JsonProperty("text")]
33-
public string Text { get; }
34-
}
35-
3620
internal AudioEndpoint(OpenAIClient client) : base(client) { }
3721

3822
/// <inheritdoc />
@@ -48,25 +32,78 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { }
4832
/// <returns><see cref="AudioClip"/> and the cached path.</returns>
4933
[Function("Generates audio from the input text.")]
5034
public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest request, CancellationToken cancellationToken = default)
35+
=> await CreateSpeechStreamAsync(request, null, cancellationToken);
36+
37+
/// <summary>
38+
/// Generates streaming audio from the input text.
39+
/// </summary>
40+
/// <param name="request"><see cref="SpeechRequest"/>.</param>
41+
/// <param name="partialClipCallback">Optional, partial <see cref="AudioClip"/> callback used to stream audio.</param>
42+
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
43+
/// <returns><see cref="AudioClip"/> and the cached path.</returns>
44+
[Function("Generates streaming audio from the input text.")]
45+
public async Task<Tuple<string, AudioClip>> CreateSpeechStreamAsync(SpeechRequest request, Action<AudioClip> partialClipCallback, CancellationToken cancellationToken = default)
5146
{
52-
var audioFormat = request.ResponseFormat switch
47+
if (partialClipCallback != null && request.ResponseFormat != SpeechResponseFormat.PCM)
5348
{
54-
SpeechResponseFormat.MP3 => AudioType.MPEG,
55-
_ => throw new NotSupportedException(request.ResponseFormat.ToString())
56-
};
49+
Debug.LogWarning("Speech streaming only supported with PCM response format. Overriding to PCM...");
50+
request.ResponseFormat = SpeechResponseFormat.PCM;
51+
}
52+
5753
var ext = request.ResponseFormat switch
5854
{
5955
SpeechResponseFormat.MP3 => "mp3",
56+
SpeechResponseFormat.WAV => "wav",
57+
SpeechResponseFormat.PCM => "pcm",
6058
_ => throw new NotSupportedException(request.ResponseFormat.ToString())
6159
};
6260
var payload = JsonConvert.SerializeObject(request, OpenAIClient.JsonSerializationOptions);
6361
string clipName;
6462

6563
lock (mutex)
6664
{
67-
clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssff}.{ext}";
65+
clipName = $"{request.Voice}-{DateTime.UtcNow:yyyyMMddThhmmssfffff}.{ext}";
6866
}
6967

68+
Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
69+
70+
if (request.ResponseFormat == SpeechResponseFormat.PCM)
71+
{
72+
var part = 0;
73+
var response = await Rest.PostAsync(
74+
GetUrl("/speech"),
75+
payload,
76+
StreamCallback,
77+
eventChunkSize: 8192,
78+
new RestParameters(client.DefaultRequestHeaders),
79+
cancellationToken);
80+
response.Validate(EnableDebug);
81+
var samples = Utilities.Audio.PCMEncoder.Decode(response.Data);
82+
await File.WriteAllBytesAsync(cachedPath, response.Data, cancellationToken).ConfigureAwait(true);
83+
return new Tuple<string, AudioClip>(cachedPath, AudioClip.Create(clipName, samples.Length, 1, 24000, false));
84+
85+
void StreamCallback(Response partialResponse)
86+
{
87+
var chunk = Utilities.Audio.PCMEncoder.Decode(partialResponse.Data);
88+
var partialClip = AudioClip.Create($"{clipName}_{++part}", chunk.Length, 1, 24000, false);
89+
90+
if (!partialClip.SetData(chunk, 0))
91+
{
92+
Debug.LogError("Failed to set pcm data to partial clip.");
93+
return;
94+
}
95+
96+
partialClipCallback?.Invoke(partialClip);
97+
}
98+
}
99+
100+
var audioFormat = request.ResponseFormat switch
101+
{
102+
SpeechResponseFormat.MP3 => AudioType.MPEG,
103+
SpeechResponseFormat.WAV => AudioType.WAV,
104+
_ => throw new NotSupportedException(request.ResponseFormat.ToString())
105+
};
106+
70107
var clip = await Rest.DownloadAudioClipAsync(
71108
GetUrl("/speech"),
72109
audioFormat,
@@ -75,17 +112,46 @@ public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest requ
75112
payload,
76113
parameters: new RestParameters(client.DefaultRequestHeaders, debug: EnableDebug),
77114
cancellationToken: cancellationToken);
78-
Rest.TryGetDownloadCacheItem(clipName, out var cachedPath);
79115
return new Tuple<string, AudioClip>(cachedPath, clip);
80116
}
81117

118+
[Obsolete("Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead.")]
119+
public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
120+
=> await CreateTranscriptionTextAsync(request, cancellationToken);
121+
82122
/// <summary>
83123
/// Transcribes audio into the input language.
84124
/// </summary>
85125
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
86126
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
87127
/// <returns>The transcribed text.</returns>
88-
public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
128+
public async Task<string> CreateTranscriptionTextAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
129+
{
130+
var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
131+
return request.ResponseFormat == AudioResponseFormat.Json
132+
? JsonConvert.DeserializeObject<AudioResponse>(response)?.Text
133+
: response;
134+
}
135+
136+
/// <summary>
137+
/// Transcribes audio into the input language.
138+
/// </summary>
139+
/// <remarks>This method expects the request format to be either <see cref="AudioResponseFormat.Json"/> or <see cref="AudioResponseFormat.Verbose_Json"/>.</remarks>
140+
/// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
141+
/// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
142+
/// <returns><see cref="AudioResponse"/>.</returns>
143+
public async Task<AudioResponse> CreateTranscriptionJsonAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
144+
{
145+
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
146+
{
147+
throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
148+
}
149+
150+
var response = await Internal_CreateTranscriptionAsync(request, cancellationToken);
151+
return JsonConvert.DeserializeObject<AudioResponse>(response);
152+
}
153+
154+
private async Task<string> Internal_CreateTranscriptionAsync(AudioTranscriptionRequest request, CancellationToken cancellationToken = default)
89155
{
90156
var form = new WWWForm();
91157
using var audioData = new MemoryStream();
@@ -111,22 +177,58 @@ public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest req
111177
form.AddField("language", request.Language);
112178
}
113179

180+
switch (request.TimestampGranularities)
181+
{
182+
case TimestampGranularity.Segment:
183+
case TimestampGranularity.Word:
184+
form.AddField("timestamp_granularities[]", request.TimestampGranularities.ToString().ToLower());
185+
break;
186+
}
187+
114188
request.Dispose();
115189

116190
var response = await Rest.PostAsync(GetUrl("/transcriptions"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
117191
response.Validate(EnableDebug);
118-
return responseFormat == AudioResponseFormat.Json
119-
? JsonConvert.DeserializeObject<AudioResponse>(response.Body)?.Text
120-
: response.Body;
192+
return response.Body;
121193
}
122194

195+
[Obsolete("Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead.")]
196+
public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
197+
=> await CreateTranslationTextAsync(request, cancellationToken);
198+
123199
/// <summary>
124200
/// Translates audio into English.
125201
/// </summary>
126202
/// <param name="request"></param>
127203
/// <param name="cancellationToken"></param>
128204
/// <returns>The translated text.</returns>
129-
public async Task<string> CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
205+
public async Task<string> CreateTranslationTextAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
206+
{
207+
var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
208+
return request.ResponseFormat == AudioResponseFormat.Json
209+
? JsonConvert.DeserializeObject<AudioResponse>(responseAsString)?.Text
210+
: responseAsString;
211+
}
212+
213+
/// <summary>
214+
/// Translates audio into English.
215+
/// </summary>
216+
/// <param name="request"></param>
217+
/// <param name="cancellationToken"></param>
218+
/// <returns></returns>
219+
/// <exception cref="ArgumentException"></exception>
220+
public async Task<AudioResponse> CreateTranslationJsonAsync(AudioTranslationRequest request, CancellationToken cancellationToken = default)
221+
{
222+
if (request.ResponseFormat is not (AudioResponseFormat.Json or AudioResponseFormat.Verbose_Json))
223+
{
224+
throw new ArgumentException("Response format must be Json or Verbose Json.", nameof(request.ResponseFormat));
225+
}
226+
227+
var responseAsString = await Internal_CreateTranslationAsync(request, cancellationToken);
228+
return JsonConvert.DeserializeObject<AudioResponse>(responseAsString);
229+
}
230+
231+
private async Task<string> Internal_CreateTranslationAsync(AudioTranslationRequest request, CancellationToken cancellationToken)
130232
{
131233
var form = new WWWForm();
132234
using var audioData = new MemoryStream();
@@ -151,9 +253,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request
151253

152254
var response = await Rest.PostAsync(GetUrl("/translations"), form, new RestParameters(client.DefaultRequestHeaders), cancellationToken);
153255
response.Validate(EnableDebug);
154-
return responseFormat == AudioResponseFormat.Json
155-
? JsonConvert.DeserializeObject<AudioResponse>(response.Body)?.Text
156-
: response.Body;
256+
return response.Body;
157257
}
158258
}
159259
}

Runtime/Audio/AudioResponse.cs

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
// Licensed under the MIT License. See LICENSE in the project root for license information.
2+
3+
using Newtonsoft.Json;
4+
using UnityEngine.Scripting;
5+
6+
namespace OpenAI.Audio
7+
{
8+
[Preserve]
9+
public class AudioResponse
10+
{
11+
[Preserve]
12+
[JsonConstructor]
13+
public AudioResponse(
14+
[JsonProperty("language")] string language,
15+
[JsonProperty("duration")] double? duration,
16+
[JsonProperty("text")] string text,
17+
[JsonProperty("words")] TranscriptionWord[] words,
18+
[JsonProperty("segments")] TranscriptionSegment[] segments)
19+
{
20+
Language = language;
21+
Duration = duration;
22+
Text = text;
23+
Words = words;
24+
Segments = segments;
25+
}
26+
27+
/// <summary>
28+
/// The language of the input audio.
29+
/// </summary>
30+
[Preserve]
31+
[JsonProperty("language")]
32+
public string Language { get; }
33+
34+
/// <summary>
35+
/// The duration of the input audio.
36+
/// </summary>
37+
[Preserve]
38+
[JsonProperty("duration")]
39+
public double? Duration { get; }
40+
41+
/// <summary>
42+
/// The transcribed text.
43+
/// </summary>
44+
[Preserve]
45+
[JsonProperty("text")]
46+
public string Text { get; }
47+
48+
/// <summary>
49+
/// Extracted words and their corresponding timestamps.
50+
/// </summary>
51+
[Preserve]
52+
[JsonProperty("words")]
53+
public TranscriptionWord[] Words { get; }
54+
55+
/// <summary>
56+
/// Segments of the transcribed text and their corresponding details.
57+
/// </summary>
58+
[Preserve]
59+
[JsonProperty("segments")]
60+
public TranscriptionSegment[] Segments { get; }
61+
}
62+
}

Runtime/Audio/AudioResponse.cs.meta

Lines changed: 11 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)