77using System . Threading . Tasks ;
88using UnityEngine ;
99using UnityEngine . Networking ;
10- using UnityEngine . Scripting ;
1110using Utilities . WebRequestRest ;
1211
1312namespace OpenAI . Audio
@@ -18,21 +17,6 @@ namespace OpenAI.Audio
1817 /// </summary>
1918 public sealed class AudioEndpoint : OpenAIBaseEndpoint
2019 {
21- [ Preserve ]
22- private class AudioResponse
23- {
24- [ Preserve ]
25- [ JsonConstructor ]
26- public AudioResponse ( [ JsonProperty ( "text" ) ] string text )
27- {
28- Text = text ;
29- }
30-
31- [ Preserve ]
32- [ JsonProperty ( "text" ) ]
33- public string Text { get ; }
34- }
35-
3620 internal AudioEndpoint ( OpenAIClient client ) : base ( client ) { }
3721
3822 /// <inheritdoc />
@@ -48,25 +32,78 @@ internal AudioEndpoint(OpenAIClient client) : base(client) { }
4832 /// <returns><see cref="AudioClip"/> and the cached path.</returns>
4933 [ Function ( "Generates audio from the input text." ) ]
5034 public async Task < Tuple < string , AudioClip > > CreateSpeechAsync ( SpeechRequest request , CancellationToken cancellationToken = default )
35+ => await CreateSpeechStreamAsync ( request , null , cancellationToken ) ;
36+
37+ /// <summary>
38+ /// Generates streaming audio from the input text.
39+ /// </summary>
40+ /// <param name="request"><see cref="SpeechRequest"/>.</param>
41+ /// <param name="partialClipCallback">Optional, partial <see cref="AudioClip"/> callback used to stream audio.</param>
42+ /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
43+ /// <returns><see cref="AudioClip"/> and the cached path.</returns>
44+ [ Function ( "Generates streaming audio from the input text." ) ]
45+ public async Task < Tuple < string , AudioClip > > CreateSpeechStreamAsync ( SpeechRequest request , Action < AudioClip > partialClipCallback , CancellationToken cancellationToken = default )
5146 {
52- var audioFormat = request . ResponseFormat switch
47+ if ( partialClipCallback != null && request . ResponseFormat != SpeechResponseFormat . PCM )
5348 {
54- SpeechResponseFormat . MP3 => AudioType . MPEG ,
55- _ => throw new NotSupportedException ( request . ResponseFormat . ToString ( ) )
56- } ;
49+ Debug . LogWarning ( "Speech streaming only supported with PCM response format. Overriding to PCM..." ) ;
50+ request . ResponseFormat = SpeechResponseFormat . PCM ;
51+ }
52+
5753 var ext = request . ResponseFormat switch
5854 {
5955 SpeechResponseFormat . MP3 => "mp3" ,
56+ SpeechResponseFormat . WAV => "wav" ,
57+ SpeechResponseFormat . PCM => "pcm" ,
6058 _ => throw new NotSupportedException ( request . ResponseFormat . ToString ( ) )
6159 } ;
6260 var payload = JsonConvert . SerializeObject ( request , OpenAIClient . JsonSerializationOptions ) ;
6361 string clipName ;
6462
6563 lock ( mutex )
6664 {
67- clipName = $ "{ request . Voice } -{ DateTime . UtcNow : yyyyMMddThhmmssff } .{ ext } ";
65+ clipName = $ "{ request . Voice } -{ DateTime . UtcNow : yyyyMMddThhmmssfffff } .{ ext } ";
6866 }
6967
68+ Rest . TryGetDownloadCacheItem ( clipName , out var cachedPath ) ;
69+
70+ if ( request . ResponseFormat == SpeechResponseFormat . PCM )
71+ {
72+ var part = 0 ;
73+ var response = await Rest . PostAsync (
74+ GetUrl ( "/speech" ) ,
75+ payload ,
76+ StreamCallback ,
77+ eventChunkSize : 8192 ,
78+ new RestParameters ( client . DefaultRequestHeaders ) ,
79+ cancellationToken ) ;
80+ response . Validate ( EnableDebug ) ;
81+ var samples = Utilities . Audio . PCMEncoder . Decode ( response . Data ) ;
82+ await File . WriteAllBytesAsync ( cachedPath , response . Data , cancellationToken ) . ConfigureAwait ( true ) ;
83+ return new Tuple < string , AudioClip > ( cachedPath , AudioClip . Create ( clipName , samples . Length , 1 , 24000 , false ) ) ;
84+
85+ void StreamCallback ( Response partialResponse )
86+ {
87+ var chunk = Utilities . Audio . PCMEncoder . Decode ( partialResponse . Data ) ;
88+ var partialClip = AudioClip . Create ( $ "{ clipName } _{ ++ part } ", chunk . Length , 1 , 24000 , false ) ;
89+
90+ if ( ! partialClip . SetData ( chunk , 0 ) )
91+ {
92+ Debug . LogError ( "Failed to set pcm data to partial clip." ) ;
93+ return ;
94+ }
95+
96+ partialClipCallback ? . Invoke ( partialClip ) ;
97+ }
98+ }
99+
100+ var audioFormat = request . ResponseFormat switch
101+ {
102+ SpeechResponseFormat . MP3 => AudioType . MPEG ,
103+ SpeechResponseFormat . WAV => AudioType . WAV ,
104+ _ => throw new NotSupportedException ( request . ResponseFormat . ToString ( ) )
105+ } ;
106+
70107 var clip = await Rest . DownloadAudioClipAsync (
71108 GetUrl ( "/speech" ) ,
72109 audioFormat ,
@@ -75,17 +112,46 @@ public async Task<Tuple<string, AudioClip>> CreateSpeechAsync(SpeechRequest requ
75112 payload ,
76113 parameters : new RestParameters ( client . DefaultRequestHeaders , debug : EnableDebug ) ,
77114 cancellationToken : cancellationToken ) ;
78- Rest . TryGetDownloadCacheItem ( clipName , out var cachedPath ) ;
79115 return new Tuple < string , AudioClip > ( cachedPath , clip ) ;
80116 }
81117
118+ [ Obsolete ( "Use CreateTranscriptionTextAsync or CreateTranscriptionJsonAsync instead." ) ]
119+ public async Task < string > CreateTranscriptionAsync ( AudioTranscriptionRequest request , CancellationToken cancellationToken = default )
120+ => await CreateTranscriptionTextAsync ( request , cancellationToken ) ;
121+
82122 /// <summary>
83123 /// Transcribes audio into the input language.
84124 /// </summary>
85125 /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
86126 /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
87127 /// <returns>The transcribed text.</returns>
88- public async Task < string > CreateTranscriptionAsync ( AudioTranscriptionRequest request , CancellationToken cancellationToken = default )
128+ public async Task < string > CreateTranscriptionTextAsync ( AudioTranscriptionRequest request , CancellationToken cancellationToken = default )
129+ {
130+ var response = await Internal_CreateTranscriptionAsync ( request , cancellationToken ) ;
131+ return request . ResponseFormat == AudioResponseFormat . Json
132+ ? JsonConvert . DeserializeObject < AudioResponse > ( response ) ? . Text
133+ : response ;
134+ }
135+
136+ /// <summary>
137+ /// Transcribes audio into the input language.
138+ /// </summary>
139+ /// <remarks>This method expects the request format to be either <see cref="AudioResponseFormat.Json"/> or <see cref="AudioResponseFormat.Verbose_Json"/>.</remarks>
140+ /// <param name="request"><see cref="AudioTranscriptionRequest"/>.</param>
141+ /// <param name="cancellationToken">Optional, <see cref="CancellationToken"/>.</param>
142+ /// <returns><see cref="AudioResponse"/>.</returns>
143+ public async Task < AudioResponse > CreateTranscriptionJsonAsync ( AudioTranscriptionRequest request , CancellationToken cancellationToken = default )
144+ {
145+ if ( request . ResponseFormat is not ( AudioResponseFormat . Json or AudioResponseFormat . Verbose_Json ) )
146+ {
147+ throw new ArgumentException ( "Response format must be Json or Verbose Json." , nameof ( request . ResponseFormat ) ) ;
148+ }
149+
150+ var response = await Internal_CreateTranscriptionAsync ( request , cancellationToken ) ;
151+ return JsonConvert . DeserializeObject < AudioResponse > ( response ) ;
152+ }
153+
154+ private async Task < string > Internal_CreateTranscriptionAsync ( AudioTranscriptionRequest request , CancellationToken cancellationToken = default )
89155 {
90156 var form = new WWWForm ( ) ;
91157 using var audioData = new MemoryStream ( ) ;
@@ -111,22 +177,58 @@ public async Task<string> CreateTranscriptionAsync(AudioTranscriptionRequest req
111177 form . AddField ( "language" , request . Language ) ;
112178 }
113179
180+ switch ( request . TimestampGranularities )
181+ {
182+ case TimestampGranularity . Segment :
183+ case TimestampGranularity . Word :
184+ form . AddField ( "timestamp_granularities[]" , request . TimestampGranularities . ToString ( ) . ToLower ( ) ) ;
185+ break ;
186+ }
187+
114188 request . Dispose ( ) ;
115189
116190 var response = await Rest . PostAsync ( GetUrl ( "/transcriptions" ) , form , new RestParameters ( client . DefaultRequestHeaders ) , cancellationToken ) ;
117191 response . Validate ( EnableDebug ) ;
118- return responseFormat == AudioResponseFormat . Json
119- ? JsonConvert . DeserializeObject < AudioResponse > ( response . Body ) ? . Text
120- : response . Body ;
192+ return response . Body ;
121193 }
122194
195+ [ Obsolete ( "Use CreateTranslationTextAsync or CreateTranslationJsonAsync instead." ) ]
196+ public async Task < string > CreateTranslationAsync ( AudioTranslationRequest request , CancellationToken cancellationToken = default )
197+ => await CreateTranslationTextAsync ( request , cancellationToken ) ;
198+
123199 /// <summary>
124200 /// Translates audio into English.
125201 /// </summary>
126202 /// <param name="request"></param>
127203 /// <param name="cancellationToken"></param>
128204 /// <returns>The translated text.</returns>
129- public async Task < string > CreateTranslationAsync ( AudioTranslationRequest request , CancellationToken cancellationToken = default )
205+ public async Task < string > CreateTranslationTextAsync ( AudioTranslationRequest request , CancellationToken cancellationToken = default )
206+ {
207+ var responseAsString = await Internal_CreateTranslationAsync ( request , cancellationToken ) ;
208+ return request . ResponseFormat == AudioResponseFormat . Json
209+ ? JsonConvert . DeserializeObject < AudioResponse > ( responseAsString ) ? . Text
210+ : responseAsString ;
211+ }
212+
213+ /// <summary>
214+ /// Translates audio into English.
215+ /// </summary>
216+ /// <param name="request"></param>
217+ /// <param name="cancellationToken"></param>
218+ /// <returns></returns>
219+ /// <exception cref="ArgumentException"></exception>
220+ public async Task < AudioResponse > CreateTranslationJsonAsync ( AudioTranslationRequest request , CancellationToken cancellationToken = default )
221+ {
222+ if ( request . ResponseFormat is not ( AudioResponseFormat . Json or AudioResponseFormat . Verbose_Json ) )
223+ {
224+ throw new ArgumentException ( "Response format must be Json or Verbose Json." , nameof ( request . ResponseFormat ) ) ;
225+ }
226+
227+ var responseAsString = await Internal_CreateTranslationAsync ( request , cancellationToken ) ;
228+ return JsonConvert . DeserializeObject < AudioResponse > ( responseAsString ) ;
229+ }
230+
231+ private async Task < string > Internal_CreateTranslationAsync ( AudioTranslationRequest request , CancellationToken cancellationToken )
130232 {
131233 var form = new WWWForm ( ) ;
132234 using var audioData = new MemoryStream ( ) ;
@@ -151,9 +253,7 @@ public async Task<string> CreateTranslationAsync(AudioTranslationRequest request
151253
152254 var response = await Rest . PostAsync ( GetUrl ( "/translations" ) , form , new RestParameters ( client . DefaultRequestHeaders ) , cancellationToken ) ;
153255 response . Validate ( EnableDebug ) ;
154- return responseFormat == AudioResponseFormat . Json
155- ? JsonConvert . DeserializeObject < AudioResponse > ( response . Body ) ? . Text
156- : response . Body ;
256+ return response . Body ;
157257 }
158258 }
159259}
0 commit comments