@@ -9,7 +9,22 @@ export class Transcriptions extends APIResource {
99 /**
1010 * Transcribes audio into the input language.
1111 */
12- create ( body : TranscriptionCreateParams , options ?: Core . RequestOptions ) : Core . APIPromise < Transcription > {
12+ create (
13+ body : TranscriptionCreateParams < 'json' | undefined > ,
14+ options ?: Core . RequestOptions ,
15+ ) : Core . APIPromise < Transcription > ;
16+ create (
17+ body : TranscriptionCreateParams < 'verbose_json' > ,
18+ options ?: Core . RequestOptions ,
19+ ) : Core . APIPromise < TranscriptionVerbose > ;
20+ create (
21+ body : TranscriptionCreateParams < 'srt' | 'vtt' | 'text' > ,
22+ options ?: Core . RequestOptions ,
23+ ) : Core . APIPromise < string > ;
24+ create (
25+ body : TranscriptionCreateParams ,
26+ options ?: Core . RequestOptions ,
27+ ) : Core . APIPromise < TranscriptionCreateResponse | string > {
1328 return this . _client . post ( '/audio/transcriptions' , Core . multipartFormRequestOptions ( { body, ...options } ) ) ;
1429 }
1530}
@@ -25,7 +40,118 @@ export interface Transcription {
2540 text : string ;
2641}
2742
28- export interface TranscriptionCreateParams {
43+ export interface TranscriptionSegment {
44+ /**
45+ * Unique identifier of the segment.
46+ */
47+ id : number ;
48+
49+ /**
50+ * Average logprob of the segment. If the value is lower than -1, consider the
51+ * logprobs failed.
52+ */
53+ avg_logprob : number ;
54+
55+ /**
56+ * Compression ratio of the segment. If the value is greater than 2.4, consider the
57+ * compression failed.
58+ */
59+ compression_ratio : number ;
60+
61+ /**
62+ * End time of the segment in seconds.
63+ */
64+ end : number ;
65+
66+ /**
67+ * Probability of no speech in the segment. If the value is higher than 1.0 and the
68+ * `avg_logprob` is below -1, consider this segment silent.
69+ */
70+ no_speech_prob : number ;
71+
72+ /**
73+ * Seek offset of the segment.
74+ */
75+ seek : number ;
76+
77+ /**
78+ * Start time of the segment in seconds.
79+ */
80+ start : number ;
81+
82+ /**
83+ * Temperature parameter used for generating the segment.
84+ */
85+ temperature : number ;
86+
87+ /**
88+ * Text content of the segment.
89+ */
90+ text : string ;
91+
92+ /**
93+ * Array of token IDs for the text content.
94+ */
95+ tokens : Array < number > ;
96+ }
97+
98+ /**
99+ * Represents a verbose json transcription response returned by model, based on the
100+ * provided input.
101+ */
102+ export interface TranscriptionVerbose {
103+ /**
104+ * The duration of the input audio.
105+ */
106+ duration : string ;
107+
108+ /**
109+ * The language of the input audio.
110+ */
111+ language : string ;
112+
113+ /**
114+ * The transcribed text.
115+ */
116+ text : string ;
117+
118+ /**
119+ * Segments of the transcribed text and their corresponding details.
120+ */
121+ segments ?: Array < TranscriptionSegment > ;
122+
123+ /**
124+ * Extracted words and their corresponding timestamps.
125+ */
126+ words ?: Array < TranscriptionWord > ;
127+ }
128+
129+ export interface TranscriptionWord {
130+ /**
131+ * End time of the word in seconds.
132+ */
133+ end : number ;
134+
135+ /**
136+ * Start time of the word in seconds.
137+ */
138+ start : number ;
139+
140+ /**
141+ * The text content of the word.
142+ */
143+ word : string ;
144+ }
145+
146+ /**
147+ * Represents a transcription response returned by model, based on the provided
148+ * input.
149+ */
150+ export type TranscriptionCreateResponse = Transcription | TranscriptionVerbose ;
151+
152+ export interface TranscriptionCreateParams <
153+ ResponseFormat extends AudioAPI . AudioResponseFormat | undefined = AudioAPI . AudioResponseFormat | undefined ,
154+ > {
29155 /**
30156 * The audio file object (not file name) to transcribe, in one of these formats:
31157 * flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
@@ -57,7 +183,7 @@ export interface TranscriptionCreateParams {
57183 * The format of the output, in one of these options: `json`, `text`, `srt`,
58184 * `verbose_json`, or `vtt`.
59185 */
60- response_format ?: AudioAPI . AudioResponseFormat ;
186+ response_format ?: ResponseFormat ;
61187
62188 /**
63189 * The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
@@ -80,5 +206,9 @@ export interface TranscriptionCreateParams {
80206
81207export namespace Transcriptions {
82208 export import Transcription = TranscriptionsAPI . Transcription ;
209+ export import TranscriptionSegment = TranscriptionsAPI . TranscriptionSegment ;
210+ export import TranscriptionVerbose = TranscriptionsAPI . TranscriptionVerbose ;
211+ export import TranscriptionWord = TranscriptionsAPI . TranscriptionWord ;
212+ export import TranscriptionCreateResponse = TranscriptionsAPI . TranscriptionCreateResponse ;
83213 export import TranscriptionCreateParams = TranscriptionsAPI . TranscriptionCreateParams ;
84214}
0 commit comments