@@ -9,7 +9,10 @@ export class Transcriptions extends APIResource {
99 /**
1010 * Transcribes audio into the input language.
1111 */
12- create ( body : TranscriptionCreateParams , options ?: Core . RequestOptions ) : Core . APIPromise < Transcription > {
12+ create (
13+ body : TranscriptionCreateParams ,
14+ options ?: Core . RequestOptions ,
15+ ) : Core . APIPromise < TranscriptionCreateResponse > {
1316 return this . _client . post ( '/audio/transcriptions' , Core . multipartFormRequestOptions ( { body, ...options } ) ) ;
1417 }
1518}
@@ -25,6 +28,115 @@ export interface Transcription {
2528 text : string ;
2629}
2730
31+ export interface TranscriptionSegment {
32+ /**
33+ * Unique identifier of the segment.
34+ */
35+ id : number ;
36+
37+ /**
38+ * Average logprob of the segment. If the value is lower than -1, consider the
39+ * logprobs failed.
40+ */
41+ avg_logprob : number ;
42+
43+ /**
44+ * Compression ratio of the segment. If the value is greater than 2.4, consider the
45+ * compression failed.
46+ */
47+ compression_ratio : number ;
48+
49+ /**
50+ * End time of the segment in seconds.
51+ */
52+ end : number ;
53+
54+ /**
55+ * Probability of no speech in the segment. If the value is higher than 1.0 and the
56+ * `avg_logprob` is below -1, consider this segment silent.
57+ */
58+ no_speech_prob : number ;
59+
60+ /**
61+ * Seek offset of the segment.
62+ */
63+ seek : number ;
64+
65+ /**
66+ * Start time of the segment in seconds.
67+ */
68+ start : number ;
69+
70+ /**
71+ * Temperature parameter used for generating the segment.
72+ */
73+ temperature : number ;
74+
75+ /**
76+ * Text content of the segment.
77+ */
78+ text : string ;
79+
80+ /**
81+ * Array of token IDs for the text content.
82+ */
83+ tokens : Array < number > ;
84+ }
85+
86+ /**
87+ * Represents a verbose json transcription response returned by model, based on the
88+ * provided input.
89+ */
90+ export interface TranscriptionVerbose {
91+ /**
92+ * The duration of the input audio.
93+ */
94+ duration : string ;
95+
96+ /**
97+ * The language of the input audio.
98+ */
99+ language : string ;
100+
101+ /**
102+ * The transcribed text.
103+ */
104+ text : string ;
105+
106+ /**
107+ * Segments of the transcribed text and their corresponding details.
108+ */
109+ segments ?: Array < TranscriptionSegment > ;
110+
111+ /**
112+ * Extracted words and their corresponding timestamps.
113+ */
114+ words ?: Array < TranscriptionWord > ;
115+ }
116+
117+ export interface TranscriptionWord {
118+ /**
119+ * End time of the word in seconds.
120+ */
121+ end : number ;
122+
123+ /**
124+ * Start time of the word in seconds.
125+ */
126+ start : number ;
127+
128+ /**
129+ * The text content of the word.
130+ */
131+ word : string ;
132+ }
133+
134+ /**
135+ * Represents a transcription response returned by model, based on the provided
136+ * input.
137+ */
138+ export type TranscriptionCreateResponse = Transcription | TranscriptionVerbose ;
139+
28140export interface TranscriptionCreateParams {
29141 /**
30142 * The audio file object (not file name) to transcribe, in one of these formats:
@@ -80,5 +192,9 @@ export interface TranscriptionCreateParams {
80192
81193export namespace Transcriptions {
82194 export import Transcription = TranscriptionsAPI . Transcription ;
195+ export import TranscriptionSegment = TranscriptionsAPI . TranscriptionSegment ;
196+ export import TranscriptionVerbose = TranscriptionsAPI . TranscriptionVerbose ;
197+ export import TranscriptionWord = TranscriptionsAPI . TranscriptionWord ;
198+ export import TranscriptionCreateResponse = TranscriptionsAPI . TranscriptionCreateResponse ;
83199 export import TranscriptionCreateParams = TranscriptionsAPI . TranscriptionCreateParams ;
84200}
0 commit comments