Skip to content

Commit 3004694

Browse files
feat(api): new models for TTS, STT, + new audio features for Realtime (#46)
1 parent 558f8ce commit 3004694

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+1115
-65
lines changed

.stats.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
configured_endpoints: 80
2-
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-b26121d5df6eb5d3032a45a267473798b15fcfec76dd44a3256cf1238be05fa4.yml
2+
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-c22f59c66aec7914b6ee653d3098d1c1c8c16c180d2a158e819c8ddbf476f74b.yml

lib/openai.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,11 @@
4343
require_relative "openai/models/audio/transcription"
4444
require_relative "openai/models/audio/transcription_create_params"
4545
require_relative "openai/models/audio/transcription_create_response"
46+
require_relative "openai/models/audio/transcription_include"
4647
require_relative "openai/models/audio/transcription_segment"
48+
require_relative "openai/models/audio/transcription_stream_event"
49+
require_relative "openai/models/audio/transcription_text_delta_event"
50+
require_relative "openai/models/audio/transcription_text_done_event"
4751
require_relative "openai/models/audio/transcription_verbose"
4852
require_relative "openai/models/audio/transcription_word"
4953
require_relative "openai/models/audio/translation"

lib/openai/models/audio/speech_create_params.rb

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ class SpeechCreateParams < OpenAI::BaseModel
1616

1717
# @!attribute model
1818
# One of the available [TTS models](https://platform.openai.com/docs/models#tts):
19-
# `tts-1` or `tts-1-hd`
19+
# `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
2020
#
2121
# @return [String, Symbol, OpenAI::Models::Audio::SpeechModel]
2222
required :model, union: -> { OpenAI::Models::Audio::SpeechCreateParams::Model }
@@ -30,6 +30,17 @@ class SpeechCreateParams < OpenAI::BaseModel
3030
# @return [Symbol, OpenAI::Models::Audio::SpeechCreateParams::Voice]
3131
required :voice, enum: -> { OpenAI::Models::Audio::SpeechCreateParams::Voice }
3232

33+
# @!attribute [r] instructions
34+
# Control the voice of your generated audio with additional instructions. Does not
35+
# work with `tts-1` or `tts-1-hd`.
36+
#
37+
# @return [String, nil]
38+
optional :instructions, String
39+
40+
# @!parse
41+
# # @return [String]
42+
# attr_writer :instructions
43+
3344
# @!attribute [r] response_format
3445
# The format to audio in. Supported formats are `mp3`, `opus`, `aac`, `flac`,
3546
# `wav`, and `pcm`.
@@ -56,22 +67,23 @@ class SpeechCreateParams < OpenAI::BaseModel
5667
# # @param input [String]
5768
# # @param model [String, Symbol, OpenAI::Models::Audio::SpeechModel]
5869
# # @param voice [Symbol, OpenAI::Models::Audio::SpeechCreateParams::Voice]
70+
# # @param instructions [String]
5971
# # @param response_format [Symbol, OpenAI::Models::Audio::SpeechCreateParams::ResponseFormat]
6072
# # @param speed [Float]
6173
# # @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}]
6274
# #
63-
# def initialize(input:, model:, voice:, response_format: nil, speed: nil, request_options: {}, **) = super
75+
# def initialize(input:, model:, voice:, instructions: nil, response_format: nil, speed: nil, request_options: {}, **) = super
6476

6577
# def initialize: (Hash | OpenAI::BaseModel) -> void
6678

6779
# @abstract
6880
#
6981
# One of the available [TTS models](https://platform.openai.com/docs/models#tts):
70-
# `tts-1` or `tts-1-hd`
82+
# `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
7183
class Model < OpenAI::Union
7284
variant String
7385

74-
# One of the available [TTS models](https://platform.openai.com/docs/models#tts): `tts-1` or `tts-1-hd`
86+
# One of the available [TTS models](https://platform.openai.com/docs/models#tts): `tts-1`, `tts-1-hd` or `gpt-4o-mini-tts`.
7587
variant enum: -> { OpenAI::Models::Audio::SpeechModel }
7688

7789
# @!parse

lib/openai/models/audio/speech_model.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ module Audio
77
class SpeechModel < OpenAI::Enum
88
TTS_1 = :"tts-1"
99
TTS_1_HD = :"tts-1-hd"
10+
GPT_4O_MINI_TTS = :"gpt-4o-mini-tts"
1011

1112
finalize!
1213
end

lib/openai/models/audio/transcription.rb

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,15 +10,69 @@ class Transcription < OpenAI::BaseModel
1010
# @return [String]
1111
required :text, String
1212

13+
# @!attribute [r] logprobs
14+
# The log probabilities of the tokens in the transcription. Only returned with the
15+
# models `gpt-4o-transcribe` and `gpt-4o-mini-transcribe` if `logprobs` is added
16+
# to the `include` array.
17+
#
18+
# @return [Array<OpenAI::Models::Audio::Transcription::Logprob>, nil]
19+
optional :logprobs, -> { OpenAI::ArrayOf[OpenAI::Models::Audio::Transcription::Logprob] }
20+
21+
# @!parse
22+
# # @return [Array<OpenAI::Models::Audio::Transcription::Logprob>]
23+
# attr_writer :logprobs
24+
1325
# @!parse
1426
# # Represents a transcription response returned by model, based on the provided
1527
# # input.
1628
# #
1729
# # @param text [String]
30+
# # @param logprobs [Array<OpenAI::Models::Audio::Transcription::Logprob>]
1831
# #
19-
# def initialize(text:, **) = super
32+
# def initialize(text:, logprobs: nil, **) = super
2033

2134
# def initialize: (Hash | OpenAI::BaseModel) -> void
35+
36+
class Logprob < OpenAI::BaseModel
37+
# @!attribute [r] token
38+
# The token in the transcription.
39+
#
40+
# @return [String, nil]
41+
optional :token, String
42+
43+
# @!parse
44+
# # @return [String]
45+
# attr_writer :token
46+
47+
# @!attribute [r] bytes
48+
# The bytes of the token.
49+
#
50+
# @return [Array<Float>, nil]
51+
optional :bytes, OpenAI::ArrayOf[Float]
52+
53+
# @!parse
54+
# # @return [Array<Float>]
55+
# attr_writer :bytes
56+
57+
# @!attribute [r] logprob
58+
# The log probability of the token.
59+
#
60+
# @return [Float, nil]
61+
optional :logprob, Float
62+
63+
# @!parse
64+
# # @return [Float]
65+
# attr_writer :logprob
66+
67+
# @!parse
68+
# # @param token [String]
69+
# # @param bytes [Array<Float>]
70+
# # @param logprob [Float]
71+
# #
72+
# def initialize(token: nil, bytes: nil, logprob: nil, **) = super
73+
74+
# def initialize: (Hash | OpenAI::BaseModel) -> void
75+
end
2276
end
2377
end
2478
end

lib/openai/models/audio/transcription_create_params.rb

Lines changed: 25 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,27 @@ class TranscriptionCreateParams < OpenAI::BaseModel
1616
required :file, IO
1717

1818
# @!attribute model
19-
# ID of the model to use. Only `whisper-1` (which is powered by our open source
20-
# Whisper V2 model) is currently available.
19+
# ID of the model to use. The options are `gpt-4o-transcribe`,
20+
# `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
21+
# Whisper V2 model).
2122
#
2223
# @return [String, Symbol, OpenAI::Models::AudioModel]
2324
required :model, union: -> { OpenAI::Models::Audio::TranscriptionCreateParams::Model }
2425

26+
# @!attribute [r] include
27+
# Additional information to include in the transcription response. `logprobs` will
28+
# return the log probabilities of the tokens in the response to understand the
29+
# model's confidence in the transcription. `logprobs` only works with
30+
# response_format set to `json` and only with the models `gpt-4o-transcribe` and
31+
# `gpt-4o-mini-transcribe`.
32+
#
33+
# @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>, nil]
34+
optional :include, -> { OpenAI::ArrayOf[enum: OpenAI::Models::Audio::TranscriptionInclude] }
35+
36+
# @!parse
37+
# # @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>]
38+
# attr_writer :include
39+
2540
# @!attribute [r] language
2641
# The language of the input audio. Supplying the input language in
2742
# [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -49,7 +64,8 @@ class TranscriptionCreateParams < OpenAI::BaseModel
4964

5065
# @!attribute [r] response_format
5166
# The format of the output, in one of these options: `json`, `text`, `srt`,
52-
# `verbose_json`, or `vtt`.
67+
# `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
68+
# the only supported format is `json`.
5369
#
5470
# @return [Symbol, OpenAI::Models::AudioResponseFormat, nil]
5571
optional :response_format, enum: -> { OpenAI::Models::AudioResponseFormat }
@@ -90,6 +106,7 @@ class TranscriptionCreateParams < OpenAI::BaseModel
90106
# @!parse
91107
# # @param file [IO, StringIO]
92108
# # @param model [String, Symbol, OpenAI::Models::AudioModel]
109+
# # @param include [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>]
93110
# # @param language [String]
94111
# # @param prompt [String]
95112
# # @param response_format [Symbol, OpenAI::Models::AudioResponseFormat]
@@ -100,6 +117,7 @@ class TranscriptionCreateParams < OpenAI::BaseModel
100117
# def initialize(
101118
# file:,
102119
# model:,
120+
# include: nil,
103121
# language: nil,
104122
# prompt: nil,
105123
# response_format: nil,
@@ -115,12 +133,13 @@ class TranscriptionCreateParams < OpenAI::BaseModel
115133

116134
# @abstract
117135
#
118-
# ID of the model to use. Only `whisper-1` (which is powered by our open source
119-
# Whisper V2 model) is currently available.
136+
# ID of the model to use. The options are `gpt-4o-transcribe`,
137+
# `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
138+
# Whisper V2 model).
120139
class Model < OpenAI::Union
121140
variant String
122141

123-
# ID of the model to use. Only `whisper-1` (which is powered by our open source Whisper V2 model) is currently available.
142+
# ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model).
124143
variant enum: -> { OpenAI::Models::AudioModel }
125144

126145
# @!parse
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
# frozen_string_literal: true
2+
3+
module OpenAI
4+
module Models
5+
module Audio
6+
# @abstract
7+
class TranscriptionInclude < OpenAI::Enum
8+
LOGPROBS = :logprobs
9+
10+
finalize!
11+
end
12+
end
13+
end
14+
end
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# frozen_string_literal: true
2+
3+
module OpenAI
4+
module Models
5+
module Audio
6+
# @abstract
7+
#
8+
# Emitted when there is an additional text delta. This is also the first event
9+
# emitted when the transcription starts. Only emitted when you
10+
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
11+
# with the `Stream` parameter set to `true`.
12+
class TranscriptionStreamEvent < OpenAI::Union
13+
discriminator :type
14+
15+
# Emitted when there is an additional text delta. This is also the first event emitted when the transcription starts. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`.
16+
variant :"transcript.text.delta", -> { OpenAI::Models::Audio::TranscriptionTextDeltaEvent }
17+
18+
# Emitted when the transcription is complete. Contains the complete transcription text. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`.
19+
variant :"transcript.text.done", -> { OpenAI::Models::Audio::TranscriptionTextDoneEvent }
20+
21+
# @!parse
22+
# class << self
23+
# # @return [Array(OpenAI::Models::Audio::TranscriptionTextDeltaEvent, OpenAI::Models::Audio::TranscriptionTextDoneEvent)]
24+
# def variants; end
25+
# end
26+
end
27+
end
28+
end
29+
end
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
# frozen_string_literal: true
2+
3+
module OpenAI
4+
module Models
5+
module Audio
6+
class TranscriptionTextDeltaEvent < OpenAI::BaseModel
7+
# @!attribute delta
8+
# The text delta that was additionally transcribed.
9+
#
10+
# @return [String]
11+
required :delta, String
12+
13+
# @!attribute type
14+
# The type of the event. Always `transcript.text.delta`.
15+
#
16+
# @return [Symbol, :"transcript.text.delta"]
17+
required :type, const: :"transcript.text.delta"
18+
19+
# @!attribute [r] logprobs
20+
# The log probabilities of the delta. Only included if you
21+
# [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
22+
# with the `include[]` parameter set to `logprobs`.
23+
#
24+
# @return [Array<OpenAI::Models::Audio::TranscriptionTextDeltaEvent::Logprob>, nil]
25+
optional :logprobs, -> { OpenAI::ArrayOf[OpenAI::Models::Audio::TranscriptionTextDeltaEvent::Logprob] }
26+
27+
# @!parse
28+
# # @return [Array<OpenAI::Models::Audio::TranscriptionTextDeltaEvent::Logprob>]
29+
# attr_writer :logprobs
30+
31+
# @!parse
32+
# # Emitted when there is an additional text delta. This is also the first event
33+
# # emitted when the transcription starts. Only emitted when you
34+
# # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
35+
# # with the `Stream` parameter set to `true`.
36+
# #
37+
# # @param delta [String]
38+
# # @param logprobs [Array<OpenAI::Models::Audio::TranscriptionTextDeltaEvent::Logprob>]
39+
# # @param type [Symbol, :"transcript.text.delta"]
40+
# #
41+
# def initialize(delta:, logprobs: nil, type: :"transcript.text.delta", **) = super
42+
43+
# def initialize: (Hash | OpenAI::BaseModel) -> void
44+
45+
class Logprob < OpenAI::BaseModel
46+
# @!attribute [r] token
47+
# The token that was used to generate the log probability.
48+
#
49+
# @return [String, nil]
50+
optional :token, String
51+
52+
# @!parse
53+
# # @return [String]
54+
# attr_writer :token
55+
56+
# @!attribute [r] bytes
57+
# The bytes that were used to generate the log probability.
58+
#
59+
# @return [Array<Object>, nil]
60+
optional :bytes, OpenAI::ArrayOf[OpenAI::Unknown]
61+
62+
# @!parse
63+
# # @return [Array<Object>]
64+
# attr_writer :bytes
65+
66+
# @!attribute [r] logprob
67+
# The log probability of the token.
68+
#
69+
# @return [Float, nil]
70+
optional :logprob, Float
71+
72+
# @!parse
73+
# # @return [Float]
74+
# attr_writer :logprob
75+
76+
# @!parse
77+
# # @param token [String]
78+
# # @param bytes [Array<Object>]
79+
# # @param logprob [Float]
80+
# #
81+
# def initialize(token: nil, bytes: nil, logprob: nil, **) = super
82+
83+
# def initialize: (Hash | OpenAI::BaseModel) -> void
84+
end
85+
end
86+
end
87+
end
88+
end

0 commit comments

Comments
 (0)