Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .release-please-manifest.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
{
".": "0.31.0"
".": "0.32.0"
}
6 changes: 3 additions & 3 deletions .stats.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
configured_endpoints: 135
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-e66e85fb7f72477256dca1acb6b23396989d381c5c1b318de564195436bcb93f.yml
openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
config_hash: 89bf7bb3a1f9439ffc6ea0e7dc57ba9b
openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
config_hash: 03b48e9b8c7231a902403210dbd7dfa0
14 changes: 14 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Changelog

## 0.32.0 (2025-10-16)

Full Changelog: [v0.31.0...v0.32.0](https://github.com/openai/openai-ruby/compare/v0.31.0...v0.32.0)

### Features

* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([b31bd7f](https://github.com/openai/openai-ruby/commit/b31bd7f20ca702160873fa26ab39479fd8102f85))


### Bug Fixes

* absolutely qualified uris should always override the default ([14fdff8](https://github.com/openai/openai-ruby/commit/14fdff8de533a1002c64c9086016777a1e152a97))
* should not reuse buffers for `IO.copy_stream` interop ([8f33de1](https://github.com/openai/openai-ruby/commit/8f33de18bb104d5003a4d459ad244c0813e5a07e))

## 0.31.0 (2025-10-10)

Full Changelog: [v0.30.0...v0.31.0](https://github.com/openai/openai-ruby/compare/v0.30.0...v0.31.0)
Expand Down
2 changes: 1 addition & 1 deletion Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ GIT
PATH
remote: .
specs:
openai (0.31.0)
openai (0.32.0)
connection_pool

GEM
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ To use this gem, install via Bundler by adding the following to your application
<!-- x-release-please-start-version -->

```ruby
gem "openai", "~> 0.31.0"
gem "openai", "~> 0.32.0"
```

<!-- x-release-please-end -->
Expand Down
3 changes: 3 additions & 0 deletions lib/openai.rb
Original file line number Diff line number Diff line change
Expand Up @@ -79,11 +79,14 @@
require_relative "openai/models/audio/transcription"
require_relative "openai/models/audio/transcription_create_params"
require_relative "openai/models/audio/transcription_create_response"
require_relative "openai/models/audio/transcription_diarized"
require_relative "openai/models/audio/transcription_diarized_segment"
require_relative "openai/models/audio/transcription_include"
require_relative "openai/models/audio/transcription_segment"
require_relative "openai/models/audio/transcription_stream_event"
require_relative "openai/models/audio/transcription_text_delta_event"
require_relative "openai/models/audio/transcription_text_done_event"
require_relative "openai/models/audio/transcription_text_segment_event"
require_relative "openai/models/audio/transcription_verbose"
require_relative "openai/models/audio/transcription_word"
require_relative "openai/models/audio/translation"
Expand Down
10 changes: 5 additions & 5 deletions lib/openai/internal/util.rb
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,9 @@ def join_parsed_uri(lhs, rhs)
base_path, base_query = lhs.fetch_values(:path, :query)
slashed = base_path.end_with?("/") ? base_path : "#{base_path}/"

parsed_path, parsed_query = parse_uri(rhs.fetch(:path)).fetch_values(:path, :query)
override = URI::Generic.build(**rhs.slice(:scheme, :host, :port), path: parsed_path)
merged = {**parse_uri(rhs.fetch(:path)), **rhs.except(:path, :query)}
parsed_path, parsed_query = merged.fetch_values(:path, :query)
override = URI::Generic.build(**merged.slice(:scheme, :host, :port), path: parsed_path)

joined = URI.join(URI::Generic.build(lhs.except(:path, :query)), slashed, override)
query = deep_merge(
Expand Down Expand Up @@ -473,10 +474,9 @@ class << self
# @return [Enumerable<String>]
def writable_enum(&blk)
Enumerator.new do |y|
buf = String.new
y.define_singleton_method(:write) do
self << buf.replace(_1)
buf.bytesize
self << _1.dup
_1.bytesize
end

blk.call(y)
Expand Down
53 changes: 42 additions & 11 deletions lib/openai/models/audio/transcription_create_params.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel

# @!attribute model
# ID of the model to use. The options are `gpt-4o-transcribe`,
# `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
# Whisper V2 model).
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
#
# @return [String, Symbol, OpenAI::Models::AudioModel]
required :model, union: -> { OpenAI::Audio::TranscriptionCreateParams::Model }
Expand All @@ -30,6 +30,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
# first normalizes loudness and then uses voice activity detection (VAD) to choose
# boundaries. `server_vad` object can be provided to tweak VAD detection
# parameters manually. If unset, the audio is transcribed as a single block.
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
# seconds.
#
# @return [Symbol, :auto, OpenAI::Models::Audio::TranscriptionCreateParams::ChunkingStrategy::VadConfig, nil]
optional :chunking_strategy,
Expand All @@ -41,11 +43,30 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
# return the log probabilities of the tokens in the response to understand the
# model's confidence in the transcription. `logprobs` only works with
# response_format set to `json` and only with the models `gpt-4o-transcribe` and
# `gpt-4o-mini-transcribe`.
# `gpt-4o-mini-transcribe`. This field is not supported when using
# `gpt-4o-transcribe-diarize`.
#
# @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>, nil]
optional :include, -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionInclude] }

# @!attribute known_speaker_names
# Optional list of speaker names that correspond to the audio samples provided in
# `known_speaker_references[]`. Each entry should be a short identifier (for
# example `customer` or `agent`). Up to 4 speakers are supported.
#
# @return [Array<String>, nil]
optional :known_speaker_names, OpenAI::Internal::Type::ArrayOf[String]

# @!attribute known_speaker_references
# Optional list of audio samples (as
# [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
# that contain known speaker references matching `known_speaker_names[]`. Each
# sample must be between 2 and 10 seconds, and can use any of the same input audio
# formats supported by `file`.
#
# @return [Array<String>, nil]
optional :known_speaker_references, OpenAI::Internal::Type::ArrayOf[String]

# @!attribute language
# The language of the input audio. Supplying the input language in
# [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
Expand All @@ -58,15 +79,18 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
# An optional text to guide the model's style or continue a previous audio
# segment. The
# [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
# should match the audio language.
# should match the audio language. This field is not supported when using
# `gpt-4o-transcribe-diarize`.
#
# @return [String, nil]
optional :prompt, String

# @!attribute response_format
# The format of the output, in one of these options: `json`, `text`, `srt`,
# `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
# the only supported format is `json`.
# `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
# `gpt-4o-mini-transcribe`, the only supported format is `json`. For
# `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
# `diarized_json`, with `diarized_json` required to receive speaker annotations.
#
# @return [Symbol, OpenAI::Models::AudioResponseFormat, nil]
optional :response_format, enum: -> { OpenAI::AudioResponseFormat }
Expand All @@ -86,13 +110,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
# `response_format` must be set `verbose_json` to use timestamp granularities.
# Either or both of these options are supported: `word`, or `segment`. Note: There
# is no additional latency for segment timestamps, but generating word timestamps
# incurs additional latency.
# incurs additional latency. This option is not available for
# `gpt-4o-transcribe-diarize`.
#
# @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionCreateParams::TimestampGranularity>, nil]
optional :timestamp_granularities,
-> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionCreateParams::TimestampGranularity] }

# @!method initialize(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
# @!method initialize(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
# Some parameter documentations has been truncated, see
# {OpenAI::Models::Audio::TranscriptionCreateParams} for more details.
#
Expand All @@ -104,6 +129,10 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
#
# @param include [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>] Additional information to include in the transcription response.
#
# @param known_speaker_names [Array<String>] Optional list of speaker names that correspond to the audio samples provided in
#
# @param known_speaker_references [Array<String>] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en-
#
# @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt
#
# @param prompt [String] An optional text to guide the model's style or continue a previous audio segment
Expand All @@ -117,14 +146,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
# @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}]

# ID of the model to use. The options are `gpt-4o-transcribe`,
# `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
# Whisper V2 model).
# `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
# Whisper V2 model), and `gpt-4o-transcribe-diarize`.
module Model
extend OpenAI::Internal::Type::Union

variant String

# ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model).
# ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source Whisper V2 model), and `gpt-4o-transcribe-diarize`.
variant enum: -> { OpenAI::AudioModel }

# @!method self.variants
Expand All @@ -135,6 +164,8 @@ module Model
# first normalizes loudness and then uses voice activity detection (VAD) to choose
# boundaries. `server_vad` object can be provided to tweak VAD detection
# parameters manually. If unset, the audio is transcribed as a single block.
# Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
# seconds.
module ChunkingStrategy
extend OpenAI::Internal::Type::Union

Expand Down
5 changes: 4 additions & 1 deletion lib/openai/models/audio/transcription_create_response.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,14 @@ module TranscriptionCreateResponse
# Represents a transcription response returned by model, based on the provided input.
variant -> { OpenAI::Audio::Transcription }

# Represents a diarized transcription response returned by the model, including the combined transcript and speaker-segment annotations.
variant -> { OpenAI::Audio::TranscriptionDiarized }

# Represents a verbose json transcription response returned by model, based on the provided input.
variant -> { OpenAI::Audio::TranscriptionVerbose }

# @!method self.variants
# @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionVerbose)]
# @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionDiarized, OpenAI::Models::Audio::TranscriptionVerbose)]
end
end
end
Expand Down
160 changes: 160 additions & 0 deletions lib/openai/models/audio/transcription_diarized.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# frozen_string_literal: true

module OpenAI
module Models
module Audio
class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel
# @!attribute duration
# Duration of the input audio in seconds.
#
# @return [Float]
required :duration, Float

# @!attribute segments
# Segments of the transcript annotated with timestamps and speaker labels.
#
# @return [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>]
required :segments, -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionDiarizedSegment] }

# @!attribute task
# The type of task that was run. Always `transcribe`.
#
# @return [Symbol, :transcribe]
required :task, const: :transcribe

# @!attribute text
# The concatenated transcript text for the entire audio input.
#
# @return [String]
required :text, String

# @!attribute usage
# Token or duration usage statistics for the request.
#
# @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration, nil]
optional :usage, union: -> { OpenAI::Audio::TranscriptionDiarized::Usage }

# @!method initialize(duration:, segments:, text:, usage: nil, task: :transcribe)
# Represents a diarized transcription response returned by the model, including
# the combined transcript and speaker-segment annotations.
#
# @param duration [Float] Duration of the input audio in seconds.
#
# @param segments [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>] Segments of the transcript annotated with timestamps and speaker labels.
#
# @param text [String] The concatenated transcript text for the entire audio input.
#
# @param usage [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration] Token or duration usage statistics for the request.
#
# @param task [Symbol, :transcribe] The type of task that was run. Always `transcribe`.

# Token or duration usage statistics for the request.
#
# @see OpenAI::Models::Audio::TranscriptionDiarized#usage
module Usage
extend OpenAI::Internal::Type::Union

discriminator :type

# Usage statistics for models billed by token usage.
variant :tokens, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens }

# Usage statistics for models billed by audio input duration.
variant :duration, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Duration }

class Tokens < OpenAI::Internal::Type::BaseModel
# @!attribute input_tokens
# Number of input tokens billed for this request.
#
# @return [Integer]
required :input_tokens, Integer

# @!attribute output_tokens
# Number of output tokens generated.
#
# @return [Integer]
required :output_tokens, Integer

# @!attribute total_tokens
# Total number of tokens used (input + output).
#
# @return [Integer]
required :total_tokens, Integer

# @!attribute type
# The type of the usage object. Always `tokens` for this variant.
#
# @return [Symbol, :tokens]
required :type, const: :tokens

# @!attribute input_token_details
# Details about the input tokens billed for this request.
#
# @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, nil]
optional :input_token_details,
-> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails }

# @!method initialize(input_tokens:, output_tokens:, total_tokens:, input_token_details: nil, type: :tokens)
# Usage statistics for models billed by token usage.
#
# @param input_tokens [Integer] Number of input tokens billed for this request.
#
# @param output_tokens [Integer] Number of output tokens generated.
#
# @param total_tokens [Integer] Total number of tokens used (input + output).
#
# @param input_token_details [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails] Details about the input tokens billed for this request.
#
# @param type [Symbol, :tokens] The type of the usage object. Always `tokens` for this variant.

# @see OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens#input_token_details
class InputTokenDetails < OpenAI::Internal::Type::BaseModel
# @!attribute audio_tokens
# Number of audio tokens billed for this request.
#
# @return [Integer, nil]
optional :audio_tokens, Integer

# @!attribute text_tokens
# Number of text tokens billed for this request.
#
# @return [Integer, nil]
optional :text_tokens, Integer

# @!method initialize(audio_tokens: nil, text_tokens: nil)
# Details about the input tokens billed for this request.
#
# @param audio_tokens [Integer] Number of audio tokens billed for this request.
#
# @param text_tokens [Integer] Number of text tokens billed for this request.
end
end

class Duration < OpenAI::Internal::Type::BaseModel
# @!attribute seconds
# Duration of the input audio in seconds.
#
# @return [Float]
required :seconds, Float

# @!attribute type
# The type of the usage object. Always `duration` for this variant.
#
# @return [Symbol, :duration]
required :type, const: :duration

# @!method initialize(seconds:, type: :duration)
# Usage statistics for models billed by audio input duration.
#
# @param seconds [Float] Duration of the input audio in seconds.
#
# @param type [Symbol, :duration] The type of the usage object. Always `duration` for this variant.
end

# @!method self.variants
# @return [Array(OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration)]
end
end
end
end
end
Loading