openai · stainless-app · Oct 16, 2025 · Oct 14, 2025 · Oct 15, 2025 · Oct 15, 2025
@@ -1,3 +1,3 @@
 {
-  ".": "0.31.0"
+  ".": "0.32.0"
 }
@@ -1,4 +1,4 @@
 configured_endpoints: 135
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-e66e85fb7f72477256dca1acb6b23396989d381c5c1b318de564195436bcb93f.yml
-openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: 89bf7bb3a1f9439ffc6ea0e7dc57ba9b
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
+openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
+config_hash: 03b48e9b8c7231a902403210dbd7dfa0
@@ -1,5 +1,19 @@
 # Changelog
 
+## 0.32.0 (2025-10-16)
+
+Full Changelog: [v0.31.0...v0.32.0](https://github.com/openai/openai-ruby/compare/v0.31.0...v0.32.0)
+
+### Features
+
+* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([b31bd7f](https://github.com/openai/openai-ruby/commit/b31bd7f20ca702160873fa26ab39479fd8102f85))
+
+
+### Bug Fixes
+
+* absolutely qualified uris should always override the default ([14fdff8](https://github.com/openai/openai-ruby/commit/14fdff8de533a1002c64c9086016777a1e152a97))
+* should not reuse buffers for `IO.copy_stream` interop ([8f33de1](https://github.com/openai/openai-ruby/commit/8f33de18bb104d5003a4d459ad244c0813e5a07e))
+
 ## 0.31.0 (2025-10-10)
 
 Full Changelog: [v0.30.0...v0.31.0](https://github.com/openai/openai-ruby/compare/v0.30.0...v0.31.0)

@@ -11,7 +11,7 @@ GIT
 PATH
   remote: .
   specs:
-    openai (0.31.0)
+    openai (0.32.0)
       connection_pool
 
 GEM

@@ -15,7 +15,7 @@ To use this gem, install via Bundler by adding the following to your application
 <!-- x-release-please-start-version -->
 
 ```ruby
-gem "openai", "~> 0.31.0"
+gem "openai", "~> 0.32.0"
 ```
 
 <!-- x-release-please-end -->

@@ -79,11 +79,14 @@
 require_relative "openai/models/audio/transcription"
 require_relative "openai/models/audio/transcription_create_params"
 require_relative "openai/models/audio/transcription_create_response"
+require_relative "openai/models/audio/transcription_diarized"
+require_relative "openai/models/audio/transcription_diarized_segment"
 require_relative "openai/models/audio/transcription_include"
 require_relative "openai/models/audio/transcription_segment"
 require_relative "openai/models/audio/transcription_stream_event"
 require_relative "openai/models/audio/transcription_text_delta_event"
 require_relative "openai/models/audio/transcription_text_done_event"
+require_relative "openai/models/audio/transcription_text_segment_event"
 require_relative "openai/models/audio/transcription_verbose"
 require_relative "openai/models/audio/transcription_word"
 require_relative "openai/models/audio/translation"

@@ -346,8 +346,9 @@ def join_parsed_uri(lhs, rhs)
           base_path, base_query = lhs.fetch_values(:path, :query)
           slashed = base_path.end_with?("/") ? base_path : "#{base_path}/"
 
-          parsed_path, parsed_query = parse_uri(rhs.fetch(:path)).fetch_values(:path, :query)
-          override = URI::Generic.build(**rhs.slice(:scheme, :host, :port), path: parsed_path)
+          merged = {**parse_uri(rhs.fetch(:path)), **rhs.except(:path, :query)}
+          parsed_path, parsed_query = merged.fetch_values(:path, :query)
+          override = URI::Generic.build(**merged.slice(:scheme, :host, :port), path: parsed_path)
 
           joined = URI.join(URI::Generic.build(lhs.except(:path, :query)), slashed, override)
           query = deep_merge(
@@ -473,10 +474,9 @@ class << self
         # @return [Enumerable<String>]
         def writable_enum(&blk)
           Enumerator.new do |y|
-            buf = String.new
             y.define_singleton_method(:write) do
-              self << buf.replace(_1)
-              buf.bytesize
+              self << _1.dup
+              _1.bytesize
             end
 
             blk.call(y)

@@ -19,8 +19,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
 
         # @!attribute model
         #   ID of the model to use. The options are `gpt-4o-transcribe`,
-        #   `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-        #   Whisper V2 model).
+        #   `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+        #   Whisper V2 model), and `gpt-4o-transcribe-diarize`.
         #
         #   @return [String, Symbol, OpenAI::Models::AudioModel]
         required :model, union: -> { OpenAI::Audio::TranscriptionCreateParams::Model }
@@ -30,6 +30,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
         #   first normalizes loudness and then uses voice activity detection (VAD) to choose
         #   boundaries. `server_vad` object can be provided to tweak VAD detection
         #   parameters manually. If unset, the audio is transcribed as a single block.
+        #   Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+        #   seconds.
         #
         #   @return [Symbol, :auto, OpenAI::Models::Audio::TranscriptionCreateParams::ChunkingStrategy::VadConfig, nil]
         optional :chunking_strategy,
@@ -41,11 +43,30 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
         #   return the log probabilities of the tokens in the response to understand the
         #   model's confidence in the transcription. `logprobs` only works with
         #   response_format set to `json` and only with the models `gpt-4o-transcribe` and
-        #   `gpt-4o-mini-transcribe`.
+        #   `gpt-4o-mini-transcribe`. This field is not supported when using
+        #   `gpt-4o-transcribe-diarize`.
         #
         #   @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>, nil]
         optional :include, -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionInclude] }
 
+        # @!attribute known_speaker_names
+        #   Optional list of speaker names that correspond to the audio samples provided in
+        #   `known_speaker_references[]`. Each entry should be a short identifier (for
+        #   example `customer` or `agent`). Up to 4 speakers are supported.
+        #
+        #   @return [Array<String>, nil]
+        optional :known_speaker_names, OpenAI::Internal::Type::ArrayOf[String]
+
+        # @!attribute known_speaker_references
+        #   Optional list of audio samples (as
+        #   [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+        #   that contain known speaker references matching `known_speaker_names[]`. Each
+        #   sample must be between 2 and 10 seconds, and can use any of the same input audio
+        #   formats supported by `file`.
+        #
+        #   @return [Array<String>, nil]
+        optional :known_speaker_references, OpenAI::Internal::Type::ArrayOf[String]
+
         # @!attribute language
         #   The language of the input audio. Supplying the input language in
         #   [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -58,15 +79,18 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
         #   An optional text to guide the model's style or continue a previous audio
         #   segment. The
         #   [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-        #   should match the audio language.
+        #   should match the audio language. This field is not supported when using
+        #   `gpt-4o-transcribe-diarize`.
         #
         #   @return [String, nil]
         optional :prompt, String
 
         # @!attribute response_format
         #   The format of the output, in one of these options: `json`, `text`, `srt`,
-        #   `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-        #   the only supported format is `json`.
+        #   `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+        #   `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+        #   `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+        #   `diarized_json`, with `diarized_json` required to receive speaker annotations.
         #
         #   @return [Symbol, OpenAI::Models::AudioResponseFormat, nil]
         optional :response_format, enum: -> { OpenAI::AudioResponseFormat }
@@ -86,13 +110,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
         #   `response_format` must be set `verbose_json` to use timestamp granularities.
         #   Either or both of these options are supported: `word`, or `segment`. Note: There
         #   is no additional latency for segment timestamps, but generating word timestamps
-        #   incurs additional latency.
+        #   incurs additional latency. This option is not available for
+        #   `gpt-4o-transcribe-diarize`.
         #
         #   @return [Array<Symbol, OpenAI::Models::Audio::TranscriptionCreateParams::TimestampGranularity>, nil]
         optional :timestamp_granularities,
                  -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionCreateParams::TimestampGranularity] }
 
-        # @!method initialize(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
+        # @!method initialize(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {})
         #   Some parameter documentations has been truncated, see
         #   {OpenAI::Models::Audio::TranscriptionCreateParams} for more details.
         #
@@ -104,6 +129,10 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
         #
         #   @param include [Array<Symbol, OpenAI::Models::Audio::TranscriptionInclude>] Additional information to include in the transcription response.
         #
+        #   @param known_speaker_names [Array<String>] Optional list of speaker names that correspond to the audio samples provided in
+        #
+        #   @param known_speaker_references [Array<String>] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en-
+        #
         #   @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt
         #
         #   @param prompt [String] An optional text to guide the model's style or continue a previous audio segment
@@ -117,14 +146,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel
         #   @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}]
 
         # ID of the model to use. The options are `gpt-4o-transcribe`,
-        # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-        # Whisper V2 model).
+        # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+        # Whisper V2 model), and `gpt-4o-transcribe-diarize`.
         module Model
           extend OpenAI::Internal::Type::Union
 
           variant String
 
-          # ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model).
+          # ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source Whisper V2 model), and `gpt-4o-transcribe-diarize`.
           variant enum: -> { OpenAI::AudioModel }
 
           # @!method self.variants
@@ -135,6 +164,8 @@ module Model
         # first normalizes loudness and then uses voice activity detection (VAD) to choose
         # boundaries. `server_vad` object can be provided to tweak VAD detection
         # parameters manually. If unset, the audio is transcribed as a single block.
+        # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+        # seconds.
         module ChunkingStrategy
           extend OpenAI::Internal::Type::Union
 

@@ -15,11 +15,14 @@ module TranscriptionCreateResponse
         # Represents a transcription response returned by model, based on the provided input.
         variant -> { OpenAI::Audio::Transcription }
 
+        # Represents a diarized transcription response returned by the model, including the combined transcript and speaker-segment annotations.
+        variant -> { OpenAI::Audio::TranscriptionDiarized }
+
         # Represents a verbose json transcription response returned by model, based on the provided input.
         variant -> { OpenAI::Audio::TranscriptionVerbose }
 
         # @!method self.variants
-        #   @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionVerbose)]
+        #   @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionDiarized, OpenAI::Models::Audio::TranscriptionVerbose)]
       end
     end
   end

@@ -0,0 +1,160 @@
+# frozen_string_literal: true
+
+module OpenAI
+  module Models
+    module Audio
+      class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel
+        # @!attribute duration
+        #   Duration of the input audio in seconds.
+        #
+        #   @return [Float]
+        required :duration, Float
+
+        # @!attribute segments
+        #   Segments of the transcript annotated with timestamps and speaker labels.
+        #
+        #   @return [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>]
+        required :segments, -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionDiarizedSegment] }
+
+        # @!attribute task
+        #   The type of task that was run. Always `transcribe`.
+        #
+        #   @return [Symbol, :transcribe]
+        required :task, const: :transcribe
+
+        # @!attribute text
+        #   The concatenated transcript text for the entire audio input.
+        #
+        #   @return [String]
+        required :text, String
+
+        # @!attribute usage
+        #   Token or duration usage statistics for the request.
+        #
+        #   @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration, nil]
+        optional :usage, union: -> { OpenAI::Audio::TranscriptionDiarized::Usage }
+
+        # @!method initialize(duration:, segments:, text:, usage: nil, task: :transcribe)
+        #   Represents a diarized transcription response returned by the model, including
+        #   the combined transcript and speaker-segment annotations.
+        #
+        #   @param duration [Float] Duration of the input audio in seconds.
+        #
+        #   @param segments [Array<OpenAI::Models::Audio::TranscriptionDiarizedSegment>] Segments of the transcript annotated with timestamps and speaker labels.
+        #
+        #   @param text [String] The concatenated transcript text for the entire audio input.
+        #
+        #   @param usage [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration] Token or duration usage statistics for the request.
+        #
+        #   @param task [Symbol, :transcribe] The type of task that was run. Always `transcribe`.
+
+        # Token or duration usage statistics for the request.
+        #
+        # @see OpenAI::Models::Audio::TranscriptionDiarized#usage
+        module Usage
+          extend OpenAI::Internal::Type::Union
+
+          discriminator :type
+
+          # Usage statistics for models billed by token usage.
+          variant :tokens, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens }
+
+          # Usage statistics for models billed by audio input duration.
+          variant :duration, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Duration }
+
+          class Tokens < OpenAI::Internal::Type::BaseModel
+            # @!attribute input_tokens
+            #   Number of input tokens billed for this request.
+            #
+            #   @return [Integer]
+            required :input_tokens, Integer
+
+            # @!attribute output_tokens
+            #   Number of output tokens generated.
+            #
+            #   @return [Integer]
+            required :output_tokens, Integer
+
+            # @!attribute total_tokens
+            #   Total number of tokens used (input + output).
+            #
+            #   @return [Integer]
+            required :total_tokens, Integer
+
+            # @!attribute type
+            #   The type of the usage object. Always `tokens` for this variant.
+            #
+            #   @return [Symbol, :tokens]
+            required :type, const: :tokens
+
+            # @!attribute input_token_details
+            #   Details about the input tokens billed for this request.
+            #
+            #   @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, nil]
+            optional :input_token_details,
+                     -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails }
+
+            # @!method initialize(input_tokens:, output_tokens:, total_tokens:, input_token_details: nil, type: :tokens)
+            #   Usage statistics for models billed by token usage.
+            #
+            #   @param input_tokens [Integer] Number of input tokens billed for this request.
+            #
+            #   @param output_tokens [Integer] Number of output tokens generated.
+            #
+            #   @param total_tokens [Integer] Total number of tokens used (input + output).
+            #
+            #   @param input_token_details [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails] Details about the input tokens billed for this request.
+            #
+            #   @param type [Symbol, :tokens] The type of the usage object. Always `tokens` for this variant.
+
+            # @see OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens#input_token_details
+            class InputTokenDetails < OpenAI::Internal::Type::BaseModel
+              # @!attribute audio_tokens
+              #   Number of audio tokens billed for this request.
+              #
+              #   @return [Integer, nil]
+              optional :audio_tokens, Integer
+
+              # @!attribute text_tokens
+              #   Number of text tokens billed for this request.
+              #
+              #   @return [Integer, nil]
+              optional :text_tokens, Integer
+
+              # @!method initialize(audio_tokens: nil, text_tokens: nil)
+              #   Details about the input tokens billed for this request.
+              #
+              #   @param audio_tokens [Integer] Number of audio tokens billed for this request.
+              #
+              #   @param text_tokens [Integer] Number of text tokens billed for this request.
+            end
+          end
+
+          class Duration < OpenAI::Internal::Type::BaseModel
+            # @!attribute seconds
+            #   Duration of the input audio in seconds.
+            #
+            #   @return [Float]
+            required :seconds, Float
+
+            # @!attribute type
+            #   The type of the usage object. Always `duration` for this variant.
+            #
+            #   @return [Symbol, :duration]
+            required :type, const: :duration
+
+            # @!method initialize(seconds:, type: :duration)
+            #   Usage statistics for models billed by audio input duration.
+            #
+            #   @param seconds [Float] Duration of the input audio in seconds.
+            #
+            #   @param type [Symbol, :duration] The type of the usage object. Always `duration` for this variant.
+          end
+
+          # @!method self.variants
+          #   @return [Array(OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration)]
+        end
+      end
+    end
+  end
+end