diff --git a/.release-please-manifest.json b/.release-please-manifest.json index f81bf992..f04d0896 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.31.0" + ".": "0.32.0" } \ No newline at end of file diff --git a/.stats.yml b/.stats.yml index e68631a0..09d2eb1d 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 135 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-e66e85fb7f72477256dca1acb6b23396989d381c5c1b318de564195436bcb93f.yml -openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1 -config_hash: 89bf7bb3a1f9439ffc6ea0e7dc57ba9b +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml +openapi_spec_hash: fdc03ed84a65a31b80da909255e53924 +config_hash: 03b48e9b8c7231a902403210dbd7dfa0 diff --git a/CHANGELOG.md b/CHANGELOG.md index 6af26dd4..14a757ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## 0.32.0 (2025-10-16) + +Full Changelog: [v0.31.0...v0.32.0](https://github.com/openai/openai-ruby/compare/v0.31.0...v0.32.0) + +### Features + +* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([b31bd7f](https://github.com/openai/openai-ruby/commit/b31bd7f20ca702160873fa26ab39479fd8102f85)) + + +### Bug Fixes + +* absolutely qualified uris should always override the default ([14fdff8](https://github.com/openai/openai-ruby/commit/14fdff8de533a1002c64c9086016777a1e152a97)) +* should not reuse buffers for `IO.copy_stream` interop ([8f33de1](https://github.com/openai/openai-ruby/commit/8f33de18bb104d5003a4d459ad244c0813e5a07e)) + ## 0.31.0 (2025-10-10) Full Changelog: [v0.30.0...v0.31.0](https://github.com/openai/openai-ruby/compare/v0.30.0...v0.31.0) diff --git a/Gemfile.lock b/Gemfile.lock index 5ee1897c..5dc3ad50 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -11,7 +11,7 @@ GIT PATH remote: . specs: - openai (0.31.0) + openai (0.32.0) connection_pool GEM diff --git a/README.md b/README.md index cc9ae07b..5ec8dd33 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ To use this gem, install via Bundler by adding the following to your application ```ruby -gem "openai", "~> 0.31.0" +gem "openai", "~> 0.32.0" ``` diff --git a/lib/openai.rb b/lib/openai.rb index 657cacd8..80a7919d 100644 --- a/lib/openai.rb +++ b/lib/openai.rb @@ -79,11 +79,14 @@ require_relative "openai/models/audio/transcription" require_relative "openai/models/audio/transcription_create_params" require_relative "openai/models/audio/transcription_create_response" +require_relative "openai/models/audio/transcription_diarized" +require_relative "openai/models/audio/transcription_diarized_segment" require_relative "openai/models/audio/transcription_include" require_relative "openai/models/audio/transcription_segment" require_relative "openai/models/audio/transcription_stream_event" require_relative "openai/models/audio/transcription_text_delta_event" require_relative "openai/models/audio/transcription_text_done_event" +require_relative "openai/models/audio/transcription_text_segment_event" require_relative "openai/models/audio/transcription_verbose" require_relative "openai/models/audio/transcription_word" require_relative "openai/models/audio/translation" diff --git a/lib/openai/internal/util.rb b/lib/openai/internal/util.rb index aea3e450..42956986 100644 --- a/lib/openai/internal/util.rb +++ b/lib/openai/internal/util.rb @@ -346,8 +346,9 @@ def join_parsed_uri(lhs, rhs) base_path, base_query = lhs.fetch_values(:path, :query) slashed = base_path.end_with?("/") ? base_path : "#{base_path}/" - parsed_path, parsed_query = parse_uri(rhs.fetch(:path)).fetch_values(:path, :query) - override = URI::Generic.build(**rhs.slice(:scheme, :host, :port), path: parsed_path) + merged = {**parse_uri(rhs.fetch(:path)), **rhs.except(:path, :query)} + parsed_path, parsed_query = merged.fetch_values(:path, :query) + override = URI::Generic.build(**merged.slice(:scheme, :host, :port), path: parsed_path) joined = URI.join(URI::Generic.build(lhs.except(:path, :query)), slashed, override) query = deep_merge( @@ -473,10 +474,9 @@ class << self # @return [Enumerable] def writable_enum(&blk) Enumerator.new do |y| - buf = String.new y.define_singleton_method(:write) do - self << buf.replace(_1) - buf.bytesize + self << _1.dup + _1.bytesize end blk.call(y) diff --git a/lib/openai/models/audio/transcription_create_params.rb b/lib/openai/models/audio/transcription_create_params.rb index 2ad4984e..64067e24 100644 --- a/lib/openai/models/audio/transcription_create_params.rb +++ b/lib/openai/models/audio/transcription_create_params.rb @@ -19,8 +19,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel # @!attribute model # ID of the model to use. The options are `gpt-4o-transcribe`, - # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - # Whisper V2 model). + # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + # Whisper V2 model), and `gpt-4o-transcribe-diarize`. # # @return [String, Symbol, OpenAI::Models::AudioModel] required :model, union: -> { OpenAI::Audio::TranscriptionCreateParams::Model } @@ -30,6 +30,8 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel # first normalizes loudness and then uses voice activity detection (VAD) to choose # boundaries. `server_vad` object can be provided to tweak VAD detection # parameters manually. If unset, the audio is transcribed as a single block. + # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + # seconds. # # @return [Symbol, :auto, OpenAI::Models::Audio::TranscriptionCreateParams::ChunkingStrategy::VadConfig, nil] optional :chunking_strategy, @@ -41,11 +43,30 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel # return the log probabilities of the tokens in the response to understand the # model's confidence in the transcription. `logprobs` only works with # response_format set to `json` and only with the models `gpt-4o-transcribe` and - # `gpt-4o-mini-transcribe`. + # `gpt-4o-mini-transcribe`. This field is not supported when using + # `gpt-4o-transcribe-diarize`. # # @return [Array, nil] optional :include, -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionInclude] } + # @!attribute known_speaker_names + # Optional list of speaker names that correspond to the audio samples provided in + # `known_speaker_references[]`. Each entry should be a short identifier (for + # example `customer` or `agent`). Up to 4 speakers are supported. + # + # @return [Array, nil] + optional :known_speaker_names, OpenAI::Internal::Type::ArrayOf[String] + + # @!attribute known_speaker_references + # Optional list of audio samples (as + # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + # that contain known speaker references matching `known_speaker_names[]`. Each + # sample must be between 2 and 10 seconds, and can use any of the same input audio + # formats supported by `file`. + # + # @return [Array, nil] + optional :known_speaker_references, OpenAI::Internal::Type::ArrayOf[String] + # @!attribute language # The language of the input audio. Supplying the input language in # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -58,15 +79,18 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel # An optional text to guide the model's style or continue a previous audio # segment. The # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - # should match the audio language. + # should match the audio language. This field is not supported when using + # `gpt-4o-transcribe-diarize`. # # @return [String, nil] optional :prompt, String # @!attribute response_format # The format of the output, in one of these options: `json`, `text`, `srt`, - # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - # the only supported format is `json`. + # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + # `gpt-4o-mini-transcribe`, the only supported format is `json`. For + # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + # `diarized_json`, with `diarized_json` required to receive speaker annotations. # # @return [Symbol, OpenAI::Models::AudioResponseFormat, nil] optional :response_format, enum: -> { OpenAI::AudioResponseFormat } @@ -86,13 +110,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel # `response_format` must be set `verbose_json` to use timestamp granularities. # Either or both of these options are supported: `word`, or `segment`. Note: There # is no additional latency for segment timestamps, but generating word timestamps - # incurs additional latency. + # incurs additional latency. This option is not available for + # `gpt-4o-transcribe-diarize`. # # @return [Array, nil] optional :timestamp_granularities, -> { OpenAI::Internal::Type::ArrayOf[enum: OpenAI::Audio::TranscriptionCreateParams::TimestampGranularity] } - # @!method initialize(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {}) + # @!method initialize(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {}) # Some parameter documentations has been truncated, see # {OpenAI::Models::Audio::TranscriptionCreateParams} for more details. # @@ -104,6 +129,10 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel # # @param include [Array] Additional information to include in the transcription response. # + # @param known_speaker_names [Array] Optional list of speaker names that correspond to the audio samples provided in + # + # @param known_speaker_references [Array] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en- + # # @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt # # @param prompt [String] An optional text to guide the model's style or continue a previous audio segment @@ -117,14 +146,14 @@ class TranscriptionCreateParams < OpenAI::Internal::Type::BaseModel # @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}] # ID of the model to use. The options are `gpt-4o-transcribe`, - # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - # Whisper V2 model). + # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + # Whisper V2 model), and `gpt-4o-transcribe-diarize`. module Model extend OpenAI::Internal::Type::Union variant String - # ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source Whisper V2 model). + # ID of the model to use. The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source Whisper V2 model), and `gpt-4o-transcribe-diarize`. variant enum: -> { OpenAI::AudioModel } # @!method self.variants @@ -135,6 +164,8 @@ module Model # first normalizes loudness and then uses voice activity detection (VAD) to choose # boundaries. `server_vad` object can be provided to tweak VAD detection # parameters manually. If unset, the audio is transcribed as a single block. + # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + # seconds. module ChunkingStrategy extend OpenAI::Internal::Type::Union diff --git a/lib/openai/models/audio/transcription_create_response.rb b/lib/openai/models/audio/transcription_create_response.rb index 0bbe16b7..976de082 100644 --- a/lib/openai/models/audio/transcription_create_response.rb +++ b/lib/openai/models/audio/transcription_create_response.rb @@ -15,11 +15,14 @@ module TranscriptionCreateResponse # Represents a transcription response returned by model, based on the provided input. variant -> { OpenAI::Audio::Transcription } + # Represents a diarized transcription response returned by the model, including the combined transcript and speaker-segment annotations. + variant -> { OpenAI::Audio::TranscriptionDiarized } + # Represents a verbose json transcription response returned by model, based on the provided input. variant -> { OpenAI::Audio::TranscriptionVerbose } # @!method self.variants - # @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionVerbose)] + # @return [Array(OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionDiarized, OpenAI::Models::Audio::TranscriptionVerbose)] end end end diff --git a/lib/openai/models/audio/transcription_diarized.rb b/lib/openai/models/audio/transcription_diarized.rb new file mode 100644 index 00000000..2a310312 --- /dev/null +++ b/lib/openai/models/audio/transcription_diarized.rb @@ -0,0 +1,160 @@ +# frozen_string_literal: true + +module OpenAI + module Models + module Audio + class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel + # @!attribute duration + # Duration of the input audio in seconds. + # + # @return [Float] + required :duration, Float + + # @!attribute segments + # Segments of the transcript annotated with timestamps and speaker labels. + # + # @return [Array] + required :segments, -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionDiarizedSegment] } + + # @!attribute task + # The type of task that was run. Always `transcribe`. + # + # @return [Symbol, :transcribe] + required :task, const: :transcribe + + # @!attribute text + # The concatenated transcript text for the entire audio input. + # + # @return [String] + required :text, String + + # @!attribute usage + # Token or duration usage statistics for the request. + # + # @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration, nil] + optional :usage, union: -> { OpenAI::Audio::TranscriptionDiarized::Usage } + + # @!method initialize(duration:, segments:, text:, usage: nil, task: :transcribe) + # Represents a diarized transcription response returned by the model, including + # the combined transcript and speaker-segment annotations. + # + # @param duration [Float] Duration of the input audio in seconds. + # + # @param segments [Array] Segments of the transcript annotated with timestamps and speaker labels. + # + # @param text [String] The concatenated transcript text for the entire audio input. + # + # @param usage [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration] Token or duration usage statistics for the request. + # + # @param task [Symbol, :transcribe] The type of task that was run. Always `transcribe`. + + # Token or duration usage statistics for the request. + # + # @see OpenAI::Models::Audio::TranscriptionDiarized#usage + module Usage + extend OpenAI::Internal::Type::Union + + discriminator :type + + # Usage statistics for models billed by token usage. + variant :tokens, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens } + + # Usage statistics for models billed by audio input duration. + variant :duration, -> { OpenAI::Audio::TranscriptionDiarized::Usage::Duration } + + class Tokens < OpenAI::Internal::Type::BaseModel + # @!attribute input_tokens + # Number of input tokens billed for this request. + # + # @return [Integer] + required :input_tokens, Integer + + # @!attribute output_tokens + # Number of output tokens generated. + # + # @return [Integer] + required :output_tokens, Integer + + # @!attribute total_tokens + # Total number of tokens used (input + output). + # + # @return [Integer] + required :total_tokens, Integer + + # @!attribute type + # The type of the usage object. Always `tokens` for this variant. + # + # @return [Symbol, :tokens] + required :type, const: :tokens + + # @!attribute input_token_details + # Details about the input tokens billed for this request. + # + # @return [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, nil] + optional :input_token_details, + -> { OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails } + + # @!method initialize(input_tokens:, output_tokens:, total_tokens:, input_token_details: nil, type: :tokens) + # Usage statistics for models billed by token usage. + # + # @param input_tokens [Integer] Number of input tokens billed for this request. + # + # @param output_tokens [Integer] Number of output tokens generated. + # + # @param total_tokens [Integer] Total number of tokens used (input + output). + # + # @param input_token_details [OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails] Details about the input tokens billed for this request. + # + # @param type [Symbol, :tokens] The type of the usage object. Always `tokens` for this variant. + + # @see OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens#input_token_details + class InputTokenDetails < OpenAI::Internal::Type::BaseModel + # @!attribute audio_tokens + # Number of audio tokens billed for this request. + # + # @return [Integer, nil] + optional :audio_tokens, Integer + + # @!attribute text_tokens + # Number of text tokens billed for this request. + # + # @return [Integer, nil] + optional :text_tokens, Integer + + # @!method initialize(audio_tokens: nil, text_tokens: nil) + # Details about the input tokens billed for this request. + # + # @param audio_tokens [Integer] Number of audio tokens billed for this request. + # + # @param text_tokens [Integer] Number of text tokens billed for this request. + end + end + + class Duration < OpenAI::Internal::Type::BaseModel + # @!attribute seconds + # Duration of the input audio in seconds. + # + # @return [Float] + required :seconds, Float + + # @!attribute type + # The type of the usage object. Always `duration` for this variant. + # + # @return [Symbol, :duration] + required :type, const: :duration + + # @!method initialize(seconds:, type: :duration) + # Usage statistics for models billed by audio input duration. + # + # @param seconds [Float] Duration of the input audio in seconds. + # + # @param type [Symbol, :duration] The type of the usage object. Always `duration` for this variant. + end + + # @!method self.variants + # @return [Array(OpenAI::Models::Audio::TranscriptionDiarized::Usage::Tokens, OpenAI::Models::Audio::TranscriptionDiarized::Usage::Duration)] + end + end + end + end +end diff --git a/lib/openai/models/audio/transcription_diarized_segment.rb b/lib/openai/models/audio/transcription_diarized_segment.rb new file mode 100644 index 00000000..bbdf721c --- /dev/null +++ b/lib/openai/models/audio/transcription_diarized_segment.rb @@ -0,0 +1,65 @@ +# frozen_string_literal: true + +module OpenAI + module Models + module Audio + class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel + # @!attribute id + # Unique identifier for the segment. + # + # @return [String] + required :id, String + + # @!attribute end_ + # End timestamp of the segment in seconds. + # + # @return [Float] + required :end_, Float, api_name: :end + + # @!attribute speaker + # Speaker label for this segment. When known speakers are provided, the label + # matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially + # using capital letters (`A`, `B`, ...). + # + # @return [String] + required :speaker, String + + # @!attribute start + # Start timestamp of the segment in seconds. + # + # @return [Float] + required :start, Float + + # @!attribute text + # Transcript text for this segment. + # + # @return [String] + required :text, String + + # @!attribute type + # The type of the segment. Always `transcript.text.segment`. + # + # @return [Symbol, :"transcript.text.segment"] + required :type, const: :"transcript.text.segment" + + # @!method initialize(id:, end_:, speaker:, start:, text:, type: :"transcript.text.segment") + # Some parameter documentations has been truncated, see + # {OpenAI::Models::Audio::TranscriptionDiarizedSegment} for more details. + # + # A segment of diarized transcript text with speaker metadata. + # + # @param id [String] Unique identifier for the segment. + # + # @param end_ [Float] End timestamp of the segment in seconds. + # + # @param speaker [String] Speaker label for this segment. When known speakers are provided, the label matc + # + # @param start [Float] Start timestamp of the segment in seconds. + # + # @param text [String] Transcript text for this segment. + # + # @param type [Symbol, :"transcript.text.segment"] The type of the segment. Always `transcript.text.segment`. + end + end + end +end diff --git a/lib/openai/models/audio/transcription_stream_event.rb b/lib/openai/models/audio/transcription_stream_event.rb index 2112080e..c34ec07a 100644 --- a/lib/openai/models/audio/transcription_stream_event.rb +++ b/lib/openai/models/audio/transcription_stream_event.rb @@ -3,15 +3,18 @@ module OpenAI module Models module Audio - # Emitted when there is an additional text delta. This is also the first event - # emitted when the transcription starts. Only emitted when you + # Emitted when a diarized transcription returns a completed segment with speaker + # information. Only emitted when you # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) - # with the `Stream` parameter set to `true`. + # with `stream` set to `true` and `response_format` set to `diarized_json`. module TranscriptionStreamEvent extend OpenAI::Internal::Type::Union discriminator :type + # Emitted when a diarized transcription returns a completed segment with speaker information. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with `stream` set to `true` and `response_format` set to `diarized_json`. + variant :"transcript.text.segment", -> { OpenAI::Audio::TranscriptionTextSegmentEvent } + # Emitted when there is an additional text delta. This is also the first event emitted when the transcription starts. Only emitted when you [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with the `Stream` parameter set to `true`. variant :"transcript.text.delta", -> { OpenAI::Audio::TranscriptionTextDeltaEvent } @@ -19,7 +22,7 @@ module TranscriptionStreamEvent variant :"transcript.text.done", -> { OpenAI::Audio::TranscriptionTextDoneEvent } # @!method self.variants - # @return [Array(OpenAI::Models::Audio::TranscriptionTextDeltaEvent, OpenAI::Models::Audio::TranscriptionTextDoneEvent)] + # @return [Array(OpenAI::Models::Audio::TranscriptionTextSegmentEvent, OpenAI::Models::Audio::TranscriptionTextDeltaEvent, OpenAI::Models::Audio::TranscriptionTextDoneEvent)] end end end diff --git a/lib/openai/models/audio/transcription_text_delta_event.rb b/lib/openai/models/audio/transcription_text_delta_event.rb index 4c54ea63..eab82f66 100644 --- a/lib/openai/models/audio/transcription_text_delta_event.rb +++ b/lib/openai/models/audio/transcription_text_delta_event.rb @@ -25,7 +25,14 @@ class TranscriptionTextDeltaEvent < OpenAI::Internal::Type::BaseModel optional :logprobs, -> { OpenAI::Internal::Type::ArrayOf[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] } - # @!method initialize(delta:, logprobs: nil, type: :"transcript.text.delta") + # @!attribute segment_id + # Identifier of the diarized segment that this delta belongs to. Only present when + # using `gpt-4o-transcribe-diarize`. + # + # @return [String, nil] + optional :segment_id, String + + # @!method initialize(delta:, logprobs: nil, segment_id: nil, type: :"transcript.text.delta") # Some parameter documentations has been truncated, see # {OpenAI::Models::Audio::TranscriptionTextDeltaEvent} for more details. # @@ -38,6 +45,8 @@ class TranscriptionTextDeltaEvent < OpenAI::Internal::Type::BaseModel # # @param logprobs [Array] The log probabilities of the delta. Only included if you [create a transcription # + # @param segment_id [String] Identifier of the diarized segment that this delta belongs to. Only present when + # # @param type [Symbol, :"transcript.text.delta"] The type of the event. Always `transcript.text.delta`. class Logprob < OpenAI::Internal::Type::BaseModel diff --git a/lib/openai/models/audio/transcription_text_segment_event.rb b/lib/openai/models/audio/transcription_text_segment_event.rb new file mode 100644 index 00000000..289d69cc --- /dev/null +++ b/lib/openai/models/audio/transcription_text_segment_event.rb @@ -0,0 +1,63 @@ +# frozen_string_literal: true + +module OpenAI + module Models + module Audio + class TranscriptionTextSegmentEvent < OpenAI::Internal::Type::BaseModel + # @!attribute id + # Unique identifier for the segment. + # + # @return [String] + required :id, String + + # @!attribute end_ + # End timestamp of the segment in seconds. + # + # @return [Float] + required :end_, Float, api_name: :end + + # @!attribute speaker + # Speaker label for this segment. + # + # @return [String] + required :speaker, String + + # @!attribute start + # Start timestamp of the segment in seconds. + # + # @return [Float] + required :start, Float + + # @!attribute text + # Transcript text for this segment. + # + # @return [String] + required :text, String + + # @!attribute type + # The type of the event. Always `transcript.text.segment`. + # + # @return [Symbol, :"transcript.text.segment"] + required :type, const: :"transcript.text.segment" + + # @!method initialize(id:, end_:, speaker:, start:, text:, type: :"transcript.text.segment") + # Emitted when a diarized transcription returns a completed segment with speaker + # information. Only emitted when you + # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) + # with `stream` set to `true` and `response_format` set to `diarized_json`. + # + # @param id [String] Unique identifier for the segment. + # + # @param end_ [Float] End timestamp of the segment in seconds. + # + # @param speaker [String] Speaker label for this segment. + # + # @param start [Float] Start timestamp of the segment in seconds. + # + # @param text [String] Transcript text for this segment. + # + # @param type [Symbol, :"transcript.text.segment"] The type of the event. Always `transcript.text.segment`. + end + end + end +end diff --git a/lib/openai/models/audio_model.rb b/lib/openai/models/audio_model.rb index 8e0e194e..6fb437fd 100644 --- a/lib/openai/models/audio_model.rb +++ b/lib/openai/models/audio_model.rb @@ -8,6 +8,7 @@ module AudioModel WHISPER_1 = :"whisper-1" GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe" GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe" + GPT_4O_TRANSCRIBE_DIARIZE = :"gpt-4o-transcribe-diarize" # @!method self.values # @return [Array] diff --git a/lib/openai/models/audio_response_format.rb b/lib/openai/models/audio_response_format.rb index 5644ca89..42adae70 100644 --- a/lib/openai/models/audio_response_format.rb +++ b/lib/openai/models/audio_response_format.rb @@ -3,8 +3,10 @@ module OpenAI module Models # The format of the output, in one of these options: `json`, `text`, `srt`, - # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - # the only supported format is `json`. + # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + # `gpt-4o-mini-transcribe`, the only supported format is `json`. For + # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + # `diarized_json`, with `diarized_json` required to receive speaker annotations. module AudioResponseFormat extend OpenAI::Internal::Type::Enum @@ -13,6 +15,7 @@ module AudioResponseFormat SRT = :srt VERBOSE_JSON = :verbose_json VTT = :vtt + DIARIZED_JSON = :diarized_json # @!method self.values # @return [Array] diff --git a/lib/openai/models/realtime/audio_transcription.rb b/lib/openai/models/realtime/audio_transcription.rb index cf3d6698..2c333c4f 100644 --- a/lib/openai/models/realtime/audio_transcription.rb +++ b/lib/openai/models/realtime/audio_transcription.rb @@ -14,7 +14,8 @@ class AudioTranscription < OpenAI::Internal::Type::BaseModel # @!attribute model # The model to use for transcription. Current options are `whisper-1`, - # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. + # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels. # # @return [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model, nil] optional :model, enum: -> { OpenAI::Realtime::AudioTranscription::Model } @@ -23,8 +24,8 @@ class AudioTranscription < OpenAI::Internal::Type::BaseModel # An optional text to guide the model's style or continue a previous audio # segment. For `whisper-1`, the # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - # For `gpt-4o-transcribe` models, the prompt is a free text string, for example - # "expect words related to technology". + # For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the + # prompt is a free text string, for example "expect words related to technology". # # @return [String, nil] optional :prompt, String @@ -35,21 +36,22 @@ class AudioTranscription < OpenAI::Internal::Type::BaseModel # # @param language [String] The language of the input audio. Supplying the input language in # - # @param model [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model] The model to use for transcription. Current options are `whisper-1`, `gpt-4o-tra + # @param model [Symbol, OpenAI::Models::Realtime::AudioTranscription::Model] The model to use for transcription. Current options are `whisper-1`, `gpt-4o-min # # @param prompt [String] An optional text to guide the model's style or continue a previous audio # The model to use for transcription. Current options are `whisper-1`, - # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. + # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels. # # @see OpenAI::Models::Realtime::AudioTranscription#model module Model extend OpenAI::Internal::Type::Enum WHISPER_1 = :"whisper-1" - GPT_4O_TRANSCRIBE_LATEST = :"gpt-4o-transcribe-latest" GPT_4O_MINI_TRANSCRIBE = :"gpt-4o-mini-transcribe" GPT_4O_TRANSCRIBE = :"gpt-4o-transcribe" + GPT_4O_TRANSCRIBE_DIARIZE = :"gpt-4o-transcribe-diarize" # @!method self.values # @return [Array] diff --git a/lib/openai/models/vector_store_create_params.rb b/lib/openai/models/vector_store_create_params.rb index dfe50418..57971237 100644 --- a/lib/openai/models/vector_store_create_params.rb +++ b/lib/openai/models/vector_store_create_params.rb @@ -14,6 +14,13 @@ class VectorStoreCreateParams < OpenAI::Internal::Type::BaseModel # @return [OpenAI::Models::AutoFileChunkingStrategyParam, OpenAI::Models::StaticFileChunkingStrategyObjectParam, nil] optional :chunking_strategy, union: -> { OpenAI::FileChunkingStrategyParam } + # @!attribute description + # A description for the vector store. Can be used to describe the vector store's + # purpose. + # + # @return [String, nil] + optional :description, String + # @!attribute expires_after # The expiration policy for a vector store. # @@ -45,12 +52,14 @@ class VectorStoreCreateParams < OpenAI::Internal::Type::BaseModel # @return [String, nil] optional :name, String - # @!method initialize(chunking_strategy: nil, expires_after: nil, file_ids: nil, metadata: nil, name: nil, request_options: {}) + # @!method initialize(chunking_strategy: nil, description: nil, expires_after: nil, file_ids: nil, metadata: nil, name: nil, request_options: {}) # Some parameter documentations has been truncated, see # {OpenAI::Models::VectorStoreCreateParams} for more details. # # @param chunking_strategy [OpenAI::Models::AutoFileChunkingStrategyParam, OpenAI::Models::StaticFileChunkingStrategyObjectParam] The chunking strategy used to chunk the file(s). If not set, will use the `auto` # + # @param description [String] A description for the vector store. Can be used to describe the vector store's p + # # @param expires_after [OpenAI::Models::VectorStoreCreateParams::ExpiresAfter] The expiration policy for a vector store. # # @param file_ids [Array] A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that diff --git a/lib/openai/resources/audio/transcriptions.rb b/lib/openai/resources/audio/transcriptions.rb index 45570d65..211ddd9a 100644 --- a/lib/openai/resources/audio/transcriptions.rb +++ b/lib/openai/resources/audio/transcriptions.rb @@ -12,7 +12,7 @@ class Transcriptions # # Transcribes audio into the input language. # - # @overload create(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {}) + # @overload create(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {}) # # @param file [Pathname, StringIO, IO, String, OpenAI::FilePart] The audio file object (not file name) to transcribe, in one of these formats: fl # @@ -22,6 +22,10 @@ class Transcriptions # # @param include [Array] Additional information to include in the transcription response. # + # @param known_speaker_names [Array] Optional list of speaker names that correspond to the audio samples provided in + # + # @param known_speaker_references [Array] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en- + # # @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt # # @param prompt [String] An optional text to guide the model's style or continue a previous audio segment @@ -34,7 +38,7 @@ class Transcriptions # # @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}, nil] # - # @return [OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionVerbose] + # @return [OpenAI::Models::Audio::Transcription, OpenAI::Models::Audio::TranscriptionDiarized, OpenAI::Models::Audio::TranscriptionVerbose] # # @see OpenAI::Models::Audio::TranscriptionCreateParams def create(params) @@ -61,7 +65,7 @@ def create(params) # # Transcribes audio into the input language. # - # @overload create_streaming(file:, model:, chunking_strategy: nil, include: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {}) + # @overload create_streaming(file:, model:, chunking_strategy: nil, include: nil, known_speaker_names: nil, known_speaker_references: nil, language: nil, prompt: nil, response_format: nil, temperature: nil, timestamp_granularities: nil, request_options: {}) # # @param file [Pathname, StringIO, IO, String, OpenAI::FilePart] The audio file object (not file name) to transcribe, in one of these formats: fl # @@ -71,6 +75,10 @@ def create(params) # # @param include [Array] Additional information to include in the transcription response. # + # @param known_speaker_names [Array] Optional list of speaker names that correspond to the audio samples provided in + # + # @param known_speaker_references [Array] Optional list of audio samples (as [data URLs](https://developer.mozilla.org/en- + # # @param language [String] The language of the input audio. Supplying the input language in [ISO-639-1](htt # # @param prompt [String] An optional text to guide the model's style or continue a previous audio segment @@ -83,7 +91,7 @@ def create(params) # # @param request_options [OpenAI::RequestOptions, Hash{Symbol=>Object}, nil] # - # @return [OpenAI::Internal::Stream] + # @return [OpenAI::Internal::Stream] # # @see OpenAI::Models::Audio::TranscriptionCreateParams def create_streaming(params) diff --git a/lib/openai/resources/vector_stores.rb b/lib/openai/resources/vector_stores.rb index d903b9ef..c6800db8 100644 --- a/lib/openai/resources/vector_stores.rb +++ b/lib/openai/resources/vector_stores.rb @@ -14,10 +14,12 @@ class VectorStores # # Create a vector store. # - # @overload create(chunking_strategy: nil, expires_after: nil, file_ids: nil, metadata: nil, name: nil, request_options: {}) + # @overload create(chunking_strategy: nil, description: nil, expires_after: nil, file_ids: nil, metadata: nil, name: nil, request_options: {}) # # @param chunking_strategy [OpenAI::Models::AutoFileChunkingStrategyParam, OpenAI::Models::StaticFileChunkingStrategyObjectParam] The chunking strategy used to chunk the file(s). If not set, will use the `auto` # + # @param description [String] A description for the vector store. Can be used to describe the vector store's p + # # @param expires_after [OpenAI::Models::VectorStoreCreateParams::ExpiresAfter] The expiration policy for a vector store. # # @param file_ids [Array] A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that diff --git a/lib/openai/version.rb b/lib/openai/version.rb index 4b592eec..1368c8cb 100644 --- a/lib/openai/version.rb +++ b/lib/openai/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true module OpenAI - VERSION = "0.31.0" + VERSION = "0.32.0" end diff --git a/rbi/openai/models/audio/transcription_create_params.rbi b/rbi/openai/models/audio/transcription_create_params.rbi index c3dc13df..cb3c21cc 100644 --- a/rbi/openai/models/audio/transcription_create_params.rbi +++ b/rbi/openai/models/audio/transcription_create_params.rbi @@ -21,8 +21,8 @@ module OpenAI attr_accessor :file # ID of the model to use. The options are `gpt-4o-transcribe`, - # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - # Whisper V2 model). + # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + # Whisper V2 model), and `gpt-4o-transcribe-diarize`. sig { returns(T.any(String, OpenAI::AudioModel::OrSymbol)) } attr_accessor :model @@ -30,6 +30,8 @@ module OpenAI # first normalizes loudness and then uses voice activity detection (VAD) to choose # boundaries. `server_vad` object can be provided to tweak VAD detection # parameters manually. If unset, the audio is transcribed as a single block. + # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + # seconds. sig do returns( T.nilable( @@ -46,7 +48,8 @@ module OpenAI # return the log probabilities of the tokens in the response to understand the # model's confidence in the transcription. `logprobs` only works with # response_format set to `json` and only with the models `gpt-4o-transcribe` and - # `gpt-4o-mini-transcribe`. + # `gpt-4o-mini-transcribe`. This field is not supported when using + # `gpt-4o-transcribe-diarize`. sig do returns( T.nilable(T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol]) @@ -61,6 +64,26 @@ module OpenAI end attr_writer :include + # Optional list of speaker names that correspond to the audio samples provided in + # `known_speaker_references[]`. Each entry should be a short identifier (for + # example `customer` or `agent`). Up to 4 speakers are supported. + sig { returns(T.nilable(T::Array[String])) } + attr_reader :known_speaker_names + + sig { params(known_speaker_names: T::Array[String]).void } + attr_writer :known_speaker_names + + # Optional list of audio samples (as + # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + # that contain known speaker references matching `known_speaker_names[]`. Each + # sample must be between 2 and 10 seconds, and can use any of the same input audio + # formats supported by `file`. + sig { returns(T.nilable(T::Array[String])) } + attr_reader :known_speaker_references + + sig { params(known_speaker_references: T::Array[String]).void } + attr_writer :known_speaker_references + # The language of the input audio. Supplying the input language in # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) # format will improve accuracy and latency. @@ -73,7 +96,8 @@ module OpenAI # An optional text to guide the model's style or continue a previous audio # segment. The # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - # should match the audio language. + # should match the audio language. This field is not supported when using + # `gpt-4o-transcribe-diarize`. sig { returns(T.nilable(String)) } attr_reader :prompt @@ -81,8 +105,10 @@ module OpenAI attr_writer :prompt # The format of the output, in one of these options: `json`, `text`, `srt`, - # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - # the only supported format is `json`. + # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + # `gpt-4o-mini-transcribe`, the only supported format is `json`. For + # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + # `diarized_json`, with `diarized_json` required to receive speaker annotations. sig { returns(T.nilable(OpenAI::AudioResponseFormat::OrSymbol)) } attr_reader :response_format @@ -106,7 +132,8 @@ module OpenAI # `response_format` must be set `verbose_json` to use timestamp granularities. # Either or both of these options are supported: `word`, or `segment`. Note: There # is no additional latency for segment timestamps, but generating word timestamps - # incurs additional latency. + # incurs additional latency. This option is not available for + # `gpt-4o-transcribe-diarize`. sig do returns( T.nilable( @@ -140,6 +167,8 @@ module OpenAI ) ), include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol], + known_speaker_names: T::Array[String], + known_speaker_references: T::Array[String], language: String, prompt: String, response_format: OpenAI::AudioResponseFormat::OrSymbol, @@ -156,20 +185,33 @@ module OpenAI # flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. file:, # ID of the model to use. The options are `gpt-4o-transcribe`, - # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - # Whisper V2 model). + # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + # Whisper V2 model), and `gpt-4o-transcribe-diarize`. model:, # Controls how the audio is cut into chunks. When set to `"auto"`, the server # first normalizes loudness and then uses voice activity detection (VAD) to choose # boundaries. `server_vad` object can be provided to tweak VAD detection # parameters manually. If unset, the audio is transcribed as a single block. + # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + # seconds. chunking_strategy: nil, # Additional information to include in the transcription response. `logprobs` will # return the log probabilities of the tokens in the response to understand the # model's confidence in the transcription. `logprobs` only works with # response_format set to `json` and only with the models `gpt-4o-transcribe` and - # `gpt-4o-mini-transcribe`. + # `gpt-4o-mini-transcribe`. This field is not supported when using + # `gpt-4o-transcribe-diarize`. include: nil, + # Optional list of speaker names that correspond to the audio samples provided in + # `known_speaker_references[]`. Each entry should be a short identifier (for + # example `customer` or `agent`). Up to 4 speakers are supported. + known_speaker_names: nil, + # Optional list of audio samples (as + # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + # that contain known speaker references matching `known_speaker_names[]`. Each + # sample must be between 2 and 10 seconds, and can use any of the same input audio + # formats supported by `file`. + known_speaker_references: nil, # The language of the input audio. Supplying the input language in # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) # format will improve accuracy and latency. @@ -177,11 +219,14 @@ module OpenAI # An optional text to guide the model's style or continue a previous audio # segment. The # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - # should match the audio language. + # should match the audio language. This field is not supported when using + # `gpt-4o-transcribe-diarize`. prompt: nil, # The format of the output, in one of these options: `json`, `text`, `srt`, - # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - # the only supported format is `json`. + # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + # `gpt-4o-mini-transcribe`, the only supported format is `json`. For + # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + # `diarized_json`, with `diarized_json` required to receive speaker annotations. response_format: nil, # The sampling temperature, between 0 and 1. Higher values like 0.8 will make the # output more random, while lower values like 0.2 will make it more focused and @@ -193,7 +238,8 @@ module OpenAI # `response_format` must be set `verbose_json` to use timestamp granularities. # Either or both of these options are supported: `word`, or `segment`. Note: There # is no additional latency for segment timestamps, but generating word timestamps - # incurs additional latency. + # incurs additional latency. This option is not available for + # `gpt-4o-transcribe-diarize`. timestamp_granularities: nil, request_options: {} ) @@ -212,6 +258,8 @@ module OpenAI ) ), include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol], + known_speaker_names: T::Array[String], + known_speaker_references: T::Array[String], language: String, prompt: String, response_format: OpenAI::AudioResponseFormat::OrSymbol, @@ -228,8 +276,8 @@ module OpenAI end # ID of the model to use. The options are `gpt-4o-transcribe`, - # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - # Whisper V2 model). + # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + # Whisper V2 model), and `gpt-4o-transcribe-diarize`. module Model extend OpenAI::Internal::Type::Union @@ -251,6 +299,8 @@ module OpenAI # first normalizes loudness and then uses voice activity detection (VAD) to choose # boundaries. `server_vad` object can be provided to tweak VAD detection # parameters manually. If unset, the audio is transcribed as a single block. + # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + # seconds. module ChunkingStrategy extend OpenAI::Internal::Type::Union diff --git a/rbi/openai/models/audio/transcription_create_response.rbi b/rbi/openai/models/audio/transcription_create_response.rbi index a96f3c65..c6701e0c 100644 --- a/rbi/openai/models/audio/transcription_create_response.rbi +++ b/rbi/openai/models/audio/transcription_create_response.rbi @@ -12,6 +12,7 @@ module OpenAI T.type_alias do T.any( OpenAI::Audio::Transcription, + OpenAI::Audio::TranscriptionDiarized, OpenAI::Audio::TranscriptionVerbose ) end diff --git a/rbi/openai/models/audio/transcription_diarized.rbi b/rbi/openai/models/audio/transcription_diarized.rbi new file mode 100644 index 00000000..d72b331c --- /dev/null +++ b/rbi/openai/models/audio/transcription_diarized.rbi @@ -0,0 +1,281 @@ +# typed: strong + +module OpenAI + module Models + module Audio + class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Audio::TranscriptionDiarized, + OpenAI::Internal::AnyHash + ) + end + + # Duration of the input audio in seconds. + sig { returns(Float) } + attr_accessor :duration + + # Segments of the transcript annotated with timestamps and speaker labels. + sig { returns(T::Array[OpenAI::Audio::TranscriptionDiarizedSegment]) } + attr_accessor :segments + + # The type of task that was run. Always `transcribe`. + sig { returns(Symbol) } + attr_accessor :task + + # The concatenated transcript text for the entire audio input. + sig { returns(String) } + attr_accessor :text + + # Token or duration usage statistics for the request. + sig do + returns( + T.nilable(OpenAI::Audio::TranscriptionDiarized::Usage::Variants) + ) + end + attr_reader :usage + + sig do + params( + usage: + T.any( + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::OrHash, + OpenAI::Audio::TranscriptionDiarized::Usage::Duration::OrHash + ) + ).void + end + attr_writer :usage + + # Represents a diarized transcription response returned by the model, including + # the combined transcript and speaker-segment annotations. + sig do + params( + duration: Float, + segments: + T::Array[OpenAI::Audio::TranscriptionDiarizedSegment::OrHash], + text: String, + usage: + T.any( + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::OrHash, + OpenAI::Audio::TranscriptionDiarized::Usage::Duration::OrHash + ), + task: Symbol + ).returns(T.attached_class) + end + def self.new( + # Duration of the input audio in seconds. + duration:, + # Segments of the transcript annotated with timestamps and speaker labels. + segments:, + # The concatenated transcript text for the entire audio input. + text:, + # Token or duration usage statistics for the request. + usage: nil, + # The type of task that was run. Always `transcribe`. + task: :transcribe + ) + end + + sig do + override.returns( + { + duration: Float, + segments: T::Array[OpenAI::Audio::TranscriptionDiarizedSegment], + task: Symbol, + text: String, + usage: OpenAI::Audio::TranscriptionDiarized::Usage::Variants + } + ) + end + def to_hash + end + + # Token or duration usage statistics for the request. + module Usage + extend OpenAI::Internal::Type::Union + + Variants = + T.type_alias do + T.any( + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens, + OpenAI::Audio::TranscriptionDiarized::Usage::Duration + ) + end + + class Tokens < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens, + OpenAI::Internal::AnyHash + ) + end + + # Number of input tokens billed for this request. + sig { returns(Integer) } + attr_accessor :input_tokens + + # Number of output tokens generated. + sig { returns(Integer) } + attr_accessor :output_tokens + + # Total number of tokens used (input + output). + sig { returns(Integer) } + attr_accessor :total_tokens + + # The type of the usage object. Always `tokens` for this variant. + sig { returns(Symbol) } + attr_accessor :type + + # Details about the input tokens billed for this request. + sig do + returns( + T.nilable( + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails + ) + ) + end + attr_reader :input_token_details + + sig do + params( + input_token_details: + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails::OrHash + ).void + end + attr_writer :input_token_details + + # Usage statistics for models billed by token usage. + sig do + params( + input_tokens: Integer, + output_tokens: Integer, + total_tokens: Integer, + input_token_details: + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails::OrHash, + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Number of input tokens billed for this request. + input_tokens:, + # Number of output tokens generated. + output_tokens:, + # Total number of tokens used (input + output). + total_tokens:, + # Details about the input tokens billed for this request. + input_token_details: nil, + # The type of the usage object. Always `tokens` for this variant. + type: :tokens + ) + end + + sig do + override.returns( + { + input_tokens: Integer, + output_tokens: Integer, + total_tokens: Integer, + type: Symbol, + input_token_details: + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails + } + ) + end + def to_hash + end + + class InputTokenDetails < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, + OpenAI::Internal::AnyHash + ) + end + + # Number of audio tokens billed for this request. + sig { returns(T.nilable(Integer)) } + attr_reader :audio_tokens + + sig { params(audio_tokens: Integer).void } + attr_writer :audio_tokens + + # Number of text tokens billed for this request. + sig { returns(T.nilable(Integer)) } + attr_reader :text_tokens + + sig { params(text_tokens: Integer).void } + attr_writer :text_tokens + + # Details about the input tokens billed for this request. + sig do + params(audio_tokens: Integer, text_tokens: Integer).returns( + T.attached_class + ) + end + def self.new( + # Number of audio tokens billed for this request. + audio_tokens: nil, + # Number of text tokens billed for this request. + text_tokens: nil + ) + end + + sig do + override.returns( + { audio_tokens: Integer, text_tokens: Integer } + ) + end + def to_hash + end + end + end + + class Duration < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Audio::TranscriptionDiarized::Usage::Duration, + OpenAI::Internal::AnyHash + ) + end + + # Duration of the input audio in seconds. + sig { returns(Float) } + attr_accessor :seconds + + # The type of the usage object. Always `duration` for this variant. + sig { returns(Symbol) } + attr_accessor :type + + # Usage statistics for models billed by audio input duration. + sig do + params(seconds: Float, type: Symbol).returns(T.attached_class) + end + def self.new( + # Duration of the input audio in seconds. + seconds:, + # The type of the usage object. Always `duration` for this variant. + type: :duration + ) + end + + sig { override.returns({ seconds: Float, type: Symbol }) } + def to_hash + end + end + + sig do + override.returns( + T::Array[OpenAI::Audio::TranscriptionDiarized::Usage::Variants] + ) + end + def self.variants + end + end + end + end + end +end diff --git a/rbi/openai/models/audio/transcription_diarized_segment.rbi b/rbi/openai/models/audio/transcription_diarized_segment.rbi new file mode 100644 index 00000000..e188e21c --- /dev/null +++ b/rbi/openai/models/audio/transcription_diarized_segment.rbi @@ -0,0 +1,87 @@ +# typed: strong + +module OpenAI + module Models + module Audio + class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Audio::TranscriptionDiarizedSegment, + OpenAI::Internal::AnyHash + ) + end + + # Unique identifier for the segment. + sig { returns(String) } + attr_accessor :id + + # End timestamp of the segment in seconds. + sig { returns(Float) } + attr_accessor :end_ + + # Speaker label for this segment. When known speakers are provided, the label + # matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially + # using capital letters (`A`, `B`, ...). + sig { returns(String) } + attr_accessor :speaker + + # Start timestamp of the segment in seconds. + sig { returns(Float) } + attr_accessor :start + + # Transcript text for this segment. + sig { returns(String) } + attr_accessor :text + + # The type of the segment. Always `transcript.text.segment`. + sig { returns(Symbol) } + attr_accessor :type + + # A segment of diarized transcript text with speaker metadata. + sig do + params( + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Unique identifier for the segment. + id:, + # End timestamp of the segment in seconds. + end_:, + # Speaker label for this segment. When known speakers are provided, the label + # matches `known_speaker_names[]`. Otherwise speakers are labeled sequentially + # using capital letters (`A`, `B`, ...). + speaker:, + # Start timestamp of the segment in seconds. + start:, + # Transcript text for this segment. + text:, + # The type of the segment. Always `transcript.text.segment`. + type: :"transcript.text.segment" + ) + end + + sig do + override.returns( + { + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: Symbol + } + ) + end + def to_hash + end + end + end + end +end diff --git a/rbi/openai/models/audio/transcription_stream_event.rbi b/rbi/openai/models/audio/transcription_stream_event.rbi index 7c5989b6..2f7d2eb7 100644 --- a/rbi/openai/models/audio/transcription_stream_event.rbi +++ b/rbi/openai/models/audio/transcription_stream_event.rbi @@ -3,16 +3,17 @@ module OpenAI module Models module Audio - # Emitted when there is an additional text delta. This is also the first event - # emitted when the transcription starts. Only emitted when you + # Emitted when a diarized transcription returns a completed segment with speaker + # information. Only emitted when you # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) - # with the `Stream` parameter set to `true`. + # with `stream` set to `true` and `response_format` set to `diarized_json`. module TranscriptionStreamEvent extend OpenAI::Internal::Type::Union Variants = T.type_alias do T.any( + OpenAI::Audio::TranscriptionTextSegmentEvent, OpenAI::Audio::TranscriptionTextDeltaEvent, OpenAI::Audio::TranscriptionTextDoneEvent ) diff --git a/rbi/openai/models/audio/transcription_text_delta_event.rbi b/rbi/openai/models/audio/transcription_text_delta_event.rbi index d8707c86..1131e67a 100644 --- a/rbi/openai/models/audio/transcription_text_delta_event.rbi +++ b/rbi/openai/models/audio/transcription_text_delta_event.rbi @@ -42,6 +42,14 @@ module OpenAI end attr_writer :logprobs + # Identifier of the diarized segment that this delta belongs to. Only present when + # using `gpt-4o-transcribe-diarize`. + sig { returns(T.nilable(String)) } + attr_reader :segment_id + + sig { params(segment_id: String).void } + attr_writer :segment_id + # Emitted when there is an additional text delta. This is also the first event # emitted when the transcription starts. Only emitted when you # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) @@ -53,6 +61,7 @@ module OpenAI T::Array[ OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob::OrHash ], + segment_id: String, type: Symbol ).returns(T.attached_class) end @@ -63,6 +72,9 @@ module OpenAI # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) # with the `include[]` parameter set to `logprobs`. logprobs: nil, + # Identifier of the diarized segment that this delta belongs to. Only present when + # using `gpt-4o-transcribe-diarize`. + segment_id: nil, # The type of the event. Always `transcript.text.delta`. type: :"transcript.text.delta" ) @@ -74,7 +86,8 @@ module OpenAI delta: String, type: Symbol, logprobs: - T::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] + T::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob], + segment_id: String } ) end diff --git a/rbi/openai/models/audio/transcription_text_segment_event.rbi b/rbi/openai/models/audio/transcription_text_segment_event.rbi new file mode 100644 index 00000000..28c532a9 --- /dev/null +++ b/rbi/openai/models/audio/transcription_text_segment_event.rbi @@ -0,0 +1,86 @@ +# typed: strong + +module OpenAI + module Models + module Audio + class TranscriptionTextSegmentEvent < OpenAI::Internal::Type::BaseModel + OrHash = + T.type_alias do + T.any( + OpenAI::Audio::TranscriptionTextSegmentEvent, + OpenAI::Internal::AnyHash + ) + end + + # Unique identifier for the segment. + sig { returns(String) } + attr_accessor :id + + # End timestamp of the segment in seconds. + sig { returns(Float) } + attr_accessor :end_ + + # Speaker label for this segment. + sig { returns(String) } + attr_accessor :speaker + + # Start timestamp of the segment in seconds. + sig { returns(Float) } + attr_accessor :start + + # Transcript text for this segment. + sig { returns(String) } + attr_accessor :text + + # The type of the event. Always `transcript.text.segment`. + sig { returns(Symbol) } + attr_accessor :type + + # Emitted when a diarized transcription returns a completed segment with speaker + # information. Only emitted when you + # [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) + # with `stream` set to `true` and `response_format` set to `diarized_json`. + sig do + params( + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: Symbol + ).returns(T.attached_class) + end + def self.new( + # Unique identifier for the segment. + id:, + # End timestamp of the segment in seconds. + end_:, + # Speaker label for this segment. + speaker:, + # Start timestamp of the segment in seconds. + start:, + # Transcript text for this segment. + text:, + # The type of the event. Always `transcript.text.segment`. + type: :"transcript.text.segment" + ) + end + + sig do + override.returns( + { + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: Symbol + } + ) + end + def to_hash + end + end + end + end +end diff --git a/rbi/openai/models/audio_model.rbi b/rbi/openai/models/audio_model.rbi index 3f22719a..9feeafe9 100644 --- a/rbi/openai/models/audio_model.rbi +++ b/rbi/openai/models/audio_model.rbi @@ -13,6 +13,8 @@ module OpenAI T.let(:"gpt-4o-transcribe", OpenAI::AudioModel::TaggedSymbol) GPT_4O_MINI_TRANSCRIBE = T.let(:"gpt-4o-mini-transcribe", OpenAI::AudioModel::TaggedSymbol) + GPT_4O_TRANSCRIBE_DIARIZE = + T.let(:"gpt-4o-transcribe-diarize", OpenAI::AudioModel::TaggedSymbol) sig { override.returns(T::Array[OpenAI::AudioModel::TaggedSymbol]) } def self.values diff --git a/rbi/openai/models/audio_response_format.rbi b/rbi/openai/models/audio_response_format.rbi index 4afcf558..50a0329b 100644 --- a/rbi/openai/models/audio_response_format.rbi +++ b/rbi/openai/models/audio_response_format.rbi @@ -3,8 +3,10 @@ module OpenAI module Models # The format of the output, in one of these options: `json`, `text`, `srt`, - # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - # the only supported format is `json`. + # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + # `gpt-4o-mini-transcribe`, the only supported format is `json`. For + # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + # `diarized_json`, with `diarized_json` required to receive speaker annotations. module AudioResponseFormat extend OpenAI::Internal::Type::Enum @@ -17,6 +19,8 @@ module OpenAI VERBOSE_JSON = T.let(:verbose_json, OpenAI::AudioResponseFormat::TaggedSymbol) VTT = T.let(:vtt, OpenAI::AudioResponseFormat::TaggedSymbol) + DIARIZED_JSON = + T.let(:diarized_json, OpenAI::AudioResponseFormat::TaggedSymbol) sig do override.returns(T::Array[OpenAI::AudioResponseFormat::TaggedSymbol]) diff --git a/rbi/openai/models/realtime/audio_transcription.rbi b/rbi/openai/models/realtime/audio_transcription.rbi index d31bc86e..3a3fe551 100644 --- a/rbi/openai/models/realtime/audio_transcription.rbi +++ b/rbi/openai/models/realtime/audio_transcription.rbi @@ -22,7 +22,8 @@ module OpenAI attr_writer :language # The model to use for transcription. Current options are `whisper-1`, - # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. + # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels. sig do returns( T.nilable(OpenAI::Realtime::AudioTranscription::Model::OrSymbol) @@ -40,8 +41,8 @@ module OpenAI # An optional text to guide the model's style or continue a previous audio # segment. For `whisper-1`, the # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - # For `gpt-4o-transcribe` models, the prompt is a free text string, for example - # "expect words related to technology". + # For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the + # prompt is a free text string, for example "expect words related to technology". sig { returns(T.nilable(String)) } attr_reader :prompt @@ -61,13 +62,14 @@ module OpenAI # format will improve accuracy and latency. language: nil, # The model to use for transcription. Current options are `whisper-1`, - # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. + # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels. model: nil, # An optional text to guide the model's style or continue a previous audio # segment. For `whisper-1`, the # [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - # For `gpt-4o-transcribe` models, the prompt is a free text string, for example - # "expect words related to technology". + # For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the + # prompt is a free text string, for example "expect words related to technology". prompt: nil ) end @@ -85,7 +87,8 @@ module OpenAI end # The model to use for transcription. Current options are `whisper-1`, - # `gpt-4o-transcribe-latest`, `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + # `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, and `gpt-4o-transcribe-diarize`. + # Use `gpt-4o-transcribe-diarize` when you need diarization with speaker labels. module Model extend OpenAI::Internal::Type::Enum @@ -100,11 +103,6 @@ module OpenAI :"whisper-1", OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol ) - GPT_4O_TRANSCRIBE_LATEST = - T.let( - :"gpt-4o-transcribe-latest", - OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol - ) GPT_4O_MINI_TRANSCRIBE = T.let( :"gpt-4o-mini-transcribe", @@ -115,6 +113,11 @@ module OpenAI :"gpt-4o-transcribe", OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol ) + GPT_4O_TRANSCRIBE_DIARIZE = + T.let( + :"gpt-4o-transcribe-diarize", + OpenAI::Realtime::AudioTranscription::Model::TaggedSymbol + ) sig do override.returns( diff --git a/rbi/openai/models/vector_store_create_params.rbi b/rbi/openai/models/vector_store_create_params.rbi index 674fc93d..e8017f4d 100644 --- a/rbi/openai/models/vector_store_create_params.rbi +++ b/rbi/openai/models/vector_store_create_params.rbi @@ -36,6 +36,14 @@ module OpenAI end attr_writer :chunking_strategy + # A description for the vector store. Can be used to describe the vector store's + # purpose. + sig { returns(T.nilable(String)) } + attr_reader :description + + sig { params(description: String).void } + attr_writer :description + # The expiration policy for a vector store. sig { returns(T.nilable(OpenAI::VectorStoreCreateParams::ExpiresAfter)) } attr_reader :expires_after @@ -79,6 +87,7 @@ module OpenAI OpenAI::AutoFileChunkingStrategyParam::OrHash, OpenAI::StaticFileChunkingStrategyObjectParam::OrHash ), + description: String, expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter::OrHash, file_ids: T::Array[String], metadata: T.nilable(T::Hash[Symbol, String]), @@ -90,6 +99,9 @@ module OpenAI # The chunking strategy used to chunk the file(s). If not set, will use the `auto` # strategy. Only applicable if `file_ids` is non-empty. chunking_strategy: nil, + # A description for the vector store. Can be used to describe the vector store's + # purpose. + description: nil, # The expiration policy for a vector store. expires_after: nil, # A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that @@ -117,6 +129,7 @@ module OpenAI OpenAI::AutoFileChunkingStrategyParam, OpenAI::StaticFileChunkingStrategyObjectParam ), + description: String, expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter, file_ids: T::Array[String], metadata: T.nilable(T::Hash[Symbol, String]), diff --git a/rbi/openai/resources/audio/transcriptions.rbi b/rbi/openai/resources/audio/transcriptions.rbi index 187218b0..ac2598d2 100644 --- a/rbi/openai/resources/audio/transcriptions.rbi +++ b/rbi/openai/resources/audio/transcriptions.rbi @@ -20,6 +20,8 @@ module OpenAI ) ), include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol], + known_speaker_names: T::Array[String], + known_speaker_references: T::Array[String], language: String, prompt: String, response_format: OpenAI::AudioResponseFormat::OrSymbol, @@ -39,20 +41,33 @@ module OpenAI # flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. file:, # ID of the model to use. The options are `gpt-4o-transcribe`, - # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - # Whisper V2 model). + # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + # Whisper V2 model), and `gpt-4o-transcribe-diarize`. model:, # Controls how the audio is cut into chunks. When set to `"auto"`, the server # first normalizes loudness and then uses voice activity detection (VAD) to choose # boundaries. `server_vad` object can be provided to tweak VAD detection # parameters manually. If unset, the audio is transcribed as a single block. + # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + # seconds. chunking_strategy: nil, # Additional information to include in the transcription response. `logprobs` will # return the log probabilities of the tokens in the response to understand the # model's confidence in the transcription. `logprobs` only works with # response_format set to `json` and only with the models `gpt-4o-transcribe` and - # `gpt-4o-mini-transcribe`. + # `gpt-4o-mini-transcribe`. This field is not supported when using + # `gpt-4o-transcribe-diarize`. include: nil, + # Optional list of speaker names that correspond to the audio samples provided in + # `known_speaker_references[]`. Each entry should be a short identifier (for + # example `customer` or `agent`). Up to 4 speakers are supported. + known_speaker_names: nil, + # Optional list of audio samples (as + # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + # that contain known speaker references matching `known_speaker_names[]`. Each + # sample must be between 2 and 10 seconds, and can use any of the same input audio + # formats supported by `file`. + known_speaker_references: nil, # The language of the input audio. Supplying the input language in # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) # format will improve accuracy and latency. @@ -60,11 +75,14 @@ module OpenAI # An optional text to guide the model's style or continue a previous audio # segment. The # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - # should match the audio language. + # should match the audio language. This field is not supported when using + # `gpt-4o-transcribe-diarize`. prompt: nil, # The format of the output, in one of these options: `json`, `text`, `srt`, - # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - # the only supported format is `json`. + # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + # `gpt-4o-mini-transcribe`, the only supported format is `json`. For + # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + # `diarized_json`, with `diarized_json` required to receive speaker annotations. response_format: nil, # The sampling temperature, between 0 and 1. Higher values like 0.8 will make the # output more random, while lower values like 0.2 will make it more focused and @@ -76,7 +94,8 @@ module OpenAI # `response_format` must be set `verbose_json` to use timestamp granularities. # Either or both of these options are supported: `word`, or `segment`. Note: There # is no additional latency for segment timestamps, but generating word timestamps - # incurs additional latency. + # incurs additional latency. This option is not available for + # `gpt-4o-transcribe-diarize`. timestamp_granularities: nil, # There is no need to provide `stream:`. Instead, use `#create_streaming` or # `#create` for streaming and non-streaming use cases, respectively. @@ -101,6 +120,8 @@ module OpenAI ) ), include: T::Array[OpenAI::Audio::TranscriptionInclude::OrSymbol], + known_speaker_names: T::Array[String], + known_speaker_references: T::Array[String], language: String, prompt: String, response_format: OpenAI::AudioResponseFormat::OrSymbol, @@ -122,20 +143,33 @@ module OpenAI # flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. file:, # ID of the model to use. The options are `gpt-4o-transcribe`, - # `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - # Whisper V2 model). + # `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + # Whisper V2 model), and `gpt-4o-transcribe-diarize`. model:, # Controls how the audio is cut into chunks. When set to `"auto"`, the server # first normalizes loudness and then uses voice activity detection (VAD) to choose # boundaries. `server_vad` object can be provided to tweak VAD detection # parameters manually. If unset, the audio is transcribed as a single block. + # Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + # seconds. chunking_strategy: nil, # Additional information to include in the transcription response. `logprobs` will # return the log probabilities of the tokens in the response to understand the # model's confidence in the transcription. `logprobs` only works with # response_format set to `json` and only with the models `gpt-4o-transcribe` and - # `gpt-4o-mini-transcribe`. + # `gpt-4o-mini-transcribe`. This field is not supported when using + # `gpt-4o-transcribe-diarize`. include: nil, + # Optional list of speaker names that correspond to the audio samples provided in + # `known_speaker_references[]`. Each entry should be a short identifier (for + # example `customer` or `agent`). Up to 4 speakers are supported. + known_speaker_names: nil, + # Optional list of audio samples (as + # [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + # that contain known speaker references matching `known_speaker_names[]`. Each + # sample must be between 2 and 10 seconds, and can use any of the same input audio + # formats supported by `file`. + known_speaker_references: nil, # The language of the input audio. Supplying the input language in # [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) # format will improve accuracy and latency. @@ -143,11 +177,14 @@ module OpenAI # An optional text to guide the model's style or continue a previous audio # segment. The # [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - # should match the audio language. + # should match the audio language. This field is not supported when using + # `gpt-4o-transcribe-diarize`. prompt: nil, # The format of the output, in one of these options: `json`, `text`, `srt`, - # `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - # the only supported format is `json`. + # `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + # `gpt-4o-mini-transcribe`, the only supported format is `json`. For + # `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + # `diarized_json`, with `diarized_json` required to receive speaker annotations. response_format: nil, # The sampling temperature, between 0 and 1. Higher values like 0.8 will make the # output more random, while lower values like 0.2 will make it more focused and @@ -159,7 +196,8 @@ module OpenAI # `response_format` must be set `verbose_json` to use timestamp granularities. # Either or both of these options are supported: `word`, or `segment`. Note: There # is no additional latency for segment timestamps, but generating word timestamps - # incurs additional latency. + # incurs additional latency. This option is not available for + # `gpt-4o-transcribe-diarize`. timestamp_granularities: nil, # There is no need to provide `stream:`. Instead, use `#create_streaming` or # `#create` for streaming and non-streaming use cases, respectively. diff --git a/rbi/openai/resources/vector_stores.rbi b/rbi/openai/resources/vector_stores.rbi index 727abd63..aeeecdf3 100644 --- a/rbi/openai/resources/vector_stores.rbi +++ b/rbi/openai/resources/vector_stores.rbi @@ -17,6 +17,7 @@ module OpenAI OpenAI::AutoFileChunkingStrategyParam::OrHash, OpenAI::StaticFileChunkingStrategyObjectParam::OrHash ), + description: String, expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter::OrHash, file_ids: T::Array[String], metadata: T.nilable(T::Hash[Symbol, String]), @@ -28,6 +29,9 @@ module OpenAI # The chunking strategy used to chunk the file(s). If not set, will use the `auto` # strategy. Only applicable if `file_ids` is non-empty. chunking_strategy: nil, + # A description for the vector store. Can be used to describe the vector store's + # purpose. + description: nil, # The expiration policy for a vector store. expires_after: nil, # A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that diff --git a/sig/openai/models/audio/transcription_create_params.rbs b/sig/openai/models/audio/transcription_create_params.rbs index 28e08060..8ffeb30a 100644 --- a/sig/openai/models/audio/transcription_create_params.rbs +++ b/sig/openai/models/audio/transcription_create_params.rbs @@ -7,6 +7,8 @@ module OpenAI model: OpenAI::Models::Audio::TranscriptionCreateParams::model, chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?, include: ::Array[OpenAI::Models::Audio::transcription_include], + known_speaker_names: ::Array[String], + known_speaker_references: ::Array[String], language: String, prompt: String, response_format: OpenAI::Models::audio_response_format, @@ -31,6 +33,14 @@ module OpenAI ::Array[OpenAI::Models::Audio::transcription_include] ) -> ::Array[OpenAI::Models::Audio::transcription_include] + attr_reader known_speaker_names: ::Array[String]? + + def known_speaker_names=: (::Array[String]) -> ::Array[String] + + attr_reader known_speaker_references: ::Array[String]? + + def known_speaker_references=: (::Array[String]) -> ::Array[String] + attr_reader language: String? def language=: (String) -> String @@ -60,6 +70,8 @@ module OpenAI model: OpenAI::Models::Audio::TranscriptionCreateParams::model, ?chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?, ?include: ::Array[OpenAI::Models::Audio::transcription_include], + ?known_speaker_names: ::Array[String], + ?known_speaker_references: ::Array[String], ?language: String, ?prompt: String, ?response_format: OpenAI::Models::audio_response_format, @@ -73,6 +85,8 @@ module OpenAI model: OpenAI::Models::Audio::TranscriptionCreateParams::model, chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?, include: ::Array[OpenAI::Models::Audio::transcription_include], + known_speaker_names: ::Array[String], + known_speaker_references: ::Array[String], language: String, prompt: String, response_format: OpenAI::Models::audio_response_format, diff --git a/sig/openai/models/audio/transcription_create_response.rbs b/sig/openai/models/audio/transcription_create_response.rbs index 5e18958f..ece5072a 100644 --- a/sig/openai/models/audio/transcription_create_response.rbs +++ b/sig/openai/models/audio/transcription_create_response.rbs @@ -2,7 +2,9 @@ module OpenAI module Models module Audio type transcription_create_response = - OpenAI::Audio::Transcription | OpenAI::Audio::TranscriptionVerbose + OpenAI::Audio::Transcription + | OpenAI::Audio::TranscriptionDiarized + | OpenAI::Audio::TranscriptionVerbose module TranscriptionCreateResponse extend OpenAI::Internal::Type::Union diff --git a/sig/openai/models/audio/transcription_diarized.rbs b/sig/openai/models/audio/transcription_diarized.rbs new file mode 100644 index 00000000..3798ff0e --- /dev/null +++ b/sig/openai/models/audio/transcription_diarized.rbs @@ -0,0 +1,129 @@ +module OpenAI + module Models + module Audio + type transcription_diarized = + { + duration: Float, + segments: ::Array[OpenAI::Audio::TranscriptionDiarizedSegment], + task: :transcribe, + text: String, + usage: OpenAI::Models::Audio::TranscriptionDiarized::usage + } + + class TranscriptionDiarized < OpenAI::Internal::Type::BaseModel + attr_accessor duration: Float + + attr_accessor segments: ::Array[OpenAI::Audio::TranscriptionDiarizedSegment] + + attr_accessor task: :transcribe + + attr_accessor text: String + + attr_reader usage: OpenAI::Models::Audio::TranscriptionDiarized::usage? + + def usage=: ( + OpenAI::Models::Audio::TranscriptionDiarized::usage + ) -> OpenAI::Models::Audio::TranscriptionDiarized::usage + + def initialize: ( + duration: Float, + segments: ::Array[OpenAI::Audio::TranscriptionDiarizedSegment], + text: String, + ?usage: OpenAI::Models::Audio::TranscriptionDiarized::usage, + ?task: :transcribe + ) -> void + + def to_hash: -> { + duration: Float, + segments: ::Array[OpenAI::Audio::TranscriptionDiarizedSegment], + task: :transcribe, + text: String, + usage: OpenAI::Models::Audio::TranscriptionDiarized::usage + } + + type usage = + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens + | OpenAI::Audio::TranscriptionDiarized::Usage::Duration + + module Usage + extend OpenAI::Internal::Type::Union + + type tokens = + { + input_tokens: Integer, + output_tokens: Integer, + total_tokens: Integer, + type: :tokens, + input_token_details: OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails + } + + class Tokens < OpenAI::Internal::Type::BaseModel + attr_accessor input_tokens: Integer + + attr_accessor output_tokens: Integer + + attr_accessor total_tokens: Integer + + attr_accessor type: :tokens + + attr_reader input_token_details: OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails? + + def input_token_details=: ( + OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails + ) -> OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails + + def initialize: ( + input_tokens: Integer, + output_tokens: Integer, + total_tokens: Integer, + ?input_token_details: OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails, + ?type: :tokens + ) -> void + + def to_hash: -> { + input_tokens: Integer, + output_tokens: Integer, + total_tokens: Integer, + type: :tokens, + input_token_details: OpenAI::Audio::TranscriptionDiarized::Usage::Tokens::InputTokenDetails + } + + type input_token_details = + { audio_tokens: Integer, text_tokens: Integer } + + class InputTokenDetails < OpenAI::Internal::Type::BaseModel + attr_reader audio_tokens: Integer? + + def audio_tokens=: (Integer) -> Integer + + attr_reader text_tokens: Integer? + + def text_tokens=: (Integer) -> Integer + + def initialize: ( + ?audio_tokens: Integer, + ?text_tokens: Integer + ) -> void + + def to_hash: -> { audio_tokens: Integer, text_tokens: Integer } + end + end + + type duration = { seconds: Float, type: :duration } + + class Duration < OpenAI::Internal::Type::BaseModel + attr_accessor seconds: Float + + attr_accessor type: :duration + + def initialize: (seconds: Float, ?type: :duration) -> void + + def to_hash: -> { seconds: Float, type: :duration } + end + + def self?.variants: -> ::Array[OpenAI::Models::Audio::TranscriptionDiarized::usage] + end + end + end + end +end diff --git a/sig/openai/models/audio/transcription_diarized_segment.rbs b/sig/openai/models/audio/transcription_diarized_segment.rbs new file mode 100644 index 00000000..465d7e17 --- /dev/null +++ b/sig/openai/models/audio/transcription_diarized_segment.rbs @@ -0,0 +1,47 @@ +module OpenAI + module Models + module Audio + type transcription_diarized_segment = + { + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: :"transcript.text.segment" + } + + class TranscriptionDiarizedSegment < OpenAI::Internal::Type::BaseModel + attr_accessor id: String + + attr_accessor end_: Float + + attr_accessor speaker: String + + attr_accessor start: Float + + attr_accessor text: String + + attr_accessor type: :"transcript.text.segment" + + def initialize: ( + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + ?type: :"transcript.text.segment" + ) -> void + + def to_hash: -> { + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: :"transcript.text.segment" + } + end + end + end +end diff --git a/sig/openai/models/audio/transcription_stream_event.rbs b/sig/openai/models/audio/transcription_stream_event.rbs index f6c55919..609ad934 100644 --- a/sig/openai/models/audio/transcription_stream_event.rbs +++ b/sig/openai/models/audio/transcription_stream_event.rbs @@ -2,7 +2,8 @@ module OpenAI module Models module Audio type transcription_stream_event = - OpenAI::Audio::TranscriptionTextDeltaEvent + OpenAI::Audio::TranscriptionTextSegmentEvent + | OpenAI::Audio::TranscriptionTextDeltaEvent | OpenAI::Audio::TranscriptionTextDoneEvent module TranscriptionStreamEvent diff --git a/sig/openai/models/audio/transcription_text_delta_event.rbs b/sig/openai/models/audio/transcription_text_delta_event.rbs index 08280006..095f6639 100644 --- a/sig/openai/models/audio/transcription_text_delta_event.rbs +++ b/sig/openai/models/audio/transcription_text_delta_event.rbs @@ -5,7 +5,8 @@ module OpenAI { delta: String, type: :"transcript.text.delta", - logprobs: ::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] + logprobs: ::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob], + segment_id: String } class TranscriptionTextDeltaEvent < OpenAI::Internal::Type::BaseModel @@ -19,16 +20,22 @@ module OpenAI ::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] ) -> ::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] + attr_reader segment_id: String? + + def segment_id=: (String) -> String + def initialize: ( delta: String, ?logprobs: ::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob], + ?segment_id: String, ?type: :"transcript.text.delta" ) -> void def to_hash: -> { delta: String, type: :"transcript.text.delta", - logprobs: ::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob] + logprobs: ::Array[OpenAI::Audio::TranscriptionTextDeltaEvent::Logprob], + segment_id: String } type logprob = diff --git a/sig/openai/models/audio/transcription_text_segment_event.rbs b/sig/openai/models/audio/transcription_text_segment_event.rbs new file mode 100644 index 00000000..e805d9a2 --- /dev/null +++ b/sig/openai/models/audio/transcription_text_segment_event.rbs @@ -0,0 +1,47 @@ +module OpenAI + module Models + module Audio + type transcription_text_segment_event = + { + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: :"transcript.text.segment" + } + + class TranscriptionTextSegmentEvent < OpenAI::Internal::Type::BaseModel + attr_accessor id: String + + attr_accessor end_: Float + + attr_accessor speaker: String + + attr_accessor start: Float + + attr_accessor text: String + + attr_accessor type: :"transcript.text.segment" + + def initialize: ( + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + ?type: :"transcript.text.segment" + ) -> void + + def to_hash: -> { + id: String, + end_: Float, + speaker: String, + start: Float, + text: String, + type: :"transcript.text.segment" + } + end + end + end +end diff --git a/sig/openai/models/audio_model.rbs b/sig/openai/models/audio_model.rbs index 4a294e19..c94ade8b 100644 --- a/sig/openai/models/audio_model.rbs +++ b/sig/openai/models/audio_model.rbs @@ -1,7 +1,10 @@ module OpenAI module Models type audio_model = - :"whisper-1" | :"gpt-4o-transcribe" | :"gpt-4o-mini-transcribe" + :"whisper-1" + | :"gpt-4o-transcribe" + | :"gpt-4o-mini-transcribe" + | :"gpt-4o-transcribe-diarize" module AudioModel extend OpenAI::Internal::Type::Enum @@ -9,6 +12,7 @@ module OpenAI WHISPER_1: :"whisper-1" GPT_4O_TRANSCRIBE: :"gpt-4o-transcribe" GPT_4O_MINI_TRANSCRIBE: :"gpt-4o-mini-transcribe" + GPT_4O_TRANSCRIBE_DIARIZE: :"gpt-4o-transcribe-diarize" def self?.values: -> ::Array[OpenAI::Models::audio_model] end diff --git a/sig/openai/models/audio_response_format.rbs b/sig/openai/models/audio_response_format.rbs index 39091918..3cb615f5 100644 --- a/sig/openai/models/audio_response_format.rbs +++ b/sig/openai/models/audio_response_format.rbs @@ -1,6 +1,7 @@ module OpenAI module Models - type audio_response_format = :json | :text | :srt | :verbose_json | :vtt + type audio_response_format = + :json | :text | :srt | :verbose_json | :vtt | :diarized_json module AudioResponseFormat extend OpenAI::Internal::Type::Enum @@ -10,6 +11,7 @@ module OpenAI SRT: :srt VERBOSE_JSON: :verbose_json VTT: :vtt + DIARIZED_JSON: :diarized_json def self?.values: -> ::Array[OpenAI::Models::audio_response_format] end diff --git a/sig/openai/models/realtime/audio_transcription.rbs b/sig/openai/models/realtime/audio_transcription.rbs index 4b79dfb9..7503413b 100644 --- a/sig/openai/models/realtime/audio_transcription.rbs +++ b/sig/openai/models/realtime/audio_transcription.rbs @@ -37,17 +37,17 @@ module OpenAI type model = :"whisper-1" - | :"gpt-4o-transcribe-latest" | :"gpt-4o-mini-transcribe" | :"gpt-4o-transcribe" + | :"gpt-4o-transcribe-diarize" module Model extend OpenAI::Internal::Type::Enum WHISPER_1: :"whisper-1" - GPT_4O_TRANSCRIBE_LATEST: :"gpt-4o-transcribe-latest" GPT_4O_MINI_TRANSCRIBE: :"gpt-4o-mini-transcribe" GPT_4O_TRANSCRIBE: :"gpt-4o-transcribe" + GPT_4O_TRANSCRIBE_DIARIZE: :"gpt-4o-transcribe-diarize" def self?.values: -> ::Array[OpenAI::Models::Realtime::AudioTranscription::model] end diff --git a/sig/openai/models/vector_store_create_params.rbs b/sig/openai/models/vector_store_create_params.rbs index d5c48eb6..0effc9bd 100644 --- a/sig/openai/models/vector_store_create_params.rbs +++ b/sig/openai/models/vector_store_create_params.rbs @@ -3,6 +3,7 @@ module OpenAI type vector_store_create_params = { chunking_strategy: OpenAI::Models::file_chunking_strategy_param, + description: String, expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter, file_ids: ::Array[String], metadata: OpenAI::Models::metadata?, @@ -20,6 +21,10 @@ module OpenAI OpenAI::Models::file_chunking_strategy_param ) -> OpenAI::Models::file_chunking_strategy_param + attr_reader description: String? + + def description=: (String) -> String + attr_reader expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter? def expires_after=: ( @@ -38,6 +43,7 @@ module OpenAI def initialize: ( ?chunking_strategy: OpenAI::Models::file_chunking_strategy_param, + ?description: String, ?expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter, ?file_ids: ::Array[String], ?metadata: OpenAI::Models::metadata?, @@ -47,6 +53,7 @@ module OpenAI def to_hash: -> { chunking_strategy: OpenAI::Models::file_chunking_strategy_param, + description: String, expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter, file_ids: ::Array[String], metadata: OpenAI::Models::metadata?, diff --git a/sig/openai/resources/audio/transcriptions.rbs b/sig/openai/resources/audio/transcriptions.rbs index 0130f147..19516240 100644 --- a/sig/openai/resources/audio/transcriptions.rbs +++ b/sig/openai/resources/audio/transcriptions.rbs @@ -7,6 +7,8 @@ module OpenAI model: OpenAI::Models::Audio::TranscriptionCreateParams::model, ?chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?, ?include: ::Array[OpenAI::Models::Audio::transcription_include], + ?known_speaker_names: ::Array[String], + ?known_speaker_references: ::Array[String], ?language: String, ?prompt: String, ?response_format: OpenAI::Models::audio_response_format, @@ -20,6 +22,8 @@ module OpenAI model: OpenAI::Models::Audio::TranscriptionCreateParams::model, ?chunking_strategy: OpenAI::Models::Audio::TranscriptionCreateParams::chunking_strategy?, ?include: ::Array[OpenAI::Models::Audio::transcription_include], + ?known_speaker_names: ::Array[String], + ?known_speaker_references: ::Array[String], ?language: String, ?prompt: String, ?response_format: OpenAI::Models::audio_response_format, diff --git a/sig/openai/resources/vector_stores.rbs b/sig/openai/resources/vector_stores.rbs index d717bd54..60badb9d 100644 --- a/sig/openai/resources/vector_stores.rbs +++ b/sig/openai/resources/vector_stores.rbs @@ -7,6 +7,7 @@ module OpenAI def create: ( ?chunking_strategy: OpenAI::Models::file_chunking_strategy_param, + ?description: String, ?expires_after: OpenAI::VectorStoreCreateParams::ExpiresAfter, ?file_ids: ::Array[String], ?metadata: OpenAI::Models::metadata?, diff --git a/test/openai/internal/util_test.rb b/test/openai/internal/util_test.rb index 2ffe5242..60fdfe9b 100644 --- a/test/openai/internal/util_test.rb +++ b/test/openai/internal/util_test.rb @@ -124,6 +124,14 @@ def test_joining path: "/c", query: {"d" => ["e"]} } + ], + [ + "h://a.b/c?d=e", + "h://nope", + { + path: "h://a.b/c", + query: {"d" => ["e"]} + } ] ] diff --git a/test/openai/resources/audio/transcriptions_test.rb b/test/openai/resources/audio/transcriptions_test.rb index f4f5b94b..1af348e5 100644 --- a/test/openai/resources/audio/transcriptions_test.rb +++ b/test/openai/resources/audio/transcriptions_test.rb @@ -13,6 +13,7 @@ def test_create_required_params assert_pattern do case response in OpenAI::Audio::Transcription + in OpenAI::Audio::TranscriptionDiarized in OpenAI::Audio::TranscriptionVerbose end end