diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 2ed1eead8..b0aac41b2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -10,6 +10,7 @@ on:
jobs:
lint:
+ timeout-minutes: 10
name: lint
runs-on: ubuntu-latest
steps:
@@ -27,6 +28,7 @@ jobs:
run: ./scripts/lint
build:
+ timeout-minutes: 5
name: build
runs-on: ubuntu-latest
permissions:
@@ -61,6 +63,7 @@ jobs:
SHA: ${{ github.sha }}
run: ./scripts/utils/upload-artifact.sh
test:
+ timeout-minutes: 10
name: test
runs-on: ubuntu-latest
steps:
@@ -78,6 +81,7 @@ jobs:
run: ./scripts/test
examples:
+ timeout-minutes: 10
name: examples
runs-on: ubuntu-latest
if: github.repository == 'openai/openai-node'
diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 2f61d58b0..5b0015f5b 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
{
- ".": "4.95.1"
+ ".": "4.96.0"
}
diff --git a/.stats.yml b/.stats.yml
index 848c5b5ad..d92408173 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
configured_endpoints: 97
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-5633633cc38734869cf7d993f7b549bb8e4d10e0ec45381ec2cd91507cd8eb8f.yml
-openapi_spec_hash: c855121b2b2324b99499c9244c21d24d
-config_hash: d20837393b73efdb19cd08e04c1cc9a1
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-8b68ae6b807dca92e914da1dd9e835a20f69b075e79102a264367fd7fddddb33.yml
+openapi_spec_hash: b6ade5b1a6327339e6669e1134de2d03
+config_hash: b597cd9a31e9e5ec709e2eefb4c54122
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1f864e203..47717a4eb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,23 @@
# Changelog
+## 4.96.0 (2025-04-23)
+
+Full Changelog: [v4.95.1...v4.96.0](https://github.com/openai/openai-node/compare/v4.95.1...v4.96.0)
+
+### Features
+
+* **api:** adding new image model support ([a00d331](https://github.com/openai/openai-node/commit/a00d33190edd08df7d9c088c00ab7b77673f88ba))
+
+
+### Bug Fixes
+
+* **types:** export AssistantStream ([#1472](https://github.com/openai/openai-node/issues/1472)) ([626c844](https://github.com/openai/openai-node/commit/626c844a758a68ffbff48873d4773be2e3868952))
+
+
+### Chores
+
+* **ci:** add timeout thresholds for CI jobs ([e465063](https://github.com/openai/openai-node/commit/e46506351097f1de39c866c28b6ec20fa724fc36))
+
## 4.95.1 (2025-04-18)
Full Changelog: [v4.95.0...v4.95.1](https://github.com/openai/openai-node/compare/v4.95.0...v4.95.1)
diff --git a/api.md b/api.md
index 2eb54b34a..49e6548a8 100644
--- a/api.md
+++ b/api.md
@@ -249,7 +249,7 @@ Methods:
- client.fineTuning.checkpoints.permissions.create(fineTunedModelCheckpoint, { ...params }) -> PermissionCreateResponsesPage
- client.fineTuning.checkpoints.permissions.retrieve(fineTunedModelCheckpoint, { ...params }) -> PermissionRetrieveResponse
-- client.fineTuning.checkpoints.permissions.del(fineTunedModelCheckpoint) -> PermissionDeleteResponse
+- client.fineTuning.checkpoints.permissions.del(fineTunedModelCheckpoint, permissionId) -> PermissionDeleteResponse
# VectorStores
@@ -626,6 +626,10 @@ Types:
- ResponseOutputRefusal
- ResponseOutputText
- ResponseReasoningItem
+- ResponseReasoningSummaryPartAddedEvent
+- ResponseReasoningSummaryPartDoneEvent
+- ResponseReasoningSummaryTextDeltaEvent
+- ResponseReasoningSummaryTextDoneEvent
- ResponseRefusalDeltaEvent
- ResponseRefusalDoneEvent
- ResponseStatus
diff --git a/jsr.json b/jsr.json
index 8271c8522..6b574ce15 100644
--- a/jsr.json
+++ b/jsr.json
@@ -1,6 +1,6 @@
{
"name": "@openai/openai",
- "version": "4.95.1",
+ "version": "4.96.0",
"exports": {
".": "./index.ts",
"./helpers/zod": "./helpers/zod.ts",
diff --git a/package.json b/package.json
index 76fe7d4d0..7b4e86f8e 100644
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
{
"name": "openai",
- "version": "4.95.1",
+ "version": "4.96.0",
"description": "The official TypeScript library for the OpenAI API",
"author": "OpenAI ",
"types": "dist/index.d.ts",
diff --git a/src/resources/beta/assistants.ts b/src/resources/beta/assistants.ts
index bf957db95..00a6ff2cf 100644
--- a/src/resources/beta/assistants.ts
+++ b/src/resources/beta/assistants.ts
@@ -9,6 +9,7 @@ import * as ThreadsAPI from './threads/threads';
import * as RunsAPI from './threads/runs/runs';
import * as StepsAPI from './threads/runs/steps';
import { CursorPage, type CursorPageParams } from '../../pagination';
+import { AssistantStream } from '../../lib/AssistantStream';
export class Assistants extends APIResource {
/**
@@ -1517,4 +1518,6 @@ export declare namespace Assistants {
type AssistantUpdateParams as AssistantUpdateParams,
type AssistantListParams as AssistantListParams,
};
+
+ export { AssistantStream };
}
diff --git a/src/resources/beta/realtime/realtime.ts b/src/resources/beta/realtime/realtime.ts
index 1c02fdd1a..5012b1edd 100644
--- a/src/resources/beta/realtime/realtime.ts
+++ b/src/resources/beta/realtime/realtime.ts
@@ -915,12 +915,34 @@ export type RealtimeClientEvent =
| ConversationItemTruncateEvent
| InputAudioBufferAppendEvent
| InputAudioBufferClearEvent
+ | RealtimeClientEvent.OutputAudioBufferClear
| InputAudioBufferCommitEvent
| ResponseCancelEvent
| ResponseCreateEvent
| SessionUpdateEvent
| TranscriptionSessionUpdate;
+export namespace RealtimeClientEvent {
+ /**
+ * **WebRTC Only:** Emit to cut off the current audio response. This will trigger
+ * the server to stop generating audio and emit a `output_audio_buffer.cleared`
+ * event. This event should be preceded by a `response.cancel` client event to stop
+ * the generation of the current response.
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
+ */
+ export interface OutputAudioBufferClear {
+ /**
+ * The event type, must be `output_audio_buffer.clear`.
+ */
+ type: 'output_audio_buffer.clear';
+
+ /**
+ * The unique ID of the client event used for error handling.
+ */
+ event_id?: string;
+ }
+}
+
/**
* The response resource.
*/
@@ -1174,7 +1196,10 @@ export type RealtimeServerEvent =
| ResponseTextDoneEvent
| SessionCreatedEvent
| SessionUpdatedEvent
- | TranscriptionSessionUpdatedEvent;
+ | TranscriptionSessionUpdatedEvent
+ | RealtimeServerEvent.OutputAudioBufferStarted
+ | RealtimeServerEvent.OutputAudioBufferStopped
+ | RealtimeServerEvent.OutputAudioBufferCleared;
export namespace RealtimeServerEvent {
/**
@@ -1197,6 +1222,77 @@ export namespace RealtimeServerEvent {
*/
type: 'conversation.item.retrieved';
}
+
+ /**
+ * **WebRTC Only:** Emitted when the server begins streaming audio to the client.
+ * This event is emitted after an audio content part has been added
+ * (`response.content_part.added`) to the response.
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
+ */
+ export interface OutputAudioBufferStarted {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The unique ID of the response that produced the audio.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `output_audio_buffer.started`.
+ */
+ type: 'output_audio_buffer.started';
+ }
+
+ /**
+ * **WebRTC Only:** Emitted when the output audio buffer has been completely
+ * drained on the server, and no more audio is forthcoming. This event is emitted
+ * after the full response data has been sent to the client (`response.done`).
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
+ */
+ export interface OutputAudioBufferStopped {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The unique ID of the response that produced the audio.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `output_audio_buffer.stopped`.
+ */
+ type: 'output_audio_buffer.stopped';
+ }
+
+ /**
+ * **WebRTC Only:** Emitted when the output audio buffer is cleared. This happens
+ * either in VAD mode when the user has interrupted
+ * (`input_audio_buffer.speech_started`), or when the client has emitted the
+ * `output_audio_buffer.clear` event to manually cut off the current audio
+ * response.
+ * [Learn more](https://platform.openai.com/docs/guides/realtime-model-capabilities#client-and-server-events-for-audio-in-webrtc).
+ */
+ export interface OutputAudioBufferCleared {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The unique ID of the response that produced the audio.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `output_audio_buffer.cleared`.
+ */
+ type: 'output_audio_buffer.cleared';
+ }
}
/**
diff --git a/src/resources/beta/threads/threads.ts b/src/resources/beta/threads/threads.ts
index 8075ba0ac..2e5ab1cc8 100644
--- a/src/resources/beta/threads/threads.ts
+++ b/src/resources/beta/threads/threads.ts
@@ -686,9 +686,7 @@ export interface ThreadCreateAndRunParamsBase {
* Override the tools the assistant can use for this run. This is useful for
* modifying the behavior on a per-run basis.
*/
- tools?: Array<
- AssistantsAPI.CodeInterpreterTool | AssistantsAPI.FileSearchTool | AssistantsAPI.FunctionTool
- > | null;
+ tools?: Array | null;
/**
* An alternative to sampling with temperature, called nucleus sampling, where the
@@ -1718,4 +1716,6 @@ export declare namespace Threads {
type MessageUpdateParams as MessageUpdateParams,
type MessageListParams as MessageListParams,
};
+
+ export { AssistantStream };
}
diff --git a/src/resources/evals/evals.ts b/src/resources/evals/evals.ts
index 84ff6d1bb..caef7acc0 100644
--- a/src/resources/evals/evals.ts
+++ b/src/resources/evals/evals.ts
@@ -4,6 +4,7 @@ import { APIResource } from '../../resource';
import { isRequestOptions } from '../../core';
import * as Core from '../../core';
import * as Shared from '../shared';
+import * as ResponsesAPI from '../responses/responses';
import * as RunsAPI from './runs/runs';
import {
CreateEvalCompletionsRunDataSource,
@@ -107,7 +108,7 @@ export interface EvalCustomDataSourceConfig {
* the evaluation.
*/
export interface EvalLabelModelGrader {
- input: Array;
+ input: Array;
/**
* The labels to assign to each item in the evaluation.
@@ -136,57 +137,43 @@ export interface EvalLabelModelGrader {
}
export namespace EvalLabelModelGrader {
- export interface InputMessage {
- content: InputMessage.Content;
-
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface Input {
/**
- * The role of the message. One of `user`, `system`, or `developer`.
+ * Text inputs to the model - can contain template strings.
*/
- role: 'user' | 'system' | 'developer';
+ content: string | ResponsesAPI.ResponseInputText | Input.OutputText;
/**
- * The type of item, which is always `message`.
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
*/
- type: 'message';
- }
-
- export namespace InputMessage {
- export interface Content {
- /**
- * The text content.
- */
- text: string;
-
- /**
- * The type of content, which is always `input_text`.
- */
- type: 'input_text';
- }
- }
-
- export interface Assistant {
- content: Assistant.Content;
+ role: 'user' | 'assistant' | 'system' | 'developer';
/**
- * The role of the message. Must be `assistant` for output.
+ * The type of the message input. Always `message`.
*/
- role: 'assistant';
+ type?: 'message';
+ }
+ export namespace Input {
/**
- * The type of item, which is always `message`.
+ * A text output from the model.
*/
- type: 'message';
- }
-
- export namespace Assistant {
- export interface Content {
+ export interface OutputText {
/**
- * The text content.
+ * The text output from the model.
*/
text: string;
/**
- * The type of content, which is always `output_text`.
+ * The type of the output text. Always `output_text`.
*/
type: 'output_text';
}
@@ -259,8 +246,8 @@ export interface EvalStringCheckGrader {
*/
export interface EvalTextSimilarityGrader {
/**
- * The evaluation metric to use. One of `cosine`, `fuzzy_match`, `bleu`, `gleu`,
- * `meteor`, `rouge_1`, `rouge_2`, `rouge_3`, `rouge_4`, `rouge_5`, or `rouge_l`.
+ * The evaluation metric to use. One of `fuzzy_match`, `bleu`, `gleu`, `meteor`,
+ * `rouge_1`, `rouge_2`, `rouge_3`, `rouge_4`, `rouge_5`, or `rouge_l`.
*/
evaluation_metric:
| 'fuzzy_match'
@@ -272,8 +259,7 @@ export interface EvalTextSimilarityGrader {
| 'rouge_3'
| 'rouge_4'
| 'rouge_5'
- | 'rouge_l'
- | 'cosine';
+ | 'rouge_l';
/**
* The text being graded.
@@ -346,14 +332,131 @@ export interface EvalCreateResponse {
object: 'eval';
/**
- * Indicates whether the evaluation is shared with OpenAI.
+ * A list of testing criteria.
*/
- share_with_openai: boolean;
+ testing_criteria: Array<
+ | EvalLabelModelGrader
+ | EvalStringCheckGrader
+ | EvalTextSimilarityGrader
+ | EvalCreateResponse.Python
+ | EvalCreateResponse.ScoreModel
+ >;
+}
+export namespace EvalCreateResponse {
/**
- * A list of testing criteria.
+ * A PythonGrader object that runs a python script on the input.
*/
- testing_criteria: Array;
+ export interface Python {
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The source code of the python script.
+ */
+ source: string;
+
+ /**
+ * The object type, which is always `python`.
+ */
+ type: 'python';
+
+ /**
+ * The image tag to use for the python script.
+ */
+ image_tag?: string;
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+ }
+
+ /**
+ * A ScoreModelGrader object that uses a model to assign a score to the input.
+ */
+ export interface ScoreModel {
+ /**
+ * The input text. This may include template strings.
+ */
+ input: Array;
+
+ /**
+ * The model to use for the evaluation.
+ */
+ model: string;
+
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The object type, which is always `score_model`.
+ */
+ type: 'score_model';
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+
+ /**
+ * The range of the score. Defaults to `[0, 1]`.
+ */
+ range?: Array;
+
+ /**
+ * The sampling parameters for the model.
+ */
+ sampling_params?: unknown;
+ }
+
+ export namespace ScoreModel {
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface Input {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | Input.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace Input {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
}
/**
@@ -401,14 +504,131 @@ export interface EvalRetrieveResponse {
object: 'eval';
/**
- * Indicates whether the evaluation is shared with OpenAI.
+ * A list of testing criteria.
*/
- share_with_openai: boolean;
+ testing_criteria: Array<
+ | EvalLabelModelGrader
+ | EvalStringCheckGrader
+ | EvalTextSimilarityGrader
+ | EvalRetrieveResponse.Python
+ | EvalRetrieveResponse.ScoreModel
+ >;
+}
+export namespace EvalRetrieveResponse {
/**
- * A list of testing criteria.
+ * A PythonGrader object that runs a python script on the input.
+ */
+ export interface Python {
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The source code of the python script.
+ */
+ source: string;
+
+ /**
+ * The object type, which is always `python`.
+ */
+ type: 'python';
+
+ /**
+ * The image tag to use for the python script.
+ */
+ image_tag?: string;
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+ }
+
+ /**
+ * A ScoreModelGrader object that uses a model to assign a score to the input.
*/
- testing_criteria: Array;
+ export interface ScoreModel {
+ /**
+ * The input text. This may include template strings.
+ */
+ input: Array;
+
+ /**
+ * The model to use for the evaluation.
+ */
+ model: string;
+
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The object type, which is always `score_model`.
+ */
+ type: 'score_model';
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+
+ /**
+ * The range of the score. Defaults to `[0, 1]`.
+ */
+ range?: Array;
+
+ /**
+ * The sampling parameters for the model.
+ */
+ sampling_params?: unknown;
+ }
+
+ export namespace ScoreModel {
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface Input {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | Input.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace Input {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
}
/**
@@ -456,14 +676,131 @@ export interface EvalUpdateResponse {
object: 'eval';
/**
- * Indicates whether the evaluation is shared with OpenAI.
+ * A list of testing criteria.
+ */
+ testing_criteria: Array<
+ | EvalLabelModelGrader
+ | EvalStringCheckGrader
+ | EvalTextSimilarityGrader
+ | EvalUpdateResponse.Python
+ | EvalUpdateResponse.ScoreModel
+ >;
+}
+
+export namespace EvalUpdateResponse {
+ /**
+ * A PythonGrader object that runs a python script on the input.
*/
- share_with_openai: boolean;
+ export interface Python {
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The source code of the python script.
+ */
+ source: string;
+
+ /**
+ * The object type, which is always `python`.
+ */
+ type: 'python';
+
+ /**
+ * The image tag to use for the python script.
+ */
+ image_tag?: string;
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+ }
/**
- * A list of testing criteria.
+ * A ScoreModelGrader object that uses a model to assign a score to the input.
*/
- testing_criteria: Array;
+ export interface ScoreModel {
+ /**
+ * The input text. This may include template strings.
+ */
+ input: Array;
+
+ /**
+ * The model to use for the evaluation.
+ */
+ model: string;
+
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The object type, which is always `score_model`.
+ */
+ type: 'score_model';
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+
+ /**
+ * The range of the score. Defaults to `[0, 1]`.
+ */
+ range?: Array;
+
+ /**
+ * The sampling parameters for the model.
+ */
+ sampling_params?: unknown;
+ }
+
+ export namespace ScoreModel {
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface Input {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | Input.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace Input {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
}
/**
@@ -511,14 +848,131 @@ export interface EvalListResponse {
object: 'eval';
/**
- * Indicates whether the evaluation is shared with OpenAI.
+ * A list of testing criteria.
*/
- share_with_openai: boolean;
+ testing_criteria: Array<
+ | EvalLabelModelGrader
+ | EvalStringCheckGrader
+ | EvalTextSimilarityGrader
+ | EvalListResponse.Python
+ | EvalListResponse.ScoreModel
+ >;
+}
+export namespace EvalListResponse {
/**
- * A list of testing criteria.
+ * A PythonGrader object that runs a python script on the input.
+ */
+ export interface Python {
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The source code of the python script.
+ */
+ source: string;
+
+ /**
+ * The object type, which is always `python`.
+ */
+ type: 'python';
+
+ /**
+ * The image tag to use for the python script.
+ */
+ image_tag?: string;
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+ }
+
+ /**
+ * A ScoreModelGrader object that uses a model to assign a score to the input.
*/
- testing_criteria: Array;
+ export interface ScoreModel {
+ /**
+ * The input text. This may include template strings.
+ */
+ input: Array;
+
+ /**
+ * The model to use for the evaluation.
+ */
+ model: string;
+
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The object type, which is always `score_model`.
+ */
+ type: 'score_model';
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+
+ /**
+ * The range of the score. Defaults to `[0, 1]`.
+ */
+ range?: Array;
+
+ /**
+ * The sampling parameters for the model.
+ */
+ sampling_params?: unknown;
+ }
+
+ export namespace ScoreModel {
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface Input {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | Input.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace Input {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
}
export interface EvalDeleteResponse {
@@ -533,12 +987,18 @@ export interface EvalCreateParams {
/**
* The configuration for the data source used for the evaluation runs.
*/
- data_source_config: EvalCreateParams.Custom | EvalCreateParams.StoredCompletions;
+ data_source_config: EvalCreateParams.Custom | EvalCreateParams.Logs;
/**
* A list of graders for all eval runs in this group.
*/
- testing_criteria: Array;
+ testing_criteria: Array<
+ | EvalCreateParams.LabelModel
+ | EvalStringCheckGrader
+ | EvalTextSimilarityGrader
+ | EvalCreateParams.Python
+ | EvalCreateParams.ScoreModel
+ >;
/**
* Set of 16 key-value pairs that can be attached to an object. This can be useful
@@ -554,11 +1014,6 @@ export interface EvalCreateParams {
* The name of the evaluation.
*/
name?: string;
-
- /**
- * Indicates whether the evaluation is shared with OpenAI.
- */
- share_with_openai?: boolean;
}
export namespace EvalCreateParams {
@@ -572,7 +1027,7 @@ export namespace EvalCreateParams {
*/
export interface Custom {
/**
- * The json schema for the run data source items.
+ * The json schema for each row in the data source.
*/
item_schema: Record;
@@ -582,7 +1037,8 @@ export namespace EvalCreateParams {
type: 'custom';
/**
- * Whether to include the sample schema in the data source.
+ * Whether the eval should expect you to populate the sample namespace (ie, by
+ * generating responses off of your data source)
*/
include_sample_schema?: boolean;
}
@@ -592,21 +1048,16 @@ export namespace EvalCreateParams {
* completions query. This is usually metadata like `usecase=chatbot` or
* `prompt-version=v2`, etc.
*/
- export interface StoredCompletions {
+ export interface Logs {
/**
- * The type of data source. Always `stored_completions`.
+ * The type of data source. Always `logs`.
*/
- type: 'stored_completions';
+ type: 'logs';
/**
- * Set of 16 key-value pairs that can be attached to an object. This can be useful
- * for storing additional information about the object in a structured format, and
- * querying for objects via API or the dashboard.
- *
- * Keys are strings with a maximum length of 64 characters. Values are strings with
- * a maximum length of 512 characters.
+ * Metadata filters for the logs data source.
*/
- metadata?: Shared.Metadata | null;
+ metadata?: Record;
}
/**
@@ -614,7 +1065,11 @@ export namespace EvalCreateParams {
* the evaluation.
*/
export interface LabelModel {
- input: Array;
+ /**
+ * A list of chat messages forming the prompt or context. May include variable
+ * references to the "item" namespace, ie {{item.name}}.
+ */
+ input: Array;
/**
* The labels to classify to each item in the evaluation.
@@ -655,57 +1110,157 @@ export namespace EvalCreateParams {
role: string;
}
- export interface InputMessage {
- content: InputMessage.Content;
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface EvalItem {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
/**
- * The role of the message. One of `user`, `system`, or `developer`.
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
*/
- role: 'user' | 'system' | 'developer';
+ role: 'user' | 'assistant' | 'system' | 'developer';
/**
- * The type of item, which is always `message`.
+ * The type of the message input. Always `message`.
*/
- type: 'message';
+ type?: 'message';
}
- export namespace InputMessage {
- export interface Content {
+ export namespace EvalItem {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
/**
- * The text content.
+ * The text output from the model.
*/
text: string;
/**
- * The type of content, which is always `input_text`.
+ * The type of the output text. Always `output_text`.
*/
- type: 'input_text';
+ type: 'output_text';
}
}
+ }
+
+ /**
+ * A PythonGrader object that runs a python script on the input.
+ */
+ export interface Python {
+ /**
+ * The name of the grader.
+ */
+ name: string;
- export interface OutputMessage {
- content: OutputMessage.Content;
+ /**
+ * The source code of the python script.
+ */
+ source: string;
+
+ /**
+ * The object type, which is always `python`.
+ */
+ type: 'python';
+
+ /**
+ * The image tag to use for the python script.
+ */
+ image_tag?: string;
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+ }
+
+ /**
+ * A ScoreModelGrader object that uses a model to assign a score to the input.
+ */
+ export interface ScoreModel {
+ /**
+ * The input text. This may include template strings.
+ */
+ input: Array;
+
+ /**
+ * The model to use for the evaluation.
+ */
+ model: string;
+
+ /**
+ * The name of the grader.
+ */
+ name: string;
+
+ /**
+ * The object type, which is always `score_model`.
+ */
+ type: 'score_model';
+
+ /**
+ * The threshold for the score.
+ */
+ pass_threshold?: number;
+
+ /**
+ * The range of the score. Defaults to `[0, 1]`.
+ */
+ range?: Array;
+ /**
+ * The sampling parameters for the model.
+ */
+ sampling_params?: unknown;
+ }
+
+ export namespace ScoreModel {
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface Input {
/**
- * The role of the message. Must be `assistant` for output.
+ * Text inputs to the model - can contain template strings.
*/
- role: 'assistant';
+ content: string | ResponsesAPI.ResponseInputText | Input.OutputText;
/**
- * The type of item, which is always `message`.
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
*/
- type: 'message';
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
}
- export namespace OutputMessage {
- export interface Content {
+ export namespace Input {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
/**
- * The text content.
+ * The text output from the model.
*/
text: string;
/**
- * The type of content, which is always `output_text`.
+ * The type of the output text. Always `output_text`.
*/
type: 'output_text';
}
diff --git a/src/resources/evals/runs/runs.ts b/src/resources/evals/runs/runs.ts
index ca2b7f424..50c07a514 100644
--- a/src/resources/evals/runs/runs.ts
+++ b/src/resources/evals/runs/runs.ts
@@ -4,6 +4,7 @@ import { APIResource } from '../../../resource';
import { isRequestOptions } from '../../../core';
import * as Core from '../../../core';
import * as Shared from '../../shared';
+import * as ResponsesAPI from '../../responses/responses';
import * as OutputItemsAPI from './output-items';
import {
OutputItemListParams,
@@ -83,15 +84,6 @@ export class RunListResponsesPage extends CursorPage {}
* A CompletionsRunDataSource object describing a model sampling configuration.
*/
export interface CreateEvalCompletionsRunDataSource {
- input_messages:
- | CreateEvalCompletionsRunDataSource.Template
- | CreateEvalCompletionsRunDataSource.ItemReference;
-
- /**
- * The name of the model to use for generating completions (e.g. "o3-mini").
- */
- model: string;
-
/**
* A StoredCompletionsRunDataSource configuration describing a set of filters
*/
@@ -105,105 +97,19 @@ export interface CreateEvalCompletionsRunDataSource {
*/
type: 'completions';
+ input_messages?:
+ | CreateEvalCompletionsRunDataSource.Template
+ | CreateEvalCompletionsRunDataSource.ItemReference;
+
+ /**
+ * The name of the model to use for generating completions (e.g. "o3-mini").
+ */
+ model?: string;
+
sampling_params?: CreateEvalCompletionsRunDataSource.SamplingParams;
}
export namespace CreateEvalCompletionsRunDataSource {
- export interface Template {
- /**
- * A list of chat messages forming the prompt or context. May include variable
- * references to the "item" namespace, ie {{item.name}}.
- */
- template: Array;
-
- /**
- * The type of input messages. Always `template`.
- */
- type: 'template';
- }
-
- export namespace Template {
- export interface ChatMessage {
- /**
- * The content of the message.
- */
- content: string;
-
- /**
- * The role of the message (e.g. "system", "assistant", "user").
- */
- role: string;
- }
-
- export interface InputMessage {
- content: InputMessage.Content;
-
- /**
- * The role of the message. One of `user`, `system`, or `developer`.
- */
- role: 'user' | 'system' | 'developer';
-
- /**
- * The type of item, which is always `message`.
- */
- type: 'message';
- }
-
- export namespace InputMessage {
- export interface Content {
- /**
- * The text content.
- */
- text: string;
-
- /**
- * The type of content, which is always `input_text`.
- */
- type: 'input_text';
- }
- }
-
- export interface OutputMessage {
- content: OutputMessage.Content;
-
- /**
- * The role of the message. Must be `assistant` for output.
- */
- role: 'assistant';
-
- /**
- * The type of item, which is always `message`.
- */
- type: 'message';
- }
-
- export namespace OutputMessage {
- export interface Content {
- /**
- * The text content.
- */
- text: string;
-
- /**
- * The type of content, which is always `output_text`.
- */
- type: 'output_text';
- }
- }
- }
-
- export interface ItemReference {
- /**
- * A reference to a variable in the "item" namespace. Ie, "item.name"
- */
- item_reference: string;
-
- /**
- * The type of input messages. Always `item_reference`.
- */
- type: 'item_reference';
- }
-
export interface FileContent {
/**
* The content of the jsonl file.
@@ -240,20 +146,25 @@ export namespace CreateEvalCompletionsRunDataSource {
* A StoredCompletionsRunDataSource configuration describing a set of filters
*/
export interface StoredCompletions {
+ /**
+ * The type of source. Always `stored_completions`.
+ */
+ type: 'stored_completions';
+
/**
* An optional Unix timestamp to filter items created after this time.
*/
- created_after: number | null;
+ created_after?: number | null;
/**
* An optional Unix timestamp to filter items created before this time.
*/
- created_before: number | null;
+ created_before?: number | null;
/**
* An optional maximum number of items to return.
*/
- limit: number | null;
+ limit?: number | null;
/**
* Set of 16 key-value pairs that can be attached to an object. This can be useful
@@ -263,17 +174,81 @@ export namespace CreateEvalCompletionsRunDataSource {
* Keys are strings with a maximum length of 64 characters. Values are strings with
* a maximum length of 512 characters.
*/
- metadata: Shared.Metadata | null;
+ metadata?: Shared.Metadata | null;
/**
* An optional model to filter by (e.g., 'gpt-4o').
*/
- model: string | null;
+ model?: string | null;
+ }
+ export interface Template {
/**
- * The type of source. Always `stored_completions`.
+ * A list of chat messages forming the prompt or context. May include variable
+ * references to the "item" namespace, ie {{item.name}}.
*/
- type: 'stored_completions';
+ template: Array;
+
+ /**
+ * The type of input messages. Always `template`.
+ */
+ type: 'template';
+ }
+
+ export namespace Template {
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface Message {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | Message.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace Message {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
+
+ export interface ItemReference {
+ /**
+ * A reference to a variable in the "item" namespace. Ie, "item.name"
+ */
+ item_reference: string;
+
+ /**
+ * The type of input messages. Always `item_reference`.
+ */
+ type: 'item_reference';
}
export interface SamplingParams {
@@ -378,7 +353,10 @@ export interface RunCreateResponse {
/**
* Information about the run's data source.
*/
- data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource;
+ data_source:
+ | CreateEvalJSONLRunDataSource
+ | CreateEvalCompletionsRunDataSource
+ | RunCreateResponse.Completions;
/**
* An object representing an error response from the Eval API.
@@ -442,162 +420,240 @@ export interface RunCreateResponse {
}
export namespace RunCreateResponse {
- export interface PerModelUsage {
- /**
- * The number of tokens retrieved from cache.
- */
- cached_tokens: number;
-
+ /**
+ * A ResponsesRunDataSource object describing a model sampling configuration.
+ */
+ export interface Completions {
/**
- * The number of completion tokens generated.
+ * A EvalResponsesSource object describing a run data source configuration.
*/
- completion_tokens: number;
+ source: Completions.FileContent | Completions.FileID | Completions.Responses;
/**
- * The number of invocations.
+ * The type of run data source. Always `completions`.
*/
- invocation_count: number;
+ type: 'completions';
- /**
- * The name of the model.
- */
- model_name: string;
+ input_messages?: Completions.Template | Completions.ItemReference;
/**
- * The number of prompt tokens used.
+ * The name of the model to use for generating completions (e.g. "o3-mini").
*/
- prompt_tokens: number;
+ model?: string;
- /**
- * The total number of tokens used.
- */
- total_tokens: number;
+ sampling_params?: Completions.SamplingParams;
}
- export interface PerTestingCriteriaResult {
- /**
- * Number of tests failed for this criteria.
- */
- failed: number;
+ export namespace Completions {
+ export interface FileContent {
+ /**
+ * The content of the jsonl file.
+ */
+ content: Array;
- /**
- * Number of tests passed for this criteria.
- */
- passed: number;
+ /**
+ * The type of jsonl source. Always `file_content`.
+ */
+ type: 'file_content';
+ }
- /**
- * A description of the testing criteria.
- */
- testing_criteria: string;
- }
+ export namespace FileContent {
+ export interface Content {
+ item: Record;
- /**
- * Counters summarizing the outcomes of the evaluation run.
- */
- export interface ResultCounts {
- /**
- * Number of output items that resulted in an error.
- */
- errored: number;
+ sample?: Record;
+ }
+ }
- /**
- * Number of output items that failed to pass the evaluation.
- */
- failed: number;
+ export interface FileID {
+ /**
+ * The identifier of the file.
+ */
+ id: string;
- /**
- * Number of output items that passed the evaluation.
- */
- passed: number;
+ /**
+ * The type of jsonl source. Always `file_id`.
+ */
+ type: 'file_id';
+ }
/**
- * Total number of executed output items.
+ * A EvalResponsesSource object describing a run data source configuration.
*/
- total: number;
- }
-}
+ export interface Responses {
+ /**
+ * The type of run data source. Always `responses`.
+ */
+ type: 'responses';
-/**
- * A schema representing an evaluation run.
- */
-export interface RunRetrieveResponse {
- /**
- * Unique identifier for the evaluation run.
- */
- id: string;
+ /**
+ * Whether to allow parallel tool calls. This is a query parameter used to select
+ * responses.
+ */
+ allow_parallel_tool_calls?: boolean | null;
- /**
- * Unix timestamp (in seconds) when the evaluation run was created.
- */
- created_at: number;
+ /**
+ * Only include items created after this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_after?: number | null;
- /**
- * Information about the run's data source.
- */
- data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource;
+ /**
+ * Only include items created before this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_before?: number | null;
- /**
- * An object representing an error response from the Eval API.
- */
- error: EvalAPIError;
+ /**
+ * Whether the response has tool calls. This is a query parameter used to select
+ * responses.
+ */
+ has_tool_calls?: boolean | null;
- /**
- * The identifier of the associated evaluation.
- */
- eval_id: string;
+ /**
+ * Optional search string for instructions. This is a query parameter used to
+ * select responses.
+ */
+ instructions_search?: string | null;
- /**
- * Set of 16 key-value pairs that can be attached to an object. This can be useful
- * for storing additional information about the object in a structured format, and
- * querying for objects via API or the dashboard.
- *
- * Keys are strings with a maximum length of 64 characters. Values are strings with
- * a maximum length of 512 characters.
- */
- metadata: Shared.Metadata | null;
+ /**
+ * Metadata filter for the responses. This is a query parameter used to select
+ * responses.
+ */
+ metadata?: unknown | null;
- /**
- * The model that is evaluated, if applicable.
- */
- model: string;
+ /**
+ * The name of the model to find responses for. This is a query parameter used to
+ * select responses.
+ */
+ model?: string | null;
- /**
- * The name of the evaluation run.
- */
- name: string;
+ /**
+ * Optional reasoning effort parameter. This is a query parameter used to select
+ * responses.
+ */
+ reasoning_effort?: Shared.ReasoningEffort | null;
- /**
- * The type of the object. Always "eval.run".
- */
- object: 'eval.run';
+ /**
+ * Sampling temperature. This is a query parameter used to select responses.
+ */
+ temperature?: number | null;
- /**
- * Usage statistics for each model during the evaluation run.
- */
- per_model_usage: Array;
+ /**
+ * Nucleus sampling parameter. This is a query parameter used to select responses.
+ */
+ top_p?: number | null;
- /**
- * Results per testing criteria applied during the evaluation run.
- */
- per_testing_criteria_results: Array;
+ /**
+ * List of user identifiers. This is a query parameter used to select responses.
+ */
+ users?: Array | null;
+ }
- /**
- * The URL to the rendered evaluation run report on the UI dashboard.
- */
- report_url: string;
+ export interface Template {
+ /**
+ * A list of chat messages forming the prompt or context. May include variable
+ * references to the "item" namespace, ie {{item.name}}.
+ */
+ template: Array;
- /**
- * Counters summarizing the outcomes of the evaluation run.
- */
- result_counts: RunRetrieveResponse.ResultCounts;
+ /**
+ * The type of input messages. Always `template`.
+ */
+ type: 'template';
+ }
- /**
- * The status of the evaluation run.
- */
- status: string;
-}
+ export namespace Template {
+ export interface ChatMessage {
+ /**
+ * The content of the message.
+ */
+ content: string;
+
+ /**
+ * The role of the message (e.g. "system", "assistant", "user").
+ */
+ role: string;
+ }
+
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface EvalItem {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace EvalItem {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
+
+ export interface ItemReference {
+ /**
+ * A reference to a variable in the "item" namespace. Ie, "item.name"
+ */
+ item_reference: string;
+
+ /**
+ * The type of input messages. Always `item_reference`.
+ */
+ type: 'item_reference';
+ }
+
+ export interface SamplingParams {
+ /**
+ * The maximum number of tokens in the generated output.
+ */
+ max_completion_tokens?: number;
+
+ /**
+ * A seed value to initialize the randomness, during sampling.
+ */
+ seed?: number;
+
+ /**
+ * A higher temperature increases randomness in the outputs.
+ */
+ temperature?: number;
+
+ /**
+ * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
+ */
+ top_p?: number;
+ }
+ }
-export namespace RunRetrieveResponse {
export interface PerModelUsage {
/**
* The number of tokens retrieved from cache.
@@ -676,7 +732,7 @@ export namespace RunRetrieveResponse {
/**
* A schema representing an evaluation run.
*/
-export interface RunListResponse {
+export interface RunRetrieveResponse {
/**
* Unique identifier for the evaluation run.
*/
@@ -690,7 +746,10 @@ export interface RunListResponse {
/**
* Information about the run's data source.
*/
- data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource;
+ data_source:
+ | CreateEvalJSONLRunDataSource
+ | CreateEvalCompletionsRunDataSource
+ | RunRetrieveResponse.Completions;
/**
* An object representing an error response from the Eval API.
@@ -730,12 +789,12 @@ export interface RunListResponse {
/**
* Usage statistics for each model during the evaluation run.
*/
- per_model_usage: Array;
+ per_model_usage: Array;
/**
* Results per testing criteria applied during the evaluation run.
*/
- per_testing_criteria_results: Array;
+ per_testing_criteria_results: Array;
/**
* The URL to the rendered evaluation run report on the UI dashboard.
@@ -745,7 +804,7 @@ export interface RunListResponse {
/**
* Counters summarizing the outcomes of the evaluation run.
*/
- result_counts: RunListResponse.ResultCounts;
+ result_counts: RunRetrieveResponse.ResultCounts;
/**
* The status of the evaluation run.
@@ -753,7 +812,241 @@ export interface RunListResponse {
status: string;
}
-export namespace RunListResponse {
+export namespace RunRetrieveResponse {
+ /**
+ * A ResponsesRunDataSource object describing a model sampling configuration.
+ */
+ export interface Completions {
+ /**
+ * A EvalResponsesSource object describing a run data source configuration.
+ */
+ source: Completions.FileContent | Completions.FileID | Completions.Responses;
+
+ /**
+ * The type of run data source. Always `completions`.
+ */
+ type: 'completions';
+
+ input_messages?: Completions.Template | Completions.ItemReference;
+
+ /**
+ * The name of the model to use for generating completions (e.g. "o3-mini").
+ */
+ model?: string;
+
+ sampling_params?: Completions.SamplingParams;
+ }
+
+ export namespace Completions {
+ export interface FileContent {
+ /**
+ * The content of the jsonl file.
+ */
+ content: Array;
+
+ /**
+ * The type of jsonl source. Always `file_content`.
+ */
+ type: 'file_content';
+ }
+
+ export namespace FileContent {
+ export interface Content {
+ item: Record;
+
+ sample?: Record;
+ }
+ }
+
+ export interface FileID {
+ /**
+ * The identifier of the file.
+ */
+ id: string;
+
+ /**
+ * The type of jsonl source. Always `file_id`.
+ */
+ type: 'file_id';
+ }
+
+ /**
+ * A EvalResponsesSource object describing a run data source configuration.
+ */
+ export interface Responses {
+ /**
+ * The type of run data source. Always `responses`.
+ */
+ type: 'responses';
+
+ /**
+ * Whether to allow parallel tool calls. This is a query parameter used to select
+ * responses.
+ */
+ allow_parallel_tool_calls?: boolean | null;
+
+ /**
+ * Only include items created after this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_after?: number | null;
+
+ /**
+ * Only include items created before this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_before?: number | null;
+
+ /**
+ * Whether the response has tool calls. This is a query parameter used to select
+ * responses.
+ */
+ has_tool_calls?: boolean | null;
+
+ /**
+ * Optional search string for instructions. This is a query parameter used to
+ * select responses.
+ */
+ instructions_search?: string | null;
+
+ /**
+ * Metadata filter for the responses. This is a query parameter used to select
+ * responses.
+ */
+ metadata?: unknown | null;
+
+ /**
+ * The name of the model to find responses for. This is a query parameter used to
+ * select responses.
+ */
+ model?: string | null;
+
+ /**
+ * Optional reasoning effort parameter. This is a query parameter used to select
+ * responses.
+ */
+ reasoning_effort?: Shared.ReasoningEffort | null;
+
+ /**
+ * Sampling temperature. This is a query parameter used to select responses.
+ */
+ temperature?: number | null;
+
+ /**
+ * Nucleus sampling parameter. This is a query parameter used to select responses.
+ */
+ top_p?: number | null;
+
+ /**
+ * List of user identifiers. This is a query parameter used to select responses.
+ */
+ users?: Array | null;
+ }
+
+ export interface Template {
+ /**
+ * A list of chat messages forming the prompt or context. May include variable
+ * references to the "item" namespace, ie {{item.name}}.
+ */
+ template: Array;
+
+ /**
+ * The type of input messages. Always `template`.
+ */
+ type: 'template';
+ }
+
+ export namespace Template {
+ export interface ChatMessage {
+ /**
+ * The content of the message.
+ */
+ content: string;
+
+ /**
+ * The role of the message (e.g. "system", "assistant", "user").
+ */
+ role: string;
+ }
+
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface EvalItem {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace EvalItem {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
+
+ export interface ItemReference {
+ /**
+ * A reference to a variable in the "item" namespace. Ie, "item.name"
+ */
+ item_reference: string;
+
+ /**
+ * The type of input messages. Always `item_reference`.
+ */
+ type: 'item_reference';
+ }
+
+ export interface SamplingParams {
+ /**
+ * The maximum number of tokens in the generated output.
+ */
+ max_completion_tokens?: number;
+
+ /**
+ * A seed value to initialize the randomness, during sampling.
+ */
+ seed?: number;
+
+ /**
+ * A higher temperature increases randomness in the outputs.
+ */
+ temperature?: number;
+
+ /**
+ * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
+ */
+ top_p?: number;
+ }
+ }
+
export interface PerModelUsage {
/**
* The number of tokens retrieved from cache.
@@ -829,18 +1122,10 @@ export namespace RunListResponse {
}
}
-export interface RunDeleteResponse {
- deleted?: boolean;
-
- object?: string;
-
- run_id?: string;
-}
-
/**
* A schema representing an evaluation run.
*/
-export interface RunCancelResponse {
+export interface RunListResponse {
/**
* Unique identifier for the evaluation run.
*/
@@ -854,7 +1139,10 @@ export interface RunCancelResponse {
/**
* Information about the run's data source.
*/
- data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource;
+ data_source:
+ | CreateEvalJSONLRunDataSource
+ | CreateEvalCompletionsRunDataSource
+ | RunListResponse.Completions;
/**
* An object representing an error response from the Eval API.
@@ -894,12 +1182,12 @@ export interface RunCancelResponse {
/**
* Usage statistics for each model during the evaluation run.
*/
- per_model_usage: Array;
+ per_model_usage: Array;
/**
* Results per testing criteria applied during the evaluation run.
*/
- per_testing_criteria_results: Array;
+ per_testing_criteria_results: Array;
/**
* The URL to the rendered evaluation run report on the UI dashboard.
@@ -909,7 +1197,7 @@ export interface RunCancelResponse {
/**
* Counters summarizing the outcomes of the evaluation run.
*/
- result_counts: RunCancelResponse.ResultCounts;
+ result_counts: RunListResponse.ResultCounts;
/**
* The status of the evaluation run.
@@ -917,25 +1205,660 @@ export interface RunCancelResponse {
status: string;
}
-export namespace RunCancelResponse {
- export interface PerModelUsage {
+export namespace RunListResponse {
+ /**
+ * A ResponsesRunDataSource object describing a model sampling configuration.
+ */
+ export interface Completions {
/**
- * The number of tokens retrieved from cache.
+ * A EvalResponsesSource object describing a run data source configuration.
*/
- cached_tokens: number;
+ source: Completions.FileContent | Completions.FileID | Completions.Responses;
/**
- * The number of completion tokens generated.
+ * The type of run data source. Always `completions`.
*/
- completion_tokens: number;
+ type: 'completions';
- /**
- * The number of invocations.
- */
- invocation_count: number;
+ input_messages?: Completions.Template | Completions.ItemReference;
/**
- * The name of the model.
+ * The name of the model to use for generating completions (e.g. "o3-mini").
+ */
+ model?: string;
+
+ sampling_params?: Completions.SamplingParams;
+ }
+
+ export namespace Completions {
+ export interface FileContent {
+ /**
+ * The content of the jsonl file.
+ */
+ content: Array;
+
+ /**
+ * The type of jsonl source. Always `file_content`.
+ */
+ type: 'file_content';
+ }
+
+ export namespace FileContent {
+ export interface Content {
+ item: Record;
+
+ sample?: Record;
+ }
+ }
+
+ export interface FileID {
+ /**
+ * The identifier of the file.
+ */
+ id: string;
+
+ /**
+ * The type of jsonl source. Always `file_id`.
+ */
+ type: 'file_id';
+ }
+
+ /**
+ * A EvalResponsesSource object describing a run data source configuration.
+ */
+ export interface Responses {
+ /**
+ * The type of run data source. Always `responses`.
+ */
+ type: 'responses';
+
+ /**
+ * Whether to allow parallel tool calls. This is a query parameter used to select
+ * responses.
+ */
+ allow_parallel_tool_calls?: boolean | null;
+
+ /**
+ * Only include items created after this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_after?: number | null;
+
+ /**
+ * Only include items created before this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_before?: number | null;
+
+ /**
+ * Whether the response has tool calls. This is a query parameter used to select
+ * responses.
+ */
+ has_tool_calls?: boolean | null;
+
+ /**
+ * Optional search string for instructions. This is a query parameter used to
+ * select responses.
+ */
+ instructions_search?: string | null;
+
+ /**
+ * Metadata filter for the responses. This is a query parameter used to select
+ * responses.
+ */
+ metadata?: unknown | null;
+
+ /**
+ * The name of the model to find responses for. This is a query parameter used to
+ * select responses.
+ */
+ model?: string | null;
+
+ /**
+ * Optional reasoning effort parameter. This is a query parameter used to select
+ * responses.
+ */
+ reasoning_effort?: Shared.ReasoningEffort | null;
+
+ /**
+ * Sampling temperature. This is a query parameter used to select responses.
+ */
+ temperature?: number | null;
+
+ /**
+ * Nucleus sampling parameter. This is a query parameter used to select responses.
+ */
+ top_p?: number | null;
+
+ /**
+ * List of user identifiers. This is a query parameter used to select responses.
+ */
+ users?: Array | null;
+ }
+
+ export interface Template {
+ /**
+ * A list of chat messages forming the prompt or context. May include variable
+ * references to the "item" namespace, ie {{item.name}}.
+ */
+ template: Array;
+
+ /**
+ * The type of input messages. Always `template`.
+ */
+ type: 'template';
+ }
+
+ export namespace Template {
+ export interface ChatMessage {
+ /**
+ * The content of the message.
+ */
+ content: string;
+
+ /**
+ * The role of the message (e.g. "system", "assistant", "user").
+ */
+ role: string;
+ }
+
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface EvalItem {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace EvalItem {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
+
+ export interface ItemReference {
+ /**
+ * A reference to a variable in the "item" namespace. Ie, "item.name"
+ */
+ item_reference: string;
+
+ /**
+ * The type of input messages. Always `item_reference`.
+ */
+ type: 'item_reference';
+ }
+
+ export interface SamplingParams {
+ /**
+ * The maximum number of tokens in the generated output.
+ */
+ max_completion_tokens?: number;
+
+ /**
+ * A seed value to initialize the randomness, during sampling.
+ */
+ seed?: number;
+
+ /**
+ * A higher temperature increases randomness in the outputs.
+ */
+ temperature?: number;
+
+ /**
+ * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
+ */
+ top_p?: number;
+ }
+ }
+
+ export interface PerModelUsage {
+ /**
+ * The number of tokens retrieved from cache.
+ */
+ cached_tokens: number;
+
+ /**
+ * The number of completion tokens generated.
+ */
+ completion_tokens: number;
+
+ /**
+ * The number of invocations.
+ */
+ invocation_count: number;
+
+ /**
+ * The name of the model.
+ */
+ model_name: string;
+
+ /**
+ * The number of prompt tokens used.
+ */
+ prompt_tokens: number;
+
+ /**
+ * The total number of tokens used.
+ */
+ total_tokens: number;
+ }
+
+ export interface PerTestingCriteriaResult {
+ /**
+ * Number of tests failed for this criteria.
+ */
+ failed: number;
+
+ /**
+ * Number of tests passed for this criteria.
+ */
+ passed: number;
+
+ /**
+ * A description of the testing criteria.
+ */
+ testing_criteria: string;
+ }
+
+ /**
+ * Counters summarizing the outcomes of the evaluation run.
+ */
+ export interface ResultCounts {
+ /**
+ * Number of output items that resulted in an error.
+ */
+ errored: number;
+
+ /**
+ * Number of output items that failed to pass the evaluation.
+ */
+ failed: number;
+
+ /**
+ * Number of output items that passed the evaluation.
+ */
+ passed: number;
+
+ /**
+ * Total number of executed output items.
+ */
+ total: number;
+ }
+}
+
+export interface RunDeleteResponse {
+ deleted?: boolean;
+
+ object?: string;
+
+ run_id?: string;
+}
+
+/**
+ * A schema representing an evaluation run.
+ */
+export interface RunCancelResponse {
+ /**
+ * Unique identifier for the evaluation run.
+ */
+ id: string;
+
+ /**
+ * Unix timestamp (in seconds) when the evaluation run was created.
+ */
+ created_at: number;
+
+ /**
+ * Information about the run's data source.
+ */
+ data_source:
+ | CreateEvalJSONLRunDataSource
+ | CreateEvalCompletionsRunDataSource
+ | RunCancelResponse.Completions;
+
+ /**
+ * An object representing an error response from the Eval API.
+ */
+ error: EvalAPIError;
+
+ /**
+ * The identifier of the associated evaluation.
+ */
+ eval_id: string;
+
+ /**
+ * Set of 16 key-value pairs that can be attached to an object. This can be useful
+ * for storing additional information about the object in a structured format, and
+ * querying for objects via API or the dashboard.
+ *
+ * Keys are strings with a maximum length of 64 characters. Values are strings with
+ * a maximum length of 512 characters.
+ */
+ metadata: Shared.Metadata | null;
+
+ /**
+ * The model that is evaluated, if applicable.
+ */
+ model: string;
+
+ /**
+ * The name of the evaluation run.
+ */
+ name: string;
+
+ /**
+ * The type of the object. Always "eval.run".
+ */
+ object: 'eval.run';
+
+ /**
+ * Usage statistics for each model during the evaluation run.
+ */
+ per_model_usage: Array;
+
+ /**
+ * Results per testing criteria applied during the evaluation run.
+ */
+ per_testing_criteria_results: Array;
+
+ /**
+ * The URL to the rendered evaluation run report on the UI dashboard.
+ */
+ report_url: string;
+
+ /**
+ * Counters summarizing the outcomes of the evaluation run.
+ */
+ result_counts: RunCancelResponse.ResultCounts;
+
+ /**
+ * The status of the evaluation run.
+ */
+ status: string;
+}
+
+export namespace RunCancelResponse {
+ /**
+ * A ResponsesRunDataSource object describing a model sampling configuration.
+ */
+ export interface Completions {
+ /**
+ * A EvalResponsesSource object describing a run data source configuration.
+ */
+ source: Completions.FileContent | Completions.FileID | Completions.Responses;
+
+ /**
+ * The type of run data source. Always `completions`.
+ */
+ type: 'completions';
+
+ input_messages?: Completions.Template | Completions.ItemReference;
+
+ /**
+ * The name of the model to use for generating completions (e.g. "o3-mini").
+ */
+ model?: string;
+
+ sampling_params?: Completions.SamplingParams;
+ }
+
+ export namespace Completions {
+ export interface FileContent {
+ /**
+ * The content of the jsonl file.
+ */
+ content: Array;
+
+ /**
+ * The type of jsonl source. Always `file_content`.
+ */
+ type: 'file_content';
+ }
+
+ export namespace FileContent {
+ export interface Content {
+ item: Record;
+
+ sample?: Record;
+ }
+ }
+
+ export interface FileID {
+ /**
+ * The identifier of the file.
+ */
+ id: string;
+
+ /**
+ * The type of jsonl source. Always `file_id`.
+ */
+ type: 'file_id';
+ }
+
+ /**
+ * A EvalResponsesSource object describing a run data source configuration.
+ */
+ export interface Responses {
+ /**
+ * The type of run data source. Always `responses`.
+ */
+ type: 'responses';
+
+ /**
+ * Whether to allow parallel tool calls. This is a query parameter used to select
+ * responses.
+ */
+ allow_parallel_tool_calls?: boolean | null;
+
+ /**
+ * Only include items created after this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_after?: number | null;
+
+ /**
+ * Only include items created before this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_before?: number | null;
+
+ /**
+ * Whether the response has tool calls. This is a query parameter used to select
+ * responses.
+ */
+ has_tool_calls?: boolean | null;
+
+ /**
+ * Optional search string for instructions. This is a query parameter used to
+ * select responses.
+ */
+ instructions_search?: string | null;
+
+ /**
+ * Metadata filter for the responses. This is a query parameter used to select
+ * responses.
+ */
+ metadata?: unknown | null;
+
+ /**
+ * The name of the model to find responses for. This is a query parameter used to
+ * select responses.
+ */
+ model?: string | null;
+
+ /**
+ * Optional reasoning effort parameter. This is a query parameter used to select
+ * responses.
+ */
+ reasoning_effort?: Shared.ReasoningEffort | null;
+
+ /**
+ * Sampling temperature. This is a query parameter used to select responses.
+ */
+ temperature?: number | null;
+
+ /**
+ * Nucleus sampling parameter. This is a query parameter used to select responses.
+ */
+ top_p?: number | null;
+
+ /**
+ * List of user identifiers. This is a query parameter used to select responses.
+ */
+ users?: Array | null;
+ }
+
+ export interface Template {
+ /**
+ * A list of chat messages forming the prompt or context. May include variable
+ * references to the "item" namespace, ie {{item.name}}.
+ */
+ template: Array;
+
+ /**
+ * The type of input messages. Always `template`.
+ */
+ type: 'template';
+ }
+
+ export namespace Template {
+ export interface ChatMessage {
+ /**
+ * The content of the message.
+ */
+ content: string;
+
+ /**
+ * The role of the message (e.g. "system", "assistant", "user").
+ */
+ role: string;
+ }
+
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface EvalItem {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace EvalItem {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
+
+ export interface ItemReference {
+ /**
+ * A reference to a variable in the "item" namespace. Ie, "item.name"
+ */
+ item_reference: string;
+
+ /**
+ * The type of input messages. Always `item_reference`.
+ */
+ type: 'item_reference';
+ }
+
+ export interface SamplingParams {
+ /**
+ * The maximum number of tokens in the generated output.
+ */
+ max_completion_tokens?: number;
+
+ /**
+ * A seed value to initialize the randomness, during sampling.
+ */
+ seed?: number;
+
+ /**
+ * A higher temperature increases randomness in the outputs.
+ */
+ temperature?: number;
+
+ /**
+ * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
+ */
+ top_p?: number;
+ }
+ }
+
+ export interface PerModelUsage {
+ /**
+ * The number of tokens retrieved from cache.
+ */
+ cached_tokens: number;
+
+ /**
+ * The number of completion tokens generated.
+ */
+ completion_tokens: number;
+
+ /**
+ * The number of invocations.
+ */
+ invocation_count: number;
+
+ /**
+ * The name of the model.
*/
model_name: string;
@@ -997,7 +1920,10 @@ export interface RunCreateParams {
/**
* Details about the run's data source.
*/
- data_source: CreateEvalJSONLRunDataSource | CreateEvalCompletionsRunDataSource;
+ data_source:
+ | CreateEvalJSONLRunDataSource
+ | CreateEvalCompletionsRunDataSource
+ | RunCreateParams.CreateEvalResponsesRunDataSource;
/**
* Set of 16 key-value pairs that can be attached to an object. This can be useful
@@ -1015,6 +1941,247 @@ export interface RunCreateParams {
name?: string;
}
+export namespace RunCreateParams {
+ /**
+ * A ResponsesRunDataSource object describing a model sampling configuration.
+ */
+ export interface CreateEvalResponsesRunDataSource {
+ /**
+ * A EvalResponsesSource object describing a run data source configuration.
+ */
+ source:
+ | CreateEvalResponsesRunDataSource.FileContent
+ | CreateEvalResponsesRunDataSource.FileID
+ | CreateEvalResponsesRunDataSource.Responses;
+
+ /**
+ * The type of run data source. Always `completions`.
+ */
+ type: 'completions';
+
+ input_messages?:
+ | CreateEvalResponsesRunDataSource.Template
+ | CreateEvalResponsesRunDataSource.ItemReference;
+
+ /**
+ * The name of the model to use for generating completions (e.g. "o3-mini").
+ */
+ model?: string;
+
+ sampling_params?: CreateEvalResponsesRunDataSource.SamplingParams;
+ }
+
+ export namespace CreateEvalResponsesRunDataSource {
+ export interface FileContent {
+ /**
+ * The content of the jsonl file.
+ */
+ content: Array;
+
+ /**
+ * The type of jsonl source. Always `file_content`.
+ */
+ type: 'file_content';
+ }
+
+ export namespace FileContent {
+ export interface Content {
+ item: Record;
+
+ sample?: Record;
+ }
+ }
+
+ export interface FileID {
+ /**
+ * The identifier of the file.
+ */
+ id: string;
+
+ /**
+ * The type of jsonl source. Always `file_id`.
+ */
+ type: 'file_id';
+ }
+
+ /**
+ * A EvalResponsesSource object describing a run data source configuration.
+ */
+ export interface Responses {
+ /**
+ * The type of run data source. Always `responses`.
+ */
+ type: 'responses';
+
+ /**
+ * Whether to allow parallel tool calls. This is a query parameter used to select
+ * responses.
+ */
+ allow_parallel_tool_calls?: boolean | null;
+
+ /**
+ * Only include items created after this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_after?: number | null;
+
+ /**
+ * Only include items created before this timestamp (inclusive). This is a query
+ * parameter used to select responses.
+ */
+ created_before?: number | null;
+
+ /**
+ * Whether the response has tool calls. This is a query parameter used to select
+ * responses.
+ */
+ has_tool_calls?: boolean | null;
+
+ /**
+ * Optional search string for instructions. This is a query parameter used to
+ * select responses.
+ */
+ instructions_search?: string | null;
+
+ /**
+ * Metadata filter for the responses. This is a query parameter used to select
+ * responses.
+ */
+ metadata?: unknown | null;
+
+ /**
+ * The name of the model to find responses for. This is a query parameter used to
+ * select responses.
+ */
+ model?: string | null;
+
+ /**
+ * Optional reasoning effort parameter. This is a query parameter used to select
+ * responses.
+ */
+ reasoning_effort?: Shared.ReasoningEffort | null;
+
+ /**
+ * Sampling temperature. This is a query parameter used to select responses.
+ */
+ temperature?: number | null;
+
+ /**
+ * Nucleus sampling parameter. This is a query parameter used to select responses.
+ */
+ top_p?: number | null;
+
+ /**
+ * List of user identifiers. This is a query parameter used to select responses.
+ */
+ users?: Array | null;
+ }
+
+ export interface Template {
+ /**
+ * A list of chat messages forming the prompt or context. May include variable
+ * references to the "item" namespace, ie {{item.name}}.
+ */
+ template: Array;
+
+ /**
+ * The type of input messages. Always `template`.
+ */
+ type: 'template';
+ }
+
+ export namespace Template {
+ export interface ChatMessage {
+ /**
+ * The content of the message.
+ */
+ content: string;
+
+ /**
+ * The role of the message (e.g. "system", "assistant", "user").
+ */
+ role: string;
+ }
+
+ /**
+ * A message input to the model with a role indicating instruction following
+ * hierarchy. Instructions given with the `developer` or `system` role take
+ * precedence over instructions given with the `user` role. Messages with the
+ * `assistant` role are presumed to have been generated by the model in previous
+ * interactions.
+ */
+ export interface EvalItem {
+ /**
+ * Text inputs to the model - can contain template strings.
+ */
+ content: string | ResponsesAPI.ResponseInputText | EvalItem.OutputText;
+
+ /**
+ * The role of the message input. One of `user`, `assistant`, `system`, or
+ * `developer`.
+ */
+ role: 'user' | 'assistant' | 'system' | 'developer';
+
+ /**
+ * The type of the message input. Always `message`.
+ */
+ type?: 'message';
+ }
+
+ export namespace EvalItem {
+ /**
+ * A text output from the model.
+ */
+ export interface OutputText {
+ /**
+ * The text output from the model.
+ */
+ text: string;
+
+ /**
+ * The type of the output text. Always `output_text`.
+ */
+ type: 'output_text';
+ }
+ }
+ }
+
+ export interface ItemReference {
+ /**
+ * A reference to a variable in the "item" namespace. Ie, "item.name"
+ */
+ item_reference: string;
+
+ /**
+ * The type of input messages. Always `item_reference`.
+ */
+ type: 'item_reference';
+ }
+
+ export interface SamplingParams {
+ /**
+ * The maximum number of tokens in the generated output.
+ */
+ max_completion_tokens?: number;
+
+ /**
+ * A seed value to initialize the randomness, during sampling.
+ */
+ seed?: number;
+
+ /**
+ * A higher temperature increases randomness in the outputs.
+ */
+ temperature?: number;
+
+ /**
+ * An alternative to temperature for nucleus sampling; 1.0 includes all tokens.
+ */
+ top_p?: number;
+ }
+ }
+}
+
export interface RunListParams extends CursorPageParams {
/**
* Sort order for runs by timestamp. Use `asc` for ascending order or `desc` for
@@ -1023,8 +2190,8 @@ export interface RunListParams extends CursorPageParams {
order?: 'asc' | 'desc';
/**
- * Filter runs by status. Use "queued" | "in_progress" | "failed" | "completed" |
- * "canceled".
+ * Filter runs by status. One of `queued` | `in_progress` | `failed` | `completed`
+ * | `canceled`.
*/
status?: 'queued' | 'in_progress' | 'completed' | 'canceled' | 'failed';
}
diff --git a/src/resources/fine-tuning/checkpoints/permissions.ts b/src/resources/fine-tuning/checkpoints/permissions.ts
index 500c3de81..e808b2001 100644
--- a/src/resources/fine-tuning/checkpoints/permissions.ts
+++ b/src/resources/fine-tuning/checkpoints/permissions.ts
@@ -61,9 +61,13 @@ export class Permissions extends APIResource {
*/
del(
fineTunedModelCheckpoint: string,
+ permissionId: string,
options?: Core.RequestOptions,
): Core.APIPromise {
- return this._client.delete(`/fine_tuning/checkpoints/${fineTunedModelCheckpoint}/permissions`, options);
+ return this._client.delete(
+ `/fine_tuning/checkpoints/${fineTunedModelCheckpoint}/permissions/${permissionId}`,
+ options,
+ );
}
}
diff --git a/src/resources/images.ts b/src/resources/images.ts
index 8e1c6d92e..de1882d30 100644
--- a/src/resources/images.ts
+++ b/src/resources/images.ts
@@ -5,7 +5,7 @@ import * as Core from '../core';
export class Images extends APIResource {
/**
- * Creates a variation of a given image.
+ * Creates a variation of a given image. This endpoint only supports `dall-e-2`.
*/
createVariation(
body: ImageCreateVariationParams,
@@ -15,7 +15,8 @@ export class Images extends APIResource {
}
/**
- * Creates an edited or extended image given an original image and a prompt.
+ * Creates an edited or extended image given one or more source images and a
+ * prompt. This endpoint only supports `gpt-image-1` and `dall-e-2`.
*/
edit(body: ImageEditParams, options?: Core.RequestOptions): Core.APIPromise {
return this._client.post('/images/edits', Core.multipartFormRequestOptions({ body, ...options }));
@@ -23,6 +24,7 @@ export class Images extends APIResource {
/**
* Creates an image given a prompt.
+ * [Learn more](https://platform.openai.com/docs/guides/images).
*/
generate(body: ImageGenerateParams, options?: Core.RequestOptions): Core.APIPromise {
return this._client.post('/images/generations', { body, ...options });
@@ -30,33 +32,93 @@ export class Images extends APIResource {
}
/**
- * Represents the url or the content of an image generated by the OpenAI API.
+ * Represents the content or the URL of an image generated by the OpenAI API.
*/
export interface Image {
/**
- * The base64-encoded JSON of the generated image, if `response_format` is
- * `b64_json`.
+ * The base64-encoded JSON of the generated image. Default value for `gpt-image-1`,
+ * and only present if `response_format` is set to `b64_json` for `dall-e-2` and
+ * `dall-e-3`.
*/
b64_json?: string;
/**
- * The prompt that was used to generate the image, if there was any revision to the
- * prompt.
+ * For `dall-e-3` only, the revised prompt that was used to generate the image.
*/
revised_prompt?: string;
/**
- * The URL of the generated image, if `response_format` is `url` (default).
+ * When using `dall-e-2` or `dall-e-3`, the URL of the generated image if
+ * `response_format` is set to `url` (default value). Unsupported for
+ * `gpt-image-1`.
*/
url?: string;
}
-export type ImageModel = 'dall-e-2' | 'dall-e-3';
+export type ImageModel = 'dall-e-2' | 'dall-e-3' | 'gpt-image-1';
+/**
+ * The response from the image generation endpoint.
+ */
export interface ImagesResponse {
+ /**
+ * The Unix timestamp (in seconds) of when the image was created.
+ */
created: number;
- data: Array;
+ /**
+ * The list of generated images.
+ */
+ data?: Array;
+
+ /**
+ * For `gpt-image-1` only, the token usage information for the image generation.
+ */
+ usage?: ImagesResponse.Usage;
+}
+
+export namespace ImagesResponse {
+ /**
+ * For `gpt-image-1` only, the token usage information for the image generation.
+ */
+ export interface Usage {
+ /**
+ * The number of tokens (images and text) in the input prompt.
+ */
+ input_tokens: number;
+
+ /**
+ * The input tokens detailed information for the image generation.
+ */
+ input_tokens_details: Usage.InputTokensDetails;
+
+ /**
+ * The number of image tokens in the output image.
+ */
+ output_tokens: number;
+
+ /**
+ * The total number of tokens (images and text) used for the image generation.
+ */
+ total_tokens: number;
+ }
+
+ export namespace Usage {
+ /**
+ * The input tokens detailed information for the image generation.
+ */
+ export interface InputTokensDetails {
+ /**
+ * The number of image tokens in the input prompt.
+ */
+ image_tokens: number;
+
+ /**
+ * The number of text tokens in the input prompt.
+ */
+ text_tokens: number;
+ }
+ }
}
export interface ImageCreateVariationParams {
@@ -73,8 +135,7 @@ export interface ImageCreateVariationParams {
model?: (string & {}) | ImageModel | null;
/**
- * The number of images to generate. Must be between 1 and 10. For `dall-e-3`, only
- * `n=1` is supported.
+ * The number of images to generate. Must be between 1 and 10.
*/
n?: number | null;
@@ -101,27 +162,31 @@ export interface ImageCreateVariationParams {
export interface ImageEditParams {
/**
- * The image to edit. Must be a valid PNG file, less than 4MB, and square. If mask
- * is not provided, image must have transparency, which will be used as the mask.
+ * The image(s) to edit. Must be a supported image file or an array of images. For
+ * `gpt-image-1`, each image should be a `png`, `webp`, or `jpg` file less than
+ * 25MB. For `dall-e-2`, you can only provide one image, and it should be a square
+ * `png` file less than 4MB.
*/
- image: Core.Uploadable;
+ image: Core.Uploadable | Array;
/**
* A text description of the desired image(s). The maximum length is 1000
- * characters.
+ * characters for `dall-e-2`, and 32000 characters for `gpt-image-1`.
*/
prompt: string;
/**
* An additional image whose fully transparent areas (e.g. where alpha is zero)
- * indicate where `image` should be edited. Must be a valid PNG file, less than
+ * indicate where `image` should be edited. If there are multiple images provided,
+ * the mask will be applied on the first image. Must be a valid PNG file, less than
* 4MB, and have the same dimensions as `image`.
*/
mask?: Core.Uploadable;
/**
- * The model to use for image generation. Only `dall-e-2` is supported at this
- * time.
+ * The model to use for image generation. Only `dall-e-2` and `gpt-image-1` are
+ * supported. Defaults to `dall-e-2` unless a parameter specific to `gpt-image-1`
+ * is used.
*/
model?: (string & {}) | ImageModel | null;
@@ -130,16 +195,25 @@ export interface ImageEditParams {
*/
n?: number | null;
+ /**
+ * The quality of the image that will be generated. `high`, `medium` and `low` are
+ * only supported for `gpt-image-1`. `dall-e-2` only supports `standard` quality.
+ * Defaults to `auto`.
+ */
+ quality?: 'standard' | 'low' | 'medium' | 'high' | 'auto' | null;
+
/**
* The format in which the generated images are returned. Must be one of `url` or
* `b64_json`. URLs are only valid for 60 minutes after the image has been
- * generated.
+ * generated. This parameter is only supported for `dall-e-2`, as `gpt-image-1`
+ * will always return base64-encoded images.
*/
response_format?: 'url' | 'b64_json' | null;
/**
- * The size of the generated images. Must be one of `256x256`, `512x512`, or
- * `1024x1024`.
+ * The size of the generated images. Must be one of `1024x1024`, `1536x1024`
+ * (landscape), `1024x1536` (portrait), or `auto` (default value) for
+ * `gpt-image-1`, and one of `256x256`, `512x512`, or `1024x1024` for `dall-e-2`.
*/
size?: '256x256' | '512x512' | '1024x1024' | null;
@@ -153,16 +227,36 @@ export interface ImageEditParams {
export interface ImageGenerateParams {
/**
- * A text description of the desired image(s). The maximum length is 1000
- * characters for `dall-e-2` and 4000 characters for `dall-e-3`.
+ * A text description of the desired image(s). The maximum length is 32000
+ * characters for `gpt-image-1`, 1000 characters for `dall-e-2` and 4000 characters
+ * for `dall-e-3`.
*/
prompt: string;
/**
- * The model to use for image generation.
+ * Allows to set transparency for the background of the generated image(s). This
+ * parameter is only supported for `gpt-image-1`. Must be one of `transparent`,
+ * `opaque` or `auto` (default value). When `auto` is used, the model will
+ * automatically determine the best background for the image.
+ *
+ * If `transparent`, the output format needs to support transparency, so it should
+ * be set to either `png` (default value) or `webp`.
+ */
+ background?: 'transparent' | 'opaque' | 'auto' | null;
+
+ /**
+ * The model to use for image generation. One of `dall-e-2`, `dall-e-3`, or
+ * `gpt-image-1`. Defaults to `dall-e-2` unless a parameter specific to
+ * `gpt-image-1` is used.
*/
model?: (string & {}) | ImageModel | null;
+ /**
+ * Control the content-moderation level for images generated by `gpt-image-1`. Must
+ * be either `low` for less restrictive filtering or `auto` (default value).
+ */
+ moderation?: 'low' | 'auto' | null;
+
/**
* The number of images to generate. Must be between 1 and 10. For `dall-e-3`, only
* `n=1` is supported.
@@ -170,31 +264,59 @@ export interface ImageGenerateParams {
n?: number | null;
/**
- * The quality of the image that will be generated. `hd` creates images with finer
- * details and greater consistency across the image. This param is only supported
- * for `dall-e-3`.
+ * The compression level (0-100%) for the generated images. This parameter is only
+ * supported for `gpt-image-1` with the `webp` or `jpeg` output formats, and
+ * defaults to 100.
*/
- quality?: 'standard' | 'hd';
+ output_compression?: number | null;
/**
- * The format in which the generated images are returned. Must be one of `url` or
- * `b64_json`. URLs are only valid for 60 minutes after the image has been
- * generated.
+ * The format in which the generated images are returned. This parameter is only
+ * supported for `gpt-image-1`. Must be one of `png`, `jpeg`, or `webp`.
+ */
+ output_format?: 'png' | 'jpeg' | 'webp' | null;
+
+ /**
+ * The quality of the image that will be generated.
+ *
+ * - `auto` (default value) will automatically select the best quality for the
+ * given model.
+ * - `high`, `medium` and `low` are supported for `gpt-image-1`.
+ * - `hd` and `standard` are supported for `dall-e-3`.
+ * - `standard` is the only option for `dall-e-2`.
+ */
+ quality?: 'standard' | 'hd' | 'low' | 'medium' | 'high' | 'auto' | null;
+
+ /**
+ * The format in which generated images with `dall-e-2` and `dall-e-3` are
+ * returned. Must be one of `url` or `b64_json`. URLs are only valid for 60 minutes
+ * after the image has been generated. This parameter isn't supported for
+ * `gpt-image-1` which will always return base64-encoded images.
*/
response_format?: 'url' | 'b64_json' | null;
/**
- * The size of the generated images. Must be one of `256x256`, `512x512`, or
- * `1024x1024` for `dall-e-2`. Must be one of `1024x1024`, `1792x1024`, or
- * `1024x1792` for `dall-e-3` models.
+ * The size of the generated images. Must be one of `1024x1024`, `1536x1024`
+ * (landscape), `1024x1536` (portrait), or `auto` (default value) for
+ * `gpt-image-1`, one of `256x256`, `512x512`, or `1024x1024` for `dall-e-2`, and
+ * one of `1024x1024`, `1792x1024`, or `1024x1792` for `dall-e-3`.
*/
- size?: '256x256' | '512x512' | '1024x1024' | '1792x1024' | '1024x1792' | null;
+ size?:
+ | 'auto'
+ | '1024x1024'
+ | '1536x1024'
+ | '1024x1536'
+ | '256x256'
+ | '512x512'
+ | '1792x1024'
+ | '1024x1792'
+ | null;
/**
- * The style of the generated images. Must be one of `vivid` or `natural`. Vivid
- * causes the model to lean towards generating hyper-real and dramatic images.
- * Natural causes the model to produce more natural, less hyper-real looking
- * images. This param is only supported for `dall-e-3`.
+ * The style of the generated images. This parameter is only supported for
+ * `dall-e-3`. Must be one of `vivid` or `natural`. Vivid causes the model to lean
+ * towards generating hyper-real and dramatic images. Natural causes the model to
+ * produce more natural, less hyper-real looking images.
*/
style?: 'vivid' | 'natural' | null;
diff --git a/src/resources/responses/responses.ts b/src/resources/responses/responses.ts
index 52dd079fc..771b8daf2 100644
--- a/src/resources/responses/responses.ts
+++ b/src/resources/responses/responses.ts
@@ -2158,6 +2158,160 @@ export namespace ResponseReasoningItem {
}
}
+/**
+ * Emitted when a new reasoning summary part is added.
+ */
+export interface ResponseReasoningSummaryPartAddedEvent {
+ /**
+ * The ID of the item this summary part is associated with.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item this summary part is associated with.
+ */
+ output_index: number;
+
+ /**
+ * The summary part that was added.
+ */
+ part: ResponseReasoningSummaryPartAddedEvent.Part;
+
+ /**
+ * The index of the summary part within the reasoning summary.
+ */
+ summary_index: number;
+
+ /**
+ * The type of the event. Always `response.reasoning_summary_part.added`.
+ */
+ type: 'response.reasoning_summary_part.added';
+}
+
+export namespace ResponseReasoningSummaryPartAddedEvent {
+ /**
+ * The summary part that was added.
+ */
+ export interface Part {
+ /**
+ * The text of the summary part.
+ */
+ text: string;
+
+ /**
+ * The type of the summary part. Always `summary_text`.
+ */
+ type: 'summary_text';
+ }
+}
+
+/**
+ * Emitted when a reasoning summary part is completed.
+ */
+export interface ResponseReasoningSummaryPartDoneEvent {
+ /**
+ * The ID of the item this summary part is associated with.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item this summary part is associated with.
+ */
+ output_index: number;
+
+ /**
+ * The completed summary part.
+ */
+ part: ResponseReasoningSummaryPartDoneEvent.Part;
+
+ /**
+ * The index of the summary part within the reasoning summary.
+ */
+ summary_index: number;
+
+ /**
+ * The type of the event. Always `response.reasoning_summary_part.done`.
+ */
+ type: 'response.reasoning_summary_part.done';
+}
+
+export namespace ResponseReasoningSummaryPartDoneEvent {
+ /**
+ * The completed summary part.
+ */
+ export interface Part {
+ /**
+ * The text of the summary part.
+ */
+ text: string;
+
+ /**
+ * The type of the summary part. Always `summary_text`.
+ */
+ type: 'summary_text';
+ }
+}
+
+/**
+ * Emitted when a delta is added to a reasoning summary text.
+ */
+export interface ResponseReasoningSummaryTextDeltaEvent {
+ /**
+ * The text delta that was added to the summary.
+ */
+ delta: string;
+
+ /**
+ * The ID of the item this summary text delta is associated with.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item this summary text delta is associated with.
+ */
+ output_index: number;
+
+ /**
+ * The index of the summary part within the reasoning summary.
+ */
+ summary_index: number;
+
+ /**
+ * The type of the event. Always `response.reasoning_summary_text.delta`.
+ */
+ type: 'response.reasoning_summary_text.delta';
+}
+
+/**
+ * Emitted when a reasoning summary text is completed.
+ */
+export interface ResponseReasoningSummaryTextDoneEvent {
+ /**
+ * The ID of the item this summary text is associated with.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item this summary text is associated with.
+ */
+ output_index: number;
+
+ /**
+ * The index of the summary part within the reasoning summary.
+ */
+ summary_index: number;
+
+ /**
+ * The full text of the completed reasoning summary.
+ */
+ text: string;
+
+ /**
+ * The type of the event. Always `response.reasoning_summary_text.done`.
+ */
+ type: 'response.reasoning_summary_text.done';
+}
+
/**
* Emitted when there is a partial refusal text.
*/
@@ -2252,6 +2406,10 @@ export type ResponseStreamEvent =
| ResponseIncompleteEvent
| ResponseOutputItemAddedEvent
| ResponseOutputItemDoneEvent
+ | ResponseReasoningSummaryPartAddedEvent
+ | ResponseReasoningSummaryPartDoneEvent
+ | ResponseReasoningSummaryTextDeltaEvent
+ | ResponseReasoningSummaryTextDoneEvent
| ResponseRefusalDeltaEvent
| ResponseRefusalDoneEvent
| ResponseTextAnnotationDeltaEvent
@@ -2967,6 +3125,10 @@ export declare namespace Responses {
type ResponseOutputRefusal as ResponseOutputRefusal,
type ResponseOutputText as ResponseOutputText,
type ResponseReasoningItem as ResponseReasoningItem,
+ type ResponseReasoningSummaryPartAddedEvent as ResponseReasoningSummaryPartAddedEvent,
+ type ResponseReasoningSummaryPartDoneEvent as ResponseReasoningSummaryPartDoneEvent,
+ type ResponseReasoningSummaryTextDeltaEvent as ResponseReasoningSummaryTextDeltaEvent,
+ type ResponseReasoningSummaryTextDoneEvent as ResponseReasoningSummaryTextDoneEvent,
type ResponseRefusalDeltaEvent as ResponseRefusalDeltaEvent,
type ResponseRefusalDoneEvent as ResponseRefusalDoneEvent,
type ResponseStatus as ResponseStatus,
diff --git a/src/version.ts b/src/version.ts
index cd1995322..1215a5e79 100644
--- a/src/version.ts
+++ b/src/version.ts
@@ -1 +1 @@
-export const VERSION = '4.95.1'; // x-release-please-version
+export const VERSION = '4.96.0'; // x-release-please-version
diff --git a/tests/api-resources/evals/evals.test.ts b/tests/api-resources/evals/evals.test.ts
index fabc2602a..45d1c4f9b 100644
--- a/tests/api-resources/evals/evals.test.ts
+++ b/tests/api-resources/evals/evals.test.ts
@@ -47,7 +47,6 @@ describe('resource evals', () => {
],
metadata: { foo: 'string' },
name: 'name',
- share_with_openai: true,
});
});
diff --git a/tests/api-resources/fine-tuning/checkpoints/permissions.test.ts b/tests/api-resources/fine-tuning/checkpoints/permissions.test.ts
index e7aceae3e..1e4b40a94 100644
--- a/tests/api-resources/fine-tuning/checkpoints/permissions.test.ts
+++ b/tests/api-resources/fine-tuning/checkpoints/permissions.test.ts
@@ -61,10 +61,10 @@ describe('resource permissions', () => {
).rejects.toThrow(OpenAI.NotFoundError);
});
- // OpenAPI spec is slightly incorrect
- test.skip('del', async () => {
+ test('del', async () => {
const responsePromise = client.fineTuning.checkpoints.permissions.del(
'ft:gpt-4o-mini-2024-07-18:org:weather:B7R9VjQd',
+ 'cp_zc4Q7MP6XxulcVzj4MZdwsAB',
);
const rawResponse = await responsePromise.asResponse();
expect(rawResponse).toBeInstanceOf(Response);
@@ -75,13 +75,14 @@ describe('resource permissions', () => {
expect(dataAndResponse.response).toBe(rawResponse);
});
- // OpenAPI spec is slightly incorrect
- test.skip('del: request options instead of params are passed correctly', async () => {
+ test('del: request options instead of params are passed correctly', async () => {
// ensure the request options are being passed correctly by passing an invalid HTTP method in order to cause an error
await expect(
- client.fineTuning.checkpoints.permissions.del('ft:gpt-4o-mini-2024-07-18:org:weather:B7R9VjQd', {
- path: '/_stainless_unknown_path',
- }),
+ client.fineTuning.checkpoints.permissions.del(
+ 'ft:gpt-4o-mini-2024-07-18:org:weather:B7R9VjQd',
+ 'cp_zc4Q7MP6XxulcVzj4MZdwsAB',
+ { path: '/_stainless_unknown_path' },
+ ),
).rejects.toThrow(OpenAI.NotFoundError);
});
});
diff --git a/tests/api-resources/images.test.ts b/tests/api-resources/images.test.ts
index 4f15e20ac..e9b460254 100644
--- a/tests/api-resources/images.test.ts
+++ b/tests/api-resources/images.test.ts
@@ -54,6 +54,7 @@ describe('resource images', () => {
mask: await toFile(Buffer.from('# my file contents'), 'README.md'),
model: 'string',
n: 1,
+ quality: 'high',
response_format: 'url',
size: '1024x1024',
user: 'user-1234',
@@ -74,9 +75,13 @@ describe('resource images', () => {
test('generate: required and optional params', async () => {
const response = await client.images.generate({
prompt: 'A cute baby sea otter',
+ background: 'transparent',
model: 'string',
+ moderation: 'low',
n: 1,
- quality: 'standard',
+ output_compression: 100,
+ output_format: 'png',
+ quality: 'medium',
response_format: 'url',
size: '1024x1024',
style: 'vivid',