Skip to content

Commit 31f7372

Browse files
feat(openai): OpenAI Responses: model-driven prompt caching and generic reasoning options refactor (#9259)
1 parent d139eff commit 31f7372

File tree

5 files changed

+101
-17
lines changed

5 files changed

+101
-17
lines changed

packages/types/src/model.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,10 @@ export const modelInfoSchema = z.object({
7474
contextWindow: z.number(),
7575
supportsImages: z.boolean().optional(),
7676
supportsPromptCache: z.boolean(),
77+
// Optional default prompt cache retention policy for providers that support it.
78+
// When set to "24h", extended prompt caching will be requested; when omitted
79+
// or set to "in_memory", the default in‑memory cache is used.
80+
promptCacheRetention: z.enum(["in_memory", "24h"]).optional(),
7781
// Capability flag to indicate whether the model supports an output verbosity parameter
7882
supportsVerbosity: z.boolean().optional(),
7983
supportsReasoningBudget: z.boolean().optional(),

packages/types/src/providers/openai.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export const openAiNativeModels = {
1111
contextWindow: 400000,
1212
supportsImages: true,
1313
supportsPromptCache: true,
14+
promptCacheRetention: "24h",
1415
supportsReasoningEffort: ["none", "low", "medium", "high"],
1516
reasoningEffort: "medium",
1617
inputPrice: 1.25,
@@ -29,6 +30,7 @@ export const openAiNativeModels = {
2930
contextWindow: 400000,
3031
supportsImages: true,
3132
supportsPromptCache: true,
33+
promptCacheRetention: "24h",
3234
supportsReasoningEffort: ["low", "medium", "high"],
3335
reasoningEffort: "medium",
3436
inputPrice: 1.25,
@@ -43,6 +45,7 @@ export const openAiNativeModels = {
4345
contextWindow: 400000,
4446
supportsImages: true,
4547
supportsPromptCache: true,
48+
promptCacheRetention: "24h",
4649
supportsReasoningEffort: ["low", "medium", "high"],
4750
reasoningEffort: "medium",
4851
inputPrice: 0.25,

src/api/providers/__tests__/openai-native-usage.spec.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,51 @@ describe("OpenAiNativeHandler - normalizeUsage", () => {
344344
})
345345
})
346346

347+
describe("OpenAiNativeHandler - prompt cache retention", () => {
348+
let handler: OpenAiNativeHandler
349+
350+
beforeEach(() => {
351+
handler = new OpenAiNativeHandler({
352+
openAiNativeApiKey: "test-key",
353+
})
354+
})
355+
356+
const buildRequestBodyForModel = (modelId: string) => {
357+
// Force the handler to use the requested model ID
358+
;(handler as any).options.apiModelId = modelId
359+
const model = handler.getModel()
360+
// Minimal formatted input/systemPrompt/verbosity/metadata for building the body
361+
return (handler as any).buildRequestBody(model, [], "", model.verbosity, undefined, undefined)
362+
}
363+
364+
it("should set prompt_cache_retention=24h for gpt-5.1 models that support prompt caching", () => {
365+
const body = buildRequestBodyForModel("gpt-5.1")
366+
expect(body.prompt_cache_retention).toBe("24h")
367+
368+
const codexBody = buildRequestBodyForModel("gpt-5.1-codex")
369+
expect(codexBody.prompt_cache_retention).toBe("24h")
370+
371+
const codexMiniBody = buildRequestBodyForModel("gpt-5.1-codex-mini")
372+
expect(codexMiniBody.prompt_cache_retention).toBe("24h")
373+
})
374+
375+
it("should not set prompt_cache_retention for non-gpt-5.1 models even if they support prompt caching", () => {
376+
const body = buildRequestBodyForModel("gpt-5")
377+
expect(body.prompt_cache_retention).toBeUndefined()
378+
379+
const fourOBody = buildRequestBodyForModel("gpt-4o")
380+
expect(fourOBody.prompt_cache_retention).toBeUndefined()
381+
})
382+
383+
it("should not set prompt_cache_retention when the model does not support prompt caching", () => {
384+
const modelId = "codex-mini-latest"
385+
expect(openAiNativeModels[modelId as keyof typeof openAiNativeModels].supportsPromptCache).toBe(false)
386+
387+
const body = buildRequestBodyForModel(modelId)
388+
expect(body.prompt_cache_retention).toBeUndefined()
389+
})
390+
})
391+
347392
describe("cost calculation", () => {
348393
it("should pass total input tokens to calculateApiCostOpenAI", () => {
349394
const usage = {

src/api/providers/openai-native.ts

Lines changed: 45 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,10 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
5252
constructor(options: ApiHandlerOptions) {
5353
super()
5454
this.options = options
55-
// Default to including reasoning.summary: "auto" for GPT‑5 unless explicitly disabled
56-
if (this.options.enableGpt5ReasoningSummary === undefined) {
57-
this.options.enableGpt5ReasoningSummary = true
55+
// Default to including reasoning.summary: "auto" for models that support Responses API
56+
// reasoning summaries unless explicitly disabled.
57+
if (this.options.enableResponsesReasoningSummary === undefined) {
58+
this.options.enableResponsesReasoningSummary = true
5859
}
5960
const apiKey = this.options.openAiNativeApiKey ?? "not-provided"
6061
this.client = new OpenAI({ baseURL: this.options.openAiNativeBaseUrl, apiKey })
@@ -176,10 +177,10 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
176177
reasoningEffort: ReasoningEffortExtended | undefined,
177178
metadata?: ApiHandlerCreateMessageMetadata,
178179
): any {
179-
// Build a request body
180-
// Ensure we explicitly pass max_output_tokens for GPT‑5 based on Roo's reserved model response calculation
180+
// Build a request body for the OpenAI Responses API.
181+
// Ensure we explicitly pass max_output_tokens based on Roo's reserved model response calculation
181182
// so requests do not default to very large limits (e.g., 120k).
182-
interface Gpt5RequestBody {
183+
interface ResponsesRequestBody {
183184
model: string
184185
input: Array<{ role: "user" | "assistant"; content: any[] } | { type: string; content: string }>
185186
stream: boolean
@@ -191,13 +192,18 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
191192
instructions?: string
192193
service_tier?: ServiceTier
193194
include?: string[]
195+
/** Prompt cache retention policy: "in_memory" (default) or "24h" for extended caching */
196+
prompt_cache_retention?: "in_memory" | "24h"
194197
}
195198

196199
// Validate requested tier against model support; if not supported, omit.
197200
const requestedTier = (this.options.openAiNativeServiceTier as ServiceTier | undefined) || undefined
198201
const allowedTierNames = new Set(model.info.tiers?.map((t) => t.name).filter(Boolean) || [])
199202

200-
const body: Gpt5RequestBody = {
203+
// Decide whether to enable extended prompt cache retention for this request
204+
const promptCacheRetention = this.getPromptCacheRetention(model)
205+
206+
const body: ResponsesRequestBody = {
201207
model: model.id,
202208
input: formattedInput,
203209
stream: true,
@@ -213,7 +219,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
213219
? {
214220
reasoning: {
215221
...(reasoningEffort ? { effort: reasoningEffort } : {}),
216-
...(this.options.enableGpt5ReasoningSummary ? { summary: "auto" as const } : {}),
222+
...(this.options.enableResponsesReasoningSummary ? { summary: "auto" as const } : {}),
217223
},
218224
}
219225
: {}),
@@ -229,6 +235,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
229235
(requestedTier === "default" || allowedTierNames.has(requestedTier)) && {
230236
service_tier: requestedTier,
231237
}),
238+
// Enable extended prompt cache retention for models that support it.
239+
// This uses the OpenAI Responses API `prompt_cache_retention` parameter.
240+
...(promptCacheRetention ? { prompt_cache_retention: promptCacheRetention } : {}),
232241
}
233242

234243
// Include text.verbosity only when the model explicitly supports it
@@ -263,7 +272,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
263272
}
264273
} catch (sdkErr: any) {
265274
// For errors, fallback to manual SSE via fetch
266-
yield* this.makeGpt5ResponsesAPIRequest(requestBody, model, metadata, systemPrompt, messages)
275+
yield* this.makeResponsesApiRequest(requestBody, model, metadata, systemPrompt, messages)
267276
}
268277
}
269278

@@ -322,7 +331,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
322331
return formattedMessages
323332
}
324333

325-
private async *makeGpt5ResponsesAPIRequest(
334+
private async *makeResponsesApiRequest(
326335
requestBody: any,
327336
model: OpenAiNativeModel,
328337
metadata?: ApiHandlerCreateMessageMetadata,
@@ -347,7 +356,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
347356
if (!response.ok) {
348357
const errorText = await response.text()
349358

350-
let errorMessage = `GPT-5 API request failed (${response.status})`
359+
let errorMessage = `OpenAI Responses API request failed (${response.status})`
351360
let errorDetails = ""
352361

353362
// Try to parse error as JSON for better error messages
@@ -803,7 +812,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
803812
}
804813
}
805814

806-
// Usage for done/completed is already handled by processGpt5Event in SDK path.
815+
// Usage for done/completed is already handled by processEvent in the SDK path.
807816
// For SSE path, usage often arrives separately; avoid double-emitting here.
808817
}
809818
// These are structural or status events, we can just log them at a lower level or ignore.
@@ -977,6 +986,23 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
977986
return selected && selected !== "disable" ? (selected as any) : undefined
978987
}
979988

989+
/**
990+
* Returns the appropriate prompt cache retention policy for the given model, if any.
991+
*
992+
* The policy is driven by ModelInfo.promptCacheRetention so that model-specific details
993+
* live in the shared types layer rather than this provider. When set to "24h" and the
994+
* model supports prompt caching, extended prompt cache retention is requested.
995+
*/
996+
private getPromptCacheRetention(model: OpenAiNativeModel): "24h" | undefined {
997+
if (!model.info.supportsPromptCache) return undefined
998+
999+
if (model.info.promptCacheRetention === "24h") {
1000+
return "24h"
1001+
}
1002+
1003+
return undefined
1004+
}
1005+
9801006
/**
9811007
* Returns a shallow-cloned ModelInfo with pricing overridden for the given tier, if available.
9821008
* If no tier or no overrides exist, the original ModelInfo is returned.
@@ -1083,7 +1109,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
10831109
if (reasoningEffort) {
10841110
requestBody.reasoning = {
10851111
effort: reasoningEffort,
1086-
...(this.options.enableGpt5ReasoningSummary ? { summary: "auto" as const } : {}),
1112+
...(this.options.enableResponsesReasoningSummary ? { summary: "auto" as const } : {}),
10871113
}
10881114
}
10891115

@@ -1102,6 +1128,12 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
11021128
requestBody.text = { verbosity: (verbosity || "medium") as VerbosityLevel }
11031129
}
11041130

1131+
// Enable extended prompt cache retention for eligible models
1132+
const promptCacheRetention = this.getPromptCacheRetention(model)
1133+
if (promptCacheRetention) {
1134+
requestBody.prompt_cache_retention = promptCacheRetention
1135+
}
1136+
11051137
// Make the non-streaming request
11061138
const response = await (this.client as any).responses.create(requestBody)
11071139

src/shared/api.ts

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ import {
1313
// Extend ProviderSettings (minus apiProvider) with handler-specific toggles.
1414
export type ApiHandlerOptions = Omit<ProviderSettings, "apiProvider"> & {
1515
/**
16-
* When true and using GPT‑5 Responses API, include reasoning.summary: "auto"
17-
* so the API returns reasoning summaries (we already parse and surface them).
18-
* Defaults to true; set to false to disable summaries.
16+
* When true and using OpenAI Responses API models that support reasoning summaries,
17+
* include reasoning.summary: "auto" so the API returns summaries (we already parse
18+
* and surface them). Defaults to true; set to false to disable summaries.
1919
*/
20-
enableGpt5ReasoningSummary?: boolean
20+
enableResponsesReasoningSummary?: boolean
2121
/**
2222
* Optional override for Ollama's num_ctx parameter.
2323
* When set, this value will be used in Ollama chat requests.

0 commit comments

Comments
 (0)