diff --git a/packages/types/src/message.ts b/packages/types/src/message.ts index 21baf3f203..7197ab29a1 100644 --- a/packages/types/src/message.ts +++ b/packages/types/src/message.ts @@ -176,6 +176,17 @@ export const clineMessageSchema = z.object({ contextCondense: contextCondenseSchema.optional(), isProtected: z.boolean().optional(), apiProtocol: z.union([z.literal("openai"), z.literal("anthropic")]).optional(), + metadata: z + .object({ + gpt5: z + .object({ + previous_response_id: z.string().optional(), + instructions: z.string().optional(), + reasoning_summary: z.string().optional(), + }) + .optional(), + }) + .optional(), }) export type ClineMessage = z.infer diff --git a/packages/types/src/model.ts b/packages/types/src/model.ts index a09790578b..90b61ad879 100644 --- a/packages/types/src/model.ts +++ b/packages/types/src/model.ts @@ -44,6 +44,8 @@ export const modelInfoSchema = z.object({ supportsImages: z.boolean().optional(), supportsComputerUse: z.boolean().optional(), supportsPromptCache: z.boolean(), + // Capability flag to indicate whether the model supports an output verbosity parameter + supportsVerbosity: z.boolean().optional(), supportsReasoningBudget: z.boolean().optional(), requiredReasoningBudget: z.boolean().optional(), supportsReasoningEffort: z.boolean().optional(), diff --git a/packages/types/src/provider-settings.ts b/packages/types/src/provider-settings.ts index f0c90101fc..aebfd4dbe5 100644 --- a/packages/types/src/provider-settings.ts +++ b/packages/types/src/provider-settings.ts @@ -3,6 +3,11 @@ import { z } from "zod" import { reasoningEffortsSchema, verbosityLevelsSchema, modelInfoSchema } from "./model.js" import { codebaseIndexProviderSchema } from "./codebase-index.js" +// Extended schema that includes "minimal" for GPT-5 models +export const extendedReasoningEffortsSchema = z.union([reasoningEffortsSchema, z.literal("minimal")]) + +export type ReasoningEffortWithMinimal = z.infer + /** * ProviderName */ @@ -76,7 +81,7 @@ const baseProviderSettingsSchema = z.object({ // Model reasoning. enableReasoningEffort: z.boolean().optional(), - reasoningEffort: reasoningEffortsSchema.optional(), + reasoningEffort: extendedReasoningEffortsSchema.optional(), modelMaxTokens: z.number().optional(), modelMaxThinkingTokens: z.number().optional(), diff --git a/packages/types/src/providers/openai.ts b/packages/types/src/providers/openai.ts index b319be2a5f..02fadb412d 100644 --- a/packages/types/src/providers/openai.ts +++ b/packages/types/src/providers/openai.ts @@ -12,10 +12,13 @@ export const openAiNativeModels = { supportsImages: true, supportsPromptCache: true, supportsReasoningEffort: true, + reasoningEffort: "medium", inputPrice: 1.25, outputPrice: 10.0, cacheReadsPrice: 0.13, description: "GPT-5: The best model for coding and agentic tasks across domains", + // supportsVerbosity is a new capability; ensure ModelInfo includes it + supportsVerbosity: true, }, "gpt-5-mini-2025-08-07": { maxTokens: 128000, @@ -23,10 +26,12 @@ export const openAiNativeModels = { supportsImages: true, supportsPromptCache: true, supportsReasoningEffort: true, + reasoningEffort: "medium", inputPrice: 0.25, outputPrice: 2.0, cacheReadsPrice: 0.03, description: "GPT-5 Mini: A faster, more cost-efficient version of GPT-5 for well-defined tasks", + supportsVerbosity: true, }, "gpt-5-nano-2025-08-07": { maxTokens: 128000, @@ -34,10 +39,12 @@ export const openAiNativeModels = { supportsImages: true, supportsPromptCache: true, supportsReasoningEffort: true, + reasoningEffort: "medium", inputPrice: 0.05, outputPrice: 0.4, cacheReadsPrice: 0.01, description: "GPT-5 Nano: Fastest, most cost-efficient version of GPT-5", + supportsVerbosity: true, }, "gpt-4.1": { maxTokens: 32_768, @@ -229,5 +236,6 @@ export const openAiModelInfoSaneDefaults: ModelInfo = { export const azureOpenAiDefaultApiVersion = "2024-08-01-preview" export const OPENAI_NATIVE_DEFAULT_TEMPERATURE = 0 +export const GPT5_DEFAULT_TEMPERATURE = 1.0 export const OPENAI_AZURE_AI_INFERENCE_PATH = "/models/chat/completions" diff --git a/src/api/index.ts b/src/api/index.ts index 57b06f7bbd..5e705a80d2 100644 --- a/src/api/index.ts +++ b/src/api/index.ts @@ -44,6 +44,13 @@ export interface SingleCompletionHandler { export interface ApiHandlerCreateMessageMetadata { mode?: string taskId: string + previousResponseId?: string + /** + * When true, the provider must NOT fall back to internal continuity state + * (e.g., lastResponseId) if previousResponseId is absent. + * Used to enforce "skip once" after a condense operation. + */ + suppressPreviousResponseId?: boolean } export interface ApiHandler { diff --git a/src/api/providers/__tests__/openai-native.spec.ts b/src/api/providers/__tests__/openai-native.spec.ts index fdd71ba3f6..23f19e3d48 100644 --- a/src/api/providers/__tests__/openai-native.spec.ts +++ b/src/api/providers/__tests__/openai-native.spec.ts @@ -160,8 +160,12 @@ describe("OpenAiNativeHandler", () => { expect(results.length).toBe(1) expect(results[0].type).toBe("usage") // Use type assertion to avoid TypeScript errors - expect((results[0] as any).inputTokens).toBe(0) - expect((results[0] as any).outputTokens).toBe(0) + const usageResult = results[0] as any + expect(usageResult.inputTokens).toBe(0) + expect(usageResult.outputTokens).toBe(0) + // When no cache tokens are present, they should be undefined + expect(usageResult.cacheWriteTokens).toBeUndefined() + expect(usageResult.cacheReadTokens).toBeUndefined() // Verify developer role is used for system prompt with o1 model expect(mockCreate).toHaveBeenCalledWith({ @@ -286,6 +290,111 @@ describe("OpenAiNativeHandler", () => { expect((results[1] as any).outputTokens).toBe(5) expect((results[1] as any).totalCost).toBeCloseTo(0.00006, 6) }) + + it("should handle cache tokens in streaming response", async () => { + const mockStream = [ + { choices: [{ delta: { content: "Hello" } }], usage: null }, + { choices: [{ delta: { content: " cached" } }], usage: null }, + { + choices: [{ delta: { content: " response" } }], + usage: { + prompt_tokens: 100, + completion_tokens: 10, + prompt_tokens_details: { + cached_tokens: 80, + audio_tokens: 0, + }, + completion_tokens_details: { + reasoning_tokens: 0, + audio_tokens: 0, + accepted_prediction_tokens: 0, + rejected_prediction_tokens: 0, + }, + }, + }, + ] + + mockCreate.mockResolvedValueOnce( + (async function* () { + for (const chunk of mockStream) { + yield chunk + } + })(), + ) + + const generator = handler.createMessage(systemPrompt, messages) + const results = [] + for await (const result of generator) { + results.push(result) + } + + // Verify text responses + expect(results.length).toBe(4) + expect(results[0]).toMatchObject({ type: "text", text: "Hello" }) + expect(results[1]).toMatchObject({ type: "text", text: " cached" }) + expect(results[2]).toMatchObject({ type: "text", text: " response" }) + + // Check usage data includes cache tokens + expect(results[3].type).toBe("usage") + const usageChunk = results[3] as any + expect(usageChunk.inputTokens).toBe(100) // Total input tokens (includes cached) + expect(usageChunk.outputTokens).toBe(10) + expect(usageChunk.cacheReadTokens).toBe(80) // Cached tokens from prompt_tokens_details + expect(usageChunk.cacheWriteTokens).toBeUndefined() // No cache write tokens in standard response + + // Verify cost calculation takes cache into account + // GPT-4.1 pricing: input $2/1M, output $8/1M, cache read $0.5/1M + // OpenAI's prompt_tokens includes cached tokens, so we need to calculate: + // - Non-cached input tokens: 100 - 80 = 20 + // - Cost for non-cached input: (20 / 1_000_000) * 2.0 + // - Cost for cached input: (80 / 1_000_000) * 0.5 + // - Cost for output: (10 / 1_000_000) * 8.0 + const nonCachedInputTokens = 100 - 80 + const expectedNonCachedInputCost = (nonCachedInputTokens / 1_000_000) * 2.0 + const expectedCacheReadCost = (80 / 1_000_000) * 0.5 + const expectedOutputCost = (10 / 1_000_000) * 8.0 + const expectedTotalCost = expectedNonCachedInputCost + expectedCacheReadCost + expectedOutputCost + expect(usageChunk.totalCost).toBeCloseTo(expectedTotalCost, 10) + }) + + it("should handle cache write tokens if present", async () => { + const mockStream = [ + { choices: [{ delta: { content: "Test" } }], usage: null }, + { + choices: [{ delta: {} }], + usage: { + prompt_tokens: 150, + completion_tokens: 5, + prompt_tokens_details: { + cached_tokens: 50, + }, + cache_creation_input_tokens: 30, // Cache write tokens + }, + }, + ] + + mockCreate.mockResolvedValueOnce( + (async function* () { + for (const chunk of mockStream) { + yield chunk + } + })(), + ) + + const generator = handler.createMessage(systemPrompt, messages) + const results = [] + for await (const result of generator) { + results.push(result) + } + + // Check usage data includes both cache read and write tokens + const usageChunk = results.find((r) => r.type === "usage") as any + expect(usageChunk).toBeDefined() + expect(usageChunk.inputTokens).toBe(150) + expect(usageChunk.outputTokens).toBe(5) + expect(usageChunk.cacheReadTokens).toBe(50) + expect(usageChunk.cacheWriteTokens).toBe(30) + }) }) describe("completePrompt", () => { @@ -461,7 +570,40 @@ describe("OpenAiNativeHandler", () => { }) describe("GPT-5 models", () => { - it("should handle GPT-5 model with developer role", async () => { + it("should handle GPT-5 model with Responses API", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + // Simulate actual GPT-5 Responses API SSE stream format + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.created","response":{"id":"test","status":"in_progress"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Hello"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":" world"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.done","response":{"usage":{"prompt_tokens":10,"completion_tokens":2}}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + handler = new OpenAiNativeHandler({ ...mockOptions, apiModelId: "gpt-5-2025-08-07", @@ -473,20 +615,56 @@ describe("OpenAiNativeHandler", () => { chunks.push(chunk) } - // Verify developer role is used for GPT-5 with default parameters - expect(mockCreate).toHaveBeenCalledWith( + // Verify Responses API is called with correct parameters + expect(mockFetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/responses", expect.objectContaining({ - model: "gpt-5-2025-08-07", - messages: [{ role: "developer", content: expect.stringContaining(systemPrompt) }], - stream: true, - stream_options: { include_usage: true }, - reasoning_effort: "minimal", // Default for GPT-5 - verbosity: "medium", // Default verbosity + method: "POST", + headers: expect.objectContaining({ + "Content-Type": "application/json", + Authorization: "Bearer test-api-key", + Accept: "text/event-stream", + }), + body: expect.any(String), }), ) + const body1 = (mockFetch.mock.calls[0][1] as any).body as string + expect(body1).toContain('"model":"gpt-5-2025-08-07"') + expect(body1).toContain('"input":"Developer: You are a helpful assistant.\\n\\nUser: Hello!"') + expect(body1).toContain('"effort":"medium"') + expect(body1).toContain('"summary":"auto"') + expect(body1).toContain('"verbosity":"medium"') + expect(body1).toContain('"temperature":1') + expect(body1).toContain('"max_output_tokens"') + + // Verify the streamed content + const textChunks = chunks.filter((c) => c.type === "text") + expect(textChunks).toHaveLength(2) + expect(textChunks[0].text).toBe("Hello") + expect(textChunks[1].text).toBe(" world") + + // Clean up + delete (global as any).fetch }) - it("should handle GPT-5-mini model", async () => { + it("should handle GPT-5-mini model with Responses API", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Response"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + handler = new OpenAiNativeHandler({ ...mockOptions, apiModelId: "gpt-5-mini-2025-08-07", @@ -498,19 +676,36 @@ describe("OpenAiNativeHandler", () => { chunks.push(chunk) } - expect(mockCreate).toHaveBeenCalledWith( + // Verify correct model and default parameters + expect(mockFetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/responses", expect.objectContaining({ - model: "gpt-5-mini-2025-08-07", - messages: [{ role: "developer", content: expect.stringContaining(systemPrompt) }], - stream: true, - stream_options: { include_usage: true }, - reasoning_effort: "minimal", // Default for GPT-5 - verbosity: "medium", // Default verbosity + body: expect.stringContaining('"model":"gpt-5-mini-2025-08-07"'), }), ) + + // Clean up + delete (global as any).fetch }) - it("should handle GPT-5-nano model", async () => { + it("should handle GPT-5-nano model with Responses API", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Nano response"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + handler = new OpenAiNativeHandler({ ...mockOptions, apiModelId: "gpt-5-nano-2025-08-07", @@ -522,19 +717,36 @@ describe("OpenAiNativeHandler", () => { chunks.push(chunk) } - expect(mockCreate).toHaveBeenCalledWith( + // Verify correct model + expect(mockFetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/responses", expect.objectContaining({ - model: "gpt-5-nano-2025-08-07", - messages: [{ role: "developer", content: expect.stringContaining(systemPrompt) }], - stream: true, - stream_options: { include_usage: true }, - reasoning_effort: "minimal", // Default for GPT-5 - verbosity: "medium", // Default verbosity + body: expect.stringContaining('"model":"gpt-5-nano-2025-08-07"'), }), ) + + // Clean up + delete (global as any).fetch }) it("should support verbosity control for GPT-5", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Low verbosity"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + handler = new OpenAiNativeHandler({ ...mockOptions, apiModelId: "gpt-5-2025-08-07", @@ -549,18 +761,77 @@ describe("OpenAiNativeHandler", () => { } // Verify that verbosity is passed in the request - expect(mockCreate).toHaveBeenCalledWith( + expect(mockFetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/responses", expect.objectContaining({ - model: "gpt-5-2025-08-07", - messages: expect.any(Array), - stream: true, - stream_options: { include_usage: true }, - verbosity: "low", + body: expect.stringContaining('"verbosity":"low"'), }), ) + + // Clean up + delete (global as any).fetch }) it("should support minimal reasoning effort for GPT-5", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Minimal effort"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + reasoningEffort: "minimal" as any, // GPT-5 supports minimal + }) + + const stream = handler.createMessage(systemPrompt, messages) + const chunks: any[] = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + // With minimal reasoning effort, the model should pass it through + expect(mockFetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/responses", + expect.objectContaining({ + body: expect.stringContaining('"effort":"minimal"'), + }), + ) + + // Clean up + delete (global as any).fetch + }) + + it("should support low reasoning effort for GPT-5", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Low effort response"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + handler = new OpenAiNativeHandler({ ...mockOptions, apiModelId: "gpt-5-2025-08-07", @@ -573,25 +844,48 @@ describe("OpenAiNativeHandler", () => { chunks.push(chunk) } - // With low reasoning effort, the model should pass it through - expect(mockCreate).toHaveBeenCalledWith( + // Should use Responses API with low reasoning effort + expect(mockFetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/responses", expect.objectContaining({ - model: "gpt-5-2025-08-07", - messages: expect.any(Array), - stream: true, - stream_options: { include_usage: true }, - reasoning_effort: "low", - verbosity: "medium", // Default verbosity + body: expect.any(String), }), ) + const body2 = (mockFetch.mock.calls[0][1] as any).body as string + expect(body2).toContain('"model":"gpt-5-2025-08-07"') + expect(body2).toContain('"effort":"low"') + expect(body2).toContain('"summary":"auto"') + expect(body2).toContain('"verbosity":"medium"') + expect(body2).toContain('"temperature":1') + expect(body2).toContain('"max_output_tokens"') + + // Clean up + delete (global as any).fetch }) it("should support both verbosity and reasoning effort together for GPT-5", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"High verbosity minimal effort"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + handler = new OpenAiNativeHandler({ ...mockOptions, apiModelId: "gpt-5-2025-08-07", - verbosity: "high", // Set verbosity through options - reasoningEffort: "low", // Set reasoning effort + verbosity: "high", + reasoningEffort: "minimal" as any, }) const stream = handler.createMessage(systemPrompt, messages) @@ -600,17 +894,624 @@ describe("OpenAiNativeHandler", () => { chunks.push(chunk) } - // Verify both parameters are passed - expect(mockCreate).toHaveBeenCalledWith( + // Should use Responses API with both parameters + expect(mockFetch).toHaveBeenCalledWith( + "https://api.openai.com/v1/responses", expect.objectContaining({ - model: "gpt-5-2025-08-07", - messages: expect.any(Array), - stream: true, - stream_options: { include_usage: true }, - reasoning_effort: "low", - verbosity: "high", + body: expect.any(String), }), ) + const body3 = (mockFetch.mock.calls[0][1] as any).body as string + expect(body3).toContain('"model":"gpt-5-2025-08-07"') + expect(body3).toContain('"effort":"minimal"') + expect(body3).toContain('"summary":"auto"') + expect(body3).toContain('"verbosity":"high"') + expect(body3).toContain('"temperature":1') + expect(body3).toContain('"max_output_tokens"') + + // Clean up + delete (global as any).fetch + }) + + it("should handle actual GPT-5 Responses API format", async () => { + // Mock fetch with actual response format from GPT-5 + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + // Test actual GPT-5 response format + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.created","response":{"id":"test","status":"in_progress"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.in_progress","response":{"status":"in_progress"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"First text"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":" Second text"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"reasoning","text":"Some reasoning"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.done","response":{"usage":{"prompt_tokens":100,"completion_tokens":20}}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + const stream = handler.createMessage(systemPrompt, messages) + const chunks: any[] = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + // Should handle the actual format correctly + const textChunks = chunks.filter((c) => c.type === "text") + const reasoningChunks = chunks.filter((c) => c.type === "reasoning") + + expect(textChunks).toHaveLength(2) + expect(textChunks[0].text).toBe("First text") + expect(textChunks[1].text).toBe(" Second text") + + expect(reasoningChunks).toHaveLength(1) + expect(reasoningChunks[0].text).toBe("Some reasoning") + + // Should also have usage information with cost + const usageChunks = chunks.filter((c) => c.type === "usage") + expect(usageChunks).toHaveLength(1) + expect(usageChunks[0]).toMatchObject({ + type: "usage", + inputTokens: 100, + outputTokens: 20, + totalCost: expect.any(Number), + }) + + // Verify cost calculation (GPT-5 pricing: input $1.25/M, output $10/M) + const expectedInputCost = (100 / 1_000_000) * 1.25 + const expectedOutputCost = (20 / 1_000_000) * 10.0 + const expectedTotalCost = expectedInputCost + expectedOutputCost + expect(usageChunks[0].totalCost).toBeCloseTo(expectedTotalCost, 10) + + // Clean up + delete (global as any).fetch + }) + + it("should handle Responses API with no content gracefully", async () => { + // Mock fetch with empty response + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue(new TextEncoder().encode('data: {"someField":"value"}\n\n')) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + const stream = handler.createMessage(systemPrompt, messages) + const chunks: any[] = [] + + // Should not throw, just warn + for await (const chunk of stream) { + chunks.push(chunk) + } + + // Should have no content chunks when stream is empty + const contentChunks = chunks.filter((c) => c.type === "text" || c.type === "reasoning") + + expect(contentChunks).toHaveLength(0) + + // Clean up + delete (global as any).fetch }) + + it("should support previous_response_id for conversation continuity", async () => { + // Mock fetch for Responses API + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + // Include response ID in the response + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.created","response":{"id":"resp_123","status":"in_progress"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Response with ID"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.done","response":{"id":"resp_123","usage":{"prompt_tokens":10,"completion_tokens":3}}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + // First request - should not have previous_response_id + const stream1 = handler.createMessage(systemPrompt, messages) + const chunks1: any[] = [] + for await (const chunk of stream1) { + chunks1.push(chunk) + } + + // Verify first request doesn't include previous_response_id + let firstCallBody = JSON.parse(mockFetch.mock.calls[0][1].body) + expect(firstCallBody.previous_response_id).toBeUndefined() + + // Second request with metadata - should include previous_response_id + const stream2 = handler.createMessage(systemPrompt, messages, { + taskId: "test-task", + previousResponseId: "resp_456", + }) + const chunks2: any[] = [] + for await (const chunk of stream2) { + chunks2.push(chunk) + } + + // Verify second request includes the provided previous_response_id + let secondCallBody = JSON.parse(mockFetch.mock.calls[1][1].body) + expect(secondCallBody.previous_response_id).toBe("resp_456") + + // Clean up + delete (global as any).fetch + }) + + it("should handle unhandled stream events gracefully", async () => { + // Mock fetch for the fallback SSE path (which is what gets used when SDK fails) + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Hello"}}\n\n', + ), + ) + // This event is not handled, so it should be ignored + controller.enqueue( + new TextEncoder().encode('data: {"type":"response.audio.delta","delta":"..."}\n\n'), + ) + controller.enqueue(new TextEncoder().encode('data: {"type":"response.done","response":{}}\n\n')) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + + // Also mock the SDK to throw an error so it falls back to fetch + const mockClient = { + responses: { + create: vitest.fn().mockRejectedValue(new Error("SDK not available")), + }, + } + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + // Replace the client with our mock + ;(handler as any).client = mockClient + + const stream = handler.createMessage(systemPrompt, messages) + const chunks: any[] = [] + const errors: any[] = [] + + try { + for await (const chunk of stream) { + chunks.push(chunk) + } + } catch (error) { + errors.push(error) + } + + // Log for debugging + if (chunks.length === 0 && errors.length === 0) { + console.log("No chunks and no errors received") + } + if (errors.length > 0) { + console.log("Errors:", errors) + } + + expect(errors.length).toBe(0) + const textChunks = chunks.filter((c) => c.type === "text") + expect(textChunks.length).toBeGreaterThan(0) + expect(textChunks[0].text).toBe("Hello") + + delete (global as any).fetch + }) + + it("should use stored response ID when metadata doesn't provide one", async () => { + // Mock fetch for Responses API + const mockFetch = vitest + .fn() + .mockResolvedValueOnce({ + ok: true, + body: new ReadableStream({ + start(controller) { + // First response with ID + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.done","response":{"id":"resp_789","output":[{"type":"text","content":[{"type":"text","text":"First"}]}],"usage":{"prompt_tokens":10,"completion_tokens":1}}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + .mockResolvedValueOnce({ + ok: true, + body: new ReadableStream({ + start(controller) { + // Second response + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Second"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + // First request - establishes response ID + const stream1 = handler.createMessage(systemPrompt, messages) + for await (const chunk of stream1) { + // consume stream + } + + // Second request without metadata - should use stored response ID + const stream2 = handler.createMessage(systemPrompt, messages, { taskId: "test-task" }) + for await (const chunk of stream2) { + // consume stream + } + + // Verify second request uses the stored response ID from first request + let secondCallBody = JSON.parse(mockFetch.mock.calls[1][1].body) + expect(secondCallBody.previous_response_id).toBe("resp_789") + + // Clean up + delete (global as any).fetch + }) + + it("should only send latest message when using previous_response_id", async () => { + // Mock fetch for Responses API + const mockFetch = vitest + .fn() + .mockResolvedValueOnce({ + ok: true, + body: new ReadableStream({ + start(controller) { + // First response with ID + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.done","response":{"id":"resp_001","output":[{"type":"text","content":[{"type":"text","text":"First"}]}],"usage":{"prompt_tokens":50,"completion_tokens":1}}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + .mockResolvedValueOnce({ + ok: true, + body: new ReadableStream({ + start(controller) { + // Second response + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Second"}}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.done","response":{"id":"resp_002","usage":{"prompt_tokens":10,"completion_tokens":1}}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + global.fetch = mockFetch as any + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + // First request with full conversation + const firstMessages: Anthropic.Messages.MessageParam[] = [ + { role: "user", content: "Hello" }, + { role: "assistant", content: "Hi there!" }, + { role: "user", content: "How are you?" }, + ] + + const stream1 = handler.createMessage(systemPrompt, firstMessages) + for await (const chunk of stream1) { + // consume stream + } + + // Verify first request sends full conversation + let firstCallBody = JSON.parse(mockFetch.mock.calls[0][1].body) + expect(firstCallBody.input).toContain("Hello") + expect(firstCallBody.input).toContain("Hi there!") + expect(firstCallBody.input).toContain("How are you?") + expect(firstCallBody.previous_response_id).toBeUndefined() + + // Second request with previous_response_id - should only send latest message + const secondMessages: Anthropic.Messages.MessageParam[] = [ + { role: "user", content: "Hello" }, + { role: "assistant", content: "Hi there!" }, + { role: "user", content: "How are you?" }, + { role: "assistant", content: "I'm doing well!" }, + { role: "user", content: "What's the weather?" }, // Latest message + ] + + const stream2 = handler.createMessage(systemPrompt, secondMessages, { + taskId: "test-task", + previousResponseId: "resp_001", + }) + for await (const chunk of stream2) { + // consume stream + } + + // Verify second request only sends the latest user message + let secondCallBody = JSON.parse(mockFetch.mock.calls[1][1].body) + expect(secondCallBody.input).toBe("User: What's the weather?") + expect(secondCallBody.input).not.toContain("Hello") + expect(secondCallBody.input).not.toContain("Hi there!") + expect(secondCallBody.input).not.toContain("How are you?") + expect(secondCallBody.previous_response_id).toBe("resp_001") + + // Clean up + delete (global as any).fetch + }) + + it("should correctly prepare GPT-5 input with conversation continuity", () => { + const gpt5Handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + // @ts-expect-error - private method + const { formattedInput, previousResponseId } = gpt5Handler.prepareGpt5Input(systemPrompt, messages, { + taskId: "task1", + previousResponseId: "resp_123", + }) + + expect(previousResponseId).toBe("resp_123") + expect(formattedInput).toBe("User: Hello!") + }) + + it("should provide helpful error messages for different error codes", async () => { + const testCases = [ + { status: 400, expectedMessage: "Invalid request to GPT-5 API" }, + { status: 401, expectedMessage: "Authentication failed" }, + { status: 403, expectedMessage: "Access denied" }, + { status: 404, expectedMessage: "GPT-5 API endpoint not found" }, + { status: 429, expectedMessage: "Rate limit exceeded" }, + { status: 500, expectedMessage: "OpenAI service error" }, + ] + + for (const { status, expectedMessage } of testCases) { + // Mock fetch with error response + const mockFetch = vitest.fn().mockResolvedValue({ + ok: false, + status, + statusText: "Error", + text: async () => JSON.stringify({ error: { message: "Test error" } }), + }) + global.fetch = mockFetch as any + + handler = new OpenAiNativeHandler({ + ...mockOptions, + apiModelId: "gpt-5-2025-08-07", + }) + + const stream = handler.createMessage(systemPrompt, messages) + + await expect(async () => { + for await (const chunk of stream) { + // Should throw before yielding anything + } + }).rejects.toThrow(expectedMessage) + } + + // Clean up + delete (global as any).fetch + }) + }) +}) + +// Added tests for GPT-5 streaming event coverage per PR_review_gpt5_final.md + +describe("GPT-5 streaming event coverage (additional)", () => { + it("should handle reasoning delta events for GPT-5", async () => { + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.reasoning.delta","delta":"Thinking about the problem..."}\n\n', + ), + ) + controller.enqueue( + new TextEncoder().encode('data: {"type":"response.text.delta","delta":"The answer is..."}\n\n'), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + // @ts-ignore + global.fetch = mockFetch + + const handler = new OpenAiNativeHandler({ + apiModelId: "gpt-5-2025-08-07", + openAiNativeApiKey: "test-api-key", + }) + + const systemPrompt = "You are a helpful assistant." + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello!" }] + const stream = handler.createMessage(systemPrompt, messages) + + const chunks: any[] = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + const reasoningChunks = chunks.filter((c) => c.type === "reasoning") + const textChunks = chunks.filter((c) => c.type === "text") + + expect(reasoningChunks).toHaveLength(1) + expect(reasoningChunks[0].text).toBe("Thinking about the problem...") + expect(textChunks).toHaveLength(1) + expect(textChunks[0].text).toBe("The answer is...") + + // @ts-ignore + delete global.fetch + }) + + it("should handle refusal delta events for GPT-5 and prefix output", async () => { + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.refusal.delta","delta":"I cannot comply with this request."}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + // @ts-ignore + global.fetch = mockFetch + + const handler = new OpenAiNativeHandler({ + apiModelId: "gpt-5-2025-08-07", + openAiNativeApiKey: "test-api-key", + }) + + const systemPrompt = "You are a helpful assistant." + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Do something disallowed" }] + const stream = handler.createMessage(systemPrompt, messages) + + const chunks: any[] = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + const textChunks = chunks.filter((c) => c.type === "text") + expect(textChunks).toHaveLength(1) + expect(textChunks[0].text).toBe("[Refusal] I cannot comply with this request.") + + // @ts-ignore + delete global.fetch + }) + + it("should ignore malformed JSON lines in SSE stream", async () => { + const mockFetch = vitest.fn().mockResolvedValue({ + ok: true, + body: new ReadableStream({ + start(controller) { + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"Before"}}\n\n', + ), + ) + // Malformed JSON line + controller.enqueue( + new TextEncoder().encode('data: {"type":"response.text.delta","delta":"Bad"\n\n'), + ) + // Valid line after malformed + controller.enqueue( + new TextEncoder().encode( + 'data: {"type":"response.output_item.added","item":{"type":"text","text":"After"}}\n\n', + ), + ) + controller.enqueue(new TextEncoder().encode("data: [DONE]\n\n")) + controller.close() + }, + }), + }) + // @ts-ignore + global.fetch = mockFetch + + const handler = new OpenAiNativeHandler({ + apiModelId: "gpt-5-2025-08-07", + openAiNativeApiKey: "test-api-key", + }) + + const systemPrompt = "You are a helpful assistant." + const messages: Anthropic.Messages.MessageParam[] = [{ role: "user", content: "Hello!" }] + const stream = handler.createMessage(systemPrompt, messages) + + const chunks: any[] = [] + for await (const chunk of stream) { + chunks.push(chunk) + } + + // It should not throw and still capture the valid texts around the malformed line + const textChunks = chunks.filter((c) => c.type === "text") + expect(textChunks.map((c: any) => c.text)).toEqual(["Before", "After"]) + + // @ts-ignore + delete global.fetch }) }) diff --git a/src/api/providers/openai-native.ts b/src/api/providers/openai-native.ts index 5e498bee45..8df70d31f1 100644 --- a/src/api/providers/openai-native.ts +++ b/src/api/providers/openai-native.ts @@ -7,8 +7,10 @@ import { OpenAiNativeModelId, openAiNativeModels, OPENAI_NATIVE_DEFAULT_TEMPERATURE, + GPT5_DEFAULT_TEMPERATURE, type ReasoningEffort, type VerbosityLevel, + type ReasoningEffortWithMinimal, } from "@roo-code/types" import type { ApiHandlerOptions } from "../../shared/api" @@ -16,7 +18,7 @@ import type { ApiHandlerOptions } from "../../shared/api" import { calculateApiCostOpenAI } from "../../shared/cost" import { convertToOpenAiMessages } from "../transform/openai-format" -import { ApiStream } from "../transform/stream" +import { ApiStream, ApiStreamUsageChunk } from "../transform/stream" import { getModelParams } from "../transform/model-params" import { BaseProvider } from "./base-provider" @@ -24,43 +26,77 @@ import type { SingleCompletionHandler, ApiHandlerCreateMessageMetadata } from ". export type OpenAiNativeModel = ReturnType -// GPT-5 specific types for Responses API -type ReasoningEffortWithMinimal = ReasoningEffort | "minimal" - -interface GPT5ResponsesAPIParams { - model: string - input: string - reasoning?: { - effort: ReasoningEffortWithMinimal - } - text?: { - verbosity: VerbosityLevel - } -} - -interface GPT5ResponseChunk { - type: "text" | "reasoning" | "usage" - text?: string - reasoning?: string - usage?: { - input_tokens: number - output_tokens: number - reasoning_tokens?: number - total_tokens: number - } -} +// GPT-5 specific types export class OpenAiNativeHandler extends BaseProvider implements SingleCompletionHandler { protected options: ApiHandlerOptions private client: OpenAI + private lastResponseId: string | undefined + private responseIdPromise: Promise | undefined + private responseIdResolver: ((value: string | undefined) => void) | undefined + + // Event types handled by the shared GPT-5 event processor to avoid duplication + private readonly gpt5CoreHandledTypes = new Set([ + "response.text.delta", + "response.output_text.delta", + "response.reasoning.delta", + "response.reasoning_text.delta", + "response.reasoning_summary.delta", + "response.reasoning_summary_text.delta", + "response.refusal.delta", + "response.output_item.added", + "response.done", + "response.completed", + ]) constructor(options: ApiHandlerOptions) { super() this.options = options + // Default to including reasoning.summary: "auto" for GPT‑5 unless explicitly disabled + if (this.options.enableGpt5ReasoningSummary === undefined) { + this.options.enableGpt5ReasoningSummary = true + } const apiKey = this.options.openAiNativeApiKey ?? "not-provided" this.client = new OpenAI({ baseURL: this.options.openAiNativeBaseUrl, apiKey }) } + private normalizeGpt5Usage(usage: any, model: OpenAiNativeModel): ApiStreamUsageChunk | undefined { + if (!usage) return undefined + + const totalInputTokens = usage.input_tokens ?? usage.prompt_tokens ?? 0 + const totalOutputTokens = usage.output_tokens ?? usage.completion_tokens ?? 0 + const cacheWriteTokens = usage.cache_creation_input_tokens ?? usage.cache_write_tokens ?? 0 + const cacheReadTokens = usage.cache_read_input_tokens ?? usage.cache_read_tokens ?? usage.cached_tokens ?? 0 + + const totalCost = calculateApiCostOpenAI( + model.info, + totalInputTokens, + totalOutputTokens, + cacheWriteTokens || 0, + cacheReadTokens || 0, + ) + + return { + type: "usage", + inputTokens: totalInputTokens, + outputTokens: totalOutputTokens, + cacheWriteTokens, + cacheReadTokens, + totalCost, + } + } + + private resolveResponseId(responseId: string | undefined): void { + if (responseId) { + this.lastResponseId = responseId + } + // Resolve the promise so the next request can use this ID + if (this.responseIdResolver) { + this.responseIdResolver(responseId) + this.responseIdResolver = undefined + } + } + override async *createMessage( systemPrompt: string, messages: Anthropic.Messages.MessageParam[], @@ -82,7 +118,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio } else if (model.id.startsWith("o1")) { yield* this.handleO1FamilyMessage(model, systemPrompt, messages) } else if (this.isGpt5Model(model.id)) { - yield* this.handleGpt5Message(model, systemPrompt, messages) + yield* this.handleGpt5Message(model, systemPrompt, messages, metadata) } else { yield* this.handleDefaultModelMessage(model, systemPrompt, messages) } @@ -157,8 +193,8 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio ...(reasoning && reasoning), } - // Add verbosity if supported (for future GPT-5 models) - if (verbosity && model.id.startsWith("gpt-5")) { + // Add verbosity if supported + if (verbosity) { params.verbosity = verbosity } @@ -180,175 +216,915 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio model: OpenAiNativeModel, systemPrompt: string, messages: Anthropic.Messages.MessageParam[], + metadata?: ApiHandlerCreateMessageMetadata, ): ApiStream { - // GPT-5 uses the Responses API, not Chat Completions - // We need to format the input as a single string combining system prompt and messages - const formattedInput = this.formatInputForResponsesAPI(systemPrompt, messages) + // Prefer the official SDK Responses API with streaming; fall back to fetch-based SSE if needed. + const { verbosity } = this.getModel() - // Get reasoning effort, supporting the new "minimal" option for GPT-5 + // Resolve reasoning effort (supports "minimal" for GPT‑5) const reasoningEffort = this.getGpt5ReasoningEffort(model) - // Get verbosity from model settings, default to "medium" if not specified - const verbosity = model.verbosity || "medium" + // Wait for any pending response ID from a previous request to be available + // This handles the race condition with fast nano model responses + let effectivePreviousResponseId = metadata?.previousResponseId + + // Only allow fallback to pending/last response id when not explicitly suppressed + if (!metadata?.suppressPreviousResponseId) { + // If we have a pending response ID promise, wait for it to resolve + if (!effectivePreviousResponseId && this.responseIdPromise) { + try { + const resolvedId = await Promise.race([ + this.responseIdPromise, + // Timeout after 100ms to avoid blocking too long + new Promise((resolve) => setTimeout(() => resolve(undefined), 100)), + ]) + if (resolvedId) { + effectivePreviousResponseId = resolvedId + } + } catch { + // Non-fatal if promise fails + } + } + + // Fall back to the last known response ID if still not available + if (!effectivePreviousResponseId) { + effectivePreviousResponseId = this.lastResponseId + } + } + + // Format input and capture continuity id + const { formattedInput, previousResponseId } = this.prepareGpt5Input(systemPrompt, messages, metadata) + const requestPreviousResponseId = effectivePreviousResponseId ?? previousResponseId + + // Create a new promise for this request's response ID + this.responseIdPromise = new Promise((resolve) => { + this.responseIdResolver = resolve + }) + + // Build a request body (also used for fallback) + // Ensure we explicitly pass max_output_tokens for GPT‑5 based on Roo's reserved model response calculation + // so requests do not default to very large limits (e.g., 120k). + interface Gpt5RequestBody { + model: string + input: string + stream: boolean + reasoning?: { effort: ReasoningEffortWithMinimal; summary?: "auto" } + text?: { verbosity: VerbosityLevel } + temperature?: number + max_output_tokens?: number + previous_response_id?: string + } - // Prepare the request parameters for Responses API - const params: GPT5ResponsesAPIParams = { + const requestBody: Gpt5RequestBody = { model: model.id, input: formattedInput, + stream: true, ...(reasoningEffort && { reasoning: { effort: reasoningEffort, + ...(this.options.enableGpt5ReasoningSummary ? { summary: "auto" as const } : {}), }, }), - text: { - verbosity: verbosity, - }, + text: { verbosity: (verbosity || "medium") as VerbosityLevel }, + temperature: this.options.modelTemperature ?? GPT5_DEFAULT_TEMPERATURE, + // Explicitly include the calculated max output tokens for GPT‑5. + // Use the per-request reserved output computed by Roo (params.maxTokens from getModelParams). + ...(model.maxTokens ? { max_output_tokens: model.maxTokens } : {}), + ...(requestPreviousResponseId && { previous_response_id: requestPreviousResponseId }), } - // Since the OpenAI SDK doesn't yet support the Responses API, - // we'll make a direct HTTP request - const response = await this.makeGpt5ResponsesAPIRequest(params, model) + try { + // Use the official SDK + const stream = (await (this.client as any).responses.create(requestBody)) as AsyncIterable - yield* this.handleGpt5StreamResponse(response, model) + if (typeof (stream as any)[Symbol.asyncIterator] !== "function") { + throw new Error( + "OpenAI SDK did not return an AsyncIterable for Responses API streaming. Falling back to SSE.", + ) + } + + for await (const event of stream) { + for await (const outChunk of this.processGpt5Event(event, model)) { + yield outChunk + } + } + } catch (sdkErr: any) { + // Check if this is a 400 error about previous_response_id not found + const errorMessage = sdkErr?.message || sdkErr?.error?.message || "" + const is400Error = sdkErr?.status === 400 || sdkErr?.response?.status === 400 + const isPreviousResponseError = + errorMessage.includes("Previous response") || errorMessage.includes("not found") + + if (is400Error && requestBody.previous_response_id && isPreviousResponseError) { + // Log the error and retry without the previous_response_id + console.warn( + `[GPT-5] Previous response ID not found (${requestBody.previous_response_id}), retrying without it`, + ) + + // Remove the problematic previous_response_id and retry + const retryRequestBody = { ...requestBody } + delete retryRequestBody.previous_response_id + + // Clear the stored lastResponseId to prevent using it again + this.lastResponseId = undefined + + try { + // Retry with the SDK + const retryStream = (await (this.client as any).responses.create( + retryRequestBody, + )) as AsyncIterable + + if (typeof (retryStream as any)[Symbol.asyncIterator] !== "function") { + // If SDK fails, fall back to SSE + yield* this.makeGpt5ResponsesAPIRequest(retryRequestBody, model, metadata) + return + } + + for await (const event of retryStream) { + for await (const outChunk of this.processGpt5Event(event, model)) { + yield outChunk + } + } + return + } catch (retryErr) { + // If retry also fails, fall back to SSE + yield* this.makeGpt5ResponsesAPIRequest(retryRequestBody, model, metadata) + return + } + } + + // For other errors, fallback to manual SSE via fetch + yield* this.makeGpt5ResponsesAPIRequest(requestBody, model, metadata) + } } private formatInputForResponsesAPI(systemPrompt: string, messages: Anthropic.Messages.MessageParam[]): string { - // Format the conversation for the Responses API's single input field - let formattedInput = `System: ${systemPrompt}\n\n` + // Format the conversation for the Responses API input field + // Use Developer role format for GPT-5 (aligning with o1/o3 Developer role usage per GPT-5 Responses guidance) + // This ensures consistent instruction handling across reasoning models + let formattedInput = `Developer: ${systemPrompt}\n\n` for (const message of messages) { const role = message.role === "user" ? "User" : "Assistant" - const content = - typeof message.content === "string" - ? message.content - : message.content.map((c) => (c.type === "text" ? c.text : "[image]")).join(" ") - formattedInput += `${role}: ${content}\n\n` + + // Handle text content + if (typeof message.content === "string") { + formattedInput += `${role}: ${message.content}\n\n` + } else if (Array.isArray(message.content)) { + // Handle content blocks + const textContent = message.content + .filter((block) => block.type === "text") + .map((block) => (block as any).text) + .join("\n") + if (textContent) { + formattedInput += `${role}: ${textContent}\n\n` + } + } } return formattedInput.trim() } - private getGpt5ReasoningEffort(model: OpenAiNativeModel): ReasoningEffortWithMinimal | undefined { - const { reasoning } = model - - // Check if reasoning effort is configured - if (reasoning && "reasoning_effort" in reasoning) { - const effort = reasoning.reasoning_effort - // Support the new "minimal" effort level for GPT-5 - if (effort === "low" || effort === "medium" || effort === "high") { - return effort + private formatSingleMessageForResponsesAPI(message: Anthropic.Messages.MessageParam): string { + // Format a single message for the Responses API when using previous_response_id + const role = message.role === "user" ? "User" : "Assistant" + + // Handle text content + if (typeof message.content === "string") { + return `${role}: ${message.content}` + } else if (Array.isArray(message.content)) { + // Handle content blocks + const textContent = message.content + .filter((block) => block.type === "text") + .map((block) => (block as any).text) + .join("\n") + if (textContent) { + return `${role}: ${textContent}` } } - // Default to "minimal" for GPT-5 models when not specified - // This provides fastest time-to-first-token as per documentation - return "minimal" + return "" } - private async makeGpt5ResponsesAPIRequest( - params: GPT5ResponsesAPIParams, + private async *makeGpt5ResponsesAPIRequest( + requestBody: any, model: OpenAiNativeModel, - ): Promise> { - // The OpenAI SDK doesn't have direct support for the Responses API yet, - // but we can access it through the underlying client request method if available. - // For now, we'll use the Chat Completions API with GPT-5 specific formatting - // to maintain compatibility while the Responses API SDK support is being added. - - // Convert Responses API params to Chat Completions format - // GPT-5 models use "developer" role for system messages - const messages: OpenAI.Chat.ChatCompletionMessageParam[] = [{ role: "developer", content: params.input }] - - // Build the request parameters - const requestParams: any = { - model: params.model, - messages, - stream: true, - stream_options: { include_usage: true }, - } + metadata?: ApiHandlerCreateMessageMetadata, + ): ApiStream { + const apiKey = this.options.openAiNativeApiKey ?? "not-provided" + const baseUrl = this.options.openAiNativeBaseUrl || "https://api.openai.com" + const url = `${baseUrl}/v1/responses` - // Add reasoning effort if specified (supporting "minimal" for GPT-5) - if (params.reasoning?.effort) { - if (params.reasoning.effort === "minimal") { - // For minimal effort, we pass "minimal" as the reasoning_effort - requestParams.reasoning_effort = "minimal" - } else { - requestParams.reasoning_effort = params.reasoning.effort + try { + const response = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + Accept: "text/event-stream", + }, + body: JSON.stringify(requestBody), + }) + + if (!response.ok) { + const errorText = await response.text() + + let errorMessage = `GPT-5 API request failed (${response.status})` + let errorDetails = "" + + // Try to parse error as JSON for better error messages + try { + const errorJson = JSON.parse(errorText) + if (errorJson.error?.message) { + errorDetails = errorJson.error.message + } else if (errorJson.message) { + errorDetails = errorJson.message + } else { + errorDetails = errorText + } + } catch { + // If not JSON, use the raw text + errorDetails = errorText + } + + // Check if this is a 400 error about previous_response_id not found + const isPreviousResponseError = + errorDetails.includes("Previous response") || errorDetails.includes("not found") + + if (response.status === 400 && requestBody.previous_response_id && isPreviousResponseError) { + // Log the error and retry without the previous_response_id + console.warn( + `[GPT-5 SSE] Previous response ID not found (${requestBody.previous_response_id}), retrying without it`, + ) + + // Remove the problematic previous_response_id and retry + const retryRequestBody = { ...requestBody } + delete retryRequestBody.previous_response_id + + // Clear the stored lastResponseId to prevent using it again + this.lastResponseId = undefined + // Resolve the promise once to unblock any waiting requests + this.resolveResponseId(undefined) + + // Retry the request without the previous_response_id + const retryResponse = await fetch(url, { + method: "POST", + headers: { + "Content-Type": "application/json", + Authorization: `Bearer ${apiKey}`, + Accept: "text/event-stream", + }, + body: JSON.stringify(retryRequestBody), + }) + + if (!retryResponse.ok) { + // If retry also fails, throw the original error + throw new Error(`GPT-5 API retry failed (${retryResponse.status})`) + } + + if (!retryResponse.body) { + throw new Error("GPT-5 Responses API error: No response body from retry request") + } + + // Handle the successful retry response + yield* this.handleGpt5StreamResponse(retryResponse.body, model) + return + } + + // Provide user-friendly error messages based on status code + switch (response.status) { + case 400: + errorMessage = "Invalid request to GPT-5 API. Please check your input parameters." + break + case 401: + errorMessage = "Authentication failed. Please check your OpenAI API key." + break + case 403: + errorMessage = "Access denied. Your API key may not have access to GPT-5 models." + break + case 404: + errorMessage = + "GPT-5 API endpoint not found. The model may not be available yet or requires a different configuration." + break + case 429: + errorMessage = "Rate limit exceeded. Please try again later." + break + case 500: + case 502: + case 503: + errorMessage = "OpenAI service error. Please try again later." + break + default: + errorMessage = `GPT-5 API error (${response.status})` + } + + // Append details if available + if (errorDetails) { + errorMessage += ` - ${errorDetails}` + } + + throw new Error(errorMessage) + } + + if (!response.body) { + throw new Error("GPT-5 Responses API error: No response body") + } + + // Handle streaming response + yield* this.handleGpt5StreamResponse(response.body, model) + } catch (error) { + if (error instanceof Error) { + // Re-throw with the original error message if it's already formatted + if (error.message.includes("GPT-5")) { + throw error + } + // Otherwise, wrap it with context + throw new Error(`Failed to connect to GPT-5 API: ${error.message}`) } + // Handle non-Error objects + throw new Error(`Unexpected error connecting to GPT-5 API`) } + } - // Add verbosity control for GPT-5 models - // According to the docs, Chat Completions API also supports verbosity parameter - if (params.text?.verbosity) { - requestParams.verbosity = params.text.verbosity + /** + * Prepares the input and conversation continuity parameters for a GPT-5 API call. + * + * - If a `previousResponseId` is available (either from metadata or the handler's state), + * it formats only the most recent user message for the input and returns the response ID + * to maintain conversation context. + * - Otherwise, it formats the entire conversation history (system prompt + messages) for the input. + * + * @returns An object containing the formatted input string and the previous response ID (if used). + */ + private prepareGpt5Input( + systemPrompt: string, + messages: Anthropic.Messages.MessageParam[], + metadata?: ApiHandlerCreateMessageMetadata, + ): { formattedInput: string; previousResponseId?: string } { + // Respect explicit suppression signal for continuity (e.g. immediately after condense) + const isFirstMessage = messages.length === 1 && messages[0].role === "user" + const allowFallback = !metadata?.suppressPreviousResponseId + + const previousResponseId = + metadata?.previousResponseId ?? (allowFallback && !isFirstMessage ? this.lastResponseId : undefined) + + if (previousResponseId) { + const lastUserMessage = [...messages].reverse().find((msg) => msg.role === "user") + const formattedInput = lastUserMessage ? this.formatSingleMessageForResponsesAPI(lastUserMessage) : "" + return { formattedInput, previousResponseId } + } else { + const formattedInput = this.formatInputForResponsesAPI(systemPrompt, messages) + return { formattedInput } } + } - const stream = (await this.client.chat.completions.create( - requestParams, - )) as unknown as AsyncIterable + /** + * Handles the streaming response from the GPT-5 Responses API. + * + * This function iterates through the Server-Sent Events (SSE) stream, parses each event, + * and yields structured data chunks (`ApiStream`). It handles a wide variety of event types, + * including text deltas, reasoning, usage data, and various status/tool events. + * + * The following event types are intentionally ignored as they are not currently consumed + * by the client application: + * - Audio events (`response.audio.*`) + * - Most tool call events (e.g., `response.function_call_arguments.*`, `response.mcp_call.*`, etc.) + * as the client does not yet support rendering these tool interactions. + * - Status events (`response.created`, `response.in_progress`, etc.) as they are informational + * and do not affect the final output. + */ + private async *handleGpt5StreamResponse(body: ReadableStream, model: OpenAiNativeModel): ApiStream { + const reader = body.getReader() + const decoder = new TextDecoder() + let buffer = "" + let hasContent = false + let totalInputTokens = 0 + let totalOutputTokens = 0 - // Convert the stream to GPT-5 response format - return this.convertChatStreamToGpt5Format(stream) + try { + while (true) { + const { done, value } = await reader.read() + if (done) break + + buffer += decoder.decode(value, { stream: true }) + const lines = buffer.split("\n") + buffer = lines.pop() || "" + + for (const line of lines) { + if (line.startsWith("data: ")) { + const data = line.slice(6).trim() + if (data === "[DONE]") { + continue + } + + try { + const parsed = JSON.parse(data) + + // Store response ID for conversation continuity + if (parsed.response?.id) { + this.resolveResponseId(parsed.response.id) + } + + // Delegate standard event types to the shared processor to avoid duplication + if (parsed?.type && this.gpt5CoreHandledTypes.has(parsed.type)) { + for await (const outChunk of this.processGpt5Event(parsed, model)) { + // Track whether we've emitted any content so fallback handling can decide appropriately + if (outChunk.type === "text" || outChunk.type === "reasoning") { + hasContent = true + } + yield outChunk + } + continue + } + + // Check if this is a complete response (non-streaming format) + if (parsed.response && parsed.response.output && Array.isArray(parsed.response.output)) { + // Handle complete response in the initial event + for (const outputItem of parsed.response.output) { + if (outputItem.type === "text" && outputItem.content) { + for (const content of outputItem.content) { + if (content.type === "text" && content.text) { + hasContent = true + yield { + type: "text", + text: content.text, + } + } + } + } + // Additionally handle reasoning summaries if present (non-streaming summary output) + if (outputItem.type === "reasoning" && Array.isArray(outputItem.summary)) { + for (const summary of outputItem.summary) { + if (summary?.type === "summary_text" && typeof summary.text === "string") { + hasContent = true + yield { + type: "reasoning", + text: summary.text, + } + } + } + } + } + // Check for usage in the complete response + if (parsed.response.usage) { + const usageData = this.normalizeGpt5Usage(parsed.response.usage, model) + if (usageData) { + yield usageData + } + } + } + // Handle streaming delta events for text content + else if ( + parsed.type === "response.text.delta" || + parsed.type === "response.output_text.delta" + ) { + // Primary streaming event for text deltas + if (parsed.delta) { + hasContent = true + yield { + type: "text", + text: parsed.delta, + } + } + } else if ( + parsed.type === "response.text.done" || + parsed.type === "response.output_text.done" + ) { + // Text streaming completed - final text already streamed via deltas + } + // Handle reasoning delta events + else if ( + parsed.type === "response.reasoning.delta" || + parsed.type === "response.reasoning_text.delta" + ) { + // Streaming reasoning content + if (parsed.delta) { + hasContent = true + yield { + type: "reasoning", + text: parsed.delta, + } + } + } else if ( + parsed.type === "response.reasoning.done" || + parsed.type === "response.reasoning_text.done" + ) { + // Reasoning streaming completed + } + // Handle reasoning summary events + else if ( + parsed.type === "response.reasoning_summary.delta" || + parsed.type === "response.reasoning_summary_text.delta" + ) { + // Streaming reasoning summary + if (parsed.delta) { + hasContent = true + yield { + type: "reasoning", + text: parsed.delta, + } + } + } else if ( + parsed.type === "response.reasoning_summary.done" || + parsed.type === "response.reasoning_summary_text.done" + ) { + // Reasoning summary completed + } + // Handle refusal delta events + else if (parsed.type === "response.refusal.delta") { + // Model is refusing to answer + if (parsed.delta) { + hasContent = true + yield { + type: "text", + text: `[Refusal] ${parsed.delta}`, + } + } + } else if (parsed.type === "response.refusal.done") { + // Refusal completed + } + // Handle audio delta events (for multimodal responses) + else if (parsed.type === "response.audio.delta") { + // Audio streaming - we'll skip for now as we focus on text + // Could be handled in future for voice responses + } else if (parsed.type === "response.audio.done") { + // Audio completed + } + // Handle audio transcript delta events + else if (parsed.type === "response.audio_transcript.delta") { + // Audio transcript streaming + if (parsed.delta) { + hasContent = true + yield { + type: "text", + text: parsed.delta, + } + } + } else if (parsed.type === "response.audio_transcript.done") { + // Audio transcript completed + } + // Handle content part events (for structured content) + else if (parsed.type === "response.content_part.added") { + // New content part added - could be text, image, etc. + if (parsed.part?.type === "text" && parsed.part.text) { + hasContent = true + yield { + type: "text", + text: parsed.part.text, + } + } + } else if (parsed.type === "response.content_part.done") { + // Content part completed + } + // Handle output item events (alternative format) + else if (parsed.type === "response.output_item.added") { + // This is where the actual content comes through in some test cases + if (parsed.item) { + if (parsed.item.type === "text" && parsed.item.text) { + hasContent = true + yield { type: "text", text: parsed.item.text } + } else if (parsed.item.type === "reasoning" && parsed.item.text) { + hasContent = true + yield { type: "reasoning", text: parsed.item.text } + } else if (parsed.item.type === "message" && parsed.item.content) { + // Handle message type items + for (const content of parsed.item.content) { + if (content.type === "text" && content.text) { + hasContent = true + yield { type: "text", text: content.text } + } + } + } + } + } else if (parsed.type === "response.output_item.done") { + // Output item completed + } + // Handle function/tool call events + else if (parsed.type === "response.function_call_arguments.delta") { + // Function call arguments streaming + // We could yield this as a special type if needed for tool usage + } else if (parsed.type === "response.function_call_arguments.done") { + // Function call completed + } + // Handle MCP (Model Context Protocol) tool events + else if (parsed.type === "response.mcp_call_arguments.delta") { + // MCP tool call arguments streaming + } else if (parsed.type === "response.mcp_call_arguments.done") { + // MCP tool call completed + } else if (parsed.type === "response.mcp_call.in_progress") { + // MCP tool call in progress + } else if ( + parsed.type === "response.mcp_call.completed" || + parsed.type === "response.mcp_call.failed" + ) { + // MCP tool call status events + } else if (parsed.type === "response.mcp_list_tools.in_progress") { + // MCP list tools in progress + } else if ( + parsed.type === "response.mcp_list_tools.completed" || + parsed.type === "response.mcp_list_tools.failed" + ) { + // MCP list tools status events + } + // Handle web search events + else if (parsed.type === "response.web_search_call.searching") { + // Web search in progress + } else if (parsed.type === "response.web_search_call.in_progress") { + // Processing web search results + } else if (parsed.type === "response.web_search_call.completed") { + // Web search completed + } + // Handle code interpreter events + else if (parsed.type === "response.code_interpreter_call_code.delta") { + // Code interpreter code streaming + if (parsed.delta) { + // Could yield as a special code type if needed + } + } else if (parsed.type === "response.code_interpreter_call_code.done") { + // Code interpreter code completed + } else if (parsed.type === "response.code_interpreter_call.interpreting") { + // Code interpreter running + } else if (parsed.type === "response.code_interpreter_call.in_progress") { + // Code execution in progress + } else if (parsed.type === "response.code_interpreter_call.completed") { + // Code interpreter completed + } + // Handle file search events + else if (parsed.type === "response.file_search_call.searching") { + // File search in progress + } else if (parsed.type === "response.file_search_call.in_progress") { + // Processing file search results + } else if (parsed.type === "response.file_search_call.completed") { + // File search completed + } + // Handle image generation events + else if (parsed.type === "response.image_gen_call.generating") { + // Image generation in progress + } else if (parsed.type === "response.image_gen_call.in_progress") { + // Processing image generation + } else if (parsed.type === "response.image_gen_call.partial_image") { + // Image partially generated + } else if (parsed.type === "response.image_gen_call.completed") { + // Image generation completed + } + // Handle computer use events + else if ( + parsed.type === "response.computer_tool_call.output_item" || + parsed.type === "response.computer_tool_call.output_screenshot" + ) { + // Computer use tool events + } + // Handle annotation events + else if ( + parsed.type === "response.output_text_annotation.added" || + parsed.type === "response.text_annotation.added" + ) { + // Text annotation events - could be citations, references, etc. + } + // Handle error events + else if (parsed.type === "response.error" || parsed.type === "error") { + // Error event from the API + if (parsed.error || parsed.message) { + throw new Error( + `GPT-5 API error: ${parsed.error?.message || parsed.message || "Unknown error"}`, + ) + } + } + // Handle incomplete event + else if (parsed.type === "response.incomplete") { + // Response was incomplete - might need to handle specially + } + // Handle queued event + else if (parsed.type === "response.queued") { + // Response is queued + } + // Handle in_progress event + else if (parsed.type === "response.in_progress") { + // Response is being processed + } + // Handle failed event + else if (parsed.type === "response.failed") { + // Response failed + if (parsed.error || parsed.message) { + throw new Error( + `GPT-5 response failed: ${parsed.error?.message || parsed.message || "Unknown failure"}`, + ) + } + } else if (parsed.type === "response.completed" || parsed.type === "response.done") { + // Store response ID for conversation continuity + if (parsed.response?.id) { + this.resolveResponseId(parsed.response.id) + } + + // Check if the done event contains the complete output (as a fallback) + if ( + !hasContent && + parsed.response && + parsed.response.output && + Array.isArray(parsed.response.output) + ) { + for (const outputItem of parsed.response.output) { + if (outputItem.type === "message" && outputItem.content) { + for (const content of outputItem.content) { + if (content.type === "output_text" && content.text) { + hasContent = true + yield { + type: "text", + text: content.text, + } + } + } + } + // Also surface reasoning summaries if present in the final output + if (outputItem.type === "reasoning" && Array.isArray(outputItem.summary)) { + for (const summary of outputItem.summary) { + if ( + summary?.type === "summary_text" && + typeof summary.text === "string" + ) { + hasContent = true + yield { + type: "reasoning", + text: summary.text, + } + } + } + } + } + } + + // Usage for done/completed is already handled by processGpt5Event in SDK path. + // For SSE path, usage often arrives separately; avoid double-emitting here. + } + // These are structural or status events, we can just log them at a lower level or ignore. + else if ( + parsed.type === "response.created" || + parsed.type === "response.in_progress" || + parsed.type === "response.output_item.done" || + parsed.type === "response.content_part.added" || + parsed.type === "response.content_part.done" + ) { + // Status events - no action needed + } + // Fallback for older formats or unexpected responses + else if (parsed.choices?.[0]?.delta?.content) { + hasContent = true + yield { + type: "text", + text: parsed.choices[0].delta.content, + } + } + // Additional fallback: some events place text under 'item.text' even if type isn't matched above + else if ( + parsed.item && + typeof parsed.item.text === "string" && + parsed.item.text.length > 0 + ) { + hasContent = true + yield { + type: "text", + text: parsed.item.text, + } + } else if (parsed.usage) { + // Handle usage if it arrives in a separate, non-completed event + const usageData = this.normalizeGpt5Usage(parsed.usage, model) + if (usageData) { + yield usageData + } + } + } catch (e) { + // Silently ignore parsing errors for non-critical SSE data + } + } + // Also try to parse non-SSE formatted lines + else if (line.trim() && !line.startsWith(":")) { + try { + const parsed = JSON.parse(line) + + // Try to extract content from various possible locations + if (parsed.content || parsed.text || parsed.message) { + hasContent = true + yield { + type: "text", + text: parsed.content || parsed.text || parsed.message, + } + } + } catch { + // Not JSON, might be plain text - ignore + } + } + } + } + + // If we didn't get any content, don't throw - the API might have returned an empty response + // This can happen in certain edge cases and shouldn't break the flow + } catch (error) { + if (error instanceof Error) { + throw new Error(`Error processing GPT-5 response stream: ${error.message}`) + } + throw new Error("Unexpected error processing GPT-5 response stream") + } finally { + reader.releaseLock() + } } - private async *convertChatStreamToGpt5Format( - stream: AsyncIterable, - ): AsyncIterable { - for await (const chunk of stream) { - const delta = chunk.choices[0]?.delta + /** + * Shared processor for GPT‑5 Responses API events. + * Used by both the official SDK streaming path and (optionally) by the SSE fallback. + */ + private async *processGpt5Event(event: any, model: OpenAiNativeModel): ApiStream { + // Persist response id for conversation continuity when available + if (event?.response?.id) { + this.resolveResponseId(event.response.id) + } - if (delta?.content) { - yield { - type: "text", - text: delta.content, - } + // Handle known streaming text deltas + if (event?.type === "response.text.delta" || event?.type === "response.output_text.delta") { + if (event?.delta) { + yield { type: "text", text: event.delta } } + return + } - if (chunk.usage) { - yield { - type: "usage", - usage: { - input_tokens: chunk.usage.prompt_tokens || 0, - output_tokens: chunk.usage.completion_tokens || 0, - total_tokens: chunk.usage.total_tokens || 0, - }, + // Handle reasoning deltas (including summary variants) + if ( + event?.type === "response.reasoning.delta" || + event?.type === "response.reasoning_text.delta" || + event?.type === "response.reasoning_summary.delta" || + event?.type === "response.reasoning_summary_text.delta" + ) { + if (event?.delta) { + yield { type: "reasoning", text: event.delta } + } + return + } + + // Handle refusal deltas + if (event?.type === "response.refusal.delta") { + if (event?.delta) { + yield { type: "text", text: `[Refusal] ${event.delta}` } + } + return + } + + // Handle output item additions (SDK or Responses API alternative format) + if (event?.type === "response.output_item.added") { + const item = event?.item + if (item) { + if (item.type === "text" && item.text) { + yield { type: "text", text: item.text } + } else if (item.type === "reasoning" && item.text) { + yield { type: "reasoning", text: item.text } + } else if (item.type === "message" && Array.isArray(item.content)) { + for (const content of item.content) { + // Some implementations send 'text'; others send 'output_text' + if ((content?.type === "text" || content?.type === "output_text") && content?.text) { + yield { type: "text", text: content.text } + } + } } } + return + } + + // Completion events that may carry usage + if (event?.type === "response.done" || event?.type === "response.completed") { + const usage = event?.response?.usage || event?.usage || undefined + const usageData = this.normalizeGpt5Usage(usage, model) + if (usageData) { + yield usageData + } + return + } + + // Fallbacks for older formats or unexpected objects + if (event?.choices?.[0]?.delta?.content) { + yield { type: "text", text: event.choices[0].delta.content } + return + } + + if (event?.usage) { + const usageData = this.normalizeGpt5Usage(event.usage, model) + if (usageData) { + yield usageData + } } } - private async *handleGpt5StreamResponse( - stream: AsyncIterable, - model: OpenAiNativeModel, - ): ApiStream { - for await (const chunk of stream) { - if (chunk.type === "text" && chunk.text) { - yield { - type: "text", - text: chunk.text, - } - } else if (chunk.type === "usage" && chunk.usage) { - const inputTokens = chunk.usage.input_tokens - const outputTokens = chunk.usage.output_tokens - const cacheReadTokens = 0 - const cacheWriteTokens = 0 - const totalCost = calculateApiCostOpenAI( - model.info, - inputTokens, - outputTokens, - cacheWriteTokens, - cacheReadTokens, - ) + private getGpt5ReasoningEffort(model: OpenAiNativeModel): ReasoningEffortWithMinimal | undefined { + const { reasoning, info } = model - yield { - type: "usage", - inputTokens, - outputTokens, - cacheWriteTokens, - cacheReadTokens, - totalCost, - } + // Check if reasoning effort is configured + if (reasoning && "reasoning_effort" in reasoning) { + const effort = reasoning.reasoning_effort as string + // Support all effort levels including "minimal" for GPT-5 + if (effort === "minimal" || effort === "low" || effort === "medium" || effort === "high") { + return effort as ReasoningEffortWithMinimal } } + + // Centralize default: use the model's default from types if available; otherwise undefined + return info.reasoningEffort as ReasoningEffortWithMinimal | undefined } private isGpt5Model(modelId: string): boolean { @@ -376,16 +1152,28 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio } private async *yieldUsage(info: ModelInfo, usage: OpenAI.Completions.CompletionUsage | undefined): ApiStream { - const inputTokens = usage?.prompt_tokens || 0 // sum of cache hits and misses + const inputTokens = usage?.prompt_tokens || 0 const outputTokens = usage?.completion_tokens || 0 - const cacheReadTokens = usage?.prompt_tokens_details?.cached_tokens || 0 - const cacheWriteTokens = 0 - const totalCost = calculateApiCostOpenAI(info, inputTokens, outputTokens, cacheWriteTokens, cacheReadTokens) - const nonCachedInputTokens = Math.max(0, inputTokens - cacheReadTokens - cacheWriteTokens) + + // Extract cache tokens from prompt_tokens_details + // According to OpenAI API, cached_tokens represents tokens read from cache + const cacheReadTokens = usage?.prompt_tokens_details?.cached_tokens || undefined + + // Cache write tokens are not typically reported in the standard streaming response + // They would be in cache_creation_input_tokens if available + const cacheWriteTokens = (usage as any)?.cache_creation_input_tokens || undefined + + const totalCost = calculateApiCostOpenAI( + info, + inputTokens, + outputTokens, + cacheWriteTokens || 0, + cacheReadTokens || 0, + ) yield { type: "usage", - inputTokens: nonCachedInputTokens, + inputTokens: inputTokens, outputTokens: outputTokens, cacheWriteTokens: cacheWriteTokens, cacheReadTokens: cacheReadTokens, @@ -406,15 +1194,17 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio modelId: id, model: info, settings: this.options, - defaultTemperature: OPENAI_NATIVE_DEFAULT_TEMPERATURE, + defaultTemperature: this.isGpt5Model(id) ? GPT5_DEFAULT_TEMPERATURE : OPENAI_NATIVE_DEFAULT_TEMPERATURE, }) // For GPT-5 models, ensure we support minimal reasoning effort - if (this.isGpt5Model(id) && params.reasoning) { - // Allow "minimal" effort for GPT-5 models - const effort = this.options.reasoningEffort - if (effort === "low" || effort === "medium" || effort === "high") { - params.reasoning.reasoning_effort = effort + if (this.isGpt5Model(id)) { + const effort = + (this.options.reasoningEffort as ReasoningEffortWithMinimal | undefined) ?? + (info.reasoningEffort as ReasoningEffortWithMinimal | undefined) + + if (effort) { + ;(params.reasoning as any) = { reasoning_effort: effort } } } @@ -423,25 +1213,62 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio return { id: id.startsWith("o3-mini") ? "o3-mini" : id, info, ...params, verbosity: params.verbosity } } + /** + * Gets the last GPT-5 response ID captured from the Responses API stream. + * Used for maintaining conversation continuity across requests. + * @returns The response ID, or undefined if not available yet + */ + getLastResponseId(): string | undefined { + return this.lastResponseId + } + + /** + * Sets the last GPT-5 response ID for conversation continuity. + * Typically only used in tests or special flows. + * @param responseId The GPT-5 response ID to store + */ + setResponseId(responseId: string): void { + this.lastResponseId = responseId + } + async completePrompt(prompt: string): Promise { try { const { id, temperature, reasoning, verbosity } = this.getModel() + const isGpt5 = this.isGpt5Model(id) - const params: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming & { - verbosity?: VerbosityLevel - } = { + if (isGpt5) { + // GPT-5 uses the Responses API, not Chat Completions. Avoid undefined behavior here. + throw new Error( + "completePrompt is not supported for GPT-5 models. Use createMessage (Responses API) instead.", + ) + } + + const params: any = { model: id, messages: [{ role: "user", content: prompt }], - temperature, - ...(reasoning && reasoning), } - // Add verbosity for GPT-5 models - if (this.isGpt5Model(id) && verbosity) { - params.verbosity = verbosity + // Add temperature if supported + if (temperature !== undefined) { + params.temperature = temperature + } + + // For GPT-5 models, add reasoning_effort and verbosity as top-level parameters + if (isGpt5) { + if (reasoning && "reasoning_effort" in reasoning) { + params.reasoning_effort = reasoning.reasoning_effort + } + if (verbosity) { + params.verbosity = verbosity + } + } else { + // For non-GPT-5 models, add reasoning as is + if (reasoning) { + Object.assign(params, reasoning) + } } - const response = await this.client.chat.completions.create(params as any) + const response = await this.client.chat.completions.create(params) return response.choices[0]?.message.content || "" } catch (error) { if (error instanceof Error) { diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts index 85abcf1a69..eed719cf0f 100644 --- a/src/api/providers/openai.ts +++ b/src/api/providers/openai.ts @@ -305,7 +305,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl ], stream: true, ...(isGrokXAI ? {} : { stream_options: { include_usage: true } }), - reasoning_effort: modelInfo.reasoningEffort, + reasoning_effort: modelInfo.reasoningEffort as "low" | "medium" | "high" | undefined, temperature: undefined, } @@ -330,7 +330,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl }, ...convertToOpenAiMessages(messages), ], - reasoning_effort: modelInfo.reasoningEffort, + reasoning_effort: modelInfo.reasoningEffort as "low" | "medium" | "high" | undefined, temperature: undefined, } diff --git a/src/api/providers/requesty.ts b/src/api/providers/requesty.ts index 8af0b9aa42..d2e55fc8f0 100644 --- a/src/api/providers/requesty.ts +++ b/src/api/providers/requesty.ts @@ -116,7 +116,7 @@ export class RequestyHandler extends BaseProvider implements SingleCompletionHan model, max_tokens, temperature, - ...(reasoning_effort && { reasoning_effort }), + ...(reasoning_effort && reasoning_effort !== "minimal" && { reasoning_effort }), ...(thinking && { thinking }), stream: true, stream_options: { include_usage: true }, diff --git a/src/api/transform/model-params.ts b/src/api/transform/model-params.ts index cc30aa5605..933697c0a5 100644 --- a/src/api/transform/model-params.ts +++ b/src/api/transform/model-params.ts @@ -2,6 +2,7 @@ import { type ModelInfo, type ProviderSettings, type VerbosityLevel, + type ReasoningEffortWithMinimal, ANTHROPIC_DEFAULT_MAX_TOKENS, } from "@roo-code/types" @@ -38,7 +39,7 @@ type GetModelParamsOptions = { type BaseModelParams = { maxTokens: number | undefined temperature: number | undefined - reasoningEffort: "low" | "medium" | "high" | undefined + reasoningEffort: ReasoningEffortWithMinimal | undefined reasoningBudget: number | undefined verbosity: VerbosityLevel | undefined } @@ -128,7 +129,8 @@ export function getModelParams({ temperature = 1.0 } else if (shouldUseReasoningEffort({ model, settings })) { // "Traditional" reasoning models use the `reasoningEffort` parameter. - reasoningEffort = customReasoningEffort ?? model.reasoningEffort + const effort = customReasoningEffort ?? model.reasoningEffort + reasoningEffort = effort as ReasoningEffortWithMinimal } const params: BaseModelParams = { maxTokens, temperature, reasoningEffort, reasoningBudget, verbosity } diff --git a/src/api/transform/reasoning.ts b/src/api/transform/reasoning.ts index a173c59b19..46ef029ea3 100644 --- a/src/api/transform/reasoning.ts +++ b/src/api/transform/reasoning.ts @@ -2,7 +2,7 @@ import { BetaThinkingConfigParam } from "@anthropic-ai/sdk/resources/beta" import OpenAI from "openai" import type { GenerateContentConfig } from "@google/genai" -import type { ModelInfo, ProviderSettings } from "@roo-code/types" +import type { ModelInfo, ProviderSettings, ReasoningEffortWithMinimal } from "@roo-code/types" import { shouldUseReasoningBudget, shouldUseReasoningEffort } from "../../shared/api" @@ -23,7 +23,7 @@ export type GeminiReasoningParams = GenerateContentConfig["thinkingConfig"] export type GetModelReasoningOptions = { model: ModelInfo reasoningBudget: number | undefined - reasoningEffort: ReasoningEffort | undefined + reasoningEffort: ReasoningEffortWithMinimal | undefined settings: ProviderSettings } @@ -36,7 +36,9 @@ export const getOpenRouterReasoning = ({ shouldUseReasoningBudget({ model, settings }) ? { max_tokens: reasoningBudget } : shouldUseReasoningEffort({ model, settings }) - ? { effort: reasoningEffort } + ? reasoningEffort !== "minimal" + ? { effort: reasoningEffort } + : undefined : undefined export const getAnthropicReasoning = ({ @@ -50,8 +52,19 @@ export const getOpenAiReasoning = ({ model, reasoningEffort, settings, -}: GetModelReasoningOptions): OpenAiReasoningParams | undefined => - shouldUseReasoningEffort({ model, settings }) ? { reasoning_effort: reasoningEffort } : undefined +}: GetModelReasoningOptions): OpenAiReasoningParams | undefined => { + if (!shouldUseReasoningEffort({ model, settings })) { + return undefined + } + + // If model has reasoning effort capability, return object even if effort is undefined + // This preserves the reasoning_effort field in the API call + if (reasoningEffort === "minimal") { + return undefined + } + + return { reasoning_effort: reasoningEffort } +} export const getGeminiReasoning = ({ model, diff --git a/src/core/task/Task.ts b/src/core/task/Task.ts index 3cb6abe7f7..1dd615f0eb 100644 --- a/src/core/task/Task.ts +++ b/src/core/task/Task.ts @@ -252,6 +252,8 @@ export class Task extends EventEmitter implements TaskLike { didCompleteReadingStream = false assistantMessageParser?: AssistantMessageParser isAssistantMessageParserEnabled = false + private lastUsedInstructions?: string + private skipPrevResponseIdOnce: boolean = false constructor({ provider, @@ -824,6 +826,7 @@ export class Task extends EventEmitter implements TaskLike { progressStatus?: ToolProgressStatus, options: { isNonInteractive?: boolean + metadata?: Record } = {}, contextCondense?: ContextCondense, ): Promise { @@ -861,6 +864,7 @@ export class Task extends EventEmitter implements TaskLike { images, partial, contextCondense, + metadata: options.metadata, }) } } else { @@ -876,6 +880,9 @@ export class Task extends EventEmitter implements TaskLike { lastMessage.images = images lastMessage.partial = false lastMessage.progressStatus = progressStatus + if (options.metadata) { + ;(lastMessage as any).metadata = options.metadata + } // Instead of streaming partialMessage events, we do a save // and post like normal to persist to disk. @@ -891,7 +898,15 @@ export class Task extends EventEmitter implements TaskLike { this.lastMessageTs = sayTs } - await this.addToClineMessages({ ts: sayTs, type: "say", say: type, text, images, contextCondense }) + await this.addToClineMessages({ + ts: sayTs, + type: "say", + say: type, + text, + images, + contextCondense, + metadata: options.metadata, + }) } } } else { @@ -1736,6 +1751,8 @@ export class Task extends EventEmitter implements TaskLike { presentAssistantMessage(this) } + await this.persistGpt5Metadata(reasoningMessage) + updateApiReqMsg() await this.saveClineMessages() await this.providerRef.deref()?.postStateToWebview() @@ -1954,6 +1971,7 @@ export class Task extends EventEmitter implements TaskLike { Task.lastGlobalApiRequestTime = Date.now() const systemPrompt = await this.getSystemPrompt() + this.lastUsedInstructions = systemPrompt const { contextTokens } = this.getTokenUsage() if (contextTokens) { @@ -1992,6 +2010,10 @@ export class Task extends EventEmitter implements TaskLike { if (truncateResult.error) { await this.say("condense_context_error", truncateResult.error) } else if (truncateResult.summary) { + // A condense operation occurred; for the next GPT‑5 API call we should NOT + // send previous_response_id so the request reflects the fresh condensed context. + this.skipPrevResponseIdOnce = true + const { summary, cost, prevContextTokens, newContextTokens = 0 } = truncateResult const contextCondense: ContextCondense = { summary, cost, newContextTokens, prevContextTokens } await this.say( @@ -2008,7 +2030,7 @@ export class Task extends EventEmitter implements TaskLike { } const messagesSinceLastSummary = getMessagesSinceLastSummary(this.apiConversationHistory) - const cleanConversationHistory = maybeRemoveImageBlocks(messagesSinceLastSummary, this.api).map( + let cleanConversationHistory = maybeRemoveImageBlocks(messagesSinceLastSummary, this.api).map( ({ role, content }) => ({ role, content }), ) @@ -2024,9 +2046,41 @@ export class Task extends EventEmitter implements TaskLike { throw new Error("Auto-approval limit reached and user did not approve continuation") } + // Determine GPT‑5 previous_response_id from last persisted assistant turn (if available), + // unless a condense just occurred (skip once after condense). + let previousResponseId: string | undefined = undefined + try { + const modelId = this.api.getModel().id + if (modelId && modelId.startsWith("gpt-5") && !this.skipPrevResponseIdOnce) { + // Find the last assistant message that has a previous_response_id stored + const idx = findLastIndex( + this.clineMessages, + (m) => + m.type === "say" && + (m as any).say === "text" && + (m as any).metadata?.gpt5?.previous_response_id, + ) + if (idx !== -1) { + // Use the previous_response_id from the last assistant message for this request + previousResponseId = ((this.clineMessages[idx] as any).metadata.gpt5.previous_response_id || + undefined) as string | undefined + } + } + } catch { + // non-fatal + } + const metadata: ApiHandlerCreateMessageMetadata = { mode: mode, taskId: this.taskId, + ...(previousResponseId ? { previousResponseId } : {}), + // If a condense just occurred, explicitly suppress continuity fallback for the next call + ...(this.skipPrevResponseIdOnce ? { suppressPreviousResponseId: true } : {}), + } + + // Reset skip flag after applying (it only affects the immediate next call) + if (this.skipPrevResponseIdOnce) { + this.skipPrevResponseIdOnce = false } const stream = this.api.createMessage(systemPrompt, cleanConversationHistory, metadata) @@ -2172,6 +2226,35 @@ export class Task extends EventEmitter implements TaskLike { } } + /** + * Persist GPT-5 per-turn metadata (previous_response_id, instructions, reasoning_summary) + * onto the last complete assistant say("text") message. + */ + private async persistGpt5Metadata(reasoningMessage?: string): Promise { + try { + const modelId = this.api.getModel().id + if (!modelId || !modelId.startsWith("gpt-5")) return + + const lastResponseId: string | undefined = (this.api as any)?.getLastResponseId?.() + const idx = findLastIndex( + this.clineMessages, + (m) => m.type === "say" && (m as any).say === "text" && m.partial !== true, + ) + if (idx !== -1) { + const msg = this.clineMessages[idx] as any + msg.metadata = msg.metadata ?? {} + msg.metadata.gpt5 = { + ...(msg.metadata.gpt5 ?? {}), + previous_response_id: lastResponseId, + instructions: this.lastUsedInstructions, + reasoning_summary: (reasoningMessage ?? "").trim() || undefined, + } + } + } catch { + // Non-fatal error in metadata persistence + } + } + // Getters public get cwd() { diff --git a/src/shared/api.ts b/src/shared/api.ts index 014b903453..e9b57af3c1 100644 --- a/src/shared/api.ts +++ b/src/shared/api.ts @@ -6,8 +6,15 @@ import { } from "@roo-code/types" // ApiHandlerOptions - -export type ApiHandlerOptions = Omit +// Extend ProviderSettings (minus apiProvider) with handler-specific toggles. +export type ApiHandlerOptions = Omit & { + /** + * When true and using GPT‑5 Responses API, include reasoning.summary: "auto" + * so the API returns reasoning summaries (we already parse and surface them). + * Defaults to true; set to false to disable summaries. + */ + enableGpt5ReasoningSummary?: boolean +} // RouterName diff --git a/webview-ui/src/components/settings/ApiOptions.tsx b/webview-ui/src/components/settings/ApiOptions.tsx index 74ba885d25..70a58f03bf 100644 --- a/webview-ui/src/components/settings/ApiOptions.tsx +++ b/webview-ui/src/components/settings/ApiOptions.tsx @@ -576,6 +576,12 @@ const ApiOptions = ({ if (value !== "custom-arn" && selectedProvider === "bedrock") { setApiConfigurationField("awsCustomArn", "") } + + // Clear reasoning effort when switching models to allow the new model's default to take effect + // This is especially important for GPT-5 models which default to "medium" + if (selectedProvider === "openai-native") { + setApiConfigurationField("reasoningEffort", undefined) + } }}> @@ -617,11 +623,14 @@ const ApiOptions = ({ modelInfo={selectedModelInfo} /> - + {/* Gate Verbosity UI by capability flag */} + {selectedModelInfo?.supportsVerbosity && ( + + )} {!fromWelcomeView && ( diff --git a/webview-ui/src/components/settings/ThinkingBudget.tsx b/webview-ui/src/components/settings/ThinkingBudget.tsx index a49ec79efc..a3e2d428b4 100644 --- a/webview-ui/src/components/settings/ThinkingBudget.tsx +++ b/webview-ui/src/components/settings/ThinkingBudget.tsx @@ -1,7 +1,12 @@ import { useEffect } from "react" import { Checkbox } from "vscrui" -import { type ProviderSettings, type ModelInfo, type ReasoningEffort, reasoningEfforts } from "@roo-code/types" +import { + type ProviderSettings, + type ModelInfo, + type ReasoningEffortWithMinimal, + reasoningEfforts, +} from "@roo-code/types" import { DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS, @@ -27,10 +32,35 @@ export const ThinkingBudget = ({ apiConfiguration, setApiConfigurationField, mod const isGemini25Pro = selectedModelId && selectedModelId.includes("gemini-2.5-pro") const minThinkingTokens = isGemini25Pro ? GEMINI_25_PRO_MIN_THINKING_TOKENS : 1024 + // Check if this is a GPT-5 model to show "minimal" option + // Only show minimal for OpenAI Native provider GPT-5 models + const isOpenAiNativeProvider = apiConfiguration.apiProvider === "openai-native" + const isGpt5Model = isOpenAiNativeProvider && selectedModelId && selectedModelId.startsWith("gpt-5") + // Add "minimal" option for GPT-5 models + // Spread to convert readonly tuple into a mutable array, then expose as readonly for safety + const baseEfforts = [...reasoningEfforts] as ReasoningEffortWithMinimal[] + const availableReasoningEfforts: ReadonlyArray = isGpt5Model + ? (["minimal", ...baseEfforts] as ReasoningEffortWithMinimal[]) + : baseEfforts + + // Default reasoning effort - use model's default if available + // GPT-5 models have "medium" as their default in the model configuration + const modelDefaultReasoningEffort = modelInfo?.reasoningEffort as ReasoningEffortWithMinimal | undefined + const defaultReasoningEffort: ReasoningEffortWithMinimal = modelDefaultReasoningEffort || "medium" + const currentReasoningEffort: ReasoningEffortWithMinimal = + (apiConfiguration.reasoningEffort as ReasoningEffortWithMinimal | undefined) || defaultReasoningEffort + const isReasoningBudgetSupported = !!modelInfo && modelInfo.supportsReasoningBudget const isReasoningBudgetRequired = !!modelInfo && modelInfo.requiredReasoningBudget const isReasoningEffortSupported = !!modelInfo && modelInfo.supportsReasoningEffort + // Set default reasoning effort when model supports it and no value is set + useEffect(() => { + if (isReasoningEffortSupported && !apiConfiguration.reasoningEffort && defaultReasoningEffort) { + setApiConfigurationField("reasoningEffort", defaultReasoningEffort) + } + }, [isReasoningEffortSupported, apiConfiguration.reasoningEffort, defaultReasoningEffort, setApiConfigurationField]) + const enableReasoningEffort = apiConfiguration.enableReasoningEffort const customMaxOutputTokens = apiConfiguration.modelMaxTokens || DEFAULT_HYBRID_REASONING_MODEL_MAX_TOKENS const customMaxThinkingTokens = @@ -109,13 +139,21 @@ export const ThinkingBudget = ({ apiConfiguration, setApiConfigurationField, mod