@@ -18,7 +18,7 @@ import type { ApiHandlerOptions } from "../../shared/api"
1818import { calculateApiCostOpenAI } from "../../shared/cost"
1919
2020import { convertToOpenAiMessages } from "../transform/openai-format"
21- import { ApiStream } from "../transform/stream"
21+ import { ApiStream , ApiStreamUsageChunk } from "../transform/stream"
2222import { getModelParams } from "../transform/model-params"
2323
2424import { BaseProvider } from "./base-provider"
@@ -42,6 +42,41 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
4242 this . client = new OpenAI ( { baseURL : this . options . openAiNativeBaseUrl , apiKey } )
4343 }
4444
45+ // Review comment 1: Extract usage normalization helper
46+ private normalizeGpt5Usage ( usage : any , model : OpenAiNativeModel ) : ApiStreamUsageChunk | undefined {
47+ if ( ! usage ) return undefined
48+
49+ const totalInputTokens = usage . input_tokens ?? usage . prompt_tokens ?? 0
50+ const totalOutputTokens = usage . output_tokens ?? usage . completion_tokens ?? 0
51+ const cacheWriteTokens = usage . cache_creation_input_tokens ?? usage . cache_write_tokens ?? 0
52+ const cacheReadTokens = usage . cache_read_input_tokens ?? usage . cache_read_tokens ?? usage . cached_tokens ?? 0
53+
54+ const inputCost = ( totalInputTokens / 1_000_000 ) * ( model . info . inputPrice || 0 )
55+ const outputCost = ( totalOutputTokens / 1_000_000 ) * ( model . info . outputPrice || 0 )
56+ const totalCost = inputCost + outputCost
57+
58+ return {
59+ type : "usage" ,
60+ inputTokens : totalInputTokens ,
61+ outputTokens : totalOutputTokens ,
62+ cacheWriteTokens,
63+ cacheReadTokens,
64+ totalCost,
65+ }
66+ }
67+
68+ // Review comment 3: Deduplicate response ID resolver logic
69+ private resolveResponseId ( responseId : string | undefined ) : void {
70+ if ( responseId ) {
71+ this . lastResponseId = responseId
72+ }
73+ // Resolve the promise so the next request can use this ID
74+ if ( this . responseIdResolver ) {
75+ this . responseIdResolver ( responseId )
76+ this . responseIdResolver = undefined
77+ }
78+ }
79+
4580 override async * createMessage (
4681 systemPrompt : string ,
4782 messages : Anthropic . Messages . MessageParam [ ] ,
@@ -211,7 +246,8 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
211246 input : formattedInput ,
212247 stream : true ,
213248 ...( reasoningEffort && {
214- // Always request reasoning summaries for GPT‑5 responses
249+ // Review comment 4: Remove dead enableGpt5ReasoningSummary option
250+ // Always request reasoning summaries for GPT‑5 responses (no longer configurable)
215251 reasoning : {
216252 effort : reasoningEffort ,
217253 summary : "auto" ,
@@ -292,7 +328,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
292328
293329 private formatInputForResponsesAPI ( systemPrompt : string , messages : Anthropic . Messages . MessageParam [ ] ) : string {
294330 // Format the conversation for the Responses API input field
295- // Use Developer role format for GPT-5 (similar to o1/o3 models)
331+ // Review comment 8: Add clarifying comment for Developer prefix
332+ // Use Developer role format for GPT-5 (aligning with o1/o3 Developer role usage per GPT-5 Responses guidance)
333+ // This ensures consistent instruction handling across reasoning models
296334 let formattedInput = `Developer: ${ systemPrompt } \n\n`
297335
298336 for ( const message of messages ) {
@@ -502,7 +540,11 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
502540 messages : Anthropic . Messages . MessageParam [ ] ,
503541 metadata ?: ApiHandlerCreateMessageMetadata ,
504542 ) : { formattedInput : string ; previousResponseId ?: string } {
505- const previousResponseId = metadata ?. previousResponseId || this . lastResponseId
543+ // Review comment 2: Suppress conversation continuity for first message
544+ // Don't use stored lastResponseId if this is the first user message in the conversation
545+ // But always respect explicitly provided previousResponseId from metadata
546+ const isFirstMessage = messages . length === 1 && messages [ 0 ] . role === "user"
547+ const previousResponseId = metadata ?. previousResponseId || ( ! isFirstMessage ? this . lastResponseId : undefined )
506548
507549 if ( previousResponseId ) {
508550 const lastUserMessage = [ ...messages ] . reverse ( ) . find ( ( msg ) => msg . role === "user" )
@@ -558,12 +600,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
558600
559601 // Store response ID for conversation continuity
560602 if ( parsed . response ?. id ) {
561- this . lastResponseId = parsed . response . id
562- // Resolve the promise so the next request can use this ID
563- if ( this . responseIdResolver ) {
564- this . responseIdResolver ( parsed . response . id )
565- this . responseIdResolver = undefined
566- }
603+ this . resolveResponseId ( parsed . response . id )
567604 }
568605
569606 // Check if this is a complete response (non-streaming format)
@@ -596,30 +633,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
596633 }
597634 // Check for usage in the complete response
598635 if ( parsed . response . usage ) {
599- const usage = parsed . response . usage
600- totalInputTokens = usage . input_tokens || usage . prompt_tokens || 0
601- totalOutputTokens = usage . output_tokens || usage . completion_tokens || 0
602-
603- // Extract cache tokens from the usage object
604- const cacheWriteTokens =
605- usage . cache_creation_input_tokens || usage . cache_write_tokens || 0
606- const cacheReadTokens =
607- usage . cache_read_input_tokens ||
608- usage . cache_read_tokens ||
609- usage . cached_tokens ||
610- 0
611-
612- const inputCost = ( totalInputTokens / 1_000_000 ) * ( model . info . inputPrice || 0 )
613- const outputCost = ( totalOutputTokens / 1_000_000 ) * ( model . info . outputPrice || 0 )
614- const totalCost = inputCost + outputCost
615-
616- yield {
617- type : "usage" ,
618- inputTokens : totalInputTokens ,
619- outputTokens : totalOutputTokens ,
620- cacheWriteTokens : cacheWriteTokens ,
621- cacheReadTokens : cacheReadTokens ,
622- totalCost : totalCost ,
636+ const usageData = this . normalizeGpt5Usage ( parsed . response . usage , model )
637+ if ( usageData ) {
638+ yield usageData
623639 }
624640 }
625641 }
@@ -863,12 +879,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
863879 } else if ( parsed . type === "response.completed" || parsed . type === "response.done" ) {
864880 // Store response ID for conversation continuity
865881 if ( parsed . response ?. id ) {
866- this . lastResponseId = parsed . response . id
867- // Resolve the promise so the next request can use this ID
868- if ( this . responseIdResolver ) {
869- this . responseIdResolver ( parsed . response . id )
870- this . responseIdResolver = undefined
871- }
882+ this . resolveResponseId ( parsed . response . id )
872883 }
873884
874885 // Check if the done event contains the complete output (as a fallback)
@@ -910,30 +921,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
910921
911922 // Extract final usage information from the 'completed' event
912923 if ( parsed . response ?. usage ) {
913- const usage = parsed . response . usage
914- totalInputTokens = usage . input_tokens || usage . prompt_tokens || 0
915- totalOutputTokens = usage . output_tokens || usage . completion_tokens || 0
916-
917- // Extract cache tokens from the usage object
918- const cacheWriteTokens =
919- usage . cache_creation_input_tokens || usage . cache_write_tokens || 0
920- const cacheReadTokens =
921- usage . cache_read_input_tokens ||
922- usage . cache_read_tokens ||
923- usage . cached_tokens ||
924- 0
925-
926- const inputCost = ( totalInputTokens / 1_000_000 ) * ( model . info . inputPrice || 0 )
927- const outputCost = ( totalOutputTokens / 1_000_000 ) * ( model . info . outputPrice || 0 )
928- const totalCost = inputCost + outputCost
929-
930- yield {
931- type : "usage" ,
932- inputTokens : totalInputTokens ,
933- outputTokens : totalOutputTokens ,
934- cacheWriteTokens : cacheWriteTokens ,
935- cacheReadTokens : cacheReadTokens ,
936- totalCost : totalCost ,
924+ const usageData = this . normalizeGpt5Usage ( parsed . response . usage , model )
925+ if ( usageData ) {
926+ yield usageData
937927 }
938928 }
939929 }
@@ -968,27 +958,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
968958 }
969959 } else if ( parsed . usage ) {
970960 // Handle usage if it arrives in a separate, non-completed event
971- const usage = parsed . usage
972- totalInputTokens = usage . input_tokens || usage . prompt_tokens || 0
973- totalOutputTokens = usage . output_tokens || usage . completion_tokens || 0
974-
975- // Extract cache tokens from the usage object
976- const cacheWriteTokens =
977- usage . cache_creation_input_tokens || usage . cache_write_tokens || 0
978- const cacheReadTokens =
979- usage . cache_read_input_tokens || usage . cache_read_tokens || usage . cached_tokens || 0
980-
981- const inputCost = ( totalInputTokens / 1_000_000 ) * ( model . info . inputPrice || 0 )
982- const outputCost = ( totalOutputTokens / 1_000_000 ) * ( model . info . outputPrice || 0 )
983- const totalCost = inputCost + outputCost
984-
985- yield {
986- type : "usage" ,
987- inputTokens : totalInputTokens ,
988- outputTokens : totalOutputTokens ,
989- cacheWriteTokens : cacheWriteTokens ,
990- cacheReadTokens : cacheReadTokens ,
991- totalCost : totalCost ,
961+ const usageData = this . normalizeGpt5Usage ( parsed . usage , model )
962+ if ( usageData ) {
963+ yield usageData
992964 }
993965 }
994966 } catch ( e ) {
@@ -1034,12 +1006,7 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
10341006 private async * processGpt5Event ( event : any , model : OpenAiNativeModel ) : ApiStream {
10351007 // Persist response id for conversation continuity when available
10361008 if ( event ?. response ?. id ) {
1037- this . lastResponseId = event . response . id
1038- // Resolve the promise so the next request can use this ID
1039- if ( this . responseIdResolver ) {
1040- this . responseIdResolver ( event . response . id )
1041- this . responseIdResolver = undefined
1042- }
1009+ this . resolveResponseId ( event . response . id )
10431010 }
10441011
10451012 // Handle known streaming text deltas
@@ -1094,26 +1061,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
10941061 // Completion events that may carry usage
10951062 if ( event ?. type === "response.done" || event ?. type === "response.completed" ) {
10961063 const usage = event ?. response ?. usage || event ?. usage || undefined
1097-
1098- if ( usage ) {
1099- const totalInputTokens = usage . input_tokens ?? usage . prompt_tokens ?? 0
1100- const totalOutputTokens = usage . output_tokens ?? usage . completion_tokens ?? 0
1101- const cacheWriteTokens = usage . cache_creation_input_tokens ?? usage . cache_write_tokens ?? 0
1102- const cacheReadTokens =
1103- usage . cache_read_input_tokens ?? usage . cache_read_tokens ?? usage . cached_tokens ?? 0
1104-
1105- const inputCost = ( totalInputTokens / 1_000_000 ) * ( model . info . inputPrice || 0 )
1106- const outputCost = ( totalOutputTokens / 1_000_000 ) * ( model . info . outputPrice || 0 )
1107- const totalCost = inputCost + outputCost
1108-
1109- yield {
1110- type : "usage" ,
1111- inputTokens : totalInputTokens ,
1112- outputTokens : totalOutputTokens ,
1113- cacheWriteTokens,
1114- cacheReadTokens,
1115- totalCost,
1116- }
1064+ const usageData = this . normalizeGpt5Usage ( usage , model )
1065+ if ( usageData ) {
1066+ yield usageData
11171067 }
11181068 return
11191069 }
@@ -1125,29 +1075,15 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
11251075 }
11261076
11271077 if ( event ?. usage ) {
1128- const usage = event . usage
1129- const totalInputTokens = usage . input_tokens ?? usage . prompt_tokens ?? 0
1130- const totalOutputTokens = usage . output_tokens ?? usage . completion_tokens ?? 0
1131- const cacheWriteTokens = usage . cache_creation_input_tokens ?? usage . cache_write_tokens ?? 0
1132- const cacheReadTokens = usage . cache_read_input_tokens ?? usage . cache_read_tokens ?? usage . cached_tokens ?? 0
1133-
1134- const inputCost = ( totalInputTokens / 1_000_000 ) * ( model . info . inputPrice || 0 )
1135- const outputCost = ( totalOutputTokens / 1_000_000 ) * ( model . info . outputPrice || 0 )
1136- const totalCost = inputCost + outputCost
1137-
1138- yield {
1139- type : "usage" ,
1140- inputTokens : totalInputTokens ,
1141- outputTokens : totalOutputTokens ,
1142- cacheWriteTokens,
1143- cacheReadTokens,
1144- totalCost,
1078+ const usageData = this . normalizeGpt5Usage ( event . usage , model )
1079+ if ( usageData ) {
1080+ yield usageData
11451081 }
11461082 }
11471083 }
11481084
11491085 private getGpt5ReasoningEffort ( model : OpenAiNativeModel ) : ReasoningEffortWithMinimal | undefined {
1150- const { reasoning } = model
1086+ const { reasoning, info } = model
11511087
11521088 // Check if reasoning effort is configured
11531089 if ( reasoning && "reasoning_effort" in reasoning ) {
@@ -1158,8 +1094,9 @@ export class OpenAiNativeHandler extends BaseProvider implements SingleCompletio
11581094 }
11591095 }
11601096
1161- // Default to "medium" for GPT-5 models when not specified
1162- return "medium"
1097+ // Review comment 6: Centralize default GPT-5 reasoning effort
1098+ // Use model's default from types if available, otherwise fall back to "medium"
1099+ return ( info . reasoningEffort as ReasoningEffortWithMinimal ) || "medium"
11631100 }
11641101
11651102 private isGpt5Model ( modelId : string ) : boolean {
0 commit comments