diff --git a/src/api/index.ts b/src/api/index.ts index 0880f422182..c6d2b07cd22 100644 --- a/src/api/index.ts +++ b/src/api/index.ts @@ -88,21 +88,25 @@ export function getModelParams({ model, defaultMaxTokens, defaultTemperature = 0, + defaultReasoningEffort, }: { options: ApiHandlerOptions model: ModelInfo defaultMaxTokens?: number defaultTemperature?: number + defaultReasoningEffort?: "low" | "medium" | "high" }) { const { modelMaxTokens: customMaxTokens, modelMaxThinkingTokens: customMaxThinkingTokens, modelTemperature: customTemperature, + reasoningEffort: customReasoningEffort, } = options let maxTokens = model.maxTokens ?? defaultMaxTokens let thinking: BetaThinkingConfigParam | undefined = undefined let temperature = customTemperature ?? defaultTemperature + const reasoningEffort = customReasoningEffort ?? defaultReasoningEffort if (model.thinking) { // Only honor `customMaxTokens` for thinking models. @@ -118,5 +122,5 @@ export function getModelParams({ temperature = 1.0 } - return { maxTokens, thinking, temperature } + return { maxTokens, thinking, temperature, reasoningEffort } } diff --git a/src/api/providers/openai.ts b/src/api/providers/openai.ts index fc739b31105..96984d90c12 100644 --- a/src/api/providers/openai.ts +++ b/src/api/providers/openai.ts @@ -82,6 +82,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl const urlHost = this._getUrlHost(modelUrl) const deepseekReasoner = modelId.includes("deepseek-reasoner") || enabledR1Format const ark = modelUrl.includes(".volces.com") + if (modelId.startsWith("o3-mini")) { yield* this.handleO3FamilyMessage(modelId, systemPrompt, messages) return @@ -94,6 +95,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl } let convertedMessages + if (deepseekReasoner) { convertedMessages = convertToR1Format([{ role: "user", content: systemPrompt }, ...messages]) } else if (ark || enabledLegacyFormat) { @@ -112,16 +114,20 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl ], } } + convertedMessages = [systemMessage, ...convertToOpenAiMessages(messages)] + if (modelInfo.supportsPromptCache) { // Note: the following logic is copied from openrouter: // Add cache_control to the last two user messages // (note: this works because we only ever add one user message at a time, but if we added multiple we'd need to mark the user message before the last assistant message) const lastTwoUserMessages = convertedMessages.filter((msg) => msg.role === "user").slice(-2) + lastTwoUserMessages.forEach((msg) => { if (typeof msg.content === "string") { msg.content = [{ type: "text", text: msg.content }] } + if (Array.isArray(msg.content)) { // NOTE: this is fine since env details will always be added at the end. but if it weren't there, and the user added a image_url type message, it would pop a text part before it and then move it after to the end. let lastTextPart = msg.content.filter((part) => part.type === "text").pop() @@ -130,6 +136,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl lastTextPart = { type: "text", text: "..." } msg.content.push(lastTextPart) } + // @ts-ignore-next-line lastTextPart["cache_control"] = { type: "ephemeral" } } @@ -145,7 +152,9 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl messages: convertedMessages, stream: true as const, ...(isGrokXAI ? {} : { stream_options: { include_usage: true } }), + reasoning_effort: this.getModel().info.reasoningEffort, } + if (this.options.includeMaxTokens) { requestOptions.max_tokens = modelInfo.maxTokens } @@ -185,6 +194,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl lastUsage = chunk.usage } } + for (const chunk of matcher.final()) { yield chunk } @@ -217,6 +227,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl type: "text", text: response.choices[0]?.message.content || "", } + yield this.processUsageMetrics(response.usage, modelInfo) } } @@ -241,6 +252,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl async completePrompt(prompt: string): Promise { try { const isAzureAiInference = this._isAzureAiInference(this.options.openAiBaseUrl) + const requestOptions: OpenAI.Chat.Completions.ChatCompletionCreateParamsNonStreaming = { model: this.getModel().id, messages: [{ role: "user", content: prompt }], @@ -250,11 +262,13 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl requestOptions, isAzureAiInference ? { path: AZURE_AI_INFERENCE_PATH } : {}, ) + return response.choices[0]?.message.content || "" } catch (error) { if (error instanceof Error) { throw new Error(`OpenAI completion error: ${error.message}`) } + throw error } } @@ -333,6 +347,7 @@ export class OpenAiHandler extends BaseProvider implements SingleCompletionHandl } } } + private _getUrlHost(baseUrl?: string): string { try { return new URL(baseUrl ?? "").host diff --git a/src/api/providers/openrouter.ts b/src/api/providers/openrouter.ts index 72e4fe576a9..2a279d09a13 100644 --- a/src/api/providers/openrouter.ts +++ b/src/api/providers/openrouter.ts @@ -1,8 +1,7 @@ import { Anthropic } from "@anthropic-ai/sdk" import { BetaThinkingConfigParam } from "@anthropic-ai/sdk/resources/beta" -import axios, { AxiosRequestConfig } from "axios" +import axios from "axios" import OpenAI from "openai" -import delay from "delay" import { ApiHandlerOptions, ModelInfo, openRouterDefaultModelId, openRouterDefaultModelInfo } from "../../shared/api" import { parseApiPrice } from "../../utils/cost" @@ -22,6 +21,12 @@ type OpenRouterChatCompletionParams = OpenAI.Chat.ChatCompletionCreateParams & { transforms?: string[] include_reasoning?: boolean thinking?: BetaThinkingConfigParam + // https://openrouter.ai/docs/use-cases/reasoning-tokens + reasoning?: { + effort?: "high" | "medium" | "low" + max_tokens?: number + exclude?: boolean + } } export class OpenRouterHandler extends BaseProvider implements SingleCompletionHandler { @@ -42,7 +47,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH systemPrompt: string, messages: Anthropic.Messages.MessageParam[], ): AsyncGenerator { - let { id: modelId, maxTokens, thinking, temperature, topP } = this.getModel() + let { id: modelId, maxTokens, thinking, temperature, topP, reasoningEffort } = this.getModel() // Convert Anthropic messages to OpenAI format. let openAiMessages: OpenAI.Chat.ChatCompletionMessageParam[] = [ @@ -70,13 +75,16 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH }, ], } + // Add cache_control to the last two user messages // (note: this works because we only ever add one user message at a time, but if we added multiple we'd need to mark the user message before the last assistant message) const lastTwoUserMessages = openAiMessages.filter((msg) => msg.role === "user").slice(-2) + lastTwoUserMessages.forEach((msg) => { if (typeof msg.content === "string") { msg.content = [{ type: "text", text: msg.content }] } + if (Array.isArray(msg.content)) { // NOTE: this is fine since env details will always be added at the end. but if it weren't there, and the user added a image_url type message, it would pop a text part before it and then move it after to the end. let lastTextPart = msg.content.filter((part) => part.type === "text").pop() @@ -113,6 +121,7 @@ export class OpenRouterHandler extends BaseProvider implements SingleCompletionH }), // This way, the transforms field will only be included in the parameters when openRouterUseMiddleOutTransform is true. ...((this.options.openRouterUseMiddleOutTransform ?? true) && { transforms: ["middle-out"] }), + ...(reasoningEffort && { reasoning: { effort: reasoningEffort } }), } const stream = await this.client.chat.completions.create(completionParams) diff --git a/webview-ui/src/components/settings/ApiOptions.tsx b/webview-ui/src/components/settings/ApiOptions.tsx index 21f40c92af6..813da049317 100644 --- a/webview-ui/src/components/settings/ApiOptions.tsx +++ b/webview-ui/src/components/settings/ApiOptions.tsx @@ -46,7 +46,7 @@ import { OPENROUTER_DEFAULT_PROVIDER_NAME, } from "@/components/ui/hooks/useOpenRouterModelProviders" import { Select, SelectContent, SelectItem, SelectTrigger, SelectValue, SelectSeparator, Button } from "@/components/ui" -import { MODELS_BY_PROVIDER, PROVIDERS, VERTEX_REGIONS } from "./constants" +import { MODELS_BY_PROVIDER, PROVIDERS, VERTEX_REGIONS, REASONING_MODELS } from "./constants" import { AWS_REGIONS } from "../../../../src/shared/aws_regions" import { VSCodeButtonLink } from "../common/VSCodeButtonLink" import { ModelInfoView } from "./ModelInfoView" @@ -58,6 +58,7 @@ import { ThinkingBudget } from "./ThinkingBudget" import { R1FormatSetting } from "./R1FormatSetting" import { OpenRouterBalanceDisplay } from "./OpenRouterBalanceDisplay" import { RequestyBalanceDisplay } from "./RequestyBalanceDisplay" +import { ReasoningEffort } from "./ReasoningEffort" interface ApiOptionsProps { uriScheme: string | undefined @@ -1537,6 +1538,13 @@ const ApiOptions = ({ )} + {selectedProvider === "openrouter" && REASONING_MODELS.has(selectedModelId) && ( + + )} + {selectedProvider === "glama" && ( )} + + (field: K, value: ApiConfiguration[K]) => void +} + +export const ReasoningEffort = ({ apiConfiguration, setApiConfigurationField }: ReasoningEffortProps) => { + const { t } = useAppTranslation() + + return ( +
+
+ +
+ +
+ ) +} diff --git a/webview-ui/src/components/settings/constants.ts b/webview-ui/src/components/settings/constants.ts index 7013a59cfdb..6432a8faf6d 100644 --- a/webview-ui/src/components/settings/constants.ts +++ b/webview-ui/src/components/settings/constants.ts @@ -46,3 +46,5 @@ export const VERTEX_REGIONS = [ { value: "europe-west4", label: "europe-west4" }, { value: "asia-southeast1", label: "asia-southeast1" }, ] + +export const REASONING_MODELS = new Set(["x-ai/grok-3-mini-beta"]) diff --git a/webview-ui/src/i18n/locales/ca/settings.json b/webview-ui/src/i18n/locales/ca/settings.json index af009b4337a..00fb251eab8 100644 --- a/webview-ui/src/i18n/locales/ca/settings.json +++ b/webview-ui/src/i18n/locales/ca/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Límit de freqüència", "description": "Temps mínim entre sol·licituds d'API." + }, + "reasoningEffort": { + "label": "Esforç de raonament del model", + "high": "Alt", + "medium": "Mitjà", + "low": "Baix" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/de/settings.json b/webview-ui/src/i18n/locales/de/settings.json index 11ff0d5a067..59d986be181 100644 --- a/webview-ui/src/i18n/locales/de/settings.json +++ b/webview-ui/src/i18n/locales/de/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Ratenbegrenzung", "description": "Minimale Zeit zwischen API-Anfragen." + }, + "reasoningEffort": { + "label": "Modell-Denkaufwand", + "high": "Hoch", + "medium": "Mittel", + "low": "Niedrig" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/en/settings.json b/webview-ui/src/i18n/locales/en/settings.json index b494ea01e51..e2770854244 100644 --- a/webview-ui/src/i18n/locales/en/settings.json +++ b/webview-ui/src/i18n/locales/en/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Rate limit", "description": "Minimum time between API requests." + }, + "reasoningEffort": { + "label": "Model Reasoning Effort", + "high": "High", + "medium": "Medium", + "low": "Low" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/es/settings.json b/webview-ui/src/i18n/locales/es/settings.json index 0b1f40d5a21..af6e2b218eb 100644 --- a/webview-ui/src/i18n/locales/es/settings.json +++ b/webview-ui/src/i18n/locales/es/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Límite de tasa", "description": "Tiempo mínimo entre solicitudes de API." + }, + "reasoningEffort": { + "label": "Esfuerzo de razonamiento del modelo", + "high": "Alto", + "medium": "Medio", + "low": "Bajo" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/fr/settings.json b/webview-ui/src/i18n/locales/fr/settings.json index d2499a90ad9..948dfb127bb 100644 --- a/webview-ui/src/i18n/locales/fr/settings.json +++ b/webview-ui/src/i18n/locales/fr/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Limite de débit", "description": "Temps minimum entre les requêtes API." + }, + "reasoningEffort": { + "label": "Effort de raisonnement du modèle", + "high": "Élevé", + "medium": "Moyen", + "low": "Faible" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/hi/settings.json b/webview-ui/src/i18n/locales/hi/settings.json index 8572ad1008d..1aaf89e9465 100644 --- a/webview-ui/src/i18n/locales/hi/settings.json +++ b/webview-ui/src/i18n/locales/hi/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "दर सीमा", "description": "API अनुरोधों के बीच न्यूनतम समय।" + }, + "reasoningEffort": { + "label": "मॉडल तर्क प्रयास", + "high": "उच्च", + "medium": "मध्यम", + "low": "निम्न" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/it/settings.json b/webview-ui/src/i18n/locales/it/settings.json index 50282e98f91..570bca7d2ef 100644 --- a/webview-ui/src/i18n/locales/it/settings.json +++ b/webview-ui/src/i18n/locales/it/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Limite di frequenza", "description": "Tempo minimo tra le richieste API." + }, + "reasoningEffort": { + "label": "Sforzo di ragionamento del modello", + "high": "Alto", + "medium": "Medio", + "low": "Basso" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/ja/settings.json b/webview-ui/src/i18n/locales/ja/settings.json index e41d8e361c3..101f56cd8a3 100644 --- a/webview-ui/src/i18n/locales/ja/settings.json +++ b/webview-ui/src/i18n/locales/ja/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "レート制限", "description": "APIリクエスト間の最小時間。" + }, + "reasoningEffort": { + "label": "モデル推論の労力", + "high": "高", + "medium": "中", + "low": "低" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/ko/settings.json b/webview-ui/src/i18n/locales/ko/settings.json index 05e7aa29444..c13e7e8f732 100644 --- a/webview-ui/src/i18n/locales/ko/settings.json +++ b/webview-ui/src/i18n/locales/ko/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "속도 제한", "description": "API 요청 간 최소 시간." + }, + "reasoningEffort": { + "label": "모델 추론 노력", + "high": "높음", + "medium": "중간", + "low": "낮음" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/pl/settings.json b/webview-ui/src/i18n/locales/pl/settings.json index 2d27e9a85cd..534ee152346 100644 --- a/webview-ui/src/i18n/locales/pl/settings.json +++ b/webview-ui/src/i18n/locales/pl/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Limit szybkości", "description": "Minimalny czas między żądaniami API." + }, + "reasoningEffort": { + "label": "Wysiłek rozumowania modelu", + "high": "Wysoki", + "medium": "Średni", + "low": "Niski" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/pt-BR/settings.json b/webview-ui/src/i18n/locales/pt-BR/settings.json index 9181d8fdb32..5df5798a6de 100644 --- a/webview-ui/src/i18n/locales/pt-BR/settings.json +++ b/webview-ui/src/i18n/locales/pt-BR/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Limite de taxa", "description": "Tempo mínimo entre requisições de API." + }, + "reasoningEffort": { + "label": "Esforço de raciocínio do modelo", + "high": "Alto", + "medium": "Médio", + "low": "Baixo" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/tr/settings.json b/webview-ui/src/i18n/locales/tr/settings.json index 5f26eea0b9b..9723383005b 100644 --- a/webview-ui/src/i18n/locales/tr/settings.json +++ b/webview-ui/src/i18n/locales/tr/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Hız sınırı", "description": "API istekleri arasındaki minimum süre." + }, + "reasoningEffort": { + "label": "Model Akıl Yürütme Çabası", + "high": "Yüksek", + "medium": "Orta", + "low": "Düşük" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/vi/settings.json b/webview-ui/src/i18n/locales/vi/settings.json index 824635fbf67..5ab7fe9b28a 100644 --- a/webview-ui/src/i18n/locales/vi/settings.json +++ b/webview-ui/src/i18n/locales/vi/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "Giới hạn tốc độ", "description": "Thời gian tối thiểu giữa các yêu cầu API." + }, + "reasoningEffort": { + "label": "Nỗ lực suy luận của mô hình", + "high": "Cao", + "medium": "Trung bình", + "low": "Thấp" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/zh-CN/settings.json b/webview-ui/src/i18n/locales/zh-CN/settings.json index 97067ffa62f..da85a296bbb 100644 --- a/webview-ui/src/i18n/locales/zh-CN/settings.json +++ b/webview-ui/src/i18n/locales/zh-CN/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "请求频率限制", "description": "设置API请求的最小间隔时间" + }, + "reasoningEffort": { + "label": "模型推理强度", + "high": "高", + "medium": "中", + "low": "低" } }, "browser": { diff --git a/webview-ui/src/i18n/locales/zh-TW/settings.json b/webview-ui/src/i18n/locales/zh-TW/settings.json index cf99713793e..f98c40e6078 100644 --- a/webview-ui/src/i18n/locales/zh-TW/settings.json +++ b/webview-ui/src/i18n/locales/zh-TW/settings.json @@ -225,6 +225,12 @@ "rateLimitSeconds": { "label": "速率限制", "description": "API 請求間的最短時間" + }, + "reasoningEffort": { + "label": "模型推理強度", + "high": "高", + "medium": "中", + "low": "低" } }, "browser": {