diff --git a/app/lib/.server/llm/constants.ts b/app/lib/.server/llm/constants.ts index e21fe827ec..68408166f3 100644 --- a/app/lib/.server/llm/constants.ts +++ b/app/lib/.server/llm/constants.ts @@ -4,6 +4,44 @@ */ export const MAX_TOKENS = 32000; +/* + * Provider-specific default completion token limits + * Used as fallbacks when model doesn't specify maxCompletionTokens + */ +export const PROVIDER_COMPLETION_LIMITS: Record = { + OpenAI: 16384, + Github: 16384, // GitHub Models use OpenAI-compatible limits + Anthropic: 128000, + Google: 32768, + Cohere: 4000, + DeepSeek: 8192, + Groq: 8192, + HuggingFace: 4096, + Mistral: 8192, + Ollama: 8192, + OpenRouter: 8192, + Perplexity: 8192, + Together: 8192, + xAI: 8192, + LMStudio: 8192, + OpenAILike: 8192, + AmazonBedrock: 8192, + Hyperbolic: 8192, +}; + +/* + * Reasoning models that require maxCompletionTokens instead of maxTokens + * These models use internal reasoning tokens and have different API parameter requirements + */ +export function isReasoningModel(modelName: string): boolean { + const result = /^(o1|o3|gpt-5)/i.test(modelName); + + // DEBUG: Test regex matching + console.log(`REGEX TEST: "${modelName}" matches reasoning pattern: ${result}`); + + return result; +} + // limits the number of model responses that can be returned in a single request export const MAX_RESPONSE_SEGMENTS = 2; diff --git a/app/lib/.server/llm/stream-text.ts b/app/lib/.server/llm/stream-text.ts index 9dcf62ba5c..c458c89520 100644 --- a/app/lib/.server/llm/stream-text.ts +++ b/app/lib/.server/llm/stream-text.ts @@ -1,5 +1,5 @@ import { convertToCoreMessages, streamText as _streamText, type Message } from 'ai'; -import { MAX_TOKENS, type FileMap } from './constants'; +import { MAX_TOKENS, PROVIDER_COMPLETION_LIMITS, isReasoningModel, type FileMap } from './constants'; import { getSystemPrompt } from '~/lib/common/prompts/prompts'; import { DEFAULT_MODEL, DEFAULT_PROVIDER, MODIFICATIONS_TAG_NAME, PROVIDER_LIST, WORK_DIR } from '~/utils/constants'; import type { IProviderSetting } from '~/types/model'; @@ -26,6 +26,23 @@ export interface StreamingOptions extends Omit[0] const logger = createScopedLogger('stream-text'); +function getCompletionTokenLimit(modelDetails: any): number { + // 1. If model specifies completion tokens, use that + if (modelDetails.maxCompletionTokens && modelDetails.maxCompletionTokens > 0) { + return modelDetails.maxCompletionTokens; + } + + // 2. Use provider-specific default + const providerDefault = PROVIDER_COMPLETION_LIMITS[modelDetails.provider]; + + if (providerDefault) { + return providerDefault; + } + + // 3. Final fallback to MAX_TOKENS, but cap at reasonable limit for safety + return Math.min(MAX_TOKENS, 16384); +} + function sanitizeText(text: string): string { let sanitized = text.replace(/
.*?<\/div>/s, ''); sanitized = sanitized.replace(/.*?<\/think>/s, ''); @@ -123,10 +140,10 @@ export async function streamText(props: { } } - const dynamicMaxTokens = modelDetails && modelDetails.maxTokenAllowed ? modelDetails.maxTokenAllowed : MAX_TOKENS; + const dynamicMaxTokens = modelDetails ? getCompletionTokenLimit(modelDetails) : Math.min(MAX_TOKENS, 16384); - // Ensure we never exceed reasonable token limits to prevent API errors - const safeMaxTokens = Math.min(dynamicMaxTokens, 100000); // Cap at 100k for safety + // Additional safety cap - should not be needed with proper completion limits, but kept for safety + const safeMaxTokens = Math.min(dynamicMaxTokens, 128000); logger.info( `Max tokens for model ${modelDetails.name} is ${safeMaxTokens} (capped from ${dynamicMaxTokens}) based on model limits`, @@ -204,9 +221,52 @@ export async function streamText(props: { logger.info(`Sending llm call to ${provider.name} with model ${modelDetails.name}`); + // DEBUG: Log reasoning model detection + const isReasoning = isReasoningModel(modelDetails.name); + logger.info(`DEBUG STREAM: Model "${modelDetails.name}" detected as reasoning model: ${isReasoning}`); + // console.log(systemPrompt, processedMessages); - return await _streamText({ + // Use maxCompletionTokens for reasoning models (o1, GPT-5), maxTokens for traditional models + const tokenParams = isReasoning ? { maxCompletionTokens: safeMaxTokens } : { maxTokens: safeMaxTokens }; + + // Filter out unsupported parameters for reasoning models + const filteredOptions = + isReasoning && options + ? Object.fromEntries( + Object.entries(options).filter( + ([key]) => + ![ + 'temperature', + 'topP', + 'presencePenalty', + 'frequencyPenalty', + 'logprobs', + 'topLogprobs', + 'logitBias', + ].includes(key), + ), + ) + : options || {}; + + // DEBUG: Log filtered options + logger.info( + `DEBUG STREAM: Options filtering for model "${modelDetails.name}":`, + JSON.stringify( + { + isReasoning, + originalOptions: options || {}, + filteredOptions, + originalOptionsKeys: options ? Object.keys(options) : [], + filteredOptionsKeys: Object.keys(filteredOptions), + removedParams: options ? Object.keys(options).filter((key) => !(key in filteredOptions)) : [], + }, + null, + 2, + ), + ); + + const streamParams = { model: provider.getModelInstance({ model: modelDetails.name, serverEnv, @@ -214,8 +274,31 @@ export async function streamText(props: { providerSettings, }), system: chatMode === 'build' ? systemPrompt : discussPrompt(), - maxTokens: safeMaxTokens, + ...tokenParams, messages: convertToCoreMessages(processedMessages as any), - ...options, - }); + ...filteredOptions, + + // Set temperature to 1 for reasoning models (required by OpenAI API) + ...(isReasoning ? { temperature: 1 } : {}), + }; + + // DEBUG: Log final streaming parameters + logger.info( + `DEBUG STREAM: Final streaming params for model "${modelDetails.name}":`, + JSON.stringify( + { + hasTemperature: 'temperature' in streamParams, + hasMaxTokens: 'maxTokens' in streamParams, + hasMaxCompletionTokens: 'maxCompletionTokens' in streamParams, + paramKeys: Object.keys(streamParams).filter((key) => !['model', 'messages', 'system'].includes(key)), + streamParams: Object.fromEntries( + Object.entries(streamParams).filter(([key]) => !['model', 'messages', 'system'].includes(key)), + ), + }, + null, + 2, + ), + ); + + return await _streamText(streamParams); } diff --git a/app/lib/modules/llm/providers/anthropic.ts b/app/lib/modules/llm/providers/anthropic.ts index af370900d1..4529d40bc5 100644 --- a/app/lib/modules/llm/providers/anthropic.ts +++ b/app/lib/modules/llm/providers/anthropic.ts @@ -22,6 +22,7 @@ export default class AnthropicProvider extends BaseProvider { label: 'Claude 3.5 Sonnet', provider: 'Anthropic', maxTokenAllowed: 200000, + maxCompletionTokens: 128000, }, // Claude 3 Haiku: 200k context, fastest and most cost-effective @@ -30,6 +31,7 @@ export default class AnthropicProvider extends BaseProvider { label: 'Claude 3 Haiku', provider: 'Anthropic', maxTokenAllowed: 200000, + maxCompletionTokens: 128000, }, ]; @@ -84,6 +86,7 @@ export default class AnthropicProvider extends BaseProvider { label: `${m.display_name} (${Math.floor(contextWindow / 1000)}k context)`, provider: this.name, maxTokenAllowed: contextWindow, + maxCompletionTokens: 128000, // Claude models support up to 128k completion tokens }; }); } diff --git a/app/lib/modules/llm/providers/github.ts b/app/lib/modules/llm/providers/github.ts index 56745fde54..322be6b73a 100644 --- a/app/lib/modules/llm/providers/github.ts +++ b/app/lib/modules/llm/providers/github.ts @@ -14,13 +14,31 @@ export default class GithubProvider extends BaseProvider { // find more in https://github.com/marketplace?type=models staticModels: ModelInfo[] = [ - { name: 'gpt-4o', label: 'GPT-4o', provider: 'Github', maxTokenAllowed: 8000 }, - { name: 'o1', label: 'o1-preview', provider: 'Github', maxTokenAllowed: 100000 }, - { name: 'o1-mini', label: 'o1-mini', provider: 'Github', maxTokenAllowed: 8000 }, - { name: 'gpt-4o-mini', label: 'GPT-4o Mini', provider: 'Github', maxTokenAllowed: 8000 }, - { name: 'gpt-4-turbo', label: 'GPT-4 Turbo', provider: 'Github', maxTokenAllowed: 8000 }, - { name: 'gpt-4', label: 'GPT-4', provider: 'Github', maxTokenAllowed: 8000 }, - { name: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo', provider: 'Github', maxTokenAllowed: 8000 }, + { name: 'gpt-4o', label: 'GPT-4o', provider: 'Github', maxTokenAllowed: 128000, maxCompletionTokens: 16384 }, + { name: 'o1', label: 'o1-preview', provider: 'Github', maxTokenAllowed: 100000, maxCompletionTokens: 16384 }, + { name: 'o1-mini', label: 'o1-mini', provider: 'Github', maxTokenAllowed: 65536, maxCompletionTokens: 8192 }, + { + name: 'gpt-4o-mini', + label: 'GPT-4o Mini', + provider: 'Github', + maxTokenAllowed: 128000, + maxCompletionTokens: 16384, + }, + { + name: 'gpt-4-turbo', + label: 'GPT-4 Turbo', + provider: 'Github', + maxTokenAllowed: 128000, + maxCompletionTokens: 8192, + }, + { name: 'gpt-4', label: 'GPT-4', provider: 'Github', maxTokenAllowed: 8192, maxCompletionTokens: 8192 }, + { + name: 'gpt-3.5-turbo', + label: 'GPT-3.5 Turbo', + provider: 'Github', + maxTokenAllowed: 16385, + maxCompletionTokens: 4096, + }, ]; getModelInstance(options: { diff --git a/app/lib/modules/llm/providers/google.ts b/app/lib/modules/llm/providers/google.ts index d5f7ff1b22..c24c0873fa 100644 --- a/app/lib/modules/llm/providers/google.ts +++ b/app/lib/modules/llm/providers/google.ts @@ -17,10 +17,22 @@ export default class GoogleProvider extends BaseProvider { * Essential fallback models - only the most reliable/stable ones * Gemini 1.5 Pro: 2M context, excellent for complex reasoning and large codebases */ - { name: 'gemini-1.5-pro', label: 'Gemini 1.5 Pro', provider: 'Google', maxTokenAllowed: 2000000 }, + { + name: 'gemini-1.5-pro', + label: 'Gemini 1.5 Pro', + provider: 'Google', + maxTokenAllowed: 2000000, + maxCompletionTokens: 32768, + }, // Gemini 1.5 Flash: 1M context, fast and cost-effective - { name: 'gemini-1.5-flash', label: 'Gemini 1.5 Flash', provider: 'Google', maxTokenAllowed: 1000000 }, + { + name: 'gemini-1.5-flash', + label: 'Gemini 1.5 Flash', + provider: 'Google', + maxTokenAllowed: 1000000, + maxCompletionTokens: 32768, + }, ]; async getDynamicModels( @@ -89,11 +101,19 @@ export default class GoogleProvider extends BaseProvider { const maxAllowed = 2000000; // 2M tokens max const finalContext = Math.min(contextWindow, maxAllowed); + // Get completion token limit from Google API + let completionTokens = 32768; // default fallback + + if (m.outputTokenLimit && m.outputTokenLimit > 0) { + completionTokens = Math.min(m.outputTokenLimit, 128000); // Cap at reasonable limit + } + return { name: modelName, label: `${m.displayName} (${finalContext >= 1000000 ? Math.floor(finalContext / 1000000) + 'M' : Math.floor(finalContext / 1000) + 'k'} context)`, provider: this.name, maxTokenAllowed: finalContext, + maxCompletionTokens: completionTokens, }; }); } diff --git a/app/lib/modules/llm/providers/openai.ts b/app/lib/modules/llm/providers/openai.ts index 7d9f71db8f..66e714248b 100644 --- a/app/lib/modules/llm/providers/openai.ts +++ b/app/lib/modules/llm/providers/openai.ts @@ -17,10 +17,16 @@ export default class OpenAIProvider extends BaseProvider { * Essential fallback models - only the most stable/reliable ones * GPT-4o: 128k context, high performance, recommended for most tasks */ - { name: 'gpt-4o', label: 'GPT-4o', provider: 'OpenAI', maxTokenAllowed: 128000 }, + { name: 'gpt-4o', label: 'GPT-4o', provider: 'OpenAI', maxTokenAllowed: 128000, maxCompletionTokens: 16384 }, // GPT-3.5-turbo: 16k context, fast and cost-effective - { name: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo', provider: 'OpenAI', maxTokenAllowed: 16000 }, + { + name: 'gpt-3.5-turbo', + label: 'GPT-3.5 Turbo', + provider: 'OpenAI', + maxTokenAllowed: 16000, + maxCompletionTokens: 4096, + }, ]; async getDynamicModels( diff --git a/app/lib/modules/llm/types.ts b/app/lib/modules/llm/types.ts index 421d6dfc0d..330886deec 100644 --- a/app/lib/modules/llm/types.ts +++ b/app/lib/modules/llm/types.ts @@ -5,7 +5,12 @@ export interface ModelInfo { name: string; label: string; provider: string; + + /** Maximum context window size (input tokens) - how many tokens the model can process */ maxTokenAllowed: number; + + /** Maximum completion/output tokens - how many tokens the model can generate. If not specified, falls back to provider defaults */ + maxCompletionTokens?: number; } export interface ProviderInfo { diff --git a/app/routes/api.llmcall.ts b/app/routes/api.llmcall.ts index 167f9ef13f..64f4d6c605 100644 --- a/app/routes/api.llmcall.ts +++ b/app/routes/api.llmcall.ts @@ -3,7 +3,7 @@ import { streamText } from '~/lib/.server/llm/stream-text'; import type { IProviderSetting, ProviderInfo } from '~/types/model'; import { generateText } from 'ai'; import { PROVIDER_LIST } from '~/utils/constants'; -import { MAX_TOKENS } from '~/lib/.server/llm/constants'; +import { MAX_TOKENS, PROVIDER_COMPLETION_LIMITS, isReasoningModel } from '~/lib/.server/llm/constants'; import { LLMManager } from '~/lib/modules/llm/manager'; import type { ModelInfo } from '~/lib/modules/llm/types'; import { getApiKeysFromCookie, getProviderSettingsFromCookie } from '~/lib/api/cookies'; @@ -24,6 +24,23 @@ async function getModelList(options: { const logger = createScopedLogger('api.llmcall'); +function getCompletionTokenLimit(modelDetails: ModelInfo): number { + // 1. If model specifies completion tokens, use that + if (modelDetails.maxCompletionTokens && modelDetails.maxCompletionTokens > 0) { + return modelDetails.maxCompletionTokens; + } + + // 2. Use provider-specific default + const providerDefault = PROVIDER_COMPLETION_LIMITS[modelDetails.provider]; + + if (providerDefault) { + return providerDefault; + } + + // 3. Final fallback to MAX_TOKENS, but cap at reasonable limit for safety + return Math.min(MAX_TOKENS, 16384); +} + async function llmCallAction({ context, request }: ActionFunctionArgs) { const { system, message, model, provider, streamOutput } = await request.json<{ system: string; @@ -101,7 +118,7 @@ async function llmCallAction({ context, request }: ActionFunctionArgs) { throw new Error('Model not found'); } - const dynamicMaxTokens = modelDetails && modelDetails.maxTokenAllowed ? modelDetails.maxTokenAllowed : MAX_TOKENS; + const dynamicMaxTokens = modelDetails ? getCompletionTokenLimit(modelDetails) : Math.min(MAX_TOKENS, 16384); const providerInfo = PROVIDER_LIST.find((p) => p.name === provider.name); @@ -111,11 +128,19 @@ async function llmCallAction({ context, request }: ActionFunctionArgs) { logger.info(`Generating response Provider: ${provider.name}, Model: ${modelDetails.name}`); - const result = await generateText({ + // DEBUG: Log reasoning model detection + const isReasoning = isReasoningModel(modelDetails.name); + logger.info(`DEBUG: Model "${modelDetails.name}" detected as reasoning model: ${isReasoning}`); + + // Use maxCompletionTokens for reasoning models (o1, GPT-5), maxTokens for traditional models + const tokenParams = isReasoning ? { maxCompletionTokens: dynamicMaxTokens } : { maxTokens: dynamicMaxTokens }; + + // Filter out unsupported parameters for reasoning models + const baseParams = { system, messages: [ { - role: 'user', + role: 'user' as const, content: `${message}`, }, ], @@ -125,9 +150,36 @@ async function llmCallAction({ context, request }: ActionFunctionArgs) { apiKeys, providerSettings, }), - maxTokens: dynamicMaxTokens, - toolChoice: 'none', - }); + ...tokenParams, + toolChoice: 'none' as const, + }; + + // For reasoning models, set temperature to 1 (required by OpenAI API) + const finalParams = isReasoning + ? { ...baseParams, temperature: 1 } // Set to 1 for reasoning models (only supported value) + : { ...baseParams, temperature: 0 }; + + // DEBUG: Log final parameters + logger.info( + `DEBUG: Final params for model "${modelDetails.name}":`, + JSON.stringify( + { + isReasoning, + hasTemperature: 'temperature' in finalParams, + hasMaxTokens: 'maxTokens' in finalParams, + hasMaxCompletionTokens: 'maxCompletionTokens' in finalParams, + paramKeys: Object.keys(finalParams).filter((key) => !['model', 'messages', 'system'].includes(key)), + tokenParams, + finalParams: Object.fromEntries( + Object.entries(finalParams).filter(([key]) => !['model', 'messages', 'system'].includes(key)), + ), + }, + null, + 2, + ), + ); + + const result = await generateText(finalParams); logger.info(`Generated response`); return new Response(JSON.stringify(result), {