Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions app/lib/.server/llm/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,44 @@
*/
export const MAX_TOKENS = 32000;

/*
* Provider-specific default completion token limits
* Used as fallbacks when model doesn't specify maxCompletionTokens
*/
export const PROVIDER_COMPLETION_LIMITS: Record<string, number> = {
OpenAI: 16384,
Github: 16384, // GitHub Models use OpenAI-compatible limits
Anthropic: 128000,
Google: 32768,
Cohere: 4000,
DeepSeek: 8192,
Groq: 8192,
HuggingFace: 4096,
Mistral: 8192,
Ollama: 8192,
OpenRouter: 8192,
Perplexity: 8192,
Together: 8192,
xAI: 8192,
LMStudio: 8192,
OpenAILike: 8192,
AmazonBedrock: 8192,
Hyperbolic: 8192,
};

/*
* Reasoning models that require maxCompletionTokens instead of maxTokens
* These models use internal reasoning tokens and have different API parameter requirements
*/
export function isReasoningModel(modelName: string): boolean {
const result = /^(o1|o3|gpt-5)/i.test(modelName);

// DEBUG: Test regex matching
console.log(`REGEX TEST: "${modelName}" matches reasoning pattern: ${result}`);

return result;
}

// limits the number of model responses that can be returned in a single request
export const MAX_RESPONSE_SEGMENTS = 2;

Expand Down
99 changes: 91 additions & 8 deletions app/lib/.server/llm/stream-text.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { convertToCoreMessages, streamText as _streamText, type Message } from 'ai';
import { MAX_TOKENS, type FileMap } from './constants';
import { MAX_TOKENS, PROVIDER_COMPLETION_LIMITS, isReasoningModel, type FileMap } from './constants';
import { getSystemPrompt } from '~/lib/common/prompts/prompts';
import { DEFAULT_MODEL, DEFAULT_PROVIDER, MODIFICATIONS_TAG_NAME, PROVIDER_LIST, WORK_DIR } from '~/utils/constants';
import type { IProviderSetting } from '~/types/model';
Expand All @@ -26,6 +26,23 @@ export interface StreamingOptions extends Omit<Parameters<typeof _streamText>[0]

const logger = createScopedLogger('stream-text');

function getCompletionTokenLimit(modelDetails: any): number {
// 1. If model specifies completion tokens, use that
if (modelDetails.maxCompletionTokens && modelDetails.maxCompletionTokens > 0) {
return modelDetails.maxCompletionTokens;
}

// 2. Use provider-specific default
const providerDefault = PROVIDER_COMPLETION_LIMITS[modelDetails.provider];

if (providerDefault) {
return providerDefault;
}

// 3. Final fallback to MAX_TOKENS, but cap at reasonable limit for safety
return Math.min(MAX_TOKENS, 16384);
}

function sanitizeText(text: string): string {
let sanitized = text.replace(/<div class=\\"__boltThought__\\">.*?<\/div>/s, '');
sanitized = sanitized.replace(/<think>.*?<\/think>/s, '');
Expand Down Expand Up @@ -123,10 +140,10 @@ export async function streamText(props: {
}
}

const dynamicMaxTokens = modelDetails && modelDetails.maxTokenAllowed ? modelDetails.maxTokenAllowed : MAX_TOKENS;
const dynamicMaxTokens = modelDetails ? getCompletionTokenLimit(modelDetails) : Math.min(MAX_TOKENS, 16384);

// Ensure we never exceed reasonable token limits to prevent API errors
const safeMaxTokens = Math.min(dynamicMaxTokens, 100000); // Cap at 100k for safety
// Additional safety cap - should not be needed with proper completion limits, but kept for safety
const safeMaxTokens = Math.min(dynamicMaxTokens, 128000);

logger.info(
`Max tokens for model ${modelDetails.name} is ${safeMaxTokens} (capped from ${dynamicMaxTokens}) based on model limits`,
Expand Down Expand Up @@ -204,18 +221,84 @@ export async function streamText(props: {

logger.info(`Sending llm call to ${provider.name} with model ${modelDetails.name}`);

// DEBUG: Log reasoning model detection
const isReasoning = isReasoningModel(modelDetails.name);
logger.info(`DEBUG STREAM: Model "${modelDetails.name}" detected as reasoning model: ${isReasoning}`);

// console.log(systemPrompt, processedMessages);

return await _streamText({
// Use maxCompletionTokens for reasoning models (o1, GPT-5), maxTokens for traditional models
const tokenParams = isReasoning ? { maxCompletionTokens: safeMaxTokens } : { maxTokens: safeMaxTokens };

// Filter out unsupported parameters for reasoning models
const filteredOptions =
isReasoning && options
? Object.fromEntries(
Object.entries(options).filter(
([key]) =>
![
'temperature',
'topP',
'presencePenalty',
'frequencyPenalty',
'logprobs',
'topLogprobs',
'logitBias',
].includes(key),
),
)
: options || {};

// DEBUG: Log filtered options
logger.info(
`DEBUG STREAM: Options filtering for model "${modelDetails.name}":`,
JSON.stringify(
{
isReasoning,
originalOptions: options || {},
filteredOptions,
originalOptionsKeys: options ? Object.keys(options) : [],
filteredOptionsKeys: Object.keys(filteredOptions),
removedParams: options ? Object.keys(options).filter((key) => !(key in filteredOptions)) : [],
},
null,
2,
),
);

const streamParams = {
model: provider.getModelInstance({
model: modelDetails.name,
serverEnv,
apiKeys,
providerSettings,
}),
system: chatMode === 'build' ? systemPrompt : discussPrompt(),
maxTokens: safeMaxTokens,
...tokenParams,
messages: convertToCoreMessages(processedMessages as any),
...options,
});
...filteredOptions,

// Set temperature to 1 for reasoning models (required by OpenAI API)
...(isReasoning ? { temperature: 1 } : {}),
};

// DEBUG: Log final streaming parameters
logger.info(
`DEBUG STREAM: Final streaming params for model "${modelDetails.name}":`,
JSON.stringify(
{
hasTemperature: 'temperature' in streamParams,
hasMaxTokens: 'maxTokens' in streamParams,
hasMaxCompletionTokens: 'maxCompletionTokens' in streamParams,
paramKeys: Object.keys(streamParams).filter((key) => !['model', 'messages', 'system'].includes(key)),
streamParams: Object.fromEntries(
Object.entries(streamParams).filter(([key]) => !['model', 'messages', 'system'].includes(key)),
),
},
null,
2,
),
);

return await _streamText(streamParams);
}
3 changes: 3 additions & 0 deletions app/lib/modules/llm/providers/anthropic.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ export default class AnthropicProvider extends BaseProvider {
label: 'Claude 3.5 Sonnet',
provider: 'Anthropic',
maxTokenAllowed: 200000,
maxCompletionTokens: 128000,
},

// Claude 3 Haiku: 200k context, fastest and most cost-effective
Expand All @@ -30,6 +31,7 @@ export default class AnthropicProvider extends BaseProvider {
label: 'Claude 3 Haiku',
provider: 'Anthropic',
maxTokenAllowed: 200000,
maxCompletionTokens: 128000,
},
];

Expand Down Expand Up @@ -84,6 +86,7 @@ export default class AnthropicProvider extends BaseProvider {
label: `${m.display_name} (${Math.floor(contextWindow / 1000)}k context)`,
provider: this.name,
maxTokenAllowed: contextWindow,
maxCompletionTokens: 128000, // Claude models support up to 128k completion tokens
};
});
}
Expand Down
32 changes: 25 additions & 7 deletions app/lib/modules/llm/providers/github.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,13 +14,31 @@ export default class GithubProvider extends BaseProvider {

// find more in https://github.com/marketplace?type=models
staticModels: ModelInfo[] = [
{ name: 'gpt-4o', label: 'GPT-4o', provider: 'Github', maxTokenAllowed: 8000 },
{ name: 'o1', label: 'o1-preview', provider: 'Github', maxTokenAllowed: 100000 },
{ name: 'o1-mini', label: 'o1-mini', provider: 'Github', maxTokenAllowed: 8000 },
{ name: 'gpt-4o-mini', label: 'GPT-4o Mini', provider: 'Github', maxTokenAllowed: 8000 },
{ name: 'gpt-4-turbo', label: 'GPT-4 Turbo', provider: 'Github', maxTokenAllowed: 8000 },
{ name: 'gpt-4', label: 'GPT-4', provider: 'Github', maxTokenAllowed: 8000 },
{ name: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo', provider: 'Github', maxTokenAllowed: 8000 },
{ name: 'gpt-4o', label: 'GPT-4o', provider: 'Github', maxTokenAllowed: 128000, maxCompletionTokens: 16384 },
{ name: 'o1', label: 'o1-preview', provider: 'Github', maxTokenAllowed: 100000, maxCompletionTokens: 16384 },
{ name: 'o1-mini', label: 'o1-mini', provider: 'Github', maxTokenAllowed: 65536, maxCompletionTokens: 8192 },
{
name: 'gpt-4o-mini',
label: 'GPT-4o Mini',
provider: 'Github',
maxTokenAllowed: 128000,
maxCompletionTokens: 16384,
},
{
name: 'gpt-4-turbo',
label: 'GPT-4 Turbo',
provider: 'Github',
maxTokenAllowed: 128000,
maxCompletionTokens: 8192,
},
{ name: 'gpt-4', label: 'GPT-4', provider: 'Github', maxTokenAllowed: 8192, maxCompletionTokens: 8192 },
{
name: 'gpt-3.5-turbo',
label: 'GPT-3.5 Turbo',
provider: 'Github',
maxTokenAllowed: 16385,
maxCompletionTokens: 4096,
},
];

getModelInstance(options: {
Expand Down
24 changes: 22 additions & 2 deletions app/lib/modules/llm/providers/google.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,22 @@ export default class GoogleProvider extends BaseProvider {
* Essential fallback models - only the most reliable/stable ones
* Gemini 1.5 Pro: 2M context, excellent for complex reasoning and large codebases
*/
{ name: 'gemini-1.5-pro', label: 'Gemini 1.5 Pro', provider: 'Google', maxTokenAllowed: 2000000 },
{
name: 'gemini-1.5-pro',
label: 'Gemini 1.5 Pro',
provider: 'Google',
maxTokenAllowed: 2000000,
maxCompletionTokens: 32768,
},

// Gemini 1.5 Flash: 1M context, fast and cost-effective
{ name: 'gemini-1.5-flash', label: 'Gemini 1.5 Flash', provider: 'Google', maxTokenAllowed: 1000000 },
{
name: 'gemini-1.5-flash',
label: 'Gemini 1.5 Flash',
provider: 'Google',
maxTokenAllowed: 1000000,
maxCompletionTokens: 32768,
},
];

async getDynamicModels(
Expand Down Expand Up @@ -89,11 +101,19 @@ export default class GoogleProvider extends BaseProvider {
const maxAllowed = 2000000; // 2M tokens max
const finalContext = Math.min(contextWindow, maxAllowed);

// Get completion token limit from Google API
let completionTokens = 32768; // default fallback

if (m.outputTokenLimit && m.outputTokenLimit > 0) {
completionTokens = Math.min(m.outputTokenLimit, 128000); // Cap at reasonable limit
}

return {
name: modelName,
label: `${m.displayName} (${finalContext >= 1000000 ? Math.floor(finalContext / 1000000) + 'M' : Math.floor(finalContext / 1000) + 'k'} context)`,
provider: this.name,
maxTokenAllowed: finalContext,
maxCompletionTokens: completionTokens,
};
});
}
Expand Down
10 changes: 8 additions & 2 deletions app/lib/modules/llm/providers/openai.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,16 @@ export default class OpenAIProvider extends BaseProvider {
* Essential fallback models - only the most stable/reliable ones
* GPT-4o: 128k context, high performance, recommended for most tasks
*/
{ name: 'gpt-4o', label: 'GPT-4o', provider: 'OpenAI', maxTokenAllowed: 128000 },
{ name: 'gpt-4o', label: 'GPT-4o', provider: 'OpenAI', maxTokenAllowed: 128000, maxCompletionTokens: 16384 },

// GPT-3.5-turbo: 16k context, fast and cost-effective
{ name: 'gpt-3.5-turbo', label: 'GPT-3.5 Turbo', provider: 'OpenAI', maxTokenAllowed: 16000 },
{
name: 'gpt-3.5-turbo',
label: 'GPT-3.5 Turbo',
provider: 'OpenAI',
maxTokenAllowed: 16000,
maxCompletionTokens: 4096,
},
];

async getDynamicModels(
Expand Down
5 changes: 5 additions & 0 deletions app/lib/modules/llm/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ export interface ModelInfo {
name: string;
label: string;
provider: string;

/** Maximum context window size (input tokens) - how many tokens the model can process */
maxTokenAllowed: number;

/** Maximum completion/output tokens - how many tokens the model can generate. If not specified, falls back to provider defaults */
maxCompletionTokens?: number;
}

export interface ProviderInfo {
Expand Down
Loading