Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/api/providers/__tests__/openrouter.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,8 @@ describe("OpenRouterHandler", () => {

const result = await handler.fetchModel()
// With the new clamping logic, 128000 tokens (64% of 200000 context window)
// gets clamped to 20% of context window: 200000 * 0.2 = 40000
expect(result.maxTokens).toBe(40000)
// is below the 80% threshold, so it should not be clamped
expect(result.maxTokens).toBe(128000)
expect(result.reasoningBudget).toBeUndefined()
expect(result.temperature).toBe(0)
})
Expand Down
8 changes: 4 additions & 4 deletions src/api/transform/__tests__/model-params.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -293,12 +293,12 @@ describe("getModelParams", () => {
it("should not honor customMaxThinkingTokens for non-reasoning budget models", () => {
const model: ModelInfo = {
...baseModel,
maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 20% threshold
maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 80% threshold
}

expect(getModelParams({ ...anthropicParams, settings: { modelMaxThinkingTokens: 1500 }, model })).toEqual({
format: anthropicParams.format,
maxTokens: 3000, // Uses model.maxTokens since it's within 20% threshold
maxTokens: 3000, // Uses model.maxTokens since it's within 80% threshold
temperature: 0, // Using default temperature.
reasoningEffort: undefined,
reasoningBudget: undefined, // Should remain undefined despite customMaxThinkingTokens being set.
Expand Down Expand Up @@ -565,7 +565,7 @@ describe("getModelParams", () => {
it("should use reasoningEffort if supportsReasoningEffort is false but reasoningEffort is set", () => {
const model: ModelInfo = {
...baseModel,
maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 20% threshold
maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 80% threshold
supportsReasoningEffort: false,
reasoningEffort: "medium",
}
Expand All @@ -576,7 +576,7 @@ describe("getModelParams", () => {
model,
})

expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 20% threshold
expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 80% threshold
expect(result.reasoningEffort).toBe("medium")
})
})
Expand Down
30 changes: 15 additions & 15 deletions src/shared/__tests__/api.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@ describe("getModelMaxOutputTokens", () => {
expect(result).toBe(16384)
})

test("should return model maxTokens when not using claude-code provider and maxTokens is within 20% of context window", () => {
test("should return model maxTokens when not using claude-code provider and maxTokens is within 80% of context window", () => {
const settings: ProviderSettings = {
apiProvider: "anthropic",
}

// mockModel has maxTokens: 8192 and contextWindow: 200000
// 8192 is 4.096% of 200000, which is <= 20%, so it should use model.maxTokens
// 8192 is 4.096% of 200000, which is <= 80%, so it should use model.maxTokens
const result = getModelMaxOutputTokens({
modelId: "claude-3-5-sonnet-20241022",
model: mockModel,
Expand Down Expand Up @@ -117,7 +117,7 @@ describe("getModelMaxOutputTokens", () => {
contextWindow: 1_048_576,
supportsPromptCache: false,
supportsReasoningBudget: true,
maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 20%
maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 80%
}

const settings: ProviderSettings = {
Expand All @@ -126,14 +126,14 @@ describe("getModelMaxOutputTokens", () => {
}

const result = getModelMaxOutputTokens({ modelId: geminiModelId, model, settings })
expect(result).toBe(65_535) // Should use model.maxTokens since it's within 20% threshold
expect(result).toBe(65_535) // Should use model.maxTokens since it's within 80% threshold
})

test("should clamp maxTokens to 20% of context window when maxTokens exceeds threshold", () => {
test("should clamp maxTokens to 80% of context window when maxTokens exceeds threshold", () => {
const model: ModelInfo = {
contextWindow: 100_000,
supportsPromptCache: false,
maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
maxTokens: 90_000, // 90% of context window, exceeds 80% threshold
}

const settings: ProviderSettings = {
Expand All @@ -146,15 +146,15 @@ describe("getModelMaxOutputTokens", () => {
settings,
format: "openai",
})
// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
expect(result).toBe(20_000)
// Should clamp to 80% of context window: 100_000 * 0.8 = 80_000
expect(result).toBe(80_000)
})

test("should clamp maxTokens to 20% of context window for Anthropic models when maxTokens exceeds threshold", () => {
test("should clamp maxTokens to 80% of context window for Anthropic models when maxTokens exceeds threshold", () => {
const model: ModelInfo = {
contextWindow: 100_000,
supportsPromptCache: true,
maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
maxTokens: 90_000, // 90% of context window, exceeds 80% threshold
}

const settings: ProviderSettings = {
Expand All @@ -166,15 +166,15 @@ describe("getModelMaxOutputTokens", () => {
model,
settings,
})
// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
expect(result).toBe(20_000)
// Should clamp to 80% of context window: 100_000 * 0.8 = 80_000
expect(result).toBe(80_000)
})

test("should use model.maxTokens when exactly at 20% threshold", () => {
test("should use model.maxTokens when at or below 80% threshold", () => {
const model: ModelInfo = {
contextWindow: 100_000,
supportsPromptCache: false,
maxTokens: 20_000, // Exactly 20% of context window
maxTokens: 80_000, // Exactly 80% of context window
}

const settings: ProviderSettings = {
Expand All @@ -187,7 +187,7 @@ describe("getModelMaxOutputTokens", () => {
settings,
format: "openai",
})
expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20%
expect(result).toBe(80_000) // Should use model.maxTokens since it's at 80%
})
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be helpful to add a specific test case for the GLM-4.5 scenario that triggered this issue? Something like:

Suggested change
})
test("should handle GLM-4.5 model with 98,304 tokens out of 131,072 context window", () => {
const model: ModelInfo = {
contextWindow: 131_072,
supportsPromptCache: false,
maxTokens: 98_304, // 75% of context window
}
const settings: ProviderSettings = {
apiProvider: "openrouter",
}
const result = getModelMaxOutputTokens({
modelId: "z.al/glm-4.5",
model,
settings,
format: "openrouter",
})
expect(result).toBe(98_304) // Should use model.maxTokens since 75% < 80%
})


test("should return modelMaxTokens from settings when reasoning budget is required", () => {
Expand Down
11 changes: 9 additions & 2 deletions src/shared/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,16 @@ export const getModelMaxOutputTokens = ({
return ANTHROPIC_DEFAULT_MAX_TOKENS
}

// If model has explicit maxTokens, clamp it to 20% of the context window
// If model has explicit maxTokens, only clamp it if it exceeds 80% of the context window
// This prevents models from using the entire context for output while still allowing
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comment is good, but could we be more explicit about why 80% was chosen? Perhaps mention that this leaves approximately 20% for input tokens and system prompts, which is typically sufficient for most use cases?

// models with legitimately high output requirements (like GLM-4.5) to function
if (model.maxTokens) {
return Math.min(model.maxTokens, model.contextWindow * 0.2)
// Only apply clamping if maxTokens is more than 80% of context window
if (model.maxTokens > model.contextWindow * 0.8) {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider extracting this magic number to a named constant like MAX_OUTPUT_TOKEN_RATIO = 0.8 for better maintainability. This would make it easier to adjust in the future and clearer about the intent.

// Clamp to 80% to leave room for input
return Math.floor(model.contextWindow * 0.8)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is using Math.floor() here intentional? With a context window of 131,072, this gives 104,857 tokens instead of 104,858. While minor, would Math.ceil() or Math.round() better maximize available tokens for edge cases?

}
return model.maxTokens
}

// For non-Anthropic formats without explicit maxTokens, return undefined
Expand Down
Loading