Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions src/api/providers/__tests__/openrouter.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,11 @@ describe("OpenRouterHandler", () => {
})

const result = await handler.fetchModel()
expect(result.maxTokens).toBe(128000) // Use actual implementation value
expect(result.reasoningBudget).toBeUndefined() // Use actual implementation value
expect(result.temperature).toBe(0) // Use actual implementation value
// With the new clamping logic, 128000 tokens (64% of 200000 context window)
// gets clamped to 20% of context window: 200000 * 0.2 = 40000
expect(result.maxTokens).toBe(40000)
expect(result.reasoningBudget).toBeUndefined()
expect(result.temperature).toBe(0)
})

it("does not honor custom maxTokens for non-thinking models", async () => {
Expand Down
11 changes: 6 additions & 5 deletions src/api/transform/__tests__/model-params.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -293,12 +293,12 @@ describe("getModelParams", () => {
it("should not honor customMaxThinkingTokens for non-reasoning budget models", () => {
const model: ModelInfo = {
...baseModel,
maxTokens: 4000,
maxTokens: 3000, // 3000 is 18.75% of 16000 context window, within 20% threshold
}

expect(getModelParams({ ...anthropicParams, settings: { modelMaxThinkingTokens: 1500 }, model })).toEqual({
format: anthropicParams.format,
maxTokens: 4000,
maxTokens: 3000, // Uses model.maxTokens since it's within 20% threshold
temperature: 0, // Using default temperature.
reasoningEffort: undefined,
reasoningBudget: undefined, // Should remain undefined despite customMaxThinkingTokens being set.
Expand Down Expand Up @@ -565,7 +565,7 @@ describe("getModelParams", () => {
it("should use reasoningEffort if supportsReasoningEffort is false but reasoningEffort is set", () => {
const model: ModelInfo = {
...baseModel,
maxTokens: 8000,
maxTokens: 3000, // Changed to 3000 (18.75% of 16000), which is within 20% threshold
supportsReasoningEffort: false,
reasoningEffort: "medium",
}
Expand All @@ -576,7 +576,7 @@ describe("getModelParams", () => {
model,
})

expect(result.maxTokens).toBe(8000)
expect(result.maxTokens).toBe(3000) // Now uses model.maxTokens since it's within 20% threshold
expect(result.reasoningEffort).toBe("medium")
})
})
Expand All @@ -595,7 +595,8 @@ describe("getModelParams", () => {
model,
})

// Should discard model's maxTokens and use default
// For hybrid models (supportsReasoningBudget) in Anthropic contexts,
// should discard model's maxTokens and use ANTHROPIC_DEFAULT_MAX_TOKENS
expect(result.maxTokens).toBe(ANTHROPIC_DEFAULT_MAX_TOKENS)
expect(result.reasoningBudget).toBeUndefined()
})
Expand Down
69 changes: 66 additions & 3 deletions src/shared/__tests__/api.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,13 @@ describe("getModelMaxOutputTokens", () => {
expect(result).toBe(16384)
})

test("should return model maxTokens when not using claude-code provider", () => {
test("should return model maxTokens when not using claude-code provider and maxTokens is within 20% of context window", () => {
const settings: ProviderSettings = {
apiProvider: "anthropic",
}

// mockModel has maxTokens: 8192 and contextWindow: 200000
// 8192 is 4.096% of 200000, which is <= 20%, so it should use model.maxTokens
const result = getModelMaxOutputTokens({
modelId: "claude-3-5-sonnet-20241022",
model: mockModel,
Expand Down Expand Up @@ -115,7 +117,7 @@ describe("getModelMaxOutputTokens", () => {
contextWindow: 1_048_576,
supportsPromptCache: false,
supportsReasoningBudget: true,
maxTokens: 65_535,
maxTokens: 65_535, // 65_535 is ~6.25% of 1_048_576, which is <= 20%
}

const settings: ProviderSettings = {
Expand All @@ -124,7 +126,68 @@ describe("getModelMaxOutputTokens", () => {
}

const result = getModelMaxOutputTokens({ modelId: geminiModelId, model, settings })
expect(result).toBe(65_535) // Should use model.maxTokens, not ANTHROPIC_DEFAULT_MAX_TOKENS
expect(result).toBe(65_535) // Should use model.maxTokens since it's within 20% threshold
})

test("should clamp maxTokens to 20% of context window when maxTokens exceeds threshold", () => {
const model: ModelInfo = {
contextWindow: 100_000,
supportsPromptCache: false,
maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
}

const settings: ProviderSettings = {
apiProvider: "openai",
}

const result = getModelMaxOutputTokens({
modelId: "gpt-4",
model,
settings,
format: "openai",
})
// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
expect(result).toBe(20_000)
})

test("should clamp maxTokens to 20% of context window for Anthropic models when maxTokens exceeds threshold", () => {
const model: ModelInfo = {
contextWindow: 100_000,
supportsPromptCache: true,
maxTokens: 50_000, // 50% of context window, exceeds 20% threshold
}

const settings: ProviderSettings = {
apiProvider: "anthropic",
}

const result = getModelMaxOutputTokens({
modelId: "claude-3-5-sonnet-20241022",
model,
settings,
})
// Should clamp to 20% of context window: 100_000 * 0.2 = 20_000
expect(result).toBe(20_000)
})

test("should use model.maxTokens when exactly at 20% threshold", () => {
const model: ModelInfo = {
contextWindow: 100_000,
supportsPromptCache: false,
maxTokens: 20_000, // Exactly 20% of context window
}

const settings: ProviderSettings = {
apiProvider: "openai",
}

const result = getModelMaxOutputTokens({
modelId: "gpt-4",
model,
settings,
format: "openai",
})
expect(result).toBe(20_000) // Should use model.maxTokens since it's exactly at 20%
})

test("should return modelMaxTokens from settings when reasoning budget is required", () => {
Expand Down
6 changes: 3 additions & 3 deletions src/shared/api.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ export const getModelMaxOutputTokens = ({
return ANTHROPIC_DEFAULT_MAX_TOKENS
}

// If model has explicit maxTokens and it's not the full context window, use it
if (model.maxTokens && model.maxTokens !== model.contextWindow) {
return model.maxTokens
// If model has explicit maxTokens, clamp it to 20% of the context window
if (model.maxTokens) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I noticed the previous comment check was removed. Is the current behavior intentional for models where maxTokens equals contextWindow? They would now be clamped to 20% as well.

Could we add a test case to verify this edge case behavior is as expected?

return Math.min(model.maxTokens, model.contextWindow * 0.2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this magic number intentional? Would it be beneficial to make this configurable or at least define it as a named constant like for better maintainability?

Also, consider adding a comment explaining why 20% was chosen as the threshold to help future developers understand the reasoning.

}

// For non-Anthropic formats without explicit maxTokens, return undefined
Expand Down
Loading