Merge branch 'cte/move-model-fetchers' into cte/openrouter-claude-thinking

cte · cte · commit 7f1c6d707968 · 2025-02-25T13:21:08.000-08:00
diff --git a/src/api/providers/glama.ts b/src/api/providers/glama.ts
@@ -71,7 +71,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {
 		let maxTokens: number | undefined
 
 		if (this.getModel().id.startsWith("anthropic/")) {
-			maxTokens = 8_192
+			maxTokens = this.getModel().info.maxTokens
 		}
 
 		const requestOptions: OpenAI.Chat.ChatCompletionCreateParams = {
@@ -179,7 +179,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {
 			}
 
 			if (this.getModel().id.startsWith("anthropic/")) {
-				requestOptions.max_tokens = 8192
+				requestOptions.max_tokens = this.getModel().info.maxTokens
 			}
 
 			const response = await this.client.chat.completions.create(requestOptions)
@@ -214,6 +214,17 @@ export async function getGlamaModels() {
 				cacheReadsPrice: parseApiPrice(rawModel.pricePerToken?.cacheRead),
 			}
 
+			switch (rawModel.id) {
+				case rawModel.id.startsWith("anthropic/claude-3-7-sonnet"):
+					modelInfo.maxTokens = 16384
+					break
+				case rawModel.id.startsWith("anthropic/"):
+					modelInfo.maxTokens = 8192
+					break
+				default:
+					break
+			}
+
 			models[rawModel.id] = modelInfo
 		}
 	} catch (error) {
diff --git a/src/api/providers/openrouter.ts b/src/api/providers/openrouter.ts
@@ -56,22 +56,8 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 
 		// prompt caching: https://openrouter.ai/docs/prompt-caching
 		// this is specifically for claude models (some models may 'support prompt caching' automatically without this)
-		switch (modelId) {
-			case "anthropic/claude-3.7-sonnet:thinking":
-			case "anthropic/claude-3.7-sonnet":
-			case "anthropic/claude-3.7-sonnet:beta":
-			case "anthropic/claude-3.5-sonnet":
-			case "anthropic/claude-3.5-sonnet:beta":
-			case "anthropic/claude-3.5-sonnet-20240620":
-			case "anthropic/claude-3.5-sonnet-20240620:beta":
-			case "anthropic/claude-3-5-haiku":
-			case "anthropic/claude-3-5-haiku:beta":
-			case "anthropic/claude-3-5-haiku-20241022":
-			case "anthropic/claude-3-5-haiku-20241022:beta":
-			case "anthropic/claude-3-haiku":
-			case "anthropic/claude-3-haiku:beta":
-			case "anthropic/claude-3-opus":
-			case "anthropic/claude-3-opus:beta":
+		switch (true) {
+			case this.getModel().id.startsWith("anthropic/"):
 				openAiMessages[0] = {
 					role: "system",
 					content: [
@@ -107,20 +93,6 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 				break
 		}
 
-		// Not sure how openrouter defaults max tokens when no value is
-		// provided, but the Anthropic API requires this value and since they
-		// offer both 4096 and 8192 variants, we should ensure 8192.
-		// (Models usually default to max tokens allowed.)
-		let maxTokens: number | undefined = undefined
-
-		if (modelId.startsWith("anthropic/claude-3.5")) {
-			maxTokens = modelInfo.maxTokens ?? 8_192
-		}
-
-		if (modelId.startsWith("anthropic/claude-3.7")) {
-			maxTokens = modelInfo.maxTokens ?? 16_384
-		}
-
 		let defaultTemperature = OPENROUTER_DEFAULT_TEMPERATURE
 		let topP: number | undefined = undefined
 
@@ -136,6 +108,7 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 
 		let temperature = this.options.modelTemperature ?? defaultTemperature
 
+		// Anthropic "Thinking" models require a temperature of 1.0.
 		if (modelInfo.thinking) {
 			temperature = 1.0
 		}
@@ -145,7 +118,7 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 
 		const completionParams: OpenRouterChatCompletionParams = {
 			model: modelId,
-			max_tokens: maxTokens,
+			max_tokens: modelInfo.maxTokens,
 			temperature,
 			top_p: topP,
 			messages: openAiMessages,
@@ -290,56 +263,46 @@ export async function getOpenRouterModels() {
 				thinking: rawModel.id === "anthropic/claude-3.7-sonnet:thinking",
 			}
 
-			switch (rawModel.id) {
-				case "anthropic/claude-3.7-sonnet:thinking":
-				case "anthropic/claude-3.7-sonnet":
-				case "anthropic/claude-3.7-sonnet:beta":
-					modelInfo.maxTokens = 16_384
+			// NOTE: this needs to be synced with api.ts/openrouter default model info.
+			switch (true) {
+				case rawModel.id.startsWith("anthropic/claude-3.7-sonnet"):
 					modelInfo.supportsComputerUse = true
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 3.75
 					modelInfo.cacheReadsPrice = 0.3
+					modelInfo.maxTokens = 16384
 					break
-				case "anthropic/claude-3.5-sonnet":
-				case "anthropic/claude-3.5-sonnet:beta":
-					// NOTE: This needs to be synced with api.ts/openrouter default model info.
-					modelInfo.maxTokens = 8_192
-					modelInfo.supportsComputerUse = true
+				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet-20240620"):
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 3.75
 					modelInfo.cacheReadsPrice = 0.3
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3.5-sonnet-20240620":
-				case "anthropic/claude-3.5-sonnet-20240620:beta":
-					modelInfo.maxTokens = 8_192
+				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet"):
+					modelInfo.supportsComputerUse = true
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 3.75
 					modelInfo.cacheReadsPrice = 0.3
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3-5-haiku":
-				case "anthropic/claude-3-5-haiku:beta":
-				case "anthropic/claude-3-5-haiku-20241022":
-				case "anthropic/claude-3-5-haiku-20241022:beta":
-				case "anthropic/claude-3.5-haiku":
-				case "anthropic/claude-3.5-haiku:beta":
-				case "anthropic/claude-3.5-haiku-20241022":
-				case "anthropic/claude-3.5-haiku-20241022:beta":
-					modelInfo.maxTokens = 8_192
+				case rawModel.id.startsWith("anthropic/claude-3-5-haiku"):
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 1.25
 					modelInfo.cacheReadsPrice = 0.1
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3-opus":
-				case "anthropic/claude-3-opus:beta":
+				case rawModel.id.startsWith("anthropic/claude-3-opus"):
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 18.75
 					modelInfo.cacheReadsPrice = 1.5
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3-haiku":
-				case "anthropic/claude-3-haiku:beta":
+				case rawModel.id.startsWith("anthropic/claude-3-haiku"):
+				default:
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 0.3
 					modelInfo.cacheReadsPrice = 0.03
+					modelInfo.maxTokens = 8192
 					break
 			}
 
diff --git a/src/api/providers/requesty.ts b/src/api/providers/requesty.ts
@@ -70,6 +70,17 @@ export async function getRequestyModels({ apiKey }: { apiKey?: string }) {
 				cacheReadsPrice: parseApiPrice(rawModel.cached_price),
 			}
 
+			switch (rawModel.id) {
+				case rawModel.id.startsWith("anthropic/claude-3-7-sonnet"):
+					modelInfo.maxTokens = 16384
+					break
+				case rawModel.id.startsWith("anthropic/"):
+					modelInfo.maxTokens = 8192
+					break
+				default:
+					break
+			}
+
 			models[rawModel.id] = modelInfo
 		}
 	} catch (error) {
diff --git a/src/api/providers/unbound.ts b/src/api/providers/unbound.ts
@@ -73,7 +73,7 @@ export class UnboundHandler implements ApiHandler, SingleCompletionHandler {
 		let maxTokens: number | undefined
 
 		if (this.getModel().id.startsWith("anthropic/")) {
-			maxTokens = 8_192
+			maxTokens = this.getModel().info.maxTokens
 		}
 
 		const { data: completion, response } = await this.client.chat.completions
@@ -152,7 +152,7 @@ export class UnboundHandler implements ApiHandler, SingleCompletionHandler {
 			}
 
 			if (this.getModel().id.startsWith("anthropic/")) {
-				requestOptions.max_tokens = 8192
+				requestOptions.max_tokens = this.getModel().info.maxTokens
 			}
 
 			const response = await this.client.chat.completions.create(requestOptions)
@@ -176,7 +176,7 @@ export async function getUnboundModels() {
 			const rawModels: Record<string, any> = response.data
 
 			for (const [modelId, model] of Object.entries(rawModels)) {
-				models[modelId] = {
+				const modelInfo: ModelInfo = {
 					maxTokens: model?.maxTokens ? parseInt(model.maxTokens) : undefined,
 					contextWindow: model?.contextWindow ? parseInt(model.contextWindow) : 0,
 					supportsImages: model?.supportsImages ?? false,
@@ -187,6 +187,19 @@ export async function getUnboundModels() {
 					cacheWritesPrice: model?.cacheWritePrice ? parseFloat(model.cacheWritePrice) : undefined,
 					cacheReadsPrice: model?.cacheReadPrice ? parseFloat(model.cacheReadPrice) : undefined,
 				}
+
+				switch (true) {
+					case modelId.startsWith("anthropic/claude-3-7-sonnet"):
+						modelInfo.maxTokens = 16384
+						break
+					case modelId.startsWith("anthropic/"):
+						modelInfo.maxTokens = 8192
+						break
+					default:
+						break
+				}
+
+				models[modelId] = modelInfo
 			}
 		}
 	} catch (error) {
diff --git a/src/core/sliding-window/__tests__/sliding-window.test.ts b/src/core/sliding-window/__tests__/sliding-window.test.ts
@@ -5,6 +5,9 @@ import { Anthropic } from "@anthropic-ai/sdk"
 import { ModelInfo } from "../../../shared/api"
 import { truncateConversation, truncateConversationIfNeeded } from "../index"
 
+/**
+ * Tests for the truncateConversation function
+ */
 describe("truncateConversation", () => {
 	it("should retain the first message", () => {
 		const messages: Anthropic.Messages.MessageParam[] = [
@@ -91,6 +94,86 @@ describe("truncateConversation", () => {
 	})
 })
 
+/**
+ * Tests for the getMaxTokens function (private but tested through truncateConversationIfNeeded)
+ */
+describe("getMaxTokens", () => {
+	// We'll test this indirectly through truncateConversationIfNeeded
+	const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({
+		contextWindow,
+		supportsPromptCache: true, // Not relevant for getMaxTokens
+		maxTokens,
+	})
+
+	// Reuse across tests for consistency
+	const messages: Anthropic.Messages.MessageParam[] = [
+		{ role: "user", content: "First message" },
+		{ role: "assistant", content: "Second message" },
+		{ role: "user", content: "Third message" },
+		{ role: "assistant", content: "Fourth message" },
+		{ role: "user", content: "Fifth message" },
+	]
+
+	it("should use maxTokens as buffer when specified", () => {
+		const modelInfo = createModelInfo(100000, 50000)
+		// Max tokens = 100000 - 50000 = 50000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 49999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 50001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+
+	it("should use 20% of context window as buffer when maxTokens is undefined", () => {
+		const modelInfo = createModelInfo(100000, undefined)
+		// Max tokens = 100000 - (100000 * 0.2) = 80000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 79999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 80001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+
+	it("should handle small context windows appropriately", () => {
+		const modelInfo = createModelInfo(50000, 10000)
+		// Max tokens = 50000 - 10000 = 40000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 39999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 40001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+
+	it("should handle large context windows appropriately", () => {
+		const modelInfo = createModelInfo(200000, 30000)
+		// Max tokens = 200000 - 30000 = 170000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 169999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 170001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+})
+
+/**
+ * Tests for the truncateConversationIfNeeded function
+ */
 describe("truncateConversationIfNeeded", () => {
 	const createModelInfo = (contextWindow: number, supportsPromptCache: boolean, maxTokens?: number): ModelInfo => ({
 		contextWindow,
@@ -106,25 +189,43 @@ describe("truncateConversationIfNeeded", () => {
 		{ role: "user", content: "Fifth message" },
 	]
 
-	it("should not truncate if tokens are below threshold for prompt caching models", () => {
-		const modelInfo = createModelInfo(200000, true, 50000)
-		const totalTokens = 100000 // Below threshold
+	it("should not truncate if tokens are below max tokens threshold", () => {
+		const modelInfo = createModelInfo(100000, true, 30000)
+		const maxTokens = 100000 - 30000 // 70000
+		const totalTokens = 69999 // Below threshold
+
 		const result = truncateConversationIfNeeded(messages, totalTokens, modelInfo)
-		expect(result).toEqual(messages)
+		expect(result).toEqual(messages) // No truncation occurs
 	})
 
-	it("should not truncate if tokens are below threshold for non-prompt caching models", () => {
-		const modelInfo = createModelInfo(200000, false)
-		const totalTokens = 100000 // Below threshold
+	it("should truncate if tokens are above max tokens threshold", () => {
+		const modelInfo = createModelInfo(100000, true, 30000)
+		const maxTokens = 100000 - 30000 // 70000
+		const totalTokens = 70001 // Above threshold
+
+		// When truncating, always uses 0.5 fraction
+		// With 4 messages after the first, 0.5 fraction means remove 2 messages
+		const expectedResult = [messages[0], messages[3], messages[4]]
+
 		const result = truncateConversationIfNeeded(messages, totalTokens, modelInfo)
-		expect(result).toEqual(messages)
+		expect(result).toEqual(expectedResult)
 	})
 
-	it("should use 80% of context window as threshold if it's greater than (contextWindow - buffer)", () => {
-		const modelInfo = createModelInfo(50000, true) // Small context window
-		const totalTokens = 40001 // Above 80% threshold (40000)
-		const mockResult = [messages[0], messages[3], messages[4]]
-		const result = truncateConversationIfNeeded(messages, totalTokens, modelInfo)
-		expect(result).toEqual(mockResult)
+	it("should work with non-prompt caching models the same as prompt caching models", () => {
+		// The implementation no longer differentiates between prompt caching and non-prompt caching models
+		const modelInfo1 = createModelInfo(100000, true, 30000)
+		const modelInfo2 = createModelInfo(100000, false, 30000)
+
+		// Test below threshold
+		const belowThreshold = 69999
+		expect(truncateConversationIfNeeded(messages, belowThreshold, modelInfo1)).toEqual(
+			truncateConversationIfNeeded(messages, belowThreshold, modelInfo2),
+		)
+
+		// Test above threshold
+		const aboveThreshold = 70001
+		expect(truncateConversationIfNeeded(messages, aboveThreshold, modelInfo1)).toEqual(
+			truncateConversationIfNeeded(messages, aboveThreshold, modelInfo2),
+		)
 	})
 })
diff --git a/src/core/sliding-window/index.ts b/src/core/sliding-window/index.ts
diff --git a/src/shared/api.ts b/src/shared/api.ts

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {`
`71`	`71`	`let maxTokens: number \| undefined`
`72`	`72`
`73`	`73`	`if (this.getModel().id.startsWith("anthropic/")) {`
`74`		`- maxTokens = 8_192`
	`74`	`+ maxTokens = this.getModel().info.maxTokens`
`75`	`75`	`}`
`76`	`76`
`77`	`77`	`const requestOptions: OpenAI.Chat.ChatCompletionCreateParams = {`
`@@ -179,7 +179,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {`
`179`	`179`	`}`
`180`	`180`
`181`	`181`	`if (this.getModel().id.startsWith("anthropic/")) {`
`182`		`- requestOptions.max_tokens = 8192`
	`182`	`+ requestOptions.max_tokens = this.getModel().info.maxTokens`
`183`	`183`	`}`
`184`	`184`
`185`	`185`	`const response = await this.client.chat.completions.create(requestOptions)`
`@@ -214,6 +214,17 @@ export async function getGlamaModels() {`
`214`	`214`	`cacheReadsPrice: parseApiPrice(rawModel.pricePerToken?.cacheRead),`
`215`	`215`	`}`
`216`	`216`
	`217`	`+ switch (rawModel.id) {`
	`218`	`+ case rawModel.id.startsWith("anthropic/claude-3-7-sonnet"):`
	`219`	`+ modelInfo.maxTokens = 16384`
	`220`	`+ break`
	`221`	`+ case rawModel.id.startsWith("anthropic/"):`
	`222`	`+ modelInfo.maxTokens = 8192`
	`223`	`+ break`
	`224`	`+ default:`
	`225`	`+ break`
	`226`	`+ }`
	`227`	`+`
`217`	`228`	`models[rawModel.id] = modelInfo`
`218`	`229`	`}`
`219`	`230`	`} catch (error) {`