Merge branch 'main' into cte/move-model-fetchers

cte · cte · commit 50ce95504896 · 2025-02-25T13:06:30.000-08:00
diff --git a/src/api/providers/glama.ts b/src/api/providers/glama.ts
@@ -71,7 +71,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {
 		let maxTokens: number | undefined
 
 		if (this.getModel().id.startsWith("anthropic/")) {
-			maxTokens = 8_192
+			maxTokens = this.getModel().info.maxTokens
 		}
 
 		const requestOptions: OpenAI.Chat.ChatCompletionCreateParams = {
@@ -179,7 +179,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {
 			}
 
 			if (this.getModel().id.startsWith("anthropic/")) {
-				requestOptions.max_tokens = 8192
+				requestOptions.max_tokens = this.getModel().info.maxTokens
 			}
 
 			const response = await this.client.chat.completions.create(requestOptions)
@@ -214,6 +214,17 @@ export async function getGlamaModels() {
 				cacheReadsPrice: parseApiPrice(rawModel.pricePerToken?.cacheRead),
 			}
 
+			switch (rawModel.id) {
+				case rawModel.id.startsWith("anthropic/claude-3-7-sonnet"):
+					modelInfo.maxTokens = 16384
+					break
+				case rawModel.id.startsWith("anthropic/"):
+					modelInfo.maxTokens = 8192
+					break
+				default:
+					break
+			}
+
 			models[rawModel.id] = modelInfo
 		}
 	} catch (error) {
diff --git a/src/api/providers/openrouter.ts b/src/api/providers/openrouter.ts
@@ -54,20 +54,8 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 
 		// prompt caching: https://openrouter.ai/docs/prompt-caching
 		// this is specifically for claude models (some models may 'support prompt caching' automatically without this)
-		switch (this.getModel().id) {
-			case "anthropic/claude-3.7-sonnet":
-			case "anthropic/claude-3.5-sonnet":
-			case "anthropic/claude-3.5-sonnet:beta":
-			case "anthropic/claude-3.5-sonnet-20240620":
-			case "anthropic/claude-3.5-sonnet-20240620:beta":
-			case "anthropic/claude-3-5-haiku":
-			case "anthropic/claude-3-5-haiku:beta":
-			case "anthropic/claude-3-5-haiku-20241022":
-			case "anthropic/claude-3-5-haiku-20241022:beta":
-			case "anthropic/claude-3-haiku":
-			case "anthropic/claude-3-haiku:beta":
-			case "anthropic/claude-3-opus":
-			case "anthropic/claude-3-opus:beta":
+		switch (true) {
+			case this.getModel().id.startsWith("anthropic/"):
 				openAiMessages[0] = {
 					role: "system",
 					content: [
@@ -103,23 +91,6 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 				break
 		}
 
-		// Not sure how openrouter defaults max tokens when no value is provided, but the anthropic api requires this value and since they offer both 4096 and 8192 variants, we should ensure 8192.
-		// (models usually default to max tokens allowed)
-		let maxTokens: number | undefined
-		switch (this.getModel().id) {
-			case "anthropic/claude-3.7-sonnet":
-			case "anthropic/claude-3.5-sonnet":
-			case "anthropic/claude-3.5-sonnet:beta":
-			case "anthropic/claude-3.5-sonnet-20240620":
-			case "anthropic/claude-3.5-sonnet-20240620:beta":
-			case "anthropic/claude-3-5-haiku":
-			case "anthropic/claude-3-5-haiku:beta":
-			case "anthropic/claude-3-5-haiku-20241022":
-			case "anthropic/claude-3-5-haiku-20241022:beta":
-				maxTokens = 8_192
-				break
-		}
-
 		let defaultTemperature = OPENROUTER_DEFAULT_TEMPERATURE
 		let topP: number | undefined = undefined
 
@@ -140,7 +111,7 @@ export class OpenRouterHandler implements ApiHandler, SingleCompletionHandler {
 		let fullResponseText = ""
 		const stream = await this.client.chat.completions.create({
 			model: this.getModel().id,
-			max_tokens: maxTokens,
+			max_tokens: this.getModel().info.maxTokens,
 			temperature: this.options.modelTemperature ?? defaultTemperature,
 			top_p: topP,
 			messages: openAiMessages,
@@ -270,46 +241,46 @@ export async function getOpenRouterModels() {
 				description: rawModel.description,
 			}
 
-			switch (rawModel.id) {
-				case "anthropic/claude-3.7-sonnet":
-				case "anthropic/claude-3.7-sonnet:beta":
-				case "anthropic/claude-3.5-sonnet":
-				case "anthropic/claude-3.5-sonnet:beta":
-					// NOTE: This needs to be synced with api.ts/openrouter default model info.
+			// NOTE: this needs to be synced with api.ts/openrouter default model info.
+			switch (true) {
+				case rawModel.id.startsWith("anthropic/claude-3.7-sonnet"):
 					modelInfo.supportsComputerUse = true
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 3.75
 					modelInfo.cacheReadsPrice = 0.3
+					modelInfo.maxTokens = 16384
+					break
+				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet-20240620"):
+					modelInfo.supportsPromptCache = true
+					modelInfo.cacheWritesPrice = 3.75
+					modelInfo.cacheReadsPrice = 0.3
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3.5-sonnet-20240620":
-				case "anthropic/claude-3.5-sonnet-20240620:beta":
+				case rawModel.id.startsWith("anthropic/claude-3.5-sonnet"):
+					modelInfo.supportsComputerUse = true
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 3.75
 					modelInfo.cacheReadsPrice = 0.3
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3-5-haiku":
-				case "anthropic/claude-3-5-haiku:beta":
-				case "anthropic/claude-3-5-haiku-20241022":
-				case "anthropic/claude-3-5-haiku-20241022:beta":
-				case "anthropic/claude-3.5-haiku":
-				case "anthropic/claude-3.5-haiku:beta":
-				case "anthropic/claude-3.5-haiku-20241022":
-				case "anthropic/claude-3.5-haiku-20241022:beta":
+				case rawModel.id.startsWith("anthropic/claude-3-5-haiku"):
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 1.25
 					modelInfo.cacheReadsPrice = 0.1
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3-opus":
-				case "anthropic/claude-3-opus:beta":
+				case rawModel.id.startsWith("anthropic/claude-3-opus"):
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 18.75
 					modelInfo.cacheReadsPrice = 1.5
+					modelInfo.maxTokens = 8192
 					break
-				case "anthropic/claude-3-haiku":
-				case "anthropic/claude-3-haiku:beta":
+				case rawModel.id.startsWith("anthropic/claude-3-haiku"):
+				default:
 					modelInfo.supportsPromptCache = true
 					modelInfo.cacheWritesPrice = 0.3
 					modelInfo.cacheReadsPrice = 0.03
+					modelInfo.maxTokens = 8192
 					break
 			}
 
diff --git a/src/api/providers/requesty.ts b/src/api/providers/requesty.ts
@@ -70,6 +70,17 @@ export async function getRequestyModels({ apiKey }: { apiKey?: string }) {
 				cacheReadsPrice: parseApiPrice(rawModel.cached_price),
 			}
 
+			switch (rawModel.id) {
+				case rawModel.id.startsWith("anthropic/claude-3-7-sonnet"):
+					modelInfo.maxTokens = 16384
+					break
+				case rawModel.id.startsWith("anthropic/"):
+					modelInfo.maxTokens = 8192
+					break
+				default:
+					break
+			}
+
 			models[rawModel.id] = modelInfo
 		}
 	} catch (error) {
diff --git a/src/api/providers/unbound.ts b/src/api/providers/unbound.ts
@@ -73,7 +73,7 @@ export class UnboundHandler implements ApiHandler, SingleCompletionHandler {
 		let maxTokens: number | undefined
 
 		if (this.getModel().id.startsWith("anthropic/")) {
-			maxTokens = 8_192
+			maxTokens = this.getModel().info.maxTokens
 		}
 
 		const { data: completion, response } = await this.client.chat.completions
@@ -152,7 +152,7 @@ export class UnboundHandler implements ApiHandler, SingleCompletionHandler {
 			}
 
 			if (this.getModel().id.startsWith("anthropic/")) {
-				requestOptions.max_tokens = 8192
+				requestOptions.max_tokens = this.getModel().info.maxTokens
 			}
 
 			const response = await this.client.chat.completions.create(requestOptions)
@@ -176,7 +176,7 @@ export async function getUnboundModels() {
 			const rawModels: Record<string, any> = response.data
 
 			for (const [modelId, model] of Object.entries(rawModels)) {
-				models[modelId] = {
+				const modelInfo: ModelInfo = {
 					maxTokens: model?.maxTokens ? parseInt(model.maxTokens) : undefined,
 					contextWindow: model?.contextWindow ? parseInt(model.contextWindow) : 0,
 					supportsImages: model?.supportsImages ?? false,
@@ -187,6 +187,19 @@ export async function getUnboundModels() {
 					cacheWritesPrice: model?.cacheWritePrice ? parseFloat(model.cacheWritePrice) : undefined,
 					cacheReadsPrice: model?.cacheReadPrice ? parseFloat(model.cacheReadPrice) : undefined,
 				}
+
+				switch (true) {
+					case modelId.startsWith("anthropic/claude-3-7-sonnet"):
+						modelInfo.maxTokens = 16384
+						break
+					case modelId.startsWith("anthropic/"):
+						modelInfo.maxTokens = 8192
+						break
+					default:
+						break
+				}
+
+				models[modelId] = modelInfo
 			}
 		}
 	} catch (error) {
diff --git a/src/core/sliding-window/__tests__/sliding-window.test.ts b/src/core/sliding-window/__tests__/sliding-window.test.ts
@@ -5,6 +5,9 @@ import { Anthropic } from "@anthropic-ai/sdk"
 import { ModelInfo } from "../../../shared/api"
 import { truncateConversation, truncateConversationIfNeeded } from "../index"
 
+/**
+ * Tests for the truncateConversation function
+ */
 describe("truncateConversation", () => {
 	it("should retain the first message", () => {
 		const messages: Anthropic.Messages.MessageParam[] = [
@@ -91,6 +94,86 @@ describe("truncateConversation", () => {
 	})
 })
 
+/**
+ * Tests for the getMaxTokens function (private but tested through truncateConversationIfNeeded)
+ */
+describe("getMaxTokens", () => {
+	// We'll test this indirectly through truncateConversationIfNeeded
+	const createModelInfo = (contextWindow: number, maxTokens?: number): ModelInfo => ({
+		contextWindow,
+		supportsPromptCache: true, // Not relevant for getMaxTokens
+		maxTokens,
+	})
+
+	// Reuse across tests for consistency
+	const messages: Anthropic.Messages.MessageParam[] = [
+		{ role: "user", content: "First message" },
+		{ role: "assistant", content: "Second message" },
+		{ role: "user", content: "Third message" },
+		{ role: "assistant", content: "Fourth message" },
+		{ role: "user", content: "Fifth message" },
+	]
+
+	it("should use maxTokens as buffer when specified", () => {
+		const modelInfo = createModelInfo(100000, 50000)
+		// Max tokens = 100000 - 50000 = 50000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 49999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 50001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+
+	it("should use 20% of context window as buffer when maxTokens is undefined", () => {
+		const modelInfo = createModelInfo(100000, undefined)
+		// Max tokens = 100000 - (100000 * 0.2) = 80000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 79999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 80001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+
+	it("should handle small context windows appropriately", () => {
+		const modelInfo = createModelInfo(50000, 10000)
+		// Max tokens = 50000 - 10000 = 40000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 39999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 40001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+
+	it("should handle large context windows appropriately", () => {
+		const modelInfo = createModelInfo(200000, 30000)
+		// Max tokens = 200000 - 30000 = 170000
+
+		// Below max tokens - no truncation
+		const result1 = truncateConversationIfNeeded(messages, 169999, modelInfo)
+		expect(result1).toEqual(messages)
+
+		// Above max tokens - truncate
+		const result2 = truncateConversationIfNeeded(messages, 170001, modelInfo)
+		expect(result2).not.toEqual(messages)
+		expect(result2.length).toBe(3) // Truncated with 0.5 fraction
+	})
+})
+
+/**
+ * Tests for the truncateConversationIfNeeded function
+ */
 describe("truncateConversationIfNeeded", () => {
 	const createModelInfo = (contextWindow: number, supportsPromptCache: boolean, maxTokens?: number): ModelInfo => ({
 		contextWindow,
@@ -106,25 +189,43 @@ describe("truncateConversationIfNeeded", () => {
 		{ role: "user", content: "Fifth message" },
 	]
 
-	it("should not truncate if tokens are below threshold for prompt caching models", () => {
-		const modelInfo = createModelInfo(200000, true, 50000)
-		const totalTokens = 100000 // Below threshold
+	it("should not truncate if tokens are below max tokens threshold", () => {
+		const modelInfo = createModelInfo(100000, true, 30000)
+		const maxTokens = 100000 - 30000 // 70000
+		const totalTokens = 69999 // Below threshold
+
 		const result = truncateConversationIfNeeded(messages, totalTokens, modelInfo)
-		expect(result).toEqual(messages)
+		expect(result).toEqual(messages) // No truncation occurs
 	})
 
-	it("should not truncate if tokens are below threshold for non-prompt caching models", () => {
-		const modelInfo = createModelInfo(200000, false)
-		const totalTokens = 100000 // Below threshold
+	it("should truncate if tokens are above max tokens threshold", () => {
+		const modelInfo = createModelInfo(100000, true, 30000)
+		const maxTokens = 100000 - 30000 // 70000
+		const totalTokens = 70001 // Above threshold
+
+		// When truncating, always uses 0.5 fraction
+		// With 4 messages after the first, 0.5 fraction means remove 2 messages
+		const expectedResult = [messages[0], messages[3], messages[4]]
+
 		const result = truncateConversationIfNeeded(messages, totalTokens, modelInfo)
-		expect(result).toEqual(messages)
+		expect(result).toEqual(expectedResult)
 	})
 
-	it("should use 80% of context window as threshold if it's greater than (contextWindow - buffer)", () => {
-		const modelInfo = createModelInfo(50000, true) // Small context window
-		const totalTokens = 40001 // Above 80% threshold (40000)
-		const mockResult = [messages[0], messages[3], messages[4]]
-		const result = truncateConversationIfNeeded(messages, totalTokens, modelInfo)
-		expect(result).toEqual(mockResult)
+	it("should work with non-prompt caching models the same as prompt caching models", () => {
+		// The implementation no longer differentiates between prompt caching and non-prompt caching models
+		const modelInfo1 = createModelInfo(100000, true, 30000)
+		const modelInfo2 = createModelInfo(100000, false, 30000)
+
+		// Test below threshold
+		const belowThreshold = 69999
+		expect(truncateConversationIfNeeded(messages, belowThreshold, modelInfo1)).toEqual(
+			truncateConversationIfNeeded(messages, belowThreshold, modelInfo2),
+		)
+
+		// Test above threshold
+		const aboveThreshold = 70001
+		expect(truncateConversationIfNeeded(messages, aboveThreshold, modelInfo1)).toEqual(
+			truncateConversationIfNeeded(messages, aboveThreshold, modelInfo2),
+		)
 	})
 })
diff --git a/src/core/sliding-window/index.ts b/src/core/sliding-window/index.ts
diff --git a/src/shared/api.ts b/src/shared/api.ts

Original file line number	Diff line number	Diff line change
`@@ -71,7 +71,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {`
`71`	`71`	`let maxTokens: number \| undefined`
`72`	`72`
`73`	`73`	`if (this.getModel().id.startsWith("anthropic/")) {`
`74`		`- maxTokens = 8_192`
	`74`	`+ maxTokens = this.getModel().info.maxTokens`
`75`	`75`	`}`
`76`	`76`
`77`	`77`	`const requestOptions: OpenAI.Chat.ChatCompletionCreateParams = {`
`@@ -179,7 +179,7 @@ export class GlamaHandler implements ApiHandler, SingleCompletionHandler {`
`179`	`179`	`}`
`180`	`180`
`181`	`181`	`if (this.getModel().id.startsWith("anthropic/")) {`
`182`		`- requestOptions.max_tokens = 8192`
	`182`	`+ requestOptions.max_tokens = this.getModel().info.maxTokens`
`183`	`183`	`}`
`184`	`184`
`185`	`185`	`const response = await this.client.chat.completions.create(requestOptions)`
`@@ -214,6 +214,17 @@ export async function getGlamaModels() {`
`214`	`214`	`cacheReadsPrice: parseApiPrice(rawModel.pricePerToken?.cacheRead),`
`215`	`215`	`}`
`216`	`216`
	`217`	`+ switch (rawModel.id) {`
	`218`	`+ case rawModel.id.startsWith("anthropic/claude-3-7-sonnet"):`
	`219`	`+ modelInfo.maxTokens = 16384`
	`220`	`+ break`
	`221`	`+ case rawModel.id.startsWith("anthropic/"):`
	`222`	`+ modelInfo.maxTokens = 8192`
	`223`	`+ break`
	`224`	`+ default:`
	`225`	`+ break`
	`226`	`+ }`
	`227`	`+`
`217`	`228`	`models[rawModel.id] = modelInfo`
`218`	`229`	`}`
`219`	`230`	`} catch (error) {`