diff --git a/src/api/providers/__tests__/cost.test.ts b/src/api/providers/__tests__/cost.test.ts new file mode 100644 index 00000000000..8bba2f76ac2 --- /dev/null +++ b/src/api/providers/__tests__/cost.test.ts @@ -0,0 +1,148 @@ +import { getCost } from "../cost" + +describe("getCost", () => { + it("should return the correct cost for Bedrock provider with invokedModelId", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Claude-3-5-sonnet: (0.003/1000 * 250) + (0.015/1000 * 750) = 0.00075 + 0.01125 = 0.012 + const cost = getCost("bedrock", "test prompt", "gpt-3.5-turbo", 1000, "claude-3-5-sonnet") + expect(cost).toBeCloseTo(0.012, 5) + }) + + it("should return 0 for Bedrock provider without invokedModelId", () => { + // Since GPT models are not supported on Bedrock and we've removed the fallback, + // this should return 0 + const cost = getCost("bedrock", "test prompt", "any-model", 1000) + expect(cost).toBe(0) + }) + + it("should return 0 for unknown provider", () => { + const cost = getCost("unknown" as any, "test prompt", "gpt-3.5-turbo", 1000) + expect(cost).toBe(0) + }) + + it("should use provided input and output tokens when available", () => { + // For specific input (300) and output (700) tokens + // Claude-3-5-sonnet: (0.003/1000 * 300) + (0.015/1000 * 700) = 0.0009 + 0.0105 = 0.0114 + const cost = getCost("bedrock", "test prompt", "gpt-3.5-turbo", 1000, "claude-3-5-sonnet", 300, 700) + expect(cost).toBeCloseTo(0.0114, 5) + }) + + it("should handle cache write and cache read tokens", () => { + // For specific input (300), output (700), cache write (200), and cache read (100) tokens + // Claude-3-5-sonnet: + // Input: (0.003/1000 * 300) = 0.0009 + // Output: (0.015/1000 * 700) = 0.0105 + // Cache Write: (0.00375/1000 * 200) = 0.00075 + // Cache Read: (0.0003/1000 * 100) = 0.00003 + // Total: 0.0009 + 0.0105 + 0.00075 + 0.00003 = 0.01218 + const cost = getCost("bedrock", "test prompt", "gpt-3.5-turbo", 1000, "claude-3-5-sonnet", 300, 700, 200, 100) + expect(cost).toBeCloseTo(0.01218, 5) + }) + + it("should handle models without cache pricing", () => { + // For specific input (300), output (700), cache write (200), and cache read (100) tokens + // Claude-3-opus: + // Input: (0.015/1000 * 300) = 0.0045 + // Output: (0.075/1000 * 700) = 0.0525 + // Cache Write: (0/1000 * 200) = 0 + // Cache Read: (0/1000 * 100) = 0 + // Total: 0.0045 + 0.0525 + 0 + 0 = 0.057 + const cost = getCost("bedrock", "test prompt", "gpt-3.5-turbo", 1000, "claude-3-opus", 300, 700, 200, 100) + expect(cost).toBeCloseTo(0.057, 5) + }) +}) + +describe("getBedrockCost", () => { + it("should return the correct cost for claude-3-5-sonnet", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Claude-3-5-sonnet: (0.003/1000 * 250) + (0.015/1000 * 750) = 0.00075 + 0.01125 = 0.012 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "claude-3-5-sonnet") + expect(cost).toBeCloseTo(0.012, 5) + }) + + // GPT model tests removed as they are not supported on Bedrock + + it("should return 0 for unknown invokedModelId", () => { + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "unknown-model") + expect(cost).toBe(0) + }) + + it("should return 0 when invokedModelId is not provided", () => { + // Since we've removed the fallback to model-based cost calculation, + // this should return 0 + const cost = getCost("bedrock", "test prompt", "any-model", 1000) + expect(cost).toBe(0) + }) + + it("should handle intelligent prompt router ARN format", () => { + // Test with a full ARN from an intelligent prompt router + // For 1000 tokens with 25% input (250) and 75% output (750) + // Claude-3-5-sonnet: (0.003/1000 * 250) + (0.015/1000 * 750) = 0.00075 + 0.01125 = 0.012 + const cost = getCost( + "bedrock", + "test prompt", + "custom-arn", + 1000, + "arn:aws:bedrock:us-west-2:699475926481:inference-profile/us.anthropic.claude-3-5-sonnet-20240620-v1:0", + ) + expect(cost).toBeCloseTo(0.012, 5) + }) + + it("should return the correct cost for Amazon Nova Pro", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Amazon Nova Pro: (0.0008/1000 * 250) + (0.0032/1000 * 750) = 0.0002 + 0.0024 = 0.0026 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "amazon.nova-pro") + expect(cost).toBeCloseTo(0.0026, 5) + }) + + it("should return the correct cost for Amazon Nova Micro", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Amazon Nova Micro: (0.000035/1000 * 250) + (0.00014/1000 * 750) = 0.00000875 + 0.000105 = 0.00011375 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "amazon.nova-micro") + expect(cost).toBeCloseTo(0.00011375, 8) + }) + + it("should return the correct cost for Amazon Titan Text Express", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Amazon Titan Text Express: (0.0002/1000 * 250) + (0.0006/1000 * 750) = 0.00005 + 0.00045 = 0.0005 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "amazon.titan-text-express") + expect(cost).toBeCloseTo(0.0005, 5) + }) + + it("should return the correct cost for Amazon Titan Text Lite", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Amazon Titan Text Lite: (0.00015/1000 * 250) + (0.0002/1000 * 750) = 0.0000375 + 0.00015 = 0.0001875 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "amazon.titan-text-lite") + expect(cost).toBeCloseTo(0.0001875, 7) + }) + + it("should return the correct cost for Amazon Titan Text Embeddings", () => { + // For embeddings, with the default 1:3 input/output split (250 input, 750 output) + // Amazon Titan Text Embeddings: (0.0001/1000 * 250) = 0.000025 + // Note: Even though embeddings don't have output tokens, the getCost function + // still splits tokens using a 1:3 ratio by default + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "amazon.titan-text-embeddings") + expect(cost).toBeCloseTo(0.000025, 6) + }) + + it("should return the correct cost for Llama 3.2 (11B)", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Llama 3.2 (11B): (0.00016/1000 * 250) + (0.00016/1000 * 750) = 0.00004 + 0.00012 = 0.00016 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "llama-3.2-11b") + expect(cost).toBeCloseTo(0.00016, 6) + }) + + it("should return the correct cost for Llama 3.2 (90B)", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Llama 3.2 (90B): (0.00072/1000 * 250) + (0.00072/1000 * 750) = 0.00018 + 0.00054 = 0.00072 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "llama-3.2-90b") + expect(cost).toBeCloseTo(0.00072, 6) + }) + + it("should return the correct cost for Llama 3.3 (70B)", () => { + // For 1000 tokens with 25% input (250) and 75% output (750) + // Llama 3.3 (70B): (0.00072/1000 * 250) + (0.00072/1000 * 750) = 0.00018 + 0.00054 = 0.00072 + const cost = getCost("bedrock", "test prompt", "any-model", 1000, "llama-3.3-70b") + expect(cost).toBeCloseTo(0.00072, 6) + }) +}) diff --git a/src/api/providers/bedrock.ts b/src/api/providers/bedrock.ts index 76d93649604..7e024179a10 100644 --- a/src/api/providers/bedrock.ts +++ b/src/api/providers/bedrock.ts @@ -86,6 +86,11 @@ export interface StreamEvent { latencyMs: number } } + trace?: { + promptRouter?: { + invokedModelId?: string + } + } } export class AwsBedrockHandler extends BaseProvider implements SingleCompletionHandler { @@ -252,10 +257,49 @@ export class AwsBedrockHandler extends BaseProvider implements SingleCompletionH // Handle metadata events first if (streamEvent.metadata?.usage) { + // Check if this is a response from an intelligent prompt router + const invokedModelId = streamEvent.trace?.promptRouter?.invokedModelId + + // If invokedModelId is present, extract it from the ARN format + let modelIdForCost: string | undefined + if (invokedModelId) { + // Extract the model name from the ARN + // Example ARN: arn:aws:bedrock:us-west-2:699475926481:inference-profile/us.anthropic.claude-3-5-sonnet-20240620-v1:0 + const modelMatch = invokedModelId.match(/\/([^\/]+)(?::|$)/) + if (modelMatch && modelMatch[1]) { + const modelName = modelMatch[1] + + // Map the model name to the format expected by the cost calculation function + if (modelName.includes("claude-3-5-sonnet")) { + modelIdForCost = "claude-3-5-sonnet" + } else if (modelName.includes("claude-3-sonnet")) { + modelIdForCost = "claude-3-sonnet" + } else if (modelName.includes("claude-3-opus")) { + modelIdForCost = "claude-3-opus" + } else if (modelName.includes("claude-3-haiku")) { + modelIdForCost = "claude-3-haiku" + } else if (modelName.includes("claude-3-5-haiku")) { + modelIdForCost = "claude-3-5-haiku" + } else if (modelName.includes("claude-3-7-sonnet")) { + modelIdForCost = "claude-3-7-sonnet" + } + + logger.debug("Extracted model ID from intelligent prompt router", { + ctx: "bedrock", + originalArn: invokedModelId, + extractedModelId: modelIdForCost, + }) + } + } + + const inputTokens = streamEvent.metadata.usage.inputTokens || 0 + const outputTokens = streamEvent.metadata.usage.outputTokens || 0 + yield { type: "usage", - inputTokens: streamEvent.metadata.usage.inputTokens || 0, - outputTokens: streamEvent.metadata.usage.outputTokens || 0, + inputTokens: inputTokens, + outputTokens: outputTokens, + invokedModelId: modelIdForCost, } continue } @@ -491,6 +535,22 @@ Please check: supportsPromptCache: false, supportsImages: true, } + } else if (arnLower.includes("llama3.3") || arnLower.includes("llama-3.3")) { + // Llama 3.3 models + modelInfo = { + maxTokens: 8192, + contextWindow: 128_000, + supportsPromptCache: true, + supportsImages: true, + } + } else if (arnLower.includes("llama3.2") || arnLower.includes("llama-3.2")) { + // Llama 3.2 models + modelInfo = { + maxTokens: 8192, + contextWindow: 128_000, + supportsPromptCache: true, + supportsImages: arnLower.includes("90b") || arnLower.includes("11b"), + } } else if (arnLower.includes("llama3") || arnLower.includes("llama-3")) { // Llama 3 models typically have 8192 tokens in Bedrock modelInfo = { @@ -499,6 +559,46 @@ Please check: supportsPromptCache: false, supportsImages: arnLower.includes("90b") || arnLower.includes("11b"), } + } else if (arnLower.includes("titan-text-lite")) { + // Amazon Titan Text Lite + modelInfo = { + maxTokens: 4096, + contextWindow: 8_000, + supportsPromptCache: false, + supportsImages: false, + } + } else if (arnLower.includes("titan-text-express")) { + // Amazon Titan Text Express + modelInfo = { + maxTokens: 4096, + contextWindow: 8_000, + supportsPromptCache: false, + supportsImages: false, + } + } else if (arnLower.includes("titan-text-embeddings")) { + // Amazon Titan Text Embeddings + modelInfo = { + maxTokens: 8192, + contextWindow: 8_000, + supportsPromptCache: false, + supportsImages: false, + } + } else if (arnLower.includes("nova-micro")) { + // Amazon Nova Micro + modelInfo = { + maxTokens: 4096, + contextWindow: 128_000, + supportsPromptCache: false, + supportsImages: false, + } + } else if (arnLower.includes("nova-lite")) { + // Amazon Nova Lite + modelInfo = { + maxTokens: 4096, + contextWindow: 128_000, + supportsPromptCache: false, + supportsImages: false, + } } else if (arnLower.includes("nova-pro")) { // Amazon Nova Pro modelInfo = { diff --git a/src/api/providers/cost.ts b/src/api/providers/cost.ts new file mode 100644 index 00000000000..041ad5660a4 --- /dev/null +++ b/src/api/providers/cost.ts @@ -0,0 +1,298 @@ +export function getCost( + provider: + | "anthropic" + | "bedrock" + | "openai" + | "openai-native" + | "vertex" + | "unbound" + | "glama" + | "deepseek" + | "ollama" + | "mistral" + | "requesty" + | "gemini" + | "human-relay" + | "lmstudio" + | "openrouter", + prompt: string, + model: string, + tokens: number, + invokedModelId?: string, + inputTokens?: number, + outputTokens?: number, + cacheWriteTokens?: number, + cacheReadTokens?: number, +): number { + // For Bedrock, handle the case where we have actual input and output token counts + if (provider === "bedrock") { + // If we have specific input and output token counts, use them + if (inputTokens !== undefined && outputTokens !== undefined) { + return getBedrockCost( + model, + inputTokens, + outputTokens, + invokedModelId, + cacheWriteTokens || 0, + cacheReadTokens || 0, + ) + } + + // Otherwise, split the tokens between input and output using a 1:3 ratio + // This is a reasonable approximation for most conversational use cases + const estimatedInputTokens = Math.round(tokens * 0.25) + const estimatedOutputTokens = tokens - estimatedInputTokens + return getBedrockCost( + model, + estimatedInputTokens, + estimatedOutputTokens, + invokedModelId, + cacheWriteTokens || 0, + cacheReadTokens || 0, + ) + } + + switch (provider) { + case "anthropic": + return getAnthropicCost(model, tokens) + // ... other provider cases + default: + return 0 + } +} + +function getBedrockCost( + model: string, + inputTokens: number, + outputTokens: number, + invokedModelId?: string, + cacheWriteTokens: number = 0, + cacheReadTokens: number = 0, +): number { + if (invokedModelId) { + // Extract model name from ARN if applicable + let modelIdentifier = invokedModelId + + // Check if invokedModelId is an ARN from an intelligent prompt router + if (invokedModelId.startsWith("arn:aws:bedrock:")) { + // Example ARN: arn:aws:bedrock:us-west-2:699475926481:inference-profile/us.anthropic.claude-3-5-sonnet-20240620-v1:0 + const modelMatch = invokedModelId.match(/\/([^\/]+)(?::|$)/) + if (modelMatch && modelMatch[1]) { + modelIdentifier = modelMatch[1] + } + } + + // March 11, 2025 US region model costs, US-West-2 where specific region is required, per https://aws.amazon.com/bedrock/pricing/ + // + // Define model costs with separate input, output, cache write, and cache read prices per 1000 tokens + const modelCosts = [ + // Claude models - prices from AWS Bedrock pricing documentation + { + ids: ["claude-3-7-sonnet", "claude-3.7-sonnet"], + inputPrice: 0.003, + outputPrice: 0.015, + cacheWritePrice: 0.00375, + cacheReadPrice: 0.0003, + }, + { + ids: ["claude-3-5-sonnet", "claude-3.5-sonnet"], + inputPrice: 0.003, + outputPrice: 0.015, + cacheWritePrice: 0.00375, + cacheReadPrice: 0.0003, + }, + { + ids: ["claude-3-5-haiku", "claude-3.5-haiku"], + inputPrice: 0.0008, + outputPrice: 0.004, + cacheWritePrice: 0.001, + cacheReadPrice: 0.00008, + }, + { + ids: ["claude-3-opus", "claude-3.0-opus"], + inputPrice: 0.015, + outputPrice: 0.075, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["claude-3-haiku", "claude-3.0-haiku"], + inputPrice: 0.00025, + outputPrice: 0.00125, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["claude-3-sonnet", "claude-3.0-sonnet"], + inputPrice: 0.003, + outputPrice: 0.015, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["claude-2-1", "claude-2.1"], + inputPrice: 0.008, + outputPrice: 0.024, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["claude-2"], + inputPrice: 0.008, + outputPrice: 0.024, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["claude-instant"], + inputPrice: 0.0008, + outputPrice: 0.0024, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + + // Note: GPT models removed as they are not supported on Bedrock + + // Llama models + { + ids: ["llama-3-8b", "llama-3.0-8b"], + inputPrice: 0.0001, + outputPrice: 0.0002, + cacheWritePrice: 0.00005, + cacheReadPrice: 0.00001, + }, + { + ids: ["llama-3-70b", "llama-3.0-70b"], + inputPrice: 0.0003, + outputPrice: 0.0006, + cacheWritePrice: 0.00015, + cacheReadPrice: 0.00003, + }, + // Llama 3.2 models + { + ids: ["llama-3.2-1b", "llama-3.2-1b-instruct"], + inputPrice: 0.0001, + outputPrice: 0.0001, + cacheWritePrice: 0.00005, + cacheReadPrice: 0.00005, + }, + { + ids: ["llama-3.2-3b", "llama-3.2-3b-instruct"], + inputPrice: 0.00015, + outputPrice: 0.00015, + cacheWritePrice: 0.000075, + cacheReadPrice: 0.000075, + }, + { + ids: ["llama-3.2-11b", "llama-3.2-11b-instruct"], + inputPrice: 0.00016, + outputPrice: 0.00016, + cacheWritePrice: 0.00008, + cacheReadPrice: 0.00008, + }, + { + ids: ["llama-3.2-90b", "llama-3.2-90b-instruct"], + inputPrice: 0.00072, + outputPrice: 0.00072, + cacheWritePrice: 0.00036, + cacheReadPrice: 0.00036, + }, + // Llama 3.3 models + { + ids: ["llama-3.3-70b", "llama-3.3-70b-instruct"], + inputPrice: 0.00072, + outputPrice: 0.00072, + cacheWritePrice: 0.00036, + cacheReadPrice: 0.00036, + }, + + // Amazon Titan models + { + ids: ["amazon.titan-text-lite", "titan-text-lite"], + inputPrice: 0.00015, + outputPrice: 0.0002, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["amazon.titan-text-express", "titan-text-express"], + inputPrice: 0.0002, + outputPrice: 0.0006, + cacheWritePrice: 0, // Updated pricing, cache may not be available + cacheReadPrice: 0, // Updated pricing, cache may not be available + }, + { + ids: ["amazon.titan-text-embeddings", "titan-text-embeddings"], + inputPrice: 0.0001, + outputPrice: 0, // Embeddings don't have output tokens + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["amazon.titan-text-embeddings-v2", "titan-text-embeddings-v2"], + inputPrice: 0.00002, + outputPrice: 0, // Embeddings don't have output tokens + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + // Amazon Nova models + { + ids: ["amazon.nova-micro", "nova-micro"], + inputPrice: 0.000035, + outputPrice: 0.00014, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["amazon.nova-lite", "nova-lite"], + inputPrice: 0.00006, + outputPrice: 0.00024, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["amazon.nova-pro", "nova-pro"], + inputPrice: 0.0008, + outputPrice: 0.0032, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + { + ids: ["amazon.nova-pro-loi", "nova-pro-loi"], + inputPrice: 0.001, + outputPrice: 0.004, + cacheWritePrice: 0, // Not available for this model + cacheReadPrice: 0, // Not available for this model + }, + ] + + // Find matching model - check if modelIdentifier matches any of the ids in the ids array + const matchedModel = modelCosts.find((m) => + m.ids.some((id) => modelIdentifier === id || modelIdentifier.includes(id)), + ) + + if (matchedModel) { + const inputCost = (matchedModel.inputPrice / 1000) * inputTokens + const outputCost = (matchedModel.outputPrice / 1000) * outputTokens + const cacheWriteCost = (matchedModel.cacheWritePrice / 1000) * cacheWriteTokens + const cacheReadCost = (matchedModel.cacheReadPrice / 1000) * cacheReadTokens + + return inputCost + outputCost + cacheWriteCost + cacheReadCost + } + + // If we don't have a specific price for this model, log it and return 0 + console.warn(`Unknown invokedModelId for cost calculation: ${invokedModelId}`) + return 0 + } else { + // No fallback model-based cost calculation for Bedrock + // GPT models are not supported on Bedrock + return 0 + } +} + +function getAnthropicCost(model: string, tokens: number): number { + // Anthropic cost calculation logic + return 0 +} + +export const parseApiPrice = (price: any) => (price ? parseFloat(price) * 1_000_000 : undefined) diff --git a/src/api/transform/stream.ts b/src/api/transform/stream.ts index 97751edd90d..4fb16b425ff 100644 --- a/src/api/transform/stream.ts +++ b/src/api/transform/stream.ts @@ -18,4 +18,5 @@ export interface ApiStreamUsageChunk { cacheWriteTokens?: number cacheReadTokens?: number totalCost?: number // openrouter + invokedModelId?: string // For intelligent prompt router responses }