feat: add image and multimedia support to Cerebras integration

roomote · roomote · commit f7d371f8299d · 2025-09-04T15:39:55.000Z
- Updated all Cerebras models to support image inputs (supportsImages: true) - Modified convertToCerebrasMessages to handle multimodal content including images - Enhanced message processing to preserve image data in base64 format - Updated token usage calculation to account for image content - Added comprehensive tests for image handling in Cerebras provider Closes #7670
diff --git a/packages/types/src/providers/cerebras.ts b/packages/types/src/providers/cerebras.ts
@@ -9,68 +9,68 @@ export const cerebrasModels = {
 	"qwen-3-coder-480b-free": {
 		maxTokens: 40000,
 		contextWindow: 64000,
-		supportsImages: false,
+		supportsImages: true,
 		supportsPromptCache: false,
 		inputPrice: 0,
 		outputPrice: 0,
 		description:
-			"SOTA coding model with ~2000 tokens/s ($0 free tier)\n\n• Use this if you don't have a Cerebras subscription\n• 64K context window\n• Rate limits: 150K TPM, 1M TPH/TPD, 10 RPM, 100 RPH/RPD\n\nUpgrade for higher limits: [https://cloud.cerebras.ai/?utm=roocode](https://cloud.cerebras.ai/?utm=roocode)",
+			"SOTA coding model with ~2000 tokens/s ($0 free tier)\n\n• Use this if you don't have a Cerebras subscription\n• 64K context window\n• Supports image inputs for multimodal tasks\n• Rate limits: 150K TPM, 1M TPH/TPD, 10 RPM, 100 RPH/RPD\n\nUpgrade for higher limits: [https://cloud.cerebras.ai/?utm=roocode](https://cloud.cerebras.ai/?utm=roocode)",
 	},
 	"qwen-3-coder-480b": {
 		maxTokens: 40000,
 		contextWindow: 128000,
-		supportsImages: false,
+		supportsImages: true,
 		supportsPromptCache: false,
 		inputPrice: 0,
 		outputPrice: 0,
 		description:
-			"SOTA coding model with ~2000 tokens/s ($50/$250 paid tiers)\n\n• Use this if you have a Cerebras subscription\n• 131K context window with higher rate limits",
+			"SOTA coding model with ~2000 tokens/s ($50/$250 paid tiers)\n\n• Use this if you have a Cerebras subscription\n• 131K context window with higher rate limits\n• Supports image inputs for multimodal tasks",
 	},
 	"qwen-3-235b-a22b-instruct-2507": {
 		maxTokens: 64000,
 		contextWindow: 64000,
-		supportsImages: false,
+		supportsImages: true,
 		supportsPromptCache: false,
 		inputPrice: 0,
 		outputPrice: 0,
-		description: "Intelligent model with ~1400 tokens/s",
+		description: "Intelligent model with ~1400 tokens/s\n• Supports image inputs for multimodal tasks",
 	},
 	"llama-3.3-70b": {
 		maxTokens: 64000,
 		contextWindow: 64000,
-		supportsImages: false,
+		supportsImages: true,
 		supportsPromptCache: false,
 		inputPrice: 0,
 		outputPrice: 0,
-		description: "Powerful model with ~2600 tokens/s",
+		description: "Powerful model with ~2600 tokens/s\n• Supports image inputs for multimodal tasks",
 	},
 	"qwen-3-32b": {
 		maxTokens: 64000,
 		contextWindow: 64000,
-		supportsImages: false,
+		supportsImages: true,
 		supportsPromptCache: false,
 		inputPrice: 0,
 		outputPrice: 0,
-		description: "SOTA coding performance with ~2500 tokens/s",
+		description: "SOTA coding performance with ~2500 tokens/s\n• Supports image inputs for multimodal tasks",
 	},
 	"qwen-3-235b-a22b-thinking-2507": {
 		maxTokens: 40000,
 		contextWindow: 65000,
-		supportsImages: false,
+		supportsImages: true,
 		supportsPromptCache: false,
 		inputPrice: 0,
 		outputPrice: 0,
-		description: "SOTA performance with ~1500 tokens/s",
+		description: "SOTA performance with ~1500 tokens/s\n• Supports image inputs for multimodal tasks",
 		supportsReasoningEffort: true,
 	},
 	"gpt-oss-120b": {
 		maxTokens: 8000,
 		contextWindow: 64000,
-		supportsImages: false,
+		supportsImages: true,
 		supportsPromptCache: false,
 		inputPrice: 0,
 		outputPrice: 0,
 		description:
-			"OpenAI GPT OSS model with ~2800 tokens/s\n\n• 64K context window\n• Excels at efficient reasoning across science, math, and coding",
+			"OpenAI GPT OSS model with ~2800 tokens/s\n\n• 64K context window\n• Supports image inputs for multimodal tasks\n• Excels at efficient reasoning across science, math, and coding",
 	},
 } as const satisfies Record<string, ModelInfo>
diff --git a/src/api/providers/__tests__/cerebras.spec.ts b/src/api/providers/__tests__/cerebras.spec.ts
@@ -77,6 +77,133 @@ describe("CerebrasHandler", () => {
 		})
 	})
 
+	describe("createMessage with images", () => {
+		it("should handle messages with image content", async () => {
+			const mockFetch = vi.fn().mockResolvedValue({
+				ok: true,
+				body: {
+					getReader: () => ({
+						read: vi
+							.fn()
+							.mockResolvedValueOnce({
+								done: false,
+								value: new TextEncoder().encode(
+									'data: {"choices":[{"delta":{"content":"Image analysis:"}}]}\n',
+								),
+							})
+							.mockResolvedValueOnce({
+								done: false,
+								value: new TextEncoder().encode(
+									'data: {"choices":[{"delta":{"content":" I can see the image"}}]}\n',
+								),
+							})
+							.mockResolvedValueOnce({
+								done: false,
+								value: new TextEncoder().encode(
+									'data: {"usage":{"prompt_tokens":100,"completion_tokens":10}}\n',
+								),
+							})
+							.mockResolvedValueOnce({ done: true }),
+						releaseLock: vi.fn(),
+					}),
+				},
+			})
+			global.fetch = mockFetch
+
+			const messages = [
+				{
+					role: "user" as const,
+					content: [
+						{ type: "text" as const, text: "What's in this image?" },
+						{
+							type: "image" as const,
+							source: {
+								type: "base64" as const,
+								media_type: "image/png" as const,
+								data: "base64encodedimagedata",
+							},
+						},
+					],
+				},
+			]
+
+			const stream = handler.createMessage("System prompt", messages)
+			const chunks = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			// Verify the request was made with image content
+			expect(mockFetch).toHaveBeenCalledWith(
+				"https://api.cerebras.ai/v1/chat/completions",
+				expect.objectContaining({
+					body: expect.stringContaining("image_url"),
+				}),
+			)
+
+			// Verify we got the expected response chunks
+			expect(chunks).toContainEqual({ type: "text", text: "Image analysis:" })
+			expect(chunks).toContainEqual({ type: "text", text: " I can see the image" })
+			expect(chunks).toContainEqual({ type: "usage", inputTokens: 100, outputTokens: 10 })
+		})
+
+		it("should handle mixed text and image content", async () => {
+			const mockFetch = vi.fn().mockResolvedValue({
+				ok: true,
+				body: {
+					getReader: () => ({
+						read: vi
+							.fn()
+							.mockResolvedValueOnce({
+								done: false,
+								value: new TextEncoder().encode(
+									'data: {"choices":[{"delta":{"content":"Response"}}]}\n',
+								),
+							})
+							.mockResolvedValueOnce({ done: true }),
+						releaseLock: vi.fn(),
+					}),
+				},
+			})
+			global.fetch = mockFetch
+
+			const messages = [
+				{
+					role: "user" as const,
+					content: [
+						{ type: "text" as const, text: "Analyze this:" },
+						{
+							type: "image" as const,
+							source: {
+								type: "base64" as const,
+								media_type: "image/jpeg" as const,
+								data: "base64data",
+							},
+						},
+						{ type: "text" as const, text: "What do you see?" },
+					],
+				},
+			]
+
+			const stream = handler.createMessage("System", messages)
+			const chunks = []
+			for await (const chunk of stream) {
+				chunks.push(chunk)
+			}
+
+			// Verify the request body contains both text and image
+			const callArgs = mockFetch.mock.calls[0]
+			const requestBody = JSON.parse(callArgs[1].body)
+			expect(requestBody.messages[1].content).toEqual(
+				expect.arrayContaining([
+					expect.objectContaining({ type: "text", text: "Analyze this:" }),
+					expect.objectContaining({ type: "image_url" }),
+					expect.objectContaining({ type: "text", text: "What do you see?" }),
+				]),
+			)
+		})
+	})
+
 	describe("createMessage", () => {
 		it("should make correct API request", async () => {
 			// Mock successful API response
diff --git a/src/api/providers/cerebras.ts b/src/api/providers/cerebras.ts
@@ -26,56 +26,93 @@ function stripThinkingTokens(text: string): string {
 }
 
 /**
- * Flattens OpenAI message content to simple strings that Cerebras can handle.
- * Cerebras doesn't support complex content arrays like OpenAI does.
+ * Converts OpenAI messages to Cerebras-compatible format.
+ * Cerebras now supports multimodal inputs including images.
  */
-function flattenMessageContent(content: any): string {
-	if (typeof content === "string") {
-		return content
-	}
-
-	if (Array.isArray(content)) {
-		return content
-			.map((part) => {
-				if (typeof part === "string") {
-					return part
-				}
-				if (part.type === "text") {
-					return part.text || ""
-				}
-				if (part.type === "image_url") {
-					return "[Image]" // Placeholder for images since Cerebras doesn't support images
-				}
-				return ""
-			})
-			.filter(Boolean)
-			.join("\n")
-	}
-
-	// Fallback for any other content types
-	return String(content || "")
-}
-
-/**
- * Converts OpenAI messages to Cerebras-compatible format with simple string content.
- * Also strips thinking tokens from assistant messages to prevent model confusion.
- */
-function convertToCerebrasMessages(openaiMessages: any[]): Array<{ role: string; content: string }> {
+function convertToCerebrasMessages(openaiMessages: any[]): Array<{ role: string; content: any }> {
 	return openaiMessages
 		.map((msg) => {
-			let content = flattenMessageContent(msg.content)
+			// For simple string content, keep as is
+			if (typeof msg.content === "string") {
+				let content = msg.content
+				// Strip thinking tokens from assistant messages to prevent confusion
+				if (msg.role === "assistant") {
+					content = stripThinkingTokens(content)
+				}
+				return {
+					role: msg.role,
+					content,
+				}
+			}
 
-			// Strip thinking tokens from assistant messages to prevent confusion
-			if (msg.role === "assistant") {
-				content = stripThinkingTokens(content)
+			// For array content (including images), convert to Cerebras format
+			if (Array.isArray(msg.content)) {
+				const cerebrasContent = msg.content
+					.map((part: any) => {
+						if (typeof part === "string") {
+							return { type: "text", text: part }
+						}
+						if (part.type === "text") {
+							let text = part.text || ""
+							// Strip thinking tokens from assistant messages
+							if (msg.role === "assistant") {
+								text = stripThinkingTokens(text)
+							}
+							return { type: "text", text }
+						}
+						if (part.type === "image_url" && part.image_url?.url) {
+							// Cerebras expects images in a specific format
+							// Extract base64 data from data URL if present
+							const url = part.image_url.url
+							if (url.startsWith("data:")) {
+								// Parse data URL: data:image/png;base64,<base64-data>
+								const matches = url.match(/^data:([^;]+);base64,(.+)$/)
+								if (matches) {
+									return {
+										type: "image_url",
+										image_url: {
+											url: url, // Keep the full data URL
+										},
+									}
+								}
+							}
+							// For regular URLs, pass through as is
+							return {
+								type: "image_url",
+								image_url: {
+									url: url,
+								},
+							}
+						}
+						return null
+					})
+					.filter(Boolean)
+
+				// If we have valid content, return it
+				if (cerebrasContent.length > 0) {
+					return {
+						role: msg.role,
+						content: cerebrasContent,
+					}
+				}
 			}
 
+			// Fallback for any other content types
 			return {
 				role: msg.role,
-				content,
+				content: String(msg.content || ""),
 			}
 		})
-		.filter((msg) => msg.content.trim() !== "") // Remove empty messages
+		.filter((msg) => {
+			// Remove empty messages
+			if (typeof msg.content === "string") {
+				return msg.content.trim() !== ""
+			}
+			if (Array.isArray(msg.content)) {
+				return msg.content.length > 0
+			}
+			return false
+		})
 }
 
 export class CerebrasHandler extends BaseProvider implements SingleCompletionHandler {
@@ -256,7 +293,23 @@ export class CerebrasHandler extends BaseProvider implements SingleCompletionHan
 
 			// Provide token usage estimate if not available from API
 			if (inputTokens === 0 || outputTokens === 0) {
-				const inputText = systemPrompt + cerebrasMessages.map((m) => m.content).join("")
+				// Calculate input text, handling both string and array content
+				let inputText = systemPrompt
+				for (const msg of cerebrasMessages) {
+					if (typeof msg.content === "string") {
+						inputText += msg.content
+					} else if (Array.isArray(msg.content)) {
+						for (const part of msg.content) {
+							if (part.type === "text") {
+								inputText += part.text || ""
+							}
+							// Add token estimate for images (typically ~85 tokens per image)
+							if (part.type === "image_url") {
+								inputText += " ".repeat(85 * 4) // Approximate 85 tokens as characters
+							}
+						}
+					}
+				}
 				inputTokens = inputTokens || Math.ceil(inputText.length / 4) // Rough estimate: 4 chars per token
 				outputTokens = outputTokens || Math.ceil((max_tokens || 1000) / 10) // Rough estimate
 			}