Skip to content

Commit 07c231f

Browse files
authored
feat: add multimodal message support with image data URI handling (#497)
* feat: add multimodal message support with image data URI handling * feat: enhance ensureDataURIPrefix to detect MIME types and trim whitespace
1 parent 41dd3f1 commit 07c231f

File tree

2 files changed

+305
-7
lines changed

2 files changed

+305
-7
lines changed

pkg/ollama/http_handler.go

Lines changed: 71 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -720,13 +720,82 @@ func (h *HTTPHandler) mapOllamaOptionsToOpenAI(ollamaOpts map[string]interface{}
720720
// as it requires a special ConfigureRunner call
721721
}
722722

723+
// ensureDataURIPrefix ensures that image data has a proper data URI prefix.
724+
// OpenWebUI may send raw base64 data without prefix, but llama.cpp requires it.
725+
// This function:
726+
// - Returns data as-is if it already starts with "data:", "http://", or "https://"
727+
// - Detects MIME type from base64 prefix and prepends appropriate data URI
728+
func ensureDataURIPrefix(imageData string) string {
729+
// Trim whitespace that might come from UIs
730+
imageData = strings.TrimSpace(imageData)
731+
732+
// Check if already has a URI scheme
733+
if strings.HasPrefix(imageData, "data:") ||
734+
strings.HasPrefix(imageData, "http://") ||
735+
strings.HasPrefix(imageData, "https://") {
736+
return imageData
737+
}
738+
739+
// Detect MIME type from base64 prefix
740+
var mimeType string
741+
if strings.HasPrefix(imageData, "/9j/") {
742+
mimeType = "image/jpeg"
743+
} else if strings.HasPrefix(imageData, "iVBOR") {
744+
mimeType = "image/png"
745+
} else if strings.HasPrefix(imageData, "R0lG") {
746+
mimeType = "image/gif"
747+
} else {
748+
// Default to jpeg for unknown formats
749+
mimeType = "image/jpeg"
750+
}
751+
752+
// Assume raw base64 data - add data URI prefix with detected MIME type
753+
return "data:" + mimeType + ";base64," + imageData
754+
}
755+
723756
// convertMessages converts Ollama messages to OpenAI format
724757
func convertMessages(messages []Message) []map[string]interface{} {
725758
result := make([]map[string]interface{}, len(messages))
726759
for i, msg := range messages {
727760
openAIMsg := map[string]interface{}{
728-
"role": msg.Role,
729-
"content": msg.Content,
761+
"role": msg.Role,
762+
}
763+
764+
// Handle multimodal content (text + images)
765+
if len(msg.Images) > 0 {
766+
// Convert to OpenAI multimodal format: content is an array of content objects
767+
contentArraySize := len(msg.Images)
768+
if msg.Content != "" {
769+
contentArraySize++
770+
}
771+
contentArray := make([]map[string]interface{}, 0, contentArraySize)
772+
773+
// Add text content if present
774+
if msg.Content != "" {
775+
contentArray = append(contentArray, map[string]interface{}{
776+
"type": "text",
777+
"text": msg.Content,
778+
})
779+
}
780+
781+
// Add images in OpenAI format
782+
for _, imageData := range msg.Images {
783+
// Ensure image data has proper data URI prefix
784+
// OpenWebUI may send raw base64 without the prefix, but llama.cpp requires it
785+
imageURL := ensureDataURIPrefix(imageData)
786+
787+
contentArray = append(contentArray, map[string]interface{}{
788+
"type": "image_url",
789+
"image_url": map[string]interface{}{
790+
"url": imageURL,
791+
},
792+
})
793+
}
794+
795+
openAIMsg["content"] = contentArray
796+
} else {
797+
// Regular text-only message
798+
openAIMsg["content"] = msg.Content
730799
}
731800

732801
// Add tool calls if present (for assistant messages)
@@ -753,11 +822,6 @@ func convertMessages(messages []Message) []map[string]interface{} {
753822
openAIMsg["tool_call_id"] = msg.ToolCallID
754823
}
755824

756-
// Add images if present (for multimodal support)
757-
if len(msg.Images) > 0 {
758-
openAIMsg["images"] = msg.Images
759-
}
760-
761825
result[i] = openAIMsg
762826
}
763827
return result

pkg/ollama/http_handler_test.go

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
package ollama
2+
3+
import (
4+
"encoding/json"
5+
"testing"
6+
)
7+
8+
func TestConvertMessages_Multimodal(t *testing.T) {
9+
tests := []struct {
10+
name string
11+
messages []Message
12+
expected string
13+
}{
14+
{
15+
name: "text only message",
16+
messages: []Message{
17+
{
18+
Role: "user",
19+
Content: "Hello, world!",
20+
},
21+
},
22+
expected: `[{"content":"Hello, world!","role":"user"}]`,
23+
},
24+
{
25+
name: "multimodal message with text and image",
26+
messages: []Message{
27+
{
28+
Role: "user",
29+
Content: "is there a person in the image? Answer yes or no",
30+
Images: []string{"...."},
31+
},
32+
},
33+
expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`,
34+
},
35+
{
36+
name: "multimodal message with only image (no text)",
37+
messages: []Message{
38+
{
39+
Role: "user",
40+
Content: "",
41+
Images: []string{"...."},
42+
},
43+
},
44+
expected: `[{"content":[{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`,
45+
},
46+
{
47+
name: "multimodal message with multiple images",
48+
messages: []Message{
49+
{
50+
Role: "user",
51+
Content: "Compare these images",
52+
Images: []string{
53+
"...",
54+
"...",
55+
},
56+
},
57+
},
58+
expected: `[{"content":[{"text":"Compare these images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`,
59+
},
60+
{
61+
name: "multimodal message with raw base64 from OpenWebUI (no prefix)",
62+
messages: []Message{
63+
{
64+
Role: "user",
65+
Content: "is there a person in the image? Answer yes or no",
66+
Images: []string{"/9j/4AAQSkZJRgABAQEBLA...."},
67+
},
68+
},
69+
// Should auto-add the data URI prefix
70+
expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`,
71+
},
72+
{
73+
name: "assistant message with tool calls",
74+
messages: []Message{
75+
{
76+
Role: "assistant",
77+
Content: "Let me call a function",
78+
ToolCalls: []ToolCall{
79+
{
80+
ID: "call_123",
81+
Type: "function",
82+
Function: FunctionCall{
83+
Name: "get_weather",
84+
Arguments: map[string]interface{}{"location": "San Francisco"},
85+
},
86+
},
87+
},
88+
},
89+
},
90+
// The tool_calls will have arguments converted to JSON string
91+
// Note: JSON field order follows struct definition
92+
expected: `[{"content":"Let me call a function","role":"assistant","tool_calls":[{"id":"call_123","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"San Francisco\"}"}}]}]`,
93+
},
94+
{
95+
name: "tool result message with tool_call_id",
96+
messages: []Message{
97+
{
98+
Role: "tool",
99+
Content: "The weather in San Francisco is sunny, 72°F",
100+
ToolCallID: "call_123",
101+
},
102+
},
103+
expected: `[{"content":"The weather in San Francisco is sunny, 72°F","role":"tool","tool_call_id":"call_123"}]`,
104+
},
105+
{
106+
name: "multiple raw base64 images without prefix",
107+
messages: []Message{
108+
{
109+
Role: "user",
110+
Content: "Compare these two images",
111+
Images: []string{
112+
"/9j/4AAQSkZJRgABAQEBLA...",
113+
"iVBORw0KGgoAAAANSUhEUgAAA...",
114+
},
115+
},
116+
},
117+
// Should auto-detect MIME types and add appropriate prefixes
118+
expected: `[{"content":[{"text":"Compare these two images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`,
119+
},
120+
}
121+
122+
for _, tt := range tests {
123+
t.Run(tt.name, func(t *testing.T) {
124+
result := convertMessages(tt.messages)
125+
126+
// Marshal to JSON for comparison
127+
resultJSON, err := json.Marshal(result)
128+
if err != nil {
129+
t.Fatalf("Failed to marshal result: %v", err)
130+
}
131+
132+
// Compare JSON strings
133+
if string(resultJSON) != tt.expected {
134+
t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected)
135+
}
136+
})
137+
}
138+
}
139+
140+
func TestEnsureDataURIPrefix(t *testing.T) {
141+
tests := []struct {
142+
name string
143+
input string
144+
expected string
145+
}{
146+
{
147+
name: "raw JPEG base64 without prefix",
148+
input: "/9j/4AAQSkZJRgABAQEBLA...",
149+
expected: "...",
150+
},
151+
{
152+
name: "raw PNG base64 without prefix",
153+
input: "iVBORw0KGgoAAAANSUhEUgAAA...",
154+
expected: "...",
155+
},
156+
{
157+
name: "raw GIF base64 without prefix",
158+
input: "R0lGODlhAQABAIAAAAAAAP...",
159+
expected: "...",
160+
},
161+
{
162+
name: "already has data URI prefix",
163+
input: "...",
164+
expected: "...",
165+
},
166+
{
167+
name: "already has data URI with png",
168+
input: "...",
169+
expected: "...",
170+
},
171+
{
172+
name: "http URL",
173+
input: "http://example.com/image.jpg",
174+
expected: "http://example.com/image.jpg",
175+
},
176+
{
177+
name: "https URL",
178+
input: "https://example.com/image.jpg",
179+
expected: "https://example.com/image.jpg",
180+
},
181+
{
182+
name: "empty string",
183+
input: "",
184+
expected: "data:image/jpeg;base64,",
185+
},
186+
{
187+
name: "whitespace with base64",
188+
input: " /9j/4AAQSkZJRgABAQEBLA... ",
189+
expected: "...",
190+
},
191+
}
192+
193+
for _, tt := range tests {
194+
t.Run(tt.name, func(t *testing.T) {
195+
result := ensureDataURIPrefix(tt.input)
196+
if result != tt.expected {
197+
t.Errorf("ensureDataURIPrefix() = %v, want %v", result, tt.expected)
198+
}
199+
})
200+
}
201+
}
202+
203+
func TestConvertMessages_PreservesOrder(t *testing.T) {
204+
messages := []Message{
205+
{Role: "system", Content: "You are a helpful assistant"},
206+
{Role: "user", Content: "Hello"},
207+
{Role: "assistant", Content: "Hi there!"},
208+
{Role: "user", Content: "What's in this image?", Images: []string{""}},
209+
}
210+
211+
result := convertMessages(messages)
212+
213+
if len(result) != 4 {
214+
t.Errorf("Expected 4 messages, got %d", len(result))
215+
}
216+
217+
// Check roles are preserved in order
218+
expectedRoles := []string{"system", "user", "assistant", "user"}
219+
for i, msg := range result {
220+
if msg["role"] != expectedRoles[i] {
221+
t.Errorf("Message %d: expected role %s, got %s", i, expectedRoles[i], msg["role"])
222+
}
223+
}
224+
225+
// Check last message has multimodal content
226+
lastMsg := result[3]
227+
content, ok := lastMsg["content"].([]map[string]interface{})
228+
if !ok {
229+
t.Errorf("Last message content should be an array, got %T", lastMsg["content"])
230+
}
231+
if len(content) != 2 {
232+
t.Errorf("Last message should have 2 content parts (text + image), got %d", len(content))
233+
}
234+
}

0 commit comments

Comments
 (0)