Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pkg/ollama/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ type Message struct {
Images []string `json:"images,omitempty"` // For multimodal support
ToolCalls []ToolCall `json:"tool_calls,omitempty"` // For function calling
ToolCallID string `json:"tool_call_id,omitempty"` // For tool results
Thinking string `json:"thinking,omitempty"` // Internal field for model's thinking output
}

// ToolCall represents a function call made by the model
Expand Down
89 changes: 81 additions & 8 deletions pkg/ollama/http_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -720,13 +720,82 @@ func (h *HTTPHandler) mapOllamaOptionsToOpenAI(ollamaOpts map[string]interface{}
// as it requires a special ConfigureRunner call
}

// ensureDataURIPrefix ensures that image data has a proper data URI prefix.
// OpenWebUI may send raw base64 data without prefix, but llama.cpp requires it.
// This function:
// - Returns data as-is if it already starts with "data:", "http://", or "https://"
// - Detects MIME type from base64 prefix and prepends appropriate data URI
func ensureDataURIPrefix(imageData string) string {
// Trim whitespace that might come from UIs
imageData = strings.TrimSpace(imageData)

// Check if already has a URI scheme
if strings.HasPrefix(imageData, "data:") ||
strings.HasPrefix(imageData, "http://") ||
strings.HasPrefix(imageData, "https://") {
return imageData
}

// Detect MIME type from base64 prefix
var mimeType string
if strings.HasPrefix(imageData, "/9j/") {
mimeType = "image/jpeg"
} else if strings.HasPrefix(imageData, "iVBOR") {
mimeType = "image/png"
} else if strings.HasPrefix(imageData, "R0lG") {
mimeType = "image/gif"
} else {
// Default to jpeg for unknown formats
mimeType = "image/jpeg"
}

// Assume raw base64 data - add data URI prefix with detected MIME type
return "data:" + mimeType + ";base64," + imageData
}

// convertMessages converts Ollama messages to OpenAI format
func convertMessages(messages []Message) []map[string]interface{} {
result := make([]map[string]interface{}, len(messages))
for i, msg := range messages {
openAIMsg := map[string]interface{}{
"role": msg.Role,
"content": msg.Content,
"role": msg.Role,
}

// Handle multimodal content (text + images)
if len(msg.Images) > 0 {
// Convert to OpenAI multimodal format: content is an array of content objects
contentArraySize := len(msg.Images)
if msg.Content != "" {
contentArraySize++
}
contentArray := make([]map[string]interface{}, 0, contentArraySize)

// Add text content if present
if msg.Content != "" {
contentArray = append(contentArray, map[string]interface{}{
"type": "text",
"text": msg.Content,
})
}

// Add images in OpenAI format
for _, imageData := range msg.Images {
// Ensure image data has proper data URI prefix
// OpenWebUI may send raw base64 without the prefix, but llama.cpp requires it
imageURL := ensureDataURIPrefix(imageData)

contentArray = append(contentArray, map[string]interface{}{
"type": "image_url",
"image_url": map[string]interface{}{
"url": imageURL,
},
})
}

openAIMsg["content"] = contentArray
} else {
// Regular text-only message
openAIMsg["content"] = msg.Content
}

// Add tool calls if present (for assistant messages)
Expand All @@ -753,11 +822,6 @@ func convertMessages(messages []Message) []map[string]interface{} {
openAIMsg["tool_call_id"] = msg.ToolCallID
}

// Add images if present (for multimodal support)
if len(msg.Images) > 0 {
openAIMsg["images"] = msg.Images
}

result[i] = openAIMsg
}
return result
Expand Down Expand Up @@ -992,11 +1056,13 @@ func (s *streamingChatResponseWriter) Write(data []byte) (int, error) {
continue
}

// Extract content and tool calls from structured response
// Extract content, tool calls, and thinking from structured response
var content string
var thinking string
var toolCalls []ToolCall
if len(chunk.Choices) > 0 {
content = chunk.Choices[0].Delta.Content
thinking = chunk.Choices[0].Delta.ReasoningContent
if len(chunk.Choices[0].Delta.ToolCalls) > 0 {
// Convert tool calls to Ollama format
toolCalls = convertToolCallsToOllamaFormat(chunk.Choices[0].Delta.ToolCalls)
Expand All @@ -1011,6 +1077,9 @@ func (s *streamingChatResponseWriter) Write(data []byte) (int, error) {
if len(toolCalls) > 0 {
message.ToolCalls = toolCalls
}
if thinking != "" {
message.Thinking = thinking
}

ollamaChunk := ChatResponse{
Model: s.modelName,
Expand Down Expand Up @@ -1178,6 +1247,10 @@ func (h *HTTPHandler) convertChatResponse(w http.ResponseWriter, respRecorder *r
if len(openAIResp.Choices[0].Message.ToolCalls) > 0 {
message.ToolCalls = convertToolCallsToOllamaFormat(openAIResp.Choices[0].Message.ToolCalls)
}
// Include thinking content if present
if openAIResp.Choices[0].Message.ReasoningContent != "" {
message.Thinking = openAIResp.Choices[0].Message.ReasoningContent
}
}

// Build Ollama response
Expand Down
234 changes: 234 additions & 0 deletions pkg/ollama/http_handler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
package ollama

import (
"encoding/json"
"testing"
)

func TestConvertMessages_Multimodal(t *testing.T) {
tests := []struct {
name string
messages []Message
expected string
}{
{
name: "text only message",
messages: []Message{
{
Role: "user",
Content: "Hello, world!",
},
},
expected: `[{"content":"Hello, world!","role":"user"}]`,
},
{
name: "multimodal message with text and image",
messages: []Message{
{
Role: "user",
Content: "is there a person in the image? Answer yes or no",
Images: []string{"...."},
},
},
expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "multimodal message with only image (no text)",
messages: []Message{
{
Role: "user",
Content: "",
Images: []string{"...."},
},
},
expected: `[{"content":[{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "multimodal message with multiple images",
messages: []Message{
{
Role: "user",
Content: "Compare these images",
Images: []string{
"...",
"...",
},
},
},
expected: `[{"content":[{"text":"Compare these images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "multimodal message with raw base64 from OpenWebUI (no prefix)",
messages: []Message{
{
Role: "user",
Content: "is there a person in the image? Answer yes or no",
Images: []string{"/9j/4AAQSkZJRgABAQEBLA...."},
},
},
// Should auto-add the data URI prefix
expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "assistant message with tool calls",
messages: []Message{
{
Role: "assistant",
Content: "Let me call a function",
ToolCalls: []ToolCall{
{
ID: "call_123",
Type: "function",
Function: FunctionCall{
Name: "get_weather",
Arguments: map[string]interface{}{"location": "San Francisco"},
},
},
},
},
},
// The tool_calls will have arguments converted to JSON string
// Note: JSON field order follows struct definition
expected: `[{"content":"Let me call a function","role":"assistant","tool_calls":[{"id":"call_123","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"San Francisco\"}"}}]}]`,
},
{
name: "tool result message with tool_call_id",
messages: []Message{
{
Role: "tool",
Content: "The weather in San Francisco is sunny, 72°F",
ToolCallID: "call_123",
},
},
expected: `[{"content":"The weather in San Francisco is sunny, 72°F","role":"tool","tool_call_id":"call_123"}]`,
},
{
name: "multiple raw base64 images without prefix",
messages: []Message{
{
Role: "user",
Content: "Compare these two images",
Images: []string{
"/9j/4AAQSkZJRgABAQEBLA...",
"iVBORw0KGgoAAAANSUhEUgAAA...",
},
},
},
// Should auto-detect MIME types and add appropriate prefixes
expected: `[{"content":[{"text":"Compare these two images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := convertMessages(tt.messages)

// Marshal to JSON for comparison
resultJSON, err := json.Marshal(result)
if err != nil {
t.Fatalf("Failed to marshal result: %v", err)
}

// Compare JSON strings
if string(resultJSON) != tt.expected {
t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected)
}
})
}
}

func TestEnsureDataURIPrefix(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "raw JPEG base64 without prefix",
input: "/9j/4AAQSkZJRgABAQEBLA...",
expected: "...",
},
{
name: "raw PNG base64 without prefix",
input: "iVBORw0KGgoAAAANSUhEUgAAA...",
expected: "...",
},
{
name: "raw GIF base64 without prefix",
input: "R0lGODlhAQABAIAAAAAAAP...",
expected: "...",
},
{
name: "already has data URI prefix",
input: "...",
expected: "...",
},
{
name: "already has data URI with png",
input: "...",
expected: "...",
},
{
name: "http URL",
input: "http://example.com/image.jpg",
expected: "http://example.com/image.jpg",
},
{
name: "https URL",
input: "https://example.com/image.jpg",
expected: "https://example.com/image.jpg",
},
{
name: "empty string",
input: "",
expected: "data:image/jpeg;base64,",
},
{
name: "whitespace with base64",
input: " /9j/4AAQSkZJRgABAQEBLA... ",
expected: "...",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ensureDataURIPrefix(tt.input)
if result != tt.expected {
t.Errorf("ensureDataURIPrefix() = %v, want %v", result, tt.expected)
}
})
}
}

func TestConvertMessages_PreservesOrder(t *testing.T) {
messages := []Message{
{Role: "system", Content: "You are a helpful assistant"},
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "Hi there!"},
{Role: "user", Content: "What's in this image?", Images: []string{""}},
}

result := convertMessages(messages)

if len(result) != 4 {
t.Errorf("Expected 4 messages, got %d", len(result))
}

// Check roles are preserved in order
expectedRoles := []string{"system", "user", "assistant", "user"}
for i, msg := range result {
if msg["role"] != expectedRoles[i] {
t.Errorf("Message %d: expected role %s, got %s", i, expectedRoles[i], msg["role"])
}
}

// Check last message has multimodal content
lastMsg := result[3]
content, ok := lastMsg["content"].([]map[string]interface{})
if !ok {
t.Errorf("Last message content should be an array, got %T", lastMsg["content"])
}
if len(content) != 2 {
t.Errorf("Last message should have 2 content parts (text + image), got %d", len(content))
}
}