diff --git a/pkg/ollama/http_handler.go b/pkg/ollama/http_handler.go index 3dd31040..46a15219 100644 --- a/pkg/ollama/http_handler.go +++ b/pkg/ollama/http_handler.go @@ -720,13 +720,82 @@ func (h *HTTPHandler) mapOllamaOptionsToOpenAI(ollamaOpts map[string]interface{} // as it requires a special ConfigureRunner call } +// ensureDataURIPrefix ensures that image data has a proper data URI prefix. +// OpenWebUI may send raw base64 data without prefix, but llama.cpp requires it. +// This function: +// - Returns data as-is if it already starts with "data:", "http://", or "https://" +// - Detects MIME type from base64 prefix and prepends appropriate data URI +func ensureDataURIPrefix(imageData string) string { + // Trim whitespace that might come from UIs + imageData = strings.TrimSpace(imageData) + + // Check if already has a URI scheme + if strings.HasPrefix(imageData, "data:") || + strings.HasPrefix(imageData, "http://") || + strings.HasPrefix(imageData, "https://") { + return imageData + } + + // Detect MIME type from base64 prefix + var mimeType string + if strings.HasPrefix(imageData, "/9j/") { + mimeType = "image/jpeg" + } else if strings.HasPrefix(imageData, "iVBOR") { + mimeType = "image/png" + } else if strings.HasPrefix(imageData, "R0lG") { + mimeType = "image/gif" + } else { + // Default to jpeg for unknown formats + mimeType = "image/jpeg" + } + + // Assume raw base64 data - add data URI prefix with detected MIME type + return "data:" + mimeType + ";base64," + imageData +} + // convertMessages converts Ollama messages to OpenAI format func convertMessages(messages []Message) []map[string]interface{} { result := make([]map[string]interface{}, len(messages)) for i, msg := range messages { openAIMsg := map[string]interface{}{ - "role": msg.Role, - "content": msg.Content, + "role": msg.Role, + } + + // Handle multimodal content (text + images) + if len(msg.Images) > 0 { + // Convert to OpenAI multimodal format: content is an array of content objects + contentArraySize := len(msg.Images) + if msg.Content != "" { + contentArraySize++ + } + contentArray := make([]map[string]interface{}, 0, contentArraySize) + + // Add text content if present + if msg.Content != "" { + contentArray = append(contentArray, map[string]interface{}{ + "type": "text", + "text": msg.Content, + }) + } + + // Add images in OpenAI format + for _, imageData := range msg.Images { + // Ensure image data has proper data URI prefix + // OpenWebUI may send raw base64 without the prefix, but llama.cpp requires it + imageURL := ensureDataURIPrefix(imageData) + + contentArray = append(contentArray, map[string]interface{}{ + "type": "image_url", + "image_url": map[string]interface{}{ + "url": imageURL, + }, + }) + } + + openAIMsg["content"] = contentArray + } else { + // Regular text-only message + openAIMsg["content"] = msg.Content } // Add tool calls if present (for assistant messages) @@ -753,11 +822,6 @@ func convertMessages(messages []Message) []map[string]interface{} { openAIMsg["tool_call_id"] = msg.ToolCallID } - // Add images if present (for multimodal support) - if len(msg.Images) > 0 { - openAIMsg["images"] = msg.Images - } - result[i] = openAIMsg } return result diff --git a/pkg/ollama/http_handler_test.go b/pkg/ollama/http_handler_test.go new file mode 100644 index 00000000..7b47790f --- /dev/null +++ b/pkg/ollama/http_handler_test.go @@ -0,0 +1,234 @@ +package ollama + +import ( + "encoding/json" + "testing" +) + +func TestConvertMessages_Multimodal(t *testing.T) { + tests := []struct { + name string + messages []Message + expected string + }{ + { + name: "text only message", + messages: []Message{ + { + Role: "user", + Content: "Hello, world!", + }, + }, + expected: `[{"content":"Hello, world!","role":"user"}]`, + }, + { + name: "multimodal message with text and image", + messages: []Message{ + { + Role: "user", + Content: "is there a person in the image? Answer yes or no", + Images: []string{"...."}, + }, + }, + expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, + }, + { + name: "multimodal message with only image (no text)", + messages: []Message{ + { + Role: "user", + Content: "", + Images: []string{"...."}, + }, + }, + expected: `[{"content":[{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, + }, + { + name: "multimodal message with multiple images", + messages: []Message{ + { + Role: "user", + Content: "Compare these images", + Images: []string{ + "...", + "...", + }, + }, + }, + expected: `[{"content":[{"text":"Compare these images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`, + }, + { + name: "multimodal message with raw base64 from OpenWebUI (no prefix)", + messages: []Message{ + { + Role: "user", + Content: "is there a person in the image? Answer yes or no", + Images: []string{"/9j/4AAQSkZJRgABAQEBLA...."}, + }, + }, + // Should auto-add the data URI prefix + expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, + }, + { + name: "assistant message with tool calls", + messages: []Message{ + { + Role: "assistant", + Content: "Let me call a function", + ToolCalls: []ToolCall{ + { + ID: "call_123", + Type: "function", + Function: FunctionCall{ + Name: "get_weather", + Arguments: map[string]interface{}{"location": "San Francisco"}, + }, + }, + }, + }, + }, + // The tool_calls will have arguments converted to JSON string + // Note: JSON field order follows struct definition + expected: `[{"content":"Let me call a function","role":"assistant","tool_calls":[{"id":"call_123","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"San Francisco\"}"}}]}]`, + }, + { + name: "tool result message with tool_call_id", + messages: []Message{ + { + Role: "tool", + Content: "The weather in San Francisco is sunny, 72°F", + ToolCallID: "call_123", + }, + }, + expected: `[{"content":"The weather in San Francisco is sunny, 72°F","role":"tool","tool_call_id":"call_123"}]`, + }, + { + name: "multiple raw base64 images without prefix", + messages: []Message{ + { + Role: "user", + Content: "Compare these two images", + Images: []string{ + "/9j/4AAQSkZJRgABAQEBLA...", + "iVBORw0KGgoAAAANSUhEUgAAA...", + }, + }, + }, + // Should auto-detect MIME types and add appropriate prefixes + expected: `[{"content":[{"text":"Compare these two images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := convertMessages(tt.messages) + + // Marshal to JSON for comparison + resultJSON, err := json.Marshal(result) + if err != nil { + t.Fatalf("Failed to marshal result: %v", err) + } + + // Compare JSON strings + if string(resultJSON) != tt.expected { + t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected) + } + }) + } +} + +func TestEnsureDataURIPrefix(t *testing.T) { + tests := []struct { + name string + input string + expected string + }{ + { + name: "raw JPEG base64 without prefix", + input: "/9j/4AAQSkZJRgABAQEBLA...", + expected: "...", + }, + { + name: "raw PNG base64 without prefix", + input: "iVBORw0KGgoAAAANSUhEUgAAA...", + expected: "...", + }, + { + name: "raw GIF base64 without prefix", + input: "R0lGODlhAQABAIAAAAAAAP...", + expected: "...", + }, + { + name: "already has data URI prefix", + input: "...", + expected: "...", + }, + { + name: "already has data URI with png", + input: "...", + expected: "...", + }, + { + name: "http URL", + input: "http://example.com/image.jpg", + expected: "http://example.com/image.jpg", + }, + { + name: "https URL", + input: "https://example.com/image.jpg", + expected: "https://example.com/image.jpg", + }, + { + name: "empty string", + input: "", + expected: "data:image/jpeg;base64,", + }, + { + name: "whitespace with base64", + input: " /9j/4AAQSkZJRgABAQEBLA... ", + expected: "...", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := ensureDataURIPrefix(tt.input) + if result != tt.expected { + t.Errorf("ensureDataURIPrefix() = %v, want %v", result, tt.expected) + } + }) + } +} + +func TestConvertMessages_PreservesOrder(t *testing.T) { + messages := []Message{ + {Role: "system", Content: "You are a helpful assistant"}, + {Role: "user", Content: "Hello"}, + {Role: "assistant", Content: "Hi there!"}, + {Role: "user", Content: "What's in this image?", Images: []string{""}}, + } + + result := convertMessages(messages) + + if len(result) != 4 { + t.Errorf("Expected 4 messages, got %d", len(result)) + } + + // Check roles are preserved in order + expectedRoles := []string{"system", "user", "assistant", "user"} + for i, msg := range result { + if msg["role"] != expectedRoles[i] { + t.Errorf("Message %d: expected role %s, got %s", i, expectedRoles[i], msg["role"]) + } + } + + // Check last message has multimodal content + lastMsg := result[3] + content, ok := lastMsg["content"].([]map[string]interface{}) + if !ok { + t.Errorf("Last message content should be an array, got %T", lastMsg["content"]) + } + if len(content) != 2 { + t.Errorf("Last message should have 2 content parts (text + image), got %d", len(content)) + } +}