Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 51 additions & 7 deletions pkg/ollama/http_handler.go
Original file line number Diff line number Diff line change
Expand Up @@ -720,13 +720,62 @@ func (h *HTTPHandler) mapOllamaOptionsToOpenAI(ollamaOpts map[string]interface{}
// as it requires a special ConfigureRunner call
}

// ensureDataURIPrefix ensures that image data has a proper data URI prefix.
// OpenWebUI may send raw base64 data without prefix, but llama.cpp requires it.
// This function:
// - Returns data as-is if it already starts with "data:", "http://", or "https://"
// - Prepends "data:image/jpeg;base64," to raw base64 strings
func ensureDataURIPrefix(imageData string) string {
// Check if already has a URI scheme
if strings.HasPrefix(imageData, "data:") ||
strings.HasPrefix(imageData, "http://") ||
strings.HasPrefix(imageData, "https://") {
return imageData
}

// Assume raw base64 data - add data URI prefix
return "data:image/jpeg;base64," + imageData
}

// convertMessages converts Ollama messages to OpenAI format
func convertMessages(messages []Message) []map[string]interface{} {
result := make([]map[string]interface{}, len(messages))
for i, msg := range messages {
openAIMsg := map[string]interface{}{
"role": msg.Role,
"content": msg.Content,
"role": msg.Role,
}

// Handle multimodal content (text + images)
if len(msg.Images) > 0 {
// Convert to OpenAI multimodal format: content is an array of content objects
var contentArray []map[string]interface{}

// Add text content if present
if msg.Content != "" {
contentArray = append(contentArray, map[string]interface{}{
"type": "text",
"text": msg.Content,
})
}

// Add images in OpenAI format
for _, imageData := range msg.Images {
// Ensure image data has proper data URI prefix
// OpenWebUI may send raw base64 without the prefix, but llama.cpp requires it
imageURL := ensureDataURIPrefix(imageData)

contentArray = append(contentArray, map[string]interface{}{
"type": "image_url",
"image_url": map[string]interface{}{
"url": imageURL,
},
})
}

openAIMsg["content"] = contentArray
} else {
// Regular text-only message
openAIMsg["content"] = msg.Content
}

// Add tool calls if present (for assistant messages)
Expand All @@ -753,11 +802,6 @@ func convertMessages(messages []Message) []map[string]interface{} {
openAIMsg["tool_call_id"] = msg.ToolCallID
}

// Add images if present (for multimodal support)
if len(msg.Images) > 0 {
openAIMsg["images"] = msg.Images
}

result[i] = openAIMsg
}
return result
Expand Down
188 changes: 188 additions & 0 deletions pkg/ollama/http_handler_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
package ollama

import (
"encoding/json"
"testing"
)

func TestConvertMessages_Multimodal(t *testing.T) {
tests := []struct {
name string
messages []Message
expected string
}{
{
name: "text only message",
messages: []Message{
{
Role: "user",
Content: "Hello, world!",
},
},
expected: `[{"content":"Hello, world!","role":"user"}]`,
},
{
name: "multimodal message with text and image",
messages: []Message{
{
Role: "user",
Content: "is there a person in the image? Answer yes or no",
Images: []string{"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...."},
},
},
expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "multimodal message with only image (no text)",
messages: []Message{
{
Role: "user",
Content: "",
Images: []string{"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...."},
},
},
expected: `[{"content":[{"image_url":{"url":"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "multimodal message with multiple images",
messages: []Message{
{
Role: "user",
Content: "Compare these images",
Images: []string{
"data:image/jpeg;base64,image1...",
"data:image/jpeg;base64,image2...",
},
},
},
expected: `[{"content":[{"text":"Compare these images","type":"text"},{"image_url":{"url":"data:image/jpeg;base64,image1..."},"type":"image_url"},{"image_url":{"url":"data:image/jpeg;base64,image2..."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "multimodal message with raw base64 from OpenWebUI (no prefix)",
messages: []Message{
{
Role: "user",
Content: "is there a person in the image? Answer yes or no",
Images: []string{"/9j/4AAQSkZJRgABAQEBLA...."},
},
},
// Should auto-add the data URI prefix
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Consider adding a case for multiple raw base64 images without prefix

You already cover a single raw base64 image (no prefix) being normalized via ensureDataURIPrefix in convertMessages. Since this also runs for multiple images, please add a test where Images contains at least two raw base64 strings without prefixes, and assert that each resulting image_url.url is correctly prefixed. This helps catch any off‑by‑one or loop issues in the multimodal path.

expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...."},"type":"image_url"}],"role":"user"}]`,
},
{
name: "assistant message with tool calls",
messages: []Message{
{
Role: "assistant",
Content: "Let me call a function",
ToolCalls: []ToolCall{
{
ID: "call_123",
Type: "function",
Function: FunctionCall{
Name: "get_weather",
Arguments: map[string]interface{}{"location": "San Francisco"},
},
},
},
},
},
// The tool_calls will have arguments converted to JSON string
// Note: JSON field order follows struct definition
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Add coverage for messages with tool_call_id (tool result messages)

Since convertMessages handles ToolCalls and ToolCallID separately, please add a test case for a tool result message (e.g., Role = "tool" with ToolCallID set) and assert that the output map includes the correct "tool_call_id". This will exercise the tool-result path and protect against regressions.

expected: `[{"content":"Let me call a function","role":"assistant","tool_calls":[{"id":"call_123","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"San Francisco\"}"}}]}]`,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := convertMessages(tt.messages)

// Marshal to JSON for comparison
resultJSON, err := json.Marshal(result)
if err != nil {
t.Fatalf("Failed to marshal result: %v", err)
}

// Compare JSON strings
if string(resultJSON) != tt.expected {
Comment on lines +132 to +133
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Avoid brittle JSON string comparison in table test

Comparing raw JSON strings here is brittle for map[string]interface{} because key order isn’t guaranteed and may vary by Go version/implementation. Instead, unmarshal both resultJSON and tt.expected into a structured type (e.g. []map[string]interface{} or []any) and compare with reflect.DeepEqual or a cmp helper so the test only fails on real semantic differences, not ordering or formatting.

Suggested implementation:

			// Marshal to JSON for comparison
			resultJSON, err := json.Marshal(result)
			if err != nil {
				t.Fatalf("Failed to marshal result: %v", err)
			}

			// Unmarshal both actual and expected into structured values to avoid brittle
			// string comparison that depends on JSON key ordering or formatting.
			var resultData []map[string]interface{}
			if err := json.Unmarshal(resultJSON, &resultData); err != nil {
				t.Fatalf("Failed to unmarshal result JSON: %v", err)
			}

			var expectedData []map[string]interface{}
			if err := json.Unmarshal([]byte(tt.expected), &expectedData); err != nil {
				t.Fatalf("Failed to unmarshal expected JSON: %v", err)
			}

			// Compare structured values so the test only fails on semantic differences.
			if !reflect.DeepEqual(resultData, expectedData) {
				t.Errorf("convertMessages() mismatch\nGot:      %s\nExpected: %s", string(resultJSON), tt.expected)
			}
  1. Ensure the file imports the reflect package at the top:
    • Add reflect to the existing import block, e.g.:
      import ( ... "encoding/json" "reflect" ... )
  2. If the expected field in the table tests is not JSON representing a []map[string]interface{}, adjust the resultData/expectedData types (e.g. to []any or a concrete struct) to match the actual shape of convertMessages output and the expected JSON.

t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected)
}
})
}
}

func TestEnsureDataURIPrefix(t *testing.T) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Consider adding an empty-string case for ensureDataURIPrefix

You already cover prefixed data URIs and http/https URLs. Please also add a case for an empty-string input, which currently becomes just the "data:image/jpeg;base64," prefix, so this behavior is explicitly tested and guarded against future changes.

Suggested implementation:

func TestEnsureDataURIPrefix(t *testing.T) {
	tests := []struct {
		name     string
		input    string
		expected string
	}{
		{
			name:     "jpeg data uri",
			input:    "data:image/jpeg;base64,abc123",
			expected: "data:image/jpeg;base64,abc123",
		},
		{
			name:     "http url",
			input:    "http://example.com/image.jpg",
			expected: "http://example.com/image.jpg",
		},
		{
			name:     "https url",
			input:    "https://example.com/image.jpg",
			expected: "https://example.com/image.jpg",
		},
		{
			name:     "raw base64 without prefix",
			input:    "abc123",
			expected: "data:image/jpeg;base64,abc123",
		},
		{
			name:     "empty string",
			input:    "",
			// Current behavior: empty input becomes just the data URI prefix.
			expected: "data:image/jpeg;base64,",
		},
	}

	for _, tt := range tests {
		t.Run(tt.name, func(t *testing.T) {
			result := ensureDataURIPrefix(tt.input)
			if result != tt.expected {
				t.Errorf("ensureDataURIPrefix(%q) = %q, want %q", tt.input, result, tt.expected)
			}
		})
	}
}

If the concrete test cases or field names in TestEnsureDataURIPrefix differ from what I inferred, adjust the SEARCH block to match the existing tests slice and insert the empty string case in the same style:

  • Keep name, input, and expected field names consistent.
  • Ensure the expected value for the empty-string input is "data:image/jpeg;base64,", reflecting the current behavior you want to guard.

tests := []struct {
name string
input string
expected string
}{
{
name: "raw base64 without prefix",
input: "/9j/4AAQSkZJRgABAQEBLA...",
expected: "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...",
},
{
name: "already has data URI prefix",
input: "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...",
expected: "data:image/jpeg;base64,/9j/4AAQSkZJRgABAQEBLA...",
},
{
name: "already has data URI with png",
input: "data:image/png;base64,iVBORw0KGgo...",
expected: "data:image/png;base64,iVBORw0KGgo...",
},
{
name: "http URL",
input: "http://example.com/image.jpg",
expected: "http://example.com/image.jpg",
},
{
name: "https URL",
input: "https://example.com/image.jpg",
expected: "https://example.com/image.jpg",
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ensureDataURIPrefix(tt.input)
if result != tt.expected {
t.Errorf("ensureDataURIPrefix() = %v, want %v", result, tt.expected)
}
})
}
}

func TestConvertMessages_PreservesOrder(t *testing.T) {
messages := []Message{
{Role: "system", Content: "You are a helpful assistant"},
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "Hi there!"},
{Role: "user", Content: "What's in this image?", Images: []string{"data:image/jpeg;base64,abc123"}},
}

result := convertMessages(messages)

Comment on lines +203 to +212
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (testing): Strengthen multimodal assertions in TestConvertMessages_PreservesOrder

To improve robustness, also assert the structure of the two content parts (e.g., first has type == "text" with the expected text, second has type == "image_url" with the expected URL) so the test validates the multimodal content array contents, not just its length.

Suggested implementation:

func TestConvertMessages_PreservesOrder(t *testing.T) {
	messages := []Message{
		{Role: "system", Content: "You are a helpful assistant"},
		{Role: "user", Content: "Hello"},
		{Role: "assistant", Content: "Hi there!"},
		{Role: "user", Content: "What's in this image?", Images: []string{"data:image/jpeg;base64,abc123"}},
	}

	result := convertMessages(messages)

	if len(result) != 4 {
		t.Fatalf("expected 4 converted messages, got %d", len(result))
	}

	// Verify roles are preserved in order
	expectedRoles := []string{"system", "user", "assistant", "user"}
	for i, role := range expectedRoles {
		if result[i].Role != role {
			t.Fatalf("message %d role mismatch: got %q, want %q", i, result[i].Role, role)
		}
	}

	// Verify multimodal content structure for the last user message
	last := result[3]

	// We expect a multimodal content array with two parts: text then image_url
	if len(last.Content) != 2 {
		t.Fatalf("expected last message to have 2 content parts, got %d", len(last.Content))
	}

	// First part: text
	textPart, ok := last.Content[0].(openai.ChatCompletionMessageContentPartText)
	if !ok {
		t.Fatalf("expected first content part to be text, got %T", last.Content[0])
	}
	if textPart.Type != "text" {
		t.Fatalf("expected first content part type %q, got %q", "text", textPart.Type)
	}
	if textPart.Text != "What's in this image?" {
		t.Fatalf("unexpected text content: got %q, want %q", textPart.Text, "What's in this image?")
	}

	// Second part: image_url
	imagePart, ok := last.Content[1].(openai.ChatCompletionMessageContentPartImage)
	if !ok {
		t.Fatalf("expected second content part to be image_url, got %T", last.Content[1])
	}
	if imagePart.Type != "image_url" {
		t.Fatalf("expected second content part type %q, got %q", "image_url", imagePart.Type)
	}
	if imagePart.ImageURL.URL != "data:image/jpeg;base64,abc123" {
		t.Fatalf("unexpected image_url content URL: got %q, want %q", imagePart.ImageURL.URL, "data:image/jpeg;base64,abc123")
	}
  1. This change assumes:
    • result is a slice of a struct with a Role string field and a Content []interface{} (or similar) field.
    • Content elements are concrete types openai.ChatCompletionMessageContentPartText and openai.ChatCompletionMessageContentPartImage from the github.com/sashabaranov/go-openai package, with fields:
      • Type string
      • Text string on the text part
      • ImageURL.URL string on the image part.
  2. If your actual types differ (e.g., []openai.ChatCompletionMessagePart with a Type discriminator instead of Go interfaces), adjust the type assertions and field accesses accordingly:
    • Replace the type assertions with direct indexing (e.g. last.Content[0].Type == openai.ChatMessagePartTypeText).
    • Replace imagePart.ImageURL.URL with the correct field path used in your code.
  3. Ensure the test file imports the openai package if it is not already imported:
    • Add openai "github.com/sashabaranov/go-openai" (or the correct alias/path) to the imports of pkg/ollama/http_handler_test.go.

if len(result) != 4 {
t.Errorf("Expected 4 messages, got %d", len(result))
}

// Check roles are preserved in order
expectedRoles := []string{"system", "user", "assistant", "user"}
for i, msg := range result {
if msg["role"] != expectedRoles[i] {
t.Errorf("Message %d: expected role %s, got %s", i, expectedRoles[i], msg["role"])
}
}

// Check last message has multimodal content
lastMsg := result[3]
content, ok := lastMsg["content"].([]map[string]interface{})
if !ok {
t.Errorf("Last message content should be an array, got %T", lastMsg["content"])
}
if len(content) != 2 {
t.Errorf("Last message should have 2 content parts (text + image), got %d", len(content))
}
}