-
Notifications
You must be signed in to change notification settings - Fork 72
feat: add multimodal message support with image data URI handling #497
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,234 @@ | ||
| package ollama | ||
|
|
||
| import ( | ||
| "encoding/json" | ||
| "testing" | ||
| ) | ||
|
|
||
| func TestConvertMessages_Multimodal(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| messages []Message | ||
| expected string | ||
| }{ | ||
| { | ||
| name: "text only message", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "Hello, world!", | ||
| }, | ||
| }, | ||
| expected: `[{"content":"Hello, world!","role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with text and image", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "is there a person in the image? Answer yes or no", | ||
| Images: []string{"...."}, | ||
| }, | ||
| }, | ||
| expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with only image (no text)", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "", | ||
| Images: []string{"...."}, | ||
| }, | ||
| }, | ||
| expected: `[{"content":[{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with multiple images", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "Compare these images", | ||
| Images: []string{ | ||
| "...", | ||
| "...", | ||
| }, | ||
| }, | ||
| }, | ||
| expected: `[{"content":[{"text":"Compare these images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with raw base64 from OpenWebUI (no prefix)", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "is there a person in the image? Answer yes or no", | ||
| Images: []string{"/9j/4AAQSkZJRgABAQEBLA...."}, | ||
| }, | ||
| }, | ||
| // Should auto-add the data URI prefix | ||
| expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "assistant message with tool calls", | ||
| messages: []Message{ | ||
| { | ||
| Role: "assistant", | ||
| Content: "Let me call a function", | ||
| ToolCalls: []ToolCall{ | ||
| { | ||
| ID: "call_123", | ||
| Type: "function", | ||
| Function: FunctionCall{ | ||
| Name: "get_weather", | ||
| Arguments: map[string]interface{}{"location": "San Francisco"}, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| // The tool_calls will have arguments converted to JSON string | ||
| // Note: JSON field order follows struct definition | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Add coverage for messages with Since |
||
| expected: `[{"content":"Let me call a function","role":"assistant","tool_calls":[{"id":"call_123","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"San Francisco\"}"}}]}]`, | ||
| }, | ||
| { | ||
| name: "tool result message with tool_call_id", | ||
| messages: []Message{ | ||
| { | ||
| Role: "tool", | ||
| Content: "The weather in San Francisco is sunny, 72°F", | ||
| ToolCallID: "call_123", | ||
| }, | ||
| }, | ||
| expected: `[{"content":"The weather in San Francisco is sunny, 72°F","role":"tool","tool_call_id":"call_123"}]`, | ||
| }, | ||
| { | ||
| name: "multiple raw base64 images without prefix", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "Compare these two images", | ||
| Images: []string{ | ||
| "/9j/4AAQSkZJRgABAQEBLA...", | ||
| "iVBORw0KGgoAAAANSUhEUgAAA...", | ||
| }, | ||
| }, | ||
| }, | ||
| // Should auto-detect MIME types and add appropriate prefixes | ||
| expected: `[{"content":[{"text":"Compare these two images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| result := convertMessages(tt.messages) | ||
|
|
||
| // Marshal to JSON for comparison | ||
| resultJSON, err := json.Marshal(result) | ||
| if err != nil { | ||
| t.Fatalf("Failed to marshal result: %v", err) | ||
| } | ||
|
|
||
| // Compare JSON strings | ||
| if string(resultJSON) != tt.expected { | ||
|
Comment on lines
+132
to
+133
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Avoid brittle JSON string comparison in table test Comparing raw JSON strings here is brittle for Suggested implementation: // Marshal to JSON for comparison
resultJSON, err := json.Marshal(result)
if err != nil {
t.Fatalf("Failed to marshal result: %v", err)
}
// Unmarshal both actual and expected into structured values to avoid brittle
// string comparison that depends on JSON key ordering or formatting.
var resultData []map[string]interface{}
if err := json.Unmarshal(resultJSON, &resultData); err != nil {
t.Fatalf("Failed to unmarshal result JSON: %v", err)
}
var expectedData []map[string]interface{}
if err := json.Unmarshal([]byte(tt.expected), &expectedData); err != nil {
t.Fatalf("Failed to unmarshal expected JSON: %v", err)
}
// Compare structured values so the test only fails on semantic differences.
if !reflect.DeepEqual(resultData, expectedData) {
t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected)
}
|
||
| t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected) | ||
| } | ||
ilopezluna marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestEnsureDataURIPrefix(t *testing.T) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Consider adding an empty-string case for You already cover prefixed data URIs and http/https URLs. Please also add a case for an empty-string input, which currently becomes just the Suggested implementation: func TestEnsureDataURIPrefix(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "jpeg data uri",
input: "",
expected: "",
},
{
name: "http url",
input: "http://example.com/image.jpg",
expected: "http://example.com/image.jpg",
},
{
name: "https url",
input: "https://example.com/image.jpg",
expected: "https://example.com/image.jpg",
},
{
name: "raw base64 without prefix",
input: "abc123",
expected: "",
},
{
name: "empty string",
input: "",
// Current behavior: empty input becomes just the data URI prefix.
expected: "data:image/jpeg;base64,",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ensureDataURIPrefix(tt.input)
if result != tt.expected {
t.Errorf("ensureDataURIPrefix(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}If the concrete test cases or field names in
|
||
| tests := []struct { | ||
| name string | ||
| input string | ||
| expected string | ||
| }{ | ||
| { | ||
| name: "raw JPEG base64 without prefix", | ||
| input: "/9j/4AAQSkZJRgABAQEBLA...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "raw PNG base64 without prefix", | ||
| input: "iVBORw0KGgoAAAANSUhEUgAAA...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "raw GIF base64 without prefix", | ||
| input: "R0lGODlhAQABAIAAAAAAAP...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "already has data URI prefix", | ||
| input: "...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "already has data URI with png", | ||
| input: "...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "http URL", | ||
| input: "http://example.com/image.jpg", | ||
| expected: "http://example.com/image.jpg", | ||
| }, | ||
| { | ||
| name: "https URL", | ||
| input: "https://example.com/image.jpg", | ||
| expected: "https://example.com/image.jpg", | ||
| }, | ||
| { | ||
| name: "empty string", | ||
| input: "", | ||
| expected: "data:image/jpeg;base64,", | ||
| }, | ||
| { | ||
| name: "whitespace with base64", | ||
| input: " /9j/4AAQSkZJRgABAQEBLA... ", | ||
| expected: "...", | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| result := ensureDataURIPrefix(tt.input) | ||
| if result != tt.expected { | ||
| t.Errorf("ensureDataURIPrefix() = %v, want %v", result, tt.expected) | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestConvertMessages_PreservesOrder(t *testing.T) { | ||
| messages := []Message{ | ||
| {Role: "system", Content: "You are a helpful assistant"}, | ||
| {Role: "user", Content: "Hello"}, | ||
| {Role: "assistant", Content: "Hi there!"}, | ||
| {Role: "user", Content: "What's in this image?", Images: []string{""}}, | ||
| } | ||
|
|
||
| result := convertMessages(messages) | ||
|
|
||
|
Comment on lines
+203
to
+212
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Strengthen multimodal assertions in To improve robustness, also assert the structure of the two content parts (e.g., first has Suggested implementation: func TestConvertMessages_PreservesOrder(t *testing.T) {
messages := []Message{
{Role: "system", Content: "You are a helpful assistant"},
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "Hi there!"},
{Role: "user", Content: "What's in this image?", Images: []string{""}},
}
result := convertMessages(messages)
if len(result) != 4 {
t.Fatalf("expected 4 converted messages, got %d", len(result))
}
// Verify roles are preserved in order
expectedRoles := []string{"system", "user", "assistant", "user"}
for i, role := range expectedRoles {
if result[i].Role != role {
t.Fatalf("message %d role mismatch: got %q, want %q", i, result[i].Role, role)
}
}
// Verify multimodal content structure for the last user message
last := result[3]
// We expect a multimodal content array with two parts: text then image_url
if len(last.Content) != 2 {
t.Fatalf("expected last message to have 2 content parts, got %d", len(last.Content))
}
// First part: text
textPart, ok := last.Content[0].(openai.ChatCompletionMessageContentPartText)
if !ok {
t.Fatalf("expected first content part to be text, got %T", last.Content[0])
}
if textPart.Type != "text" {
t.Fatalf("expected first content part type %q, got %q", "text", textPart.Type)
}
if textPart.Text != "What's in this image?" {
t.Fatalf("unexpected text content: got %q, want %q", textPart.Text, "What's in this image?")
}
// Second part: image_url
imagePart, ok := last.Content[1].(openai.ChatCompletionMessageContentPartImage)
if !ok {
t.Fatalf("expected second content part to be image_url, got %T", last.Content[1])
}
if imagePart.Type != "image_url" {
t.Fatalf("expected second content part type %q, got %q", "image_url", imagePart.Type)
}
if imagePart.ImageURL.URL != "" {
t.Fatalf("unexpected image_url content URL: got %q, want %q", imagePart.ImageURL.URL, "")
}
|
||
| if len(result) != 4 { | ||
| t.Errorf("Expected 4 messages, got %d", len(result)) | ||
| } | ||
|
|
||
| // Check roles are preserved in order | ||
| expectedRoles := []string{"system", "user", "assistant", "user"} | ||
| for i, msg := range result { | ||
| if msg["role"] != expectedRoles[i] { | ||
| t.Errorf("Message %d: expected role %s, got %s", i, expectedRoles[i], msg["role"]) | ||
| } | ||
| } | ||
|
|
||
| // Check last message has multimodal content | ||
| lastMsg := result[3] | ||
| content, ok := lastMsg["content"].([]map[string]interface{}) | ||
| if !ok { | ||
| t.Errorf("Last message content should be an array, got %T", lastMsg["content"]) | ||
| } | ||
| if len(content) != 2 { | ||
| t.Errorf("Last message should have 2 content parts (text + image), got %d", len(content)) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
suggestion (testing): Consider adding a case for multiple raw base64 images without prefix
You already cover a single raw base64 image (no prefix) being normalized via
ensureDataURIPrefixinconvertMessages. Since this also runs for multiple images, please add a test whereImagescontains at least two raw base64 strings without prefixes, and assert that each resultingimage_url.urlis correctly prefixed. This helps catch any off‑by‑one or loop issues in the multimodal path.