-
Notifications
You must be signed in to change notification settings - Fork 72
feat: add multimodal message support with image data URI handling #497
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,188 @@ | ||
| package ollama | ||
|
|
||
| import ( | ||
| "encoding/json" | ||
| "testing" | ||
| ) | ||
|
|
||
| func TestConvertMessages_Multimodal(t *testing.T) { | ||
| tests := []struct { | ||
| name string | ||
| messages []Message | ||
| expected string | ||
| }{ | ||
| { | ||
| name: "text only message", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "Hello, world!", | ||
| }, | ||
| }, | ||
| expected: `[{"content":"Hello, world!","role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with text and image", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "is there a person in the image? Answer yes or no", | ||
| Images: []string{"...."}, | ||
| }, | ||
| }, | ||
| expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with only image (no text)", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "", | ||
| Images: []string{"...."}, | ||
| }, | ||
| }, | ||
| expected: `[{"content":[{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with multiple images", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "Compare these images", | ||
| Images: []string{ | ||
| "...", | ||
| "...", | ||
| }, | ||
| }, | ||
| }, | ||
| expected: `[{"content":[{"text":"Compare these images","type":"text"},{"image_url":{"url":"..."},"type":"image_url"},{"image_url":{"url":"..."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "multimodal message with raw base64 from OpenWebUI (no prefix)", | ||
| messages: []Message{ | ||
| { | ||
| Role: "user", | ||
| Content: "is there a person in the image? Answer yes or no", | ||
| Images: []string{"/9j/4AAQSkZJRgABAQEBLA...."}, | ||
| }, | ||
| }, | ||
| // Should auto-add the data URI prefix | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Consider adding a case for multiple raw base64 images without prefix You already cover a single raw base64 image (no prefix) being normalized via |
||
| expected: `[{"content":[{"text":"is there a person in the image? Answer yes or no","type":"text"},{"image_url":{"url":"...."},"type":"image_url"}],"role":"user"}]`, | ||
| }, | ||
| { | ||
| name: "assistant message with tool calls", | ||
| messages: []Message{ | ||
| { | ||
| Role: "assistant", | ||
| Content: "Let me call a function", | ||
| ToolCalls: []ToolCall{ | ||
| { | ||
| ID: "call_123", | ||
| Type: "function", | ||
| Function: FunctionCall{ | ||
| Name: "get_weather", | ||
| Arguments: map[string]interface{}{"location": "San Francisco"}, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| }, | ||
| // The tool_calls will have arguments converted to JSON string | ||
| // Note: JSON field order follows struct definition | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Add coverage for messages with Since |
||
| expected: `[{"content":"Let me call a function","role":"assistant","tool_calls":[{"id":"call_123","type":"function","function":{"name":"get_weather","arguments":"{\"location\":\"San Francisco\"}"}}]}]`, | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| result := convertMessages(tt.messages) | ||
|
|
||
| // Marshal to JSON for comparison | ||
| resultJSON, err := json.Marshal(result) | ||
| if err != nil { | ||
| t.Fatalf("Failed to marshal result: %v", err) | ||
| } | ||
|
|
||
| // Compare JSON strings | ||
| if string(resultJSON) != tt.expected { | ||
|
Comment on lines
+132
to
+133
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Avoid brittle JSON string comparison in table test Comparing raw JSON strings here is brittle for Suggested implementation: // Marshal to JSON for comparison
resultJSON, err := json.Marshal(result)
if err != nil {
t.Fatalf("Failed to marshal result: %v", err)
}
// Unmarshal both actual and expected into structured values to avoid brittle
// string comparison that depends on JSON key ordering or formatting.
var resultData []map[string]interface{}
if err := json.Unmarshal(resultJSON, &resultData); err != nil {
t.Fatalf("Failed to unmarshal result JSON: %v", err)
}
var expectedData []map[string]interface{}
if err := json.Unmarshal([]byte(tt.expected), &expectedData); err != nil {
t.Fatalf("Failed to unmarshal expected JSON: %v", err)
}
// Compare structured values so the test only fails on semantic differences.
if !reflect.DeepEqual(resultData, expectedData) {
t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected)
}
|
||
| t.Errorf("convertMessages() mismatch\nGot: %s\nExpected: %s", string(resultJSON), tt.expected) | ||
| } | ||
ilopezluna marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestEnsureDataURIPrefix(t *testing.T) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Consider adding an empty-string case for You already cover prefixed data URIs and http/https URLs. Please also add a case for an empty-string input, which currently becomes just the Suggested implementation: func TestEnsureDataURIPrefix(t *testing.T) {
tests := []struct {
name string
input string
expected string
}{
{
name: "jpeg data uri",
input: "",
expected: "",
},
{
name: "http url",
input: "http://example.com/image.jpg",
expected: "http://example.com/image.jpg",
},
{
name: "https url",
input: "https://example.com/image.jpg",
expected: "https://example.com/image.jpg",
},
{
name: "raw base64 without prefix",
input: "abc123",
expected: "",
},
{
name: "empty string",
input: "",
// Current behavior: empty input becomes just the data URI prefix.
expected: "data:image/jpeg;base64,",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := ensureDataURIPrefix(tt.input)
if result != tt.expected {
t.Errorf("ensureDataURIPrefix(%q) = %q, want %q", tt.input, result, tt.expected)
}
})
}
}If the concrete test cases or field names in
|
||
| tests := []struct { | ||
| name string | ||
| input string | ||
| expected string | ||
| }{ | ||
| { | ||
| name: "raw base64 without prefix", | ||
| input: "/9j/4AAQSkZJRgABAQEBLA...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "already has data URI prefix", | ||
| input: "...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "already has data URI with png", | ||
| input: "...", | ||
| expected: "...", | ||
| }, | ||
| { | ||
| name: "http URL", | ||
| input: "http://example.com/image.jpg", | ||
| expected: "http://example.com/image.jpg", | ||
| }, | ||
| { | ||
| name: "https URL", | ||
| input: "https://example.com/image.jpg", | ||
| expected: "https://example.com/image.jpg", | ||
| }, | ||
| } | ||
|
|
||
| for _, tt := range tests { | ||
| t.Run(tt.name, func(t *testing.T) { | ||
| result := ensureDataURIPrefix(tt.input) | ||
| if result != tt.expected { | ||
| t.Errorf("ensureDataURIPrefix() = %v, want %v", result, tt.expected) | ||
| } | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| func TestConvertMessages_PreservesOrder(t *testing.T) { | ||
| messages := []Message{ | ||
| {Role: "system", Content: "You are a helpful assistant"}, | ||
| {Role: "user", Content: "Hello"}, | ||
| {Role: "assistant", Content: "Hi there!"}, | ||
| {Role: "user", Content: "What's in this image?", Images: []string{""}}, | ||
| } | ||
|
|
||
| result := convertMessages(messages) | ||
|
|
||
|
Comment on lines
+203
to
+212
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. suggestion (testing): Strengthen multimodal assertions in To improve robustness, also assert the structure of the two content parts (e.g., first has Suggested implementation: func TestConvertMessages_PreservesOrder(t *testing.T) {
messages := []Message{
{Role: "system", Content: "You are a helpful assistant"},
{Role: "user", Content: "Hello"},
{Role: "assistant", Content: "Hi there!"},
{Role: "user", Content: "What's in this image?", Images: []string{""}},
}
result := convertMessages(messages)
if len(result) != 4 {
t.Fatalf("expected 4 converted messages, got %d", len(result))
}
// Verify roles are preserved in order
expectedRoles := []string{"system", "user", "assistant", "user"}
for i, role := range expectedRoles {
if result[i].Role != role {
t.Fatalf("message %d role mismatch: got %q, want %q", i, result[i].Role, role)
}
}
// Verify multimodal content structure for the last user message
last := result[3]
// We expect a multimodal content array with two parts: text then image_url
if len(last.Content) != 2 {
t.Fatalf("expected last message to have 2 content parts, got %d", len(last.Content))
}
// First part: text
textPart, ok := last.Content[0].(openai.ChatCompletionMessageContentPartText)
if !ok {
t.Fatalf("expected first content part to be text, got %T", last.Content[0])
}
if textPart.Type != "text" {
t.Fatalf("expected first content part type %q, got %q", "text", textPart.Type)
}
if textPart.Text != "What's in this image?" {
t.Fatalf("unexpected text content: got %q, want %q", textPart.Text, "What's in this image?")
}
// Second part: image_url
imagePart, ok := last.Content[1].(openai.ChatCompletionMessageContentPartImage)
if !ok {
t.Fatalf("expected second content part to be image_url, got %T", last.Content[1])
}
if imagePart.Type != "image_url" {
t.Fatalf("expected second content part type %q, got %q", "image_url", imagePart.Type)
}
if imagePart.ImageURL.URL != "" {
t.Fatalf("unexpected image_url content URL: got %q, want %q", imagePart.ImageURL.URL, "")
}
|
||
| if len(result) != 4 { | ||
| t.Errorf("Expected 4 messages, got %d", len(result)) | ||
| } | ||
|
|
||
| // Check roles are preserved in order | ||
| expectedRoles := []string{"system", "user", "assistant", "user"} | ||
| for i, msg := range result { | ||
| if msg["role"] != expectedRoles[i] { | ||
| t.Errorf("Message %d: expected role %s, got %s", i, expectedRoles[i], msg["role"]) | ||
| } | ||
| } | ||
|
|
||
| // Check last message has multimodal content | ||
| lastMsg := result[3] | ||
| content, ok := lastMsg["content"].([]map[string]interface{}) | ||
| if !ok { | ||
| t.Errorf("Last message content should be an array, got %T", lastMsg["content"]) | ||
| } | ||
| if len(content) != 2 { | ||
| t.Errorf("Last message should have 2 content parts (text + image), got %d", len(content)) | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.