Skip to content

Commit 9500ec7

Browse files
authored
feat(mutlimodal): do parse all images shared in the conversation (#221)
Signed-off-by: Ettore Di Giacinto <[email protected]>
1 parent 1fb7f8b commit 9500ec7

File tree

2 files changed

+222
-190
lines changed

2 files changed

+222
-190
lines changed

core/agent/agent.go

Lines changed: 71 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -444,12 +444,13 @@ func (a *Agent) describeImage(ctx context.Context, model, imageURL string) (stri
444444
return resp.Choices[0].Message.Content, nil
445445
}
446446

447-
func extractImageContent(message openai.ChatCompletionMessage) (imageURL, text string, e error) {
447+
// extractAllImageContent extracts all images from a message
448+
func extractAllImageContent(message openai.ChatCompletionMessage) (images []string, text string, e error) {
448449
e = fmt.Errorf("no image found")
449450
if message.MultiContent != nil {
450451
for _, content := range message.MultiContent {
451452
if content.Type == openai.ChatMessagePartTypeImageURL {
452-
imageURL = content.ImageURL.URL
453+
images = append(images, content.ImageURL.URL)
453454
e = nil
454455
}
455456
if content.Type == openai.ChatMessagePartTypeText {
@@ -463,45 +464,75 @@ func extractImageContent(message openai.ChatCompletionMessage) (imageURL, text s
463464

464465
func (a *Agent) processUserInputs(job *types.Job, role string, conv Messages) Messages {
465466

466-
// walk conversation history, and check if last message from user contains image.
467-
// If it does, we need to describe the image first with a model that supports image understanding (if the current model doesn't support it)
468-
// and add it to the conversation context
467+
// walk conversation history, and check if any message contains images.
468+
// If they do, we need to describe the images first with a model that supports image understanding (if the current model doesn't support it)
469+
// and add them to the conversation context
469470
if !a.options.SeparatedMultimodalModel() {
470471
return conv
471472
}
472473

473474
xlog.Debug("Processing user inputs", "agent", a.Character.Name, "conversation", conv)
474475

475-
lastUserMessage := conv.GetLatestUserMessage()
476-
xlog.Debug("Last user message", "lastUserMessage", lastUserMessage)
477-
if lastUserMessage != nil && conv.IsLastMessageFromRole(UserRole) {
478-
imageURL, text, err := extractImageContent(*lastUserMessage)
479-
if err == nil {
480-
xlog.Debug("Found image in user input", "image", imageURL)
481-
// We have an image, we need to describe it first
482-
// and add it to the conversation context
483-
imageDescription, err := a.describeImage(a.context.Context, a.options.LLMAPI.MultimodalModel, imageURL)
484-
if err != nil {
485-
xlog.Error("Error describing image", "error", err)
486-
} else {
487-
// We replace the user message with the image description
488-
// and add the user text to the conversation
489-
explainerMessage := openai.ChatCompletionMessage{
490-
Role: "system",
491-
Content: fmt.Sprintf("The user shared an image which can be described as: %s", imageDescription),
476+
// Process all messages in the conversation to extract and describe images
477+
var processedMessages Messages
478+
var messagesToRemove []int
479+
480+
for i, message := range conv {
481+
images, text, err := extractAllImageContent(message)
482+
if err == nil && len(images) > 0 {
483+
xlog.Debug("Found images in message", "messageIndex", i, "imageCount", len(images), "role", message.Role)
484+
485+
// Mark this message for removal
486+
messagesToRemove = append(messagesToRemove, i)
487+
488+
// Process each image in the message
489+
var imageDescriptions []string
490+
for j, image := range images {
491+
imageDescription, err := a.describeImage(a.context.Context, a.options.LLMAPI.MultimodalModel, image)
492+
if err != nil {
493+
xlog.Error("Error describing image", "error", err, "messageIndex", i, "imageIndex", j)
494+
imageDescriptions = append(imageDescriptions, fmt.Sprintf("Image %d: [Error describing image: %v]", j+1, err))
495+
} else {
496+
imageDescriptions = append(imageDescriptions, fmt.Sprintf("Image %d: %s", j+1, imageDescription))
492497
}
498+
}
493499

494-
// remove lastUserMessage from the conversation
495-
conv = conv.RemoveLastUserMessage()
496-
conv = append(conv, explainerMessage)
497-
conv = append(conv, openai.ChatCompletionMessage{
498-
Role: role,
500+
// Add the text content as a new message with the same role first
501+
if text != "" {
502+
textMessage := openai.ChatCompletionMessage{
503+
Role: message.Role,
499504
Content: text,
500-
})
505+
}
506+
processedMessages = append(processedMessages, textMessage)
507+
508+
// Add the image descriptions as a system message after the text
509+
explainerMessage := openai.ChatCompletionMessage{
510+
Role: "system",
511+
Content: fmt.Sprintf("The above message also contains %d image(s) which can be described as: %s",
512+
len(images), strings.Join(imageDescriptions, "; ")),
513+
}
514+
processedMessages = append(processedMessages, explainerMessage)
515+
} else {
516+
// If there's no text, just add the image descriptions as a system message
517+
explainerMessage := openai.ChatCompletionMessage{
518+
Role: "system",
519+
Content: fmt.Sprintf("Message contains %d image(s) which can be described as: %s",
520+
len(images), strings.Join(imageDescriptions, "; ")),
521+
}
522+
processedMessages = append(processedMessages, explainerMessage)
501523
}
524+
} else {
525+
// No image found, keep the original message
526+
processedMessages = append(processedMessages, message)
502527
}
503528
}
504529

530+
// If we found and processed any images, replace the conversation
531+
if len(messagesToRemove) > 0 {
532+
xlog.Info("Processed images in conversation", "messagesWithImages", len(messagesToRemove), "agent", a.Character.Name)
533+
return processedMessages
534+
}
535+
505536
return conv
506537
}
507538

@@ -578,30 +609,30 @@ func (a *Agent) validateBuiltinTools(job *types.Job) {
578609
if len(builtinTools) == 0 {
579610
return
580611
}
581-
612+
582613
// Get available actions
583614
availableActions := a.mcpActions
584-
615+
585616
for _, tool := range builtinTools {
586617
functionName := tool.Name
587-
618+
588619
// Check if this is a web search builtin tool
589620
if strings.HasPrefix(string(functionName), "web_search_") {
590621
// Look for a search action
591622
searchAction := availableActions.Find("search")
592623
if searchAction == nil {
593-
xlog.Warn("Web search builtin tool specified but no 'search' action available",
594-
"function_name", functionName,
624+
xlog.Warn("Web search builtin tool specified but no 'search' action available",
625+
"function_name", functionName,
595626
"agent", a.Character.Name)
596627
} else {
597-
xlog.Debug("Web search builtin tool matched to search action",
598-
"function_name", functionName,
628+
xlog.Debug("Web search builtin tool matched to search action",
629+
"function_name", functionName,
599630
"agent", a.Character.Name)
600631
}
601632
} else {
602633
// For future builtin tools, add more matching logic here
603-
xlog.Warn("Unknown builtin tool specified",
604-
"function_name", functionName,
634+
xlog.Warn("Unknown builtin tool specified",
635+
"function_name", functionName,
605636
"agent", a.Character.Name)
606637
}
607638
}
@@ -621,10 +652,10 @@ func (a *Agent) replyWithToolCall(job *types.Job, conv []openai.ChatCompletionMe
621652
Result: reasoning, // The reasoning/message to show to user
622653
},
623654
}
624-
655+
625656
// Add the action state to the job result
626657
job.Result.SetResult(stateResult)
627-
658+
628659
// Set conversation but leave Response empty
629660
// The webui will detect the user-defined action and generate the proper tool call response
630661
job.Result.Conversation = conv
@@ -912,7 +943,7 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
912943
a.replyWithToolCall(job, conv, actionParams, chosenAction, reasoning)
913944
return
914945
}
915-
946+
916947
result, err := a.runAction(job, chosenAction, actionParams)
917948
if err != nil {
918949
result.Result = fmt.Sprintf("Error running tool: %v", err)

0 commit comments

Comments
 (0)