@@ -444,12 +444,13 @@ func (a *Agent) describeImage(ctx context.Context, model, imageURL string) (stri
444444 return resp .Choices [0 ].Message .Content , nil
445445}
446446
447- func extractImageContent (message openai.ChatCompletionMessage ) (imageURL , text string , e error ) {
447+ // extractAllImageContent extracts all images from a message
448+ func extractAllImageContent (message openai.ChatCompletionMessage ) (images []string , text string , e error ) {
448449 e = fmt .Errorf ("no image found" )
449450 if message .MultiContent != nil {
450451 for _ , content := range message .MultiContent {
451452 if content .Type == openai .ChatMessagePartTypeImageURL {
452- imageURL = content .ImageURL .URL
453+ images = append ( images , content .ImageURL .URL )
453454 e = nil
454455 }
455456 if content .Type == openai .ChatMessagePartTypeText {
@@ -463,45 +464,75 @@ func extractImageContent(message openai.ChatCompletionMessage) (imageURL, text s
463464
464465func (a * Agent ) processUserInputs (job * types.Job , role string , conv Messages ) Messages {
465466
466- // walk conversation history, and check if last message from user contains image .
467- // If it does , we need to describe the image first with a model that supports image understanding (if the current model doesn't support it)
468- // and add it to the conversation context
467+ // walk conversation history, and check if any message contains images .
468+ // If they do , we need to describe the images first with a model that supports image understanding (if the current model doesn't support it)
469+ // and add them to the conversation context
469470 if ! a .options .SeparatedMultimodalModel () {
470471 return conv
471472 }
472473
473474 xlog .Debug ("Processing user inputs" , "agent" , a .Character .Name , "conversation" , conv )
474475
475- lastUserMessage := conv .GetLatestUserMessage ()
476- xlog .Debug ("Last user message" , "lastUserMessage" , lastUserMessage )
477- if lastUserMessage != nil && conv .IsLastMessageFromRole (UserRole ) {
478- imageURL , text , err := extractImageContent (* lastUserMessage )
479- if err == nil {
480- xlog .Debug ("Found image in user input" , "image" , imageURL )
481- // We have an image, we need to describe it first
482- // and add it to the conversation context
483- imageDescription , err := a .describeImage (a .context .Context , a .options .LLMAPI .MultimodalModel , imageURL )
484- if err != nil {
485- xlog .Error ("Error describing image" , "error" , err )
486- } else {
487- // We replace the user message with the image description
488- // and add the user text to the conversation
489- explainerMessage := openai.ChatCompletionMessage {
490- Role : "system" ,
491- Content : fmt .Sprintf ("The user shared an image which can be described as: %s" , imageDescription ),
476+ // Process all messages in the conversation to extract and describe images
477+ var processedMessages Messages
478+ var messagesToRemove []int
479+
480+ for i , message := range conv {
481+ images , text , err := extractAllImageContent (message )
482+ if err == nil && len (images ) > 0 {
483+ xlog .Debug ("Found images in message" , "messageIndex" , i , "imageCount" , len (images ), "role" , message .Role )
484+
485+ // Mark this message for removal
486+ messagesToRemove = append (messagesToRemove , i )
487+
488+ // Process each image in the message
489+ var imageDescriptions []string
490+ for j , image := range images {
491+ imageDescription , err := a .describeImage (a .context .Context , a .options .LLMAPI .MultimodalModel , image )
492+ if err != nil {
493+ xlog .Error ("Error describing image" , "error" , err , "messageIndex" , i , "imageIndex" , j )
494+ imageDescriptions = append (imageDescriptions , fmt .Sprintf ("Image %d: [Error describing image: %v]" , j + 1 , err ))
495+ } else {
496+ imageDescriptions = append (imageDescriptions , fmt .Sprintf ("Image %d: %s" , j + 1 , imageDescription ))
492497 }
498+ }
493499
494- // remove lastUserMessage from the conversation
495- conv = conv .RemoveLastUserMessage ()
496- conv = append (conv , explainerMessage )
497- conv = append (conv , openai.ChatCompletionMessage {
498- Role : role ,
500+ // Add the text content as a new message with the same role first
501+ if text != "" {
502+ textMessage := openai.ChatCompletionMessage {
503+ Role : message .Role ,
499504 Content : text ,
500- })
505+ }
506+ processedMessages = append (processedMessages , textMessage )
507+
508+ // Add the image descriptions as a system message after the text
509+ explainerMessage := openai.ChatCompletionMessage {
510+ Role : "system" ,
511+ Content : fmt .Sprintf ("The above message also contains %d image(s) which can be described as: %s" ,
512+ len (images ), strings .Join (imageDescriptions , "; " )),
513+ }
514+ processedMessages = append (processedMessages , explainerMessage )
515+ } else {
516+ // If there's no text, just add the image descriptions as a system message
517+ explainerMessage := openai.ChatCompletionMessage {
518+ Role : "system" ,
519+ Content : fmt .Sprintf ("Message contains %d image(s) which can be described as: %s" ,
520+ len (images ), strings .Join (imageDescriptions , "; " )),
521+ }
522+ processedMessages = append (processedMessages , explainerMessage )
501523 }
524+ } else {
525+ // No image found, keep the original message
526+ processedMessages = append (processedMessages , message )
502527 }
503528 }
504529
530+ // If we found and processed any images, replace the conversation
531+ if len (messagesToRemove ) > 0 {
532+ xlog .Info ("Processed images in conversation" , "messagesWithImages" , len (messagesToRemove ), "agent" , a .Character .Name )
533+ return processedMessages
534+ }
535+
505536 return conv
506537}
507538
@@ -578,30 +609,30 @@ func (a *Agent) validateBuiltinTools(job *types.Job) {
578609 if len (builtinTools ) == 0 {
579610 return
580611 }
581-
612+
582613 // Get available actions
583614 availableActions := a .mcpActions
584-
615+
585616 for _ , tool := range builtinTools {
586617 functionName := tool .Name
587-
618+
588619 // Check if this is a web search builtin tool
589620 if strings .HasPrefix (string (functionName ), "web_search_" ) {
590621 // Look for a search action
591622 searchAction := availableActions .Find ("search" )
592623 if searchAction == nil {
593- xlog .Warn ("Web search builtin tool specified but no 'search' action available" ,
594- "function_name" , functionName ,
624+ xlog .Warn ("Web search builtin tool specified but no 'search' action available" ,
625+ "function_name" , functionName ,
595626 "agent" , a .Character .Name )
596627 } else {
597- xlog .Debug ("Web search builtin tool matched to search action" ,
598- "function_name" , functionName ,
628+ xlog .Debug ("Web search builtin tool matched to search action" ,
629+ "function_name" , functionName ,
599630 "agent" , a .Character .Name )
600631 }
601632 } else {
602633 // For future builtin tools, add more matching logic here
603- xlog .Warn ("Unknown builtin tool specified" ,
604- "function_name" , functionName ,
634+ xlog .Warn ("Unknown builtin tool specified" ,
635+ "function_name" , functionName ,
605636 "agent" , a .Character .Name )
606637 }
607638 }
@@ -621,10 +652,10 @@ func (a *Agent) replyWithToolCall(job *types.Job, conv []openai.ChatCompletionMe
621652 Result : reasoning , // The reasoning/message to show to user
622653 },
623654 }
624-
655+
625656 // Add the action state to the job result
626657 job .Result .SetResult (stateResult )
627-
658+
628659 // Set conversation but leave Response empty
629660 // The webui will detect the user-defined action and generate the proper tool call response
630661 job .Result .Conversation = conv
@@ -912,7 +943,7 @@ func (a *Agent) consumeJob(job *types.Job, role string, retries int) {
912943 a .replyWithToolCall (job , conv , actionParams , chosenAction , reasoning )
913944 return
914945 }
915-
946+
916947 result , err := a .runAction (job , chosenAction , actionParams )
917948 if err != nil {
918949 result .Result = fmt .Sprintf ("Error running tool: %v" , err )
0 commit comments