HabanaAI · dsocek · Sep 17, 2025 · Sep 18, 2025 · Sep 18, 2025
@@ -94,7 +94,14 @@
         """
         content = [{"text": prompt, "type": "text"}]
         if mm_content is not None:
-            content.append(mm_content)
+            # Handle multiple images in mm_content
+            if "image" in mm_content and isinstance(mm_content["image"], list):
+                # Multiple images - append each one separately
+                for img in mm_content["image"]:
+                    content.append(img)
+            else:
+                # Single image or other multimodal content
+                content.append(mm_content)
         return [{"role": "user", "content": content}]
 
     def load_data(self) -> None:
@@ -747,6 +754,9 @@
         sampled_requests = []
         dynamic_output = output_len is None
 
+        # Extract images_per_request from kwargs
+        images_per_request = kwargs.get('images_per_request', 1)
+
         for item in filtered_data:
             if len(sampled_requests) >= num_requests:
                 break
@@ -761,7 +771,22 @@
             assert isinstance(output_len, int) and output_len > 0
             if dynamic_output and not is_valid_sequence(prompt_len, completion_len):
                 continue
-            mm_content = process_image(item["image"]) if "image" in item else None
+
+            # Simple approach: if multiple images requested, duplicate the current image
+            if images_per_request > 1 and "image" in item:
+                single_image = process_image(item["image"])
+                if single_image:
+                    # For chat format, we need to create separate image entries
+                    # Instead of {"image": [img1, img2]}, create multiple separate processed images
+                    images = [single_image] * images_per_request
+                    # For multimodal chat, we'll pass the list to be handled properly
+                    mm_content = {"image": images}
+                else:
+                    mm_content = None
+            else:
+                # Original single image logic
+                mm_content = process_image(item["image"]) if "image" in item else None
+
             if enable_multimodal_chat:
                 # Note: when chat is enabled the request prompt_len is no longer
                 # accurate and we will be using request output to count the

@@ -465,6 +465,10 @@
         "output_len": args.output_len,
     }
 
+    # Add images_per_request if limit_mm_per_prompt is set
+    if hasattr(args, 'limit_mm_per_prompt') and args.limit_mm_per_prompt and 'image' in args.limit_mm_per_prompt:
+        sample_kwargs["images_per_request"] = args.limit_mm_per_prompt['image']
+
     if args.dataset_path is None or args.dataset_name == "random":
         sample_kwargs["range_ratio"] = args.random_range_ratio
         sample_kwargs["prefix_len"] = args.prefix_len

@@ -405,8 +405,14 @@
     jinja_text = (hf_chat_template if isinstance(hf_chat_template, str)
                   else load_chat_template(chat_template, is_literal=True))
 
-    detected_format = ("string" if jinja_text is None else
-                       _detect_content_format(jinja_text, default="string"))
+    # The InternVL template has mixed content access patterns that fail with automatic detection.
+    # Set string format for proper operation if InternVL is used.
+    model_type = getattr(model_config.hf_config, 'model_type', '')
+    if model_type == 'internvl_chat' or 'internvl' in model_config.model.lower():
+        detected_format = "string"
+    else:
+        detected_format = ("string" if jinja_text is None else
+                           _detect_content_format(jinja_text, default="string"))
 
     return detected_format