[VLM] Add caption to flickr dataset (#1138)

kylesayrs · web-flow · commit 5e9edaec1f56 · 2025-02-15T00:16:08.000Z
## Purpose ##
* Introduce more variability in the flickr dataset for better
calibration

Sample text
```
&lt;|begin_of_text|&gt;&lt;|start_header_id|&gt;user&lt;|end_header_id|&gt;

&lt;|image|&gt;What does this image show?&lt;|eot_id|&gt;&lt;|start_header_id|&gt;assistant&lt;|end_header_id|&gt;

Two young guys with shaggy hair look at their hands while hanging out in the yard . Two young  White males are outside near many bushes . Two men in green shirts are standing in a yard . A man in a blue shirt standing in a garden . Two friends enjoy time spent together .&lt;|eot_id|&gt;
```

## Changes ##
* Add caption to flickr dataset for better/more varied calibration
* Modify preprocessing to add caption for vlm examples

## Testing ##
* Ran all vision examples

---------

Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/examples/multimodal_vision/idefics3_example.py b/examples/multimodal_vision/idefics3_example.py
@@ -49,15 +49,19 @@ def preprocess(example):
         {
             "role": "user",
             "content": [
-                {"type": "text", "text": "What does the image show?"},
+                {"type": "text", "text": "What does this image show?"},
                 {"type": "image"},
             ],
-        }
+        },
+        {
+            "role": "assistant",
+            "content": " ".join(example["caption"]),
+        },
     ]
     return {
         "text": processor.apply_chat_template(
             messages,
-            add_generation_prompt=True,
+            add_generation_prompt=False,
         ),
         "images": example["image"],
     }
diff --git a/examples/multimodal_vision/phi3_vision_example.py b/examples/multimodal_vision/phi3_vision_example.py
@@ -1,3 +1,6 @@
+# NOTE: this model requires modification in order to work with transformers>4.48
+# https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/discussions/69
+
 import torch
 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoProcessor
@@ -30,11 +33,14 @@
 
 # Apply chat template
 def preprocess(example):
-    messages = [{"role": "user", "content": "<|image_1|>\nWhat does the image show?"}]
+    messages = [
+        {"role": "user", "content": "<|image_1|>\nWhat does this image show?"},
+        {"role": "assistant", "content": " ".join(example["caption"])},
+    ]
     return {
         "text": processor.apply_chat_template(
             messages,
-            add_generation_prompt=True,
+            add_generation_prompt=False,
         ),
         "images": example["image"],
     }
diff --git a/examples/multimodal_vision/qwen2_vl_example.py b/examples/multimodal_vision/qwen2_vl_example.py
@@ -43,12 +43,18 @@ def preprocess_and_tokenize(example):
             "role": "user",
             "content": [
                 {"type": "image", "image": base64_qwen},
-                {"type": "text", "text": "What does the image show?"},
+                {"type": "text", "text": "What does this image show?"},
             ],
-        }
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {"type": "text", "text": " ".join(example["caption"])},
+            ],
+        },
     ]
     text = processor.apply_chat_template(
-        messages, tokenize=False, add_generation_prompt=True
+        messages, tokenize=False, add_generation_prompt=False
     )
     image_inputs, video_inputs = process_vision_info(messages)
 
diff --git a/src/llmcompressor/transformers/finetune/data/base.py b/src/llmcompressor/transformers/finetune/data/base.py
@@ -97,6 +97,7 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
                 self.preprocess,
                 batched=False,
                 num_proc=self.data_args.preprocessing_num_workers,
+                load_from_cache_file=not self.data_args.overwrite_cache,
                 desc="Preprocessing",
             )
             logger.debug(f"Dataset after preprocessing: {get_columns(dataset)}")
diff --git a/src/llmcompressor/transformers/finetune/data/flickr_30k.py b/src/llmcompressor/transformers/finetune/data/flickr_30k.py
@@ -55,14 +55,19 @@ def dataset_template(self, sample):
                 "role": "user",
                 "content": [
                     {"type": "image"},
-                    {"type": "text", "text": "What does the image show?"},
+                    {"type": "text", "text": "What does this image show?"},
                 ],
-            }
+            },
+            {
+                "role": "assistant",
+                "content": " ".join(sample["caption"]),
+            },
         ]
+
         return {
             "text": self.processor.apply_chat_template(
                 messages,
-                add_generation_prompt=True,
+                add_generation_prompt=False,
             ),
             "images": sample["image"],
         }

Original file line number	Diff line number	Diff line change
`@@ -97,6 +97,7 @@ def __call__(self, add_labels: bool = True) -> DatasetType:`
`97`	`97`	`self.preprocess,`
`98`	`98`	`batched=False,`
`99`	`99`	`num_proc=self.data_args.preprocessing_num_workers,`
	`100`	`+ load_from_cache_file=not self.data_args.overwrite_cache,`
`100`	`101`	`desc="Preprocessing",`
`101`	`102`	`)`
`102`	`103`	`logger.debug(f"Dataset after preprocessing: {get_columns(dataset)}")`