Skip to content

Commit 5e9edae

Browse files
authored
[VLM] Add caption to flickr dataset (#1138)
## Purpose ## * Introduce more variability in the flickr dataset for better calibration Sample text ``` <|begin_of_text|><|start_header_id|>user<|end_header_id|> <|image|>What does this image show?<|eot_id|><|start_header_id|>assistant<|end_header_id|> Two young guys with shaggy hair look at their hands while hanging out in the yard . Two young White males are outside near many bushes . Two men in green shirts are standing in a yard . A man in a blue shirt standing in a garden . Two friends enjoy time spent together .<|eot_id|> ``` ## Changes ## * Add caption to flickr dataset for better/more varied calibration * Modify preprocessing to add caption for vlm examples ## Testing ## * Ran all vision examples --------- Signed-off-by: Kyle Sayers <[email protected]>
1 parent cdf686f commit 5e9edae

File tree

5 files changed

+33
-11
lines changed

5 files changed

+33
-11
lines changed

examples/multimodal_vision/idefics3_example.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,15 +49,19 @@ def preprocess(example):
4949
{
5050
"role": "user",
5151
"content": [
52-
{"type": "text", "text": "What does the image show?"},
52+
{"type": "text", "text": "What does this image show?"},
5353
{"type": "image"},
5454
],
55-
}
55+
},
56+
{
57+
"role": "assistant",
58+
"content": " ".join(example["caption"]),
59+
},
5660
]
5761
return {
5862
"text": processor.apply_chat_template(
5963
messages,
60-
add_generation_prompt=True,
64+
add_generation_prompt=False,
6165
),
6266
"images": example["image"],
6367
}

examples/multimodal_vision/phi3_vision_example.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
# NOTE: this model requires modification in order to work with transformers>4.48
2+
# https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/discussions/69
3+
14
import torch
25
from datasets import load_dataset
36
from transformers import AutoModelForCausalLM, AutoProcessor
@@ -30,11 +33,14 @@
3033

3134
# Apply chat template
3235
def preprocess(example):
33-
messages = [{"role": "user", "content": "<|image_1|>\nWhat does the image show?"}]
36+
messages = [
37+
{"role": "user", "content": "<|image_1|>\nWhat does this image show?"},
38+
{"role": "assistant", "content": " ".join(example["caption"])},
39+
]
3440
return {
3541
"text": processor.apply_chat_template(
3642
messages,
37-
add_generation_prompt=True,
43+
add_generation_prompt=False,
3844
),
3945
"images": example["image"],
4046
}

examples/multimodal_vision/qwen2_vl_example.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,12 +43,18 @@ def preprocess_and_tokenize(example):
4343
"role": "user",
4444
"content": [
4545
{"type": "image", "image": base64_qwen},
46-
{"type": "text", "text": "What does the image show?"},
46+
{"type": "text", "text": "What does this image show?"},
4747
],
48-
}
48+
},
49+
{
50+
"role": "assistant",
51+
"content": [
52+
{"type": "text", "text": " ".join(example["caption"])},
53+
],
54+
},
4955
]
5056
text = processor.apply_chat_template(
51-
messages, tokenize=False, add_generation_prompt=True
57+
messages, tokenize=False, add_generation_prompt=False
5258
)
5359
image_inputs, video_inputs = process_vision_info(messages)
5460

src/llmcompressor/transformers/finetune/data/base.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def __call__(self, add_labels: bool = True) -> DatasetType:
9797
self.preprocess,
9898
batched=False,
9999
num_proc=self.data_args.preprocessing_num_workers,
100+
load_from_cache_file=not self.data_args.overwrite_cache,
100101
desc="Preprocessing",
101102
)
102103
logger.debug(f"Dataset after preprocessing: {get_columns(dataset)}")

src/llmcompressor/transformers/finetune/data/flickr_30k.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,19 @@ def dataset_template(self, sample):
5555
"role": "user",
5656
"content": [
5757
{"type": "image"},
58-
{"type": "text", "text": "What does the image show?"},
58+
{"type": "text", "text": "What does this image show?"},
5959
],
60-
}
60+
},
61+
{
62+
"role": "assistant",
63+
"content": " ".join(sample["caption"]),
64+
},
6165
]
66+
6267
return {
6368
"text": self.processor.apply_chat_template(
6469
messages,
65-
add_generation_prompt=True,
70+
add_generation_prompt=False,
6671
),
6772
"images": sample["image"],
6873
}

0 commit comments

Comments
 (0)