[examples] fix vision_tower/multi_modal_projector regexes (#1871)

brian-dellabetta · fynnsu · web-flow · commit 183364ec1887 · 2025-10-01T11:24:36.000-04:00
SUMMARY: Resolves #1652 Our multimodal examples all ignore `"re:vision_tower.*"`, but this misses cases where the name is prefixed with something else (e.g. `model.vision_tower`). This PR loosens the regexes to allow for anything to precede `vision_tower` or `multi_modal_projector` and still be caught by the ignore. Layers beginning with `vision_tower`, without a prefix, will still be caught. Also some formatting fixes, which must not be included on `examples/` as part of ci/cd checks. TEST PLAN: Running `llm-compressor/examples/multimodal_vision/mistral3_example.py` on latest main shows we are quantizing layers we don't want to be: ``` 2025-09-26T20:02:43.571160+0000 | compress_modules | INFO - Quantizing model.vision_tower.transformer.layers.4.feed_forward.gate_proj using 512 samples ``` After these changes, those don't appear in the logs --------- Signed-off-by: Brian Dellabetta <bdellabe@redhat.com> Co-authored-by: Fynn Schmitt-Ulms <fynnsu@outlook.com>
diff --git a/examples/multimodal_vision/README.md b/examples/multimodal_vision/README.md
@@ -37,7 +37,7 @@ recipe = [
         targets="Linear",
         scheme="W4A16",
         sequential_targets=["MistralDecoderLayer"],
-        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+        ignore=["re:.*lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*"],
     ),
 ]
 ```
diff --git a/examples/multimodal_vision/llama4_example.py b/examples/multimodal_vision/llama4_example.py
@@ -52,9 +52,11 @@ def preprocess_function(example):
 def data_collator(batch):
     assert len(batch) == 1
     return {
-        key: torch.tensor(value)
-        if key != "pixel_values"
-        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        )
         for key, value in batch[0].items()
     }
 
@@ -67,8 +69,8 @@ def data_collator(batch):
         "re:.*lm_head",
         "re:.*self_attn",
         "re:.*router",
-        "re:vision_model.*",
-        "re:multi_modal_projector.*",
+        "re:.*vision_model.*",
+        "re:.*multi_modal_projector.*",
         "Llama4TextAttention",
     ],
 )
diff --git a/examples/multimodal_vision/llava_example.py b/examples/multimodal_vision/llava_example.py
@@ -30,7 +30,7 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+        ignore=["re:.*lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*"],
     ),
 ]
 
diff --git a/examples/multimodal_vision/mistral3_example.py b/examples/multimodal_vision/mistral3_example.py
@@ -31,9 +31,11 @@
 def data_collator(batch):
     assert len(batch) == 1
     return {
-        key: torch.tensor(value)
-        if key != "pixel_values"
-        else torch.tensor(value, dtype=model.dtype)
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=model.dtype)
+        )
         for key, value in batch[0].items()
     }
 
@@ -43,7 +45,7 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+        ignore=["re:.*lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*"],
     ),
 ]
 
diff --git a/examples/multimodal_vision/mllama_example.py b/examples/multimodal_vision/mllama_example.py
@@ -30,7 +30,7 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
+        ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_model.*"],
     ),
 ]
 
diff --git a/examples/multimodal_vision/pixtral_example.py b/examples/multimodal_vision/pixtral_example.py
@@ -36,7 +36,7 @@ def data_collator(batch):
     GPTQModifier(
         targets="Linear",
         scheme="W4A16",
-        ignore=["re:.*lm_head", "re:vision_tower.*", "re:multi_modal_projector.*"],
+        ignore=["re:.*lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*"],
     ),
 ]
 
diff --git a/examples/quantization_w4a4_fp4/llama4_example.py b/examples/quantization_w4a4_fp4/llama4_example.py
@@ -52,9 +52,11 @@ def preprocess_function(example):
 def data_collator(batch):
     assert len(batch) == 1
     return {
-        key: torch.tensor(value)
-        if key != "pixel_values"
-        else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        key: (
+            torch.tensor(value)
+            if key != "pixel_values"
+            else torch.tensor(value, dtype=torch.bfloat16).squeeze(0)
+        )
         for key, value in batch[0].items()
     }
 
@@ -67,8 +69,8 @@ def data_collator(batch):
         "re:.*lm_head",
         "re:.*self_attn",
         "re:.*router",
-        "re:vision_model.*",
-        "re:multi_modal_projector.*",
+        "re:.*vision_model.*",
+        "re:.*multi_modal_projector.*",
         "Llama4TextAttention",
     ],
 )
diff --git a/examples/quantization_w8a8_fp8/llama3.2_vision_example.py b/examples/quantization_w8a8_fp8/llama3.2_vision_example.py
@@ -17,7 +17,7 @@
 recipe = QuantizationModifier(
     targets="Linear",
     scheme="FP8_DYNAMIC",
-    ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_model.*"],
+    ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_model.*"],
 )
 
 # Apply quantization and save to disk in compressed-tensors format.
diff --git a/examples/quantization_w8a8_fp8/llama4_fp8_block_example.py b/examples/quantization_w8a8_fp8/llama4_fp8_block_example.py
@@ -22,8 +22,8 @@
         "re:.*lm_head",
         "re:.*self_attn",
         "re:.*router",
-        "re:vision_model.*",
-        "re:multi_modal_projector.*",
+        "re:.*vision_model.*",
+        "re:.*multi_modal_projector.*",
         "Llama4TextAttention",
     ],
 )
diff --git a/examples/quantization_w8a8_fp8/llava1.5_example.py b/examples/quantization_w8a8_fp8/llava1.5_example.py
@@ -17,7 +17,7 @@
 recipe = QuantizationModifier(
     targets="Linear",
     scheme="FP8_DYNAMIC",
-    ignore=["re:.*lm_head", "re:multi_modal_projector.*", "re:vision_tower.*"],
+    ignore=["re:.*lm_head", "re:.*multi_modal_projector.*", "re:.*vision_tower.*"],
 )
 
 # Apply quantization and save to disk in compressed-tensors format.
diff --git a/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml b/tests/e2e/vLLM/recipes/FP8/recipe_fp8_dynamic.yaml
@@ -1,7 +1,7 @@
 quant_stage:
   quant_modifiers:
     QuantizationModifier:
-      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*", "re:model.visual.*"]
+      ignore: ["lm_head", "re:vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"]
       config_groups:
         group_0:
           weights:
diff --git a/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml b/tests/e2e/vLLM/recipes/INT8/recipe_int8_channel_weight_dynamic_per_token.yaml
@@ -3,7 +3,7 @@ quant_stage:
     SmoothQuantModifier:
       smoothing_strength: 0.8
     GPTQModifier:
-      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*", "re:model.visual.*"]
+      ignore: ["lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"]
       actorder: null
       config_groups:
         group_0:
diff --git a/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml b/tests/e2e/vLLM/recipes/actorder/recipe_w4a16_actorder_weight.yaml
@@ -1,7 +1,7 @@
 quant_stage:
   quant_modifiers:
     GPTQModifier:
-      ignore: ["lm_head", "re:vision_tower.*", "re:multi_modal_projector.*", "re:visual.*", "re:vision_model.*", "re:model.visual.*"]
+      ignore: ["lm_head", "re:.*vision_tower.*", "re:.*multi_modal_projector.*", "re:.*visual.*", "re:.*vision_model.*"]
       actorder: "weight"
       config_groups:
         group_0:

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ recipe = [`
`37`	`37`	`targets="Linear",`
`38`	`38`	`scheme="W4A16",`
`39`	`39`	`sequential_targets=["MistralDecoderLayer"],`
`40`		`- ignore=["re:.lm_head", "re:vision_tower.", "re:multi_modal_projector.*"],`
	`40`	`+ ignore=["re:.lm_head", "re:.vision_tower.", "re:.multi_modal_projector.*"],`
`41`	`41`	`),`
`42`	`42`	`]`
`43`	`43`	```
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ def data_collator(batch):`
`30`	`30`	`GPTQModifier(`
`31`	`31`	`targets="Linear",`
`32`	`32`	`scheme="W4A16",`
`33`		`- ignore=["re:.lm_head", "re:vision_tower.", "re:multi_modal_projector.*"],`
	`33`	`+ ignore=["re:.lm_head", "re:.vision_tower.", "re:.multi_modal_projector.*"],`
`34`	`34`	`),`
`35`	`35`	`]`
`36`	`36`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ def data_collator(batch):`
`36`	`36`	`GPTQModifier(`
`37`	`37`	`targets="Linear",`
`38`	`38`	`scheme="W4A16",`
`39`		`- ignore=["re:.lm_head", "re:vision_tower.", "re:multi_modal_projector.*"],`
	`39`	`+ ignore=["re:.lm_head", "re:.vision_tower.", "re:.multi_modal_projector.*"],`
`40`	`40`	`),`
`41`	`41`	`]`
`42`	`42`
Original file line number	Diff line number	Diff line change
`@@ -17,7 +17,7 @@`
`17`	`17`	`recipe = QuantizationModifier(`
`18`	`18`	`targets="Linear",`
`19`	`19`	`scheme="FP8_DYNAMIC",`
`20`		`- ignore=["re:.lm_head", "re:multi_modal_projector.", "re:vision_model.*"],`
	`20`	`+ ignore=["re:.lm_head", "re:.multi_modal_projector.", "re:.vision_model.*"],`
`21`	`21`	`)`
`22`	`22`
`23`	`23`	`# Apply quantization and save to disk in compressed-tensors format.`