vllm-project
diff --git a/‎examples/multimodal_audio/README.md
Lines changed: 0 additions & 6 deletions b/‎examples/multimodal_audio/README.md
Lines changed: 0 additions & 6 deletions
diff --git a/‎examples/multimodal_vision/README.md
Lines changed: 0 additions & 6 deletions b/‎examples/multimodal_vision/README.md
Lines changed: 0 additions & 6 deletions
diff --git a/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/gemma3_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/idefics3_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/llava_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/llava_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/mllama_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/mllama_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/pixtral_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/qwen2_vl_example.py
Lines changed: 2 additions & 3 deletions b/‎examples/multimodal_vision/qwen2_vl_example.py
Lines changed: 2 additions & 3 deletions
diff --git a/‎examples/multimodal_vision/qwen_2_5_vl_example.py
Lines changed: 2 additions & 5 deletions b/‎examples/multimodal_vision/qwen_2_5_vl_example.py
Lines changed: 2 additions & 5 deletions
diff --git a/‎src/llmcompressor/args/dataset_arguments.py
Lines changed: 7 additions & 0 deletions b/‎src/llmcompressor/args/dataset_arguments.py
Lines changed: 7 additions & 0 deletions
@@ -47,12 +47,6 @@ Sequential targets are the modules which determine the granularity of error prop
 
 Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
 
-### Ignore ###
-If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
-
-## Tracing Errors ##
-Because the architectures of audio-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
-
 ## Adding Your Own Smoothquant Mappings ##
 For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
 
 
@@ -51,12 +51,6 @@ Sequential targets are the modules which determine the granularity of error prop
 
 Choosing sequential targets with higher granularity (for example "Linear" instead of "LlamaDecoderLayer") will result in fewer hessians being allocated at the same time, decreasing the memory requirements for compression. This may also increase the recovered accuracy of the model, as compression error is propagated at a higher granularity. However, using higher granularity sequential targets may also increase compression time, as more time is spent offloading and onloading activations.
 
-### Ignore ###
-If your model is not traceable for your desired dataset, first consider adding any problematic modules to the ignore list. Doing this prevents the model tracer from tracing the internals of those modules, thereby avoid the untraceable operations.
-
-## Tracing Errors ##
-Because the architectures of vision-language models is often times more complex than those of typical decoder-only text models, you may encounter `torch.fx.TraceError`s when attempting to quantize your model. For more information on `torch.fx.TraceError`s, why they occur, and how to resolve them, please see the [Model Tracing Guide](/src/llmcompressor/transformers/tracing/GUIDE.md).
-
 ## Adding Your Own Smoothquant Mappings ##
 For a guide on adding smoothquant mappings for your dataset, see the [SmoothQuant Guide](/src/llmcompressor/modifiers/smoothquant/README.md).
 
 
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Gemma3ForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableGemma3ForConditionalGeneration
 
 # Load model.
 model_id = "google/gemma-3-4b-it"
-model = TraceableGemma3ForConditionalGeneration.from_pretrained(
+model = Gemma3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -2,15 +2,14 @@
 import torch
 from datasets import load_dataset
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Idefics3ForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableIdefics3ForConditionalGeneration
 
 # Load model.
 model_id = "HuggingFaceM4/Idefics3-8B-Llama3"  # or "HuggingFaceTB/SmolVLM-Instruct"
-model = TraceableIdefics3ForConditionalGeneration.from_pretrained(
+model = Idefics3ForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, LlavaForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
 model_id = "llava-hf/llava-1.5-7b-hf"
-model = TraceableLlavaForConditionalGeneration.from_pretrained(
+model = LlavaForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, MllamaForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableMllamaForConditionalGeneration
 
 # Load model.
 model_id = "meta-llama/Llama-3.2-11B-Vision-Instruct"
-model = TraceableMllamaForConditionalGeneration.from_pretrained(
+model = MllamaForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -1,15 +1,14 @@
 import requests
 import torch
 from PIL import Image
-from transformers import AutoProcessor
+from transformers import AutoProcessor, LlavaForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableLlavaForConditionalGeneration
 
 # Load model.
 model_id = "mgoin/pixtral-12b"
-model = TraceableLlavaForConditionalGeneration.from_pretrained(
+model = LlavaForConditionalGeneration.from_pretrained(
     model_id, device_map="auto", torch_dtype="auto"
 )
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 
@@ -4,15 +4,14 @@
 import torch
 from datasets import load_dataset
 from qwen_vl_utils import process_vision_info
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
 
 from llmcompressor import oneshot
 from llmcompressor.modifiers.quantization import GPTQModifier
-from llmcompressor.transformers.tracing import TraceableQwen2VLForConditionalGeneration
 
 # Load model.
 model_id = "Qwen/Qwen2-VL-2B-Instruct"
-model = TraceableQwen2VLForConditionalGeneration.from_pretrained(
+model = Qwen2VLForConditionalGeneration.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype="auto",
 
@@ -4,17 +4,14 @@
 import torch
 from datasets import load_dataset
 from qwen_vl_utils import process_vision_info
-from transformers import AutoProcessor
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 
 from llmcompressor.modifiers.quantization import GPTQModifier
 from llmcompressor.transformers import oneshot
-from llmcompressor.transformers.tracing import (
-    TraceableQwen2_5_VLForConditionalGeneration,
-)
 
 # Load model.
 model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
-model = TraceableQwen2_5_VLForConditionalGeneration.from_pretrained(
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
     model_id,
     device_map="auto",
     torch_dtype="auto",
 
@@ -179,3 +179,10 @@ class DatasetArguments(CustomDatasetArguments):
             "independent]"
         },
     )
+    tracing_ignore: List[str] = field(
+        default_factory=lambda: ["_update_causal_mask"],
+        metadata={
+            "help": "List of functions to ignore during tracing, either "
+            "{module}.{method_name} or {function_name}"
+        },
+    )