vllm-project
diff --git a/‎examples/big_models_with_sequential_onloading/README.md
Lines changed: 2 additions & 2 deletions b/‎examples/big_models_with_sequential_onloading/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/big_models_with_sequential_onloading/llama3.3_70b.py
Lines changed: 5 additions & 1 deletion b/‎examples/big_models_with_sequential_onloading/llama3.3_70b.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/finetuning/example_alternating_recipe.yaml
Lines changed: 2 additions & 2 deletions b/‎examples/finetuning/example_alternating_recipe.yaml
Lines changed: 2 additions & 2 deletions
diff --git a/‎setup.py
Lines changed: 2 additions & 1 deletion b/‎setup.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/llmcompressor/args/dataset_arguments.py
Lines changed: 0 additions & 9 deletions b/‎src/llmcompressor/args/dataset_arguments.py
Lines changed: 0 additions & 9 deletions
diff --git a/‎src/llmcompressor/entrypoints/oneshot.py
Lines changed: 3 additions & 5 deletions b/‎src/llmcompressor/entrypoints/oneshot.py
Lines changed: 3 additions & 5 deletions
diff --git a/‎src/llmcompressor/entrypoints/utils.py
Lines changed: 1 addition & 1 deletion b/‎src/llmcompressor/entrypoints/utils.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llmcompressor/modeling/fuse.py
Lines changed: 28 additions & 0 deletions b/‎src/llmcompressor/modeling/fuse.py
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/llmcompressor/modifiers/modifier.py
Lines changed: 4 additions & 0 deletions b/‎src/llmcompressor/modifiers/modifier.py
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/llmcompressor/modifiers/smoothquant/base.py
Lines changed: 5 additions & 0 deletions b/‎src/llmcompressor/modifiers/smoothquant/base.py
Lines changed: 5 additions & 0 deletions
@@ -18,7 +18,7 @@ The Llama 3.3 70b is larger than 80 GB, surpassing the size of 1 A100. However,
 
 ```python
 model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto", device_map=None)
 ```
 
 The model is first loaded onto the `cpu`, as indicated through the use of `None` for the `device_map` argument in the `from_pretrained` method when loading the model.
@@ -42,4 +42,4 @@ output = model.generate(**sample, max_new_tokens=100)
 print(tokenizer.decode(output[0]))
 ```
 
-Finally, we call `dispatch_for_generation` to evenly load the model across available devices (potentially offloading the model if required) and run sample generations on the newly quantized model.
+Finally, we call `dispatch_for_generation` to evenly load the model across available devices (potentially offloading the model if required) and run sample generations on the newly quantized model.
@@ -8,7 +8,11 @@
 
 # Select model and load it.
 model_id = "meta-llama/Llama-3.3-70B-Instruct"
-model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype="auto")
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype="auto",
+    device_map=None,
+)
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 # Select calibration dataset.
 
@@ -4,7 +4,7 @@ initial_sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.5
       block_size: 128
-      percdamp: 0.01
+      dampening_frac: 0.01
       mask_structure: "0:0"
       targets: ["Linear"]
       ignore: ["re:.*lm_head"]
@@ -20,7 +20,7 @@ next_sparsity_stage:
     SparseGPTModifier:
       sparsity: 0.7
       block_size: 128
-      percdamp: 0.01
+      dampening_frac: 0.01
       mask_structure: "0:0"
       targets: ["Linear"]
       ignore: ["re:.*lm_head"]
 
@@ -119,7 +119,7 @@ def localversion_func(version: ScmVersion) -> str:
         "tqdm>=4.0.0",
         # torch 1.10 and 1.11 do not support quantized onnx export
         "torch>=1.7.0,!=1.10,!=1.11",
-        "transformers>4.0,<4.53.0",
+        "transformers>4.0",
         "datasets",
         "accelerate>=0.20.3,!=1.1.0",
         "pynvml",
@@ -146,6 +146,7 @@ def localversion_func(version: ScmVersion) -> str:
             "torchvision",
             "librosa",
             "soundfile",
+            "torchcodec",
             # linting, formatting, and type checking
             "black~=24.4.2",
             "isort~=5.13.2",
 
@@ -162,15 +162,6 @@ class DatasetArguments(CustomDatasetArguments):
             ),
         },
     )
-    trust_remote_code_data: bool = field(
-        default=False,
-        metadata={
-            "help": "Whether or not to allow for datasets defined on the Hub using "
-            "a dataset script. This option should only be set to True for "
-            "repositories you trust and in which you have read the code, as it "
-            "will execute code present on the Hub on your local machine."
-        },
-    )
     # --- pipeline arguments --- #
     pipeline: Optional[str] = field(
         default="independent",
 
@@ -228,7 +228,6 @@ def oneshot(
     overwrite_cache: bool = False,
     preprocessing_num_workers: Optional[int] = None,
     min_tokens_per_module: Optional[float] = None,
-    trust_remote_code_data: bool = False,
     # Miscellaneous arguments
     output_dir: Optional[str] = None,
     log_dir: Optional[str] = "sparse_logs",
@@ -290,8 +289,6 @@ def oneshot(
         preprocessing.
     :param min_tokens_per_module: Minimum percentage of tokens per
         module, relevant for MoE models.
-    :param trust_remote_code_data: Whether to allow for datasets defined on the Hub
-        using a dataset script.
 
     # Miscellaneous arguments
     :param output_dir: Path to save the output model after calibration.
@@ -303,8 +300,9 @@ def oneshot(
     """
 
     # pass all args directly into Oneshot
-    local_args = locals()
-    local_args.pop("kwargs")
+    local_args = {
+        k: v for k, v in locals().items() if k not in ("local_args", "kwargs")
+    }
     one_shot = Oneshot(**local_args, **kwargs)
     one_shot()
 
 
@@ -241,7 +241,7 @@ def initialize_processor_from_path(
         )
 
     except ValueError as exception:
-        if "trust_remote_code=True" in exception.value:
+        if any("trust_remote_code=True" in arg for arg in exception.args):
             raise ValueError(
                 f"The repository for {processor_src} contains custom code which must "
                 "be executed to correctly load the tokenizer/processor. You can "
 
@@ -0,0 +1,28 @@
+from typing import Iterable
+
+import torch
+from compressed_tensors import update_offload_parameter
+
+__all__ = ["fuse_norm_linears"]
+
+
+def fuse_norm_linears(norm: torch.nn.Module, linears: Iterable[torch.nn.Linear]):
+    """
+    Fuse a norm layer into subsequent linear layers. This useful for ensuring transform
+    invariance between norm and linear layers.
+
+    Note that a model cannot be properly trained after its norms have been fused
+
+    :param norm: norm layer whose weight will be fused into subsequent linears
+    :param linears: linear layers which directly follow the norm layer
+    """
+    if isinstance(norm, torch.nn.RMSNorm):
+        for linear in linears:
+            # spinquant does this op in float64
+            new_weight = linear.weight * norm.weight
+            update_offload_parameter(linear, "weight", new_weight)
+
+        update_offload_parameter(norm, "weight", torch.ones_like(norm.weight))
+
+    else:
+        raise ValueError(f"Cannot fuse norm of type {type(norm)}")
@@ -1,6 +1,8 @@
 from abc import abstractmethod
 from typing import Optional
 
+from pydantic import ConfigDict
+
 from llmcompressor.core.events import Event, EventType
 from llmcompressor.core.state import State
 from llmcompressor.modifiers.interface import ModifierInterface
@@ -30,6 +32,8 @@ class Modifier(ModifierInterface, HooksMixin):
     :param update: The update step for the modifier
     """
 
+    model_config = ConfigDict(extra="forbid")
+
     index: Optional[int] = None
     group: Optional[str] = None
     start: Optional[float] = None
 
@@ -127,6 +127,11 @@ def on_initialize(self, state: State, **kwargs) -> bool:
                 f"Expected start to be None or -1, got {self.end}"
             )
 
+        if not hasattr(state, "data") or state.data.calib is None:
+            raise ValueError(
+                f"{self.__class__.__name__} requires a calibration dataset to be "
+                "provided"
+            )
         self.ignore = [] if not self.ignore else self.ignore
         self.mappings = self._infer_mappings_from_model(state.model)
         self.resolved_mappings_ = self._resolve_mappings(state.model)
Original file line number	Diff line number	Diff line change
`@@ -241,7 +241,7 @@ def initialize_processor_from_path(`
`241`	`241`	`)`
`242`	`242`
`243`	`243`	`except ValueError as exception:`
`244`		`- if "trust_remote_code=True" in exception.value:`
	`244`	`+ if any("trust_remote_code=True" in arg for arg in exception.args):`
`245`	`245`	`raise ValueError(`
`246`	`246`	`f"The repository for {processor_src} contains custom code which must "`
`247`	`247`	`"be executed to correctly load the tokenizer/processor. You can "`