R1 working

kylesayrs · kylesayrs · commit f5c2150eefb3 · 2025-07-11T11:07:36.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/src/llmcompressor/modeling/__init__.py b/src/llmcompressor/modeling/__init__.py
@@ -1,3 +1,4 @@
 # flake8: noqa
 
 from .prepare import *
+from .fuse import *
diff --git a/src/llmcompressor/modeling/fuse.py b/src/llmcompressor/modeling/fuse.py
@@ -1,7 +1,9 @@
 from typing import Iterable
 
 import torch
-from compressed_tensors import update_offload_parameter
+from compressed_tensors import get_execution_device, align_module_device, update_offload_parameter
+
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
 
 __all__ = ["fuse_norm_linears"]
 
@@ -16,10 +18,13 @@ def fuse_norm_linears(norm: torch.nn.Module, linears: Iterable[torch.nn.Linear])
     :param norm: norm layer whose weight will be fused into subsequent linears
     :param linears: linear layers which directly follow the norm layer
     """
-    if isinstance(norm, torch.nn.RMSNorm):
+    if isinstance(norm, (torch.nn.RMSNorm, LlamaRMSNorm)):
         for linear in linears:
-            # spinquant does this op in float64
-            new_weight = linear.weight * norm.weight
+            # NOTE: spinquant does this op in float64
+            exec_device = get_execution_device(norm)
+            with align_module_device(norm, exec_device), align_module_device(linear, exec_device):
+                new_weight = linear.weight * norm.weight
+            
             update_offload_parameter(linear, "weight", new_weight)
 
         update_offload_parameter(norm, "weight", torch.ones_like(norm.weight))
diff --git a/src/llmcompressor/modifiers/transform/transform.py b/src/llmcompressor/modifiers/transform/transform.py
@@ -4,6 +4,7 @@
 from pydantic import ValidationError, model_validator
 
 from llmcompressor.core import Event, EventType, State
+from llmcompressor.modeling import fuse_norm_linears
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.transform.presets import TRANSFORM_PRESETS
 
@@ -29,13 +30,19 @@ def validate_model_after(model: "TransformModifier") -> "TransformModifier":
         return model
 
     def on_initialize(self, state: State, **kwargs) -> bool:
-        apply_transform_config(state.model, self.config)
-
         return True
 
     def on_start(self, state: State, event: Event, **kwargs):
         self.started_ = True
 
+        for layer in state.model.model.layers:
+            fuse_norm_linears(layer.input_layernorm, (layer.self_attn.q_proj, layer.self_attn.k_proj, layer.self_attn.v_proj))
+            fuse_norm_linears(layer.post_attention_layernorm, (layer.mlp.gate_proj, layer.mlp.up_proj))
+
+        # needs to happen after the model has been hooked to execute on the GPU
+        # otherwise we're applying weight transforms on CPU
+        apply_transform_config(state.model, self.config)
+
     def on_event(self, state: State, event: Event, **kwargs):
         if event.type_ == EventType.CALIBRATION_EPOCH_START:
             if not self.started_:
diff --git a/src/llmcompressor/pipelines/data_free/pipeline.py b/src/llmcompressor/pipelines/data_free/pipeline.py
@@ -5,6 +5,7 @@
 
 from llmcompressor.core.session_functions import LifecycleCallbacks
 from llmcompressor.pipelines.registry import CalibrationPipeline
+from llmcompressor.utils.dev import dispatch_for_generation
 
 if TYPE_CHECKING:
     from llmcompressor.args.dataset_arguments import DatasetArguments
@@ -27,5 +28,9 @@ def __call__(
         :param dataloader: loads data for calibration
         :param dataset_args: dataset arguments relevant to pipelines
         """
+        # some ops are still performed on the model by modifiers
+        # we want those ops to occur on the GPU
+        dispatch_for_generation(model)
+
         LifecycleCallbacks.calibration_epoch_start()
         LifecycleCallbacks.calibration_epoch_end()

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
`1`	`1`	`# flake8: noqa`
`2`	`2`
`3`	`3`	`from .prepare import *`
	`4`	`+from .fuse import *`