Feat (brevitas_examples/llm): better RMSNorm replacement (#1436)

Giuseppe5 · web-flow · commit 26798a4ba604 · 2026-01-12T13:23:23.000+01:00
diff --git a/src/brevitas/graph/base.py b/src/brevitas/graph/base.py
@@ -435,18 +435,18 @@ class ModuleToModuleByClass(ModuleToModule):
     def __init__(self, old_module_class, new_module_class, **kwargs):
         super().__init__(new_module_class, **kwargs)
         self.old_module_class = old_module_class
+        self.old_new_module_dict = {}
 
     def apply(self, model: GraphModule) -> GraphModule:
-        old_new_module_dict = {}
         for old_module in model.modules():
             # check for equality, not inheritance
             if type(old_module) == self.old_module_class:
                 # init the new module based on the old one
                 new_module = self.init_new_module(old_module)
                 # register modules pair to be replaced
-                old_new_module_dict[old_module] = new_module
+                self.old_new_module_dict[old_module] = new_module
         # replace all pairs registered
-        for old_module, new_module in old_new_module_dict.items():
+        for old_module, new_module in self.old_new_module_dict.items():
             replace_module(model, old_module, new_module)
         return model
 
diff --git a/src/brevitas/graph/equalize.py b/src/brevitas/graph/equalize.py
@@ -293,7 +293,7 @@ class WalkRegionState:
 
     supported_srcs: set = _supported_layers
     supported_sinks: set = _supported_layers
-    scale_invariant_function: set = _scale_invariant_op
+    scale_invariant_functions: set = _scale_invariant_op
     scale_invariant_layers: set = _scale_invariant_layers
     residual_fns: set = _residual_fns
     residual_methods: set = _residual_methods
@@ -1027,7 +1027,7 @@ def find_srcs_channel_dim(state, model, inp_node):
         return total_channels
     elif _is_scale_invariant_module(model, inp_node,
                                     state.scale_invariant_layers) or _is_scale_invariant_function(
-                                        inp_node, state.scale_invariant_function):
+                                        inp_node, state.scale_invariant_functions):
         return find_srcs_channel_dim(state, model, inp_node.all_input_nodes[0])
     else:
         return _UNSUPPORTED_OP
@@ -1078,7 +1078,7 @@ def find_srcs(graph_model: GraphModule, starting_node: Node,
                 0]
         elif _is_scale_invariant_module(
                 graph_model, node, state.scale_invariant_layers) or _is_scale_invariant_function(
-                    node, state.scale_invariant_function):
+                    node, state.scale_invariant_functions):
             find_sinks(graph_model, node, state)
             find_srcs(graph_model, node, state)
         elif _is_add(node, state.residual_fns, state.residual_methods):
@@ -1126,7 +1126,7 @@ def find_sinks(graph_model: GraphModule, starting_node: Node,
 
         elif _is_scale_invariant_module(
                 graph_model, node, state.scale_invariant_layers) or _is_scale_invariant_function(
-                    node, state.scale_invariant_function):
+                    node, state.scale_invariant_functions):
             find_sinks(graph_model, node, state)
         elif _is_add(node, state.residual_fns, state.residual_methods):
             state.update_offset = False
@@ -1785,10 +1785,43 @@ def _merge_ln(layer_norm, next_module, scale_bias_by_weight):
         _replace_bias(next_module, new_bias)
 
 
+class RegionWalkMixin:
+
+    def __init__(
+            self,
+            supported_srcs: Tuple[Type[nn.Module]] = _supported_layers,
+            supported_sinks: Tuple[Type[nn.Module]] = _supported_layers,
+            scale_invariant_layers: Tuple[Type[nn.Module]] = _scale_invariant_layers,
+            scale_invariant_functions: Tuple[Callable] = _scale_invariant_op,
+            residual_fns: Tuple[Callable] = _residual_fns,
+            residual_methods: Tuple[str] = _residual_methods,
+            extra_state_kwargs: Optional[Dict[str, Tuple[Type[nn.Module]]]] = None):
+        self.supported_srcs = supported_srcs
+        self.supported_sinks = supported_sinks
+        self.scale_invariant_layers = scale_invariant_layers
+        self.scale_invariant_functions = scale_invariant_functions
+        self.residual_fns = residual_fns
+        self.residual_methods = residual_methods
+
+        if extra_state_kwargs is not None:
+            for attr_name, value in extra_state_kwargs.items():
+                combined_value = value + getattr(self, attr_name)
+                setattr(self, attr_name, combined_value)
+
+    @property
+    def full_state_kwargs(self) -> Dict[str, Tuple[Type[nn.Module]]]:
+        return {
+            'supported_srcs': self.supported_srcs,
+            'supported_sinks': self.supported_sinks,
+            'scale_invariant_layers': self.scale_invariant_layers,
+            'scale_invariant_functions': self.scale_invariant_functions,
+            'residual_fns': self.residual_fns,
+            'residual_methods': self.residual_methods}
+
+
 class RotationEqualization(GraphTransform):
 
     def __init__(self, blacklist_layers, layers_to_expand) -> None:
-        super(RotationEqualization, self).__init__()
         if blacklist_layers is not None:
             self.blacklist_layers = blacklist_layers
         else:
@@ -1797,19 +1830,19 @@ def __init__(self, blacklist_layers, layers_to_expand) -> None:
             self.layers_to_expand = layers_to_expand
         else:
             self.layers_to_expand = []
-        self.supported_sinks = ()
 
     def find_module(
             self,
             model: nn.Module,
             regions: List[Region],
+            supported_sinks: Tuple[nn.Module],
             prefix: str = '',
             blacklist_layers: Optional[List[str]] = None):
         """
         Iterate through the model looking at immediate children of every module to look for supported modules.
         This allows us to stop the search when we meet a top-level module that is supported.
         """
-        if isinstance(model, self.supported_sinks):
+        if isinstance(model, supported_sinks):
             if prefix in blacklist_layers:
                 return
             weight = get_weight_sink(model)
@@ -1820,7 +1853,7 @@ def find_module(
         else:
             for name, module in model.named_children():
                 full_name = prefix + '.' + name if prefix != '' else name
-                self.find_module(module, regions, full_name, blacklist_layers)
+                self.find_module(module, regions, supported_sinks, full_name, blacklist_layers)
 
     def find_module_by_name(self, model: nn.Module, regions: List[Region], prefix: str = ''):
         """
@@ -1852,7 +1885,7 @@ def transform_model(
             return apply_rewriters(model, rewriters)
 
 
-class GraphRotationEqualization(RotationEqualization):
+class GraphRotationEqualization(RotationEqualization, RegionWalkMixin):
 
     def __init__(
             self,
@@ -1866,16 +1899,20 @@ def __init__(
             layers_to_expand: Optional[List[str]] = None,
             expansion_step: int = None,
             delay_rewriters: bool = False,
-            return_rewriters: bool = False) -> None:
-        super(GraphRotationEqualization, self).__init__(blacklist_layers, layers_to_expand)
+            return_rewriters: bool = False,
+            extra_state_kwargs: Optional[Dict[str, Tuple]] = None) -> None:
+        RotationEqualization.__init__(self, blacklist_layers, layers_to_expand)
 
-        self.supported_srcs = (nn.Linear, nn.Embedding)
-        self.supported_sinks = (nn.Linear)
         common_scale_invariant = list(_scale_invariant_layers)
         common_scale_invariant.remove(torch.nn.ReLU)
         common_scale_invariant.remove(torch.nn.LeakyReLU)
-        self.scale_invariant_layers = tuple(common_scale_invariant) + (RMSNorm,)
-        self.scale_invariant_function = ()
+        base_state_kwargs = {
+            'supported_srcs': (nn.Linear, nn.Embedding),
+            'supported_sinks': (nn.Linear,),
+            'scale_invariant_layers': tuple(common_scale_invariant) + (RMSNorm,),
+            'scale_invariant_functions': ()}
+        RegionWalkMixin.__init__(self, **base_state_kwargs, extra_state_kwargs=extra_state_kwargs)
+
         self.orphan_sink = orphan_sink
         self.rotate_matmul = rotate_matmul
         self.full_rotation_method = full_rotation_method
@@ -1992,13 +2029,7 @@ def find_sink(node):
     def apply(self,
               graph_model: GraphModule) -> Union[Tuple[GraphModule, List[Transform]], GraphModule]:
         rewriters = []
-        regions = _extract_regions(
-            graph_model,
-            state_impl_kwargs={
-                'supported_srcs': self.supported_srcs,
-                'supported_sinks': self.supported_sinks,
-                'scale_invariant_layers': self.scale_invariant_layers,
-                'scale_invariant_function': self.scale_invariant_function})
+        regions = _extract_regions(graph_model, state_impl_kwargs=self.full_state_kwargs)
 
         expanded_regions = []
         self.find_module_by_name(graph_model, expanded_regions)
@@ -2007,7 +2038,11 @@ def apply(self,
 
         if self.orphan_sink:
             blacklist_orphan_layers = self.blacklist_layers + self.layers_to_expand
-            self.find_module(graph_model, orphan_regions, blacklist_layers=blacklist_orphan_layers)
+            self.find_module(
+                graph_model,
+                orphan_regions,
+                self.full_state_kwargs['supported_sinks'],
+                blacklist_layers=blacklist_orphan_layers)
 
         if len(expanded_regions) > 0:
             parameter_number_pre = 0
@@ -2095,20 +2130,23 @@ def apply_rewriters(
     return model
 
 
-class LayerNormToRMS(GraphTransform):
+class LayerNormToRMS(GraphTransform, RegionWalkMixin):
+
+    def __init__(
+            self,
+            return_rewriters: bool = False,
+            extra_state_kwargs: Optional[Dict[str, Tuple]] = None) -> None:
+        GraphTransform.__init__(self)
+
+        base_state_kwargs = {
+            'supported_srcs': (nn.Linear, nn.Embedding), 'supported_sinks': (nn.LayerNorm,)}
+        RegionWalkMixin.__init__(self, **base_state_kwargs, extra_state_kwargs=extra_state_kwargs)
 
-    def __init__(self, return_rewriters=False) -> None:
-        super(LayerNormToRMS, self).__init__()
-        self.supported_srcs = (nn.Linear, nn.Embedding)
-        self.supported_sinks = (nn.LayerNorm)
         self.return_rewriters = return_rewriters
         assert RMSNorm is not object, 'Update your Pytorch version to 2.4+'
 
     def apply(self, graph_model: GraphModule) -> GraphModule:
-        regions = _extract_regions(
-            graph_model,
-            state_impl_kwargs={
-                'supported_srcs': self.supported_srcs, 'supported_sinks': self.supported_sinks})
+        regions = _extract_regions(graph_model, state_impl_kwargs=self.full_state_kwargs)
 
         rewriters = []
         if len(regions) > 0:
@@ -2141,18 +2179,17 @@ def apply(self, graph_model: GraphModule) -> GraphModule:
             return graph_model
 
 
-class MergeLnAffine(GraphTransform):
+class MergeLnAffine(GraphTransform, RegionWalkMixin):
 
-    def __init__(self) -> None:
-        super(MergeLnAffine, self).__init__()
+    def __init__(self, extra_state_kwargs: Optional[Dict[str, Tuple]] = None) -> None:
+        GraphTransform.__init__(self)
         self.supported_srcs = (RMSNorm, nn.LayerNorm)
-        self.supported_sinks = (nn.Linear)
+        base_state_kwargs = {
+            'supported_srcs': (RMSNorm, nn.LayerNorm), 'supported_sinks': (nn.Linear,)}
+        RegionWalkMixin.__init__(self, **base_state_kwargs, extra_state_kwargs=extra_state_kwargs)
 
     def apply(self, graph_model: GraphModule) -> GraphModule:
-        regions = _extract_regions(
-            graph_model,
-            state_impl_kwargs={
-                'supported_srcs': self.supported_srcs, 'supported_sinks': self.supported_sinks})
+        regions = _extract_regions(graph_model, state_impl_kwargs=self.full_state_kwargs)
 
         if len(regions) > 0:
             scaled_biases = set()
@@ -2180,18 +2217,21 @@ def __init__(
             blacklist_layer: Optional[List] = None,
             layers_to_expand: Optional[List] = None,
             expansion_step: int = 0,
-            block_rotation_dim: Optional[int] = None):
-        super().__init__(blacklist_layer, layers_to_expand)
+            block_rotation_dim: Optional[int] = None,
+            extra_state_kwargs: Optional[Dict[str, Tuple]] = None):
+
+        RotationEqualization.__init__(self, blacklist_layer, layers_to_expand)
         self.expansion_step = expansion_step
-        self.supported_sinks = (nn.Linear)
         self.block_rotation_dim = block_rotation_dim
+        self.supported_sinks = (nn.Linear,)
 
     def apply(self, model: nn.Module) -> nn.Module:
         regions: List[Region] = []
         rewriters: List[Transform] = []
 
         blacklist_orphan_layers = self.blacklist_layers + self.layers_to_expand
-        self.find_module(model, regions, blacklist_layers=blacklist_orphan_layers)
+        self.find_module(
+            model, regions, self.supported_sinks, blacklist_layers=blacklist_orphan_layers)
         expanded_regions = []
         self.find_module_by_name(model, expanded_regions)
 
diff --git a/src/brevitas_examples/llm/llm_quant/ln_affine_merge.py b/src/brevitas_examples/llm/llm_quant/ln_affine_merge.py
@@ -3,36 +3,63 @@
 SPDX-License-Identifier: MIT
 """
 
+from inspect import signature
+
 from packaging import version
 import torch
 from torch import nn
 
 from brevitas import torch_version
-from brevitas.graph.base import ModuleToModuleByClass
+from brevitas.graph import ModuleInstanceToModuleInstance
+from brevitas.graph import ModuleToModuleByClass
 from brevitas.graph.equalize import _is_scale_invariant_module
 from brevitas.graph.equalize import LayerNormToRMS
 from brevitas.graph.equalize import MergeLnAffine
 from brevitas.graph.utils import get_module
 
 
-def replace_rmsnorm_with_torch(model, config):
-    assert torch_version >= version.parse('2.4'), "torch.nn.RMSNorm requires torch 2.4 or greater"
-    set_of_layers = set(type(x) for x in model.modules() if 'RMS' in type(x).__name__)
-    dtype = next(model.parameters()).dtype
-    device = next(model.parameters()).device
-    rewriters = [
-        ModuleToModuleByClass(
-            rms_cls,
-            torch.nn.RMSNorm,
-            normalized_shape=lambda module: module.weight.shape[0],
-            eps=config.rms_norm_eps,
-            dtype=dtype,
-            device=device) for rms_cls in set_of_layers]
-    dtype = next(iter(model.parameters())).dtype
-    for r in rewriters:
-        model = r.apply(model)
-    model = model.to(dtype)
-    return model
+class rmsnorm_patch:
+
+    def __init__(self, model, config, enabled=True):
+        self.model = model
+        self.config = config
+        if enabled:
+            self.rmsnorm_classes = tuple(
+                set(type(x) for x in model.modules() if 'RMS' in type(x).__name__))
+        else:
+            self.rmsnorm_classes = tuple()
+        self.mapping = dict()
+
+    def __enter__(self):
+        assert torch_version >= version.parse('2.4'), "torch.nn.RMSNorm requires torch 2.4 or greater"
+
+        dtype = next(self.model.parameters()).dtype
+        device = next(self.model.parameters()).device
+
+        rewriters = [
+            ModuleToModuleByClass(
+                rms_cls,
+                torch.nn.RMSNorm,
+                normalized_shape=lambda module: module.weight.shape[0],
+                eps=self.config.rms_norm_eps,
+                dtype=dtype,
+                device=device) for rms_cls in self.rmsnorm_classes]
+
+        for r in rewriters:
+            self.model = r.apply(self.model)
+            self.mapping.update(r.old_new_module_dict)
+
+        self.model = self.model.to(dtype)
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        dtype = next(self.model.parameters()).dtype
+
+        for old_module, new_module in self.mapping.items():
+            rewriter = ModuleInstanceToModuleInstance(old_module, new_module)
+            self.model = rewriter.apply(self.model)
+
+        self.model = self.model.to(dtype)
 
 
 def replace_bias(next_module, new_bias):
@@ -106,8 +133,8 @@ def merge_layernorm_affine_params(graph_model):
 
 
 @torch.no_grad()
-def apply_layernorm_affine_merge(graph_model):
-    eq = MergeLnAffine()
+def apply_layernorm_affine_merge(graph_model, rmsnorm_classes):
+    eq = MergeLnAffine(extra_state_kwargs={'scale_invariant_layers': rmsnorm_classes})
     graph_model = eq.apply(graph_model)
     return graph_model
 
diff --git a/src/brevitas_examples/llm/main.py b/src/brevitas_examples/llm/main.py