Bug fixes in patching module (#834)

vaibhavjindal · web-flow · commit 0ea822f9cf3e · 2025-07-24T12:45:34.000-07:00
## Summary
&lt;!--- This is a required section; please describe the main purpose of
this proposed code change. ---&gt;

&lt;!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
---&gt;
1. Fix `_patch_layer_norm_module` by replacing `LigerRMSNorm` with
`LigerLayerNorm`.
2. Correctly change the name of the instance and not of the Class by
replacing patches like `module.__class__.__name__ =
LigerLayerNorm.__name__` with `_bind_method_to_module(module,
"_get_name", lambda self: LigerLayerNorm.__name__)`.

## Testing Done
&lt;!--- This is a required section; please describe how this change was
tested. ---&gt;

```
from transformers import AutoModelForCausalLM
from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct").to(device)
apply_liger_kernel_to_qwen2(model=model)

print(model)
```
prints:

```
Applied Liger kernels to Qwen2
Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=896, out_features=896, bias=True)
          (k_proj): Linear(in_features=896, out_features=128, bias=True)
          (v_proj): Linear(in_features=896, out_features=128, bias=True)
          (o_proj): Linear(in_features=896, out_features=896, bias=False)
        )
        (mlp): LigerSwiGLUMLP(
          (gate_proj): Linear(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LigerRMSNorm((896,), eps=1e-06, offset=0.0, in_place=True, row_mode=None)
        (post_attention_layernorm): LigerRMSNorm((896,), eps=1e-06, offset=0.0, in_place=True, row_mode=None)
      )
    )
    (norm): LigerRMSNorm((896,), eps=1e-06, offset=0.0, in_place=True, row_mode=None)
    (rotary_emb): Qwen2RotaryEmbedding()
  )
  (lm_head): Linear(in_features=896, out_features=151936, bias=False)
)
```

&lt;!-- 
Replace BLANK with your device type. For example, A100-80G-PCIe

Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them. 
--&gt;

- Hardware Type: &lt;BLANK&gt;
- [ ] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [ ] run `make test-convergence` to ensure convergence
diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
@@ -78,8 +78,8 @@ def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", i
         _bind_method_to_module(module.modules_to_save.default, "extra_repr", LigerRMSNorm.extra_repr)
         _bind_method_to_module(module.original_module, "forward", LigerRMSNorm.forward)
         _bind_method_to_module(module.original_module, "extra_repr", LigerRMSNorm.extra_repr)
-        module.modules_to_save.default.__class__.__name__ = LigerRMSNorm.__name__
-        module.original_module.__class__.__name__ = LigerRMSNorm.__name__
+        _bind_method_to_module(module.modules_to_save.default, "_get_name", lambda self: LigerRMSNorm.__name__)
+        _bind_method_to_module(module.original_module, "_get_name", lambda self: LigerRMSNorm.__name__)
     else:
         module.offset = offset
         module.casting_mode = casting_mode
@@ -88,7 +88,7 @@ def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama", i
         module.row_mode = row_mode
         _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
         _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
-        module.__class__.__name__ = LigerRMSNorm.__name__
+        _bind_method_to_module(module, "_get_name", lambda self: LigerRMSNorm.__name__)
 
 
 def _patch_layer_norm_module(module, eps=1e-6):
@@ -110,28 +110,28 @@ def _patch_layer_norm_module(module, eps=1e-6):
         module.original_module.hidden_size = getattr(module, "hidden_size", None) or getattr(
             module, "normalized_shape", None
         )
-        _bind_method_to_module(module.modules_to_save.default, "forward", LigerRMSNorm.forward)
-        _bind_method_to_module(module.modules_to_save.default, "extra_repr", LigerRMSNorm.extra_repr)
-        _bind_method_to_module(module.original_module, "forward", LigerRMSNorm.forward)
-        _bind_method_to_module(module.original_module, "extra_repr", LigerRMSNorm.extra_repr)
-        module.modules_to_save.default.__class__.__name__ = LigerLayerNorm.__name__
-        module.original_module.__class__.__name__ = LigerLayerNorm.__name__
+        _bind_method_to_module(module.modules_to_save.default, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module.modules_to_save.default, "extra_repr", LigerLayerNorm.extra_repr)
+        _bind_method_to_module(module.original_module, "forward", LigerLayerNorm.forward)
+        _bind_method_to_module(module.original_module, "extra_repr", LigerLayerNorm.extra_repr)
+        _bind_method_to_module(module.modules_to_save.default, "_get_name", lambda self: LigerLayerNorm.__name__)
+        _bind_method_to_module(module.original_module, "_get_name", lambda self: LigerLayerNorm.__name__)
     else:
         module.variance_epsilon = getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
         module.hidden_size = getattr(module, "hidden_size", None) or getattr(module, "normalized_shape", None)
         _bind_method_to_module(module, "forward", LigerLayerNorm.forward)
         _bind_method_to_module(module, "extra_repr", LigerLayerNorm.extra_repr)
-        module.__class__.__name__ = LigerLayerNorm.__name__
+        _bind_method_to_module(module, "_get_name", lambda self: LigerLayerNorm.__name__)
 
 
 def _patch_swiglu_module(module, liger_module):
     _bind_method_to_module(module, "forward", liger_module.forward)
-    module.__class__.__name__ = liger_module.__name__
+    _bind_method_to_module(module, "_get_name", lambda self: liger_module.__name__)
 
 
 def _patch_geglu_module(module):
     _bind_method_to_module(module, "forward", LigerGEGLUMLP.forward)
-    module.__class__.__name__ = LigerGEGLUMLP.__name__
+    _bind_method_to_module(module, "_get_name", lambda self: LigerGEGLUMLP.__name__)
 
 
 def apply_liger_kernel_to_granite(