Fix CE patch and add layernorm support for InternVL (#921)

MilkClouds · shimizust · web-flow · commit a80f30b31600 · 2025-10-28T22:29:48.000+08:00
## Summary Fix CE patch and add layernorm support for InternVL. Related issue: #920 ## Testing Done - Hardware Type: A100 - [ ] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [x] run `make test-convergence` to ensure convergence ``` (9 durations < 0.005s hidden. Use -vv to show these durations.) ================================================================================================================= short test summary info ================================================================================================================= FAILED test/convergence/fp32/test_mini_models_multimodal.py::test_mini_model_multimodal[mini_llava-32-0.0001-dtype1-1e-08-1e-05-0.005-1e-05-0.005-1e-05] - TypeError: 'NoneType' object is not subscriptable ================================================================================================== 1 failed, 7 passed, 11 warnings in 147.93s (0:02:27) =================================================================================================== sys:1: DeprecationWarning: builtin type swigvarlink has no __module__ attribute make: *** [Makefile:32: test-convergence] Error 1 ``` llava crash, can't guess why(maybe my transformers version matter) --------- Co-authored-by: Steven Shimizu <shimizust@gmail.com>
diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
@@ -2038,6 +2038,7 @@ def apply_liger_kernel_to_internvl(
     cross_entropy: bool = False,
     fused_linear_cross_entropy: bool = True,
     rms_norm: bool = True,
+    layer_norm: bool = True,
     model: Optional[PreTrainedModel] = None,
     **kwargs,
 ) -> None:
@@ -2048,37 +2049,60 @@ def apply_liger_kernel_to_internvl(
     NOTE: InternVL is not available in transformers<4.52.1
 
     Args:
-        rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
         cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
         fused_linear_cross_entropy (bool):
             Whether to apply Liger's fused linear cross entropy loss. Default is True.
             `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
             If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
         rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
-        swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
+        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
         model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
         loaded. Default is None.
     """
     assert not (cross_entropy and fused_linear_cross_entropy), (
         "cross_entropy and fused_linear_cross_entropy cannot both be True."
     )
+    import torch.nn as torch_nn
 
     from transformers.models.internvl import modeling_internvl
+    from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
+    from transformers.models.internvl.modeling_internvl import InternVLModel
+    from transformers.models.internvl.modeling_internvl import InternVLVisionLayer
+    from transformers.models.internvl.modeling_internvl import InternVLVisionModel
+    from transformers.models.internvl.modeling_internvl import InternVLVisionRMSNorm
 
+    from liger_kernel.transformers.layer_norm import LigerLayerNorm
     from liger_kernel.transformers.model.internvl import lce_forward as internvl_lce_forward
+    from liger_kernel.transformers.rms_norm import LigerRMSNorm
+
+    if layer_norm and model is None:
+        modeling_internvl.nn.LayerNorm = LigerLayerNorm
 
     if cross_entropy:
-        logger.warning(TRANSFORMER_DEPRECATION_WARNING)
-        modeling_internvl.nn.CrossEntropyLoss = LigerCrossEntropyLoss
+        logger.info("Apply liger cross entropy")
+
+        from transformers.loss.loss_utils import nn
+
+        nn.functional.cross_entropy = liger_cross_entropy
     if fused_linear_cross_entropy:
         modeling_internvl.InternVLForConditionalGeneration.forward = internvl_lce_forward
     if rms_norm:
         modeling_internvl.InternVLVisionRMSNorm = LigerRMSNorm
 
     if model is not None:
-        text_model_name, vision_model_name = model.config.text_config.model_type, model.config.vision_config.model_type
+        # The model instance already exists, so we need to additionally patch the
+        # instance variables that reference already-instantiated modules
+        if isinstance(model, (InternVLForConditionalGeneration, InternVLModel)):
+            # NOTE: language_model and visual properties can be accessed throught conditional class.
+            text_model = model.language_model
+            vision_model: InternVLVisionModel = model.vision_tower
+        else:
+            raise TypeError(
+                f"Unsupported internvl model type. `model` must be `InternVLForConditionalGeneration`, `InternVLModel`. Got: {type(model)}"
+            )
+
+        text_model_name = model.config.text_config.model_type
         text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
-        vision_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(vision_model_name, None)
 
         kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs} | {"rms_norm": rms_norm}
         if text_liger_fn:
@@ -2091,25 +2115,33 @@ def apply_liger_kernel_to_internvl(
                     f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
                     f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
                 )
-            text_kwargs["model"] = model.language_model
+            text_kwargs["model"] = text_model
             text_liger_fn(**text_kwargs)
         elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
             logger.warning(f"{text_model_name} is not supported by Liger kernel.")
 
-        if vision_liger_fn:
-            accept_params = inspect.signature(vision_liger_fn).parameters
-            remain_params = set(kwargs) - (set(accept_params) & set(kwargs))
-            vision_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
+        # Patch vision model RMSNorm layers
+        if rms_norm:
+            for encoder_layer in vision_model.encoder.layer:
+                encoder_layer: InternVLVisionLayer
+                if isinstance(encoder_layer.attention.q_norm, InternVLVisionRMSNorm):
+                    _patch_rms_norm_module(encoder_layer.attention.q_norm)
+                if isinstance(encoder_layer.attention.k_norm, InternVLVisionRMSNorm):
+                    _patch_rms_norm_module(encoder_layer.attention.k_norm)
 
-            if remain_params:
-                logger.warning(
-                    f"These parameters are not supported by {vision_model_name}. Enter the remaining {list(vision_kwargs.keys())} except for {list(remain_params)}\n"
-                    f"Parameters accepted by {vision_model_name}: {list(accept_params.keys())}"
-                )
-            vision_kwargs["model"] = model.vision_tower
-            vision_liger_fn(**vision_kwargs)
-        elif vision_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
-            logger.warning(f"{vision_model_name} is not supported by Liger kernel.")
+        # Patch vision model LayerNorm layers
+        if layer_norm:
+            # Patch layernorm
+            if isinstance(vision_model.layernorm, torch_nn.LayerNorm):
+                _patch_layer_norm_module(vision_model.layernorm)
+
+            # Patch encoder layers
+            for encoder_layer in vision_model.encoder.layer:
+                encoder_layer: InternVLVisionLayer
+                if isinstance(encoder_layer.layernorm_before, torch_nn.LayerNorm):
+                    _patch_layer_norm_module(encoder_layer.layernorm_before)
+                if isinstance(encoder_layer.layernorm_after, torch_nn.LayerNorm):
+                    _patch_layer_norm_module(encoder_layer.layernorm_after)
 
 
 def apply_liger_kernel_to_smolvlm(
diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py
@@ -1637,21 +1637,30 @@ def test_apply_liger_kernel_to_instance_for_internvl():
 
         # Instantiate a dummy model
         config = transformers.models.internvl.configuration_internvl.InternVLConfig(
-            torch_dtype=torch.bfloat16,
-            rms_norm_eps=1e-5,
-            hidden_size=32,
-            intermediate_size=48,
-            hidden_act="silu",
-            num_hidden_layers=2,
-            num_attention_heads=2,
-            max_position_embeddings=128,
-            vocab_size=1000,
+            dtype=torch.bfloat16,
+            text_config={
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "hidden_act": "silu",
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
+                "num_key_value_heads": 2,  # 16
+                "max_position_embeddings": 4096,  # 8192
+                "vocab_size": 32000,  # 151936
+                "bos_token_id": 1,
+                "eos_token_id": 2,
+                "pad_token_id": 2,
+                "tie_word_embeddings": False,
+            },
             vision_config={
-                "depth": 4,
-                "embed_dim": 128,
-                "num_heads": 8,
-                "hidden_size": 1024,
+                "hidden_size": 256,  # 1024
+                "intermediate_size": 1024,  # 4096
+                "num_hidden_layers": 4,  # 24
+                "num_attention_heads": 4,  # 16
             },
+            image_token_id=10,
+            attn_implementation="sdpa",  # default value, pytorch native attention
         )
         dummy_model_instance = InternVLForConditionalGeneration._from_config(config)
 
@@ -1692,7 +1701,7 @@ def test_apply_liger_kernel_to_instance_for_smolvlm2():
 
         # Instantiate a dummy model
         config = transformers.models.smolvlm.configuration_smolvlm.SmolVLMConfig(
-            torch_dtype=torch.bfloat16,
+            dtype=torch.bfloat16,
             text_config={
                 "rms_norm_eps": 1e-5,
                 "hidden_size": 576,
diff --git a/test/utils.py b/test/utils.py
@@ -585,10 +585,12 @@ def revert_liger_kernel_to_internvl(model_config: MiniModelConfig):
     """
     Revert all Liger kernel patches applied to InternVL.
     """
+    import torch.nn as nn
 
     from transformers.models.internvl import modeling_internvl
     from transformers.models.qwen2 import modeling_qwen2
 
+    importlib.reload(nn)
     importlib.reload(modeling_internvl)
     importlib.reload(modeling_qwen2)