Fix paligemma missing test_liger_kernel_to_instance_for_paligemma_instance test in test_monkey_patch.py (#785)

vvvdwbvvv · lancerts · Tcc0403 · web-flow · commit e80828149b74 · 2025-07-09T11:03:46.000+08:00
## Summary  WHAT THIS PR ADDS - A new unit test file containing `test_apply_liger_kernel_to_instance_for_paligemma()`. Fix #776 - The test instantiates a dummy PaliGemmaForConditionalGeneration model, confirms it is un-patched, runs `_apply_liger_kernel_to_instance()`, then verifies that: - model.forward is replaced by `paligemma_lce_forward()`. - `vision_tower.vision_model.post_layernorm.forward` is replaced by `LigerLayerNorm.forward`. - Every encoder layer’s `layer_norm1.forward` and `layer_norm2.forward` are also replaced. - Source equality is checked with inspect.getsource before and after patching.  ## Testing Done   `transformers==4.49.0` <details> <summary>Test result</summary> ❯ python3 -m pytest test/transformers/test_monkey_patch.py -k paligemma -v -rP ============================================== test session starts ============================================== platform linux -- Python 3.11.11, pytest-8.4.1, pluggy-1.6.0 -- /home/vvvdwbvvv/.local/bin/python3 cachedir: .pytest_cache rootdir: /home/vvvdwbvvv/develop/Liger-Kernel configfile: pyproject.toml plugins: asyncio-1.0.0 asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function collected 33 items / 32 deselected / 1 selected test/transformers/test_monkey_patch.py::test_apply_liger_kernel_to_instance_for_paligemma ------------------------------------------------- live log call ------------------------------------------------- INFO liger_kernel.transformers.monkey_patch:monkey_patch.py:1864 Applying Liger kernels to model instance with model type: paligemma with kwargs: {} PASSED [100%] ==================================================== PASSES ===================================================== _______________________________ test_apply_liger_kernel_to_instance_for_paligemma _______________________________ --------------------------------------------- Captured stdout call ---------------------------------------------- PaliGemmaForConditionalGeneration( (vision_tower): SiglipVisionModel( (vision_model): SiglipVisionTransformer( (embeddings): SiglipVisionEmbeddings( (patch_embedding): Conv2d(3, 48, kernel_size=(16, 16), stride=(16, 16), padding=valid) (position_embedding): Embedding(196, 48) ) (encoder): SiglipEncoder( (layers): ModuleList( (0-1): 2 x SiglipEncoderLayer( (self_attn): SiglipSdpaAttention( (k_proj): Linear(in_features=48, out_features=48, bias=True) (v_proj): Linear(in_features=48, out_features=48, bias=True) (q_proj): Linear(in_features=48, out_features=48, bias=True) (out_proj): Linear(in_features=48, out_features=48, bias=True) ) (layer_norm1): LigerLayerNorm((48,), eps=1e-05) (mlp): SiglipMLP( (activation_fn): PytorchGELUTanh() (fc1): Linear(in_features=48, out_features=64, bias=True) (fc2): Linear(in_features=64, out_features=48, bias=True) ) (layer_norm2): LigerLayerNorm((48,), eps=1e-05) ) ) ) (post_layernorm): LigerLayerNorm((48,), eps=1e-05) (head): SiglipMultiheadAttentionPoolingHead( (attention): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True) ) (layernorm): LigerLayerNorm((48,), eps=1e-05, elementwise_affine=True) (mlp): SiglipMLP( (activation_fn): PytorchGELUTanh() (fc1): Linear(in_features=48, out_features=64, bias=True) (fc2): Linear(in_features=64, out_features=48, bias=True) ) ) ) ) (multi_modal_projector): PaliGemmaMultiModalProjector( (linear): Linear(in_features=48, out_features=2048, bias=True) ) (language_model): GemmaForCausalLM( (model): GemmaModel( (embed_tokens): Embedding(256000, 32, padding_idx=0) (layers): ModuleList( (0-1): 2 x GemmaDecoderLayer( (self_attn): GemmaAttention( (q_proj): Linear(in_features=32, out_features=4096, bias=False) (k_proj): Linear(in_features=32, out_features=4096, bias=False) (v_proj): Linear(in_features=32, out_features=4096, bias=False) (o_proj): Linear(in_features=4096, out_features=32, bias=False) ) (mlp): LigerGEGLUMLP( (gate_proj): Linear(in_features=32, out_features=64, bias=False) (up_proj): Linear(in_features=32, out_features=64, bias=False) (down_proj): Linear(in_features=64, out_features=32, bias=False) (act_fn): SiLU() ) (input_layernorm): LigerRMSNorm((32,), eps=1e-05, offset=1.0, in_place=True, row_mode=None) (post_attention_layernorm): LigerRMSNorm((32,), eps=1e-05, offset=1.0, in_place=True, row_mode=None) ) ) (norm): LigerRMSNorm((32,), eps=1e-05, offset=1.0, in_place=True, row_mode=None) (rotary_emb): GemmaRotaryEmbedding() ) (lm_head): Linear(in_features=32, out_features=256000, bias=False) ) ) ----------------------------------------------- Captured log call ----------------------------------------------- INFO liger_kernel.transformers.monkey_patch:monkey_patch.py:1864 Applying Liger kernels to model instance with model type: paligemma with kwargs: {} ======================================= 1 passed, 32 deselected in 1.78s ======================================== </details> `transformers==4.53.1` <details> <summary>Test result</summary> ❯ python3 -m pytest test/transformers/test_monkey_patch.py -k paligemma -v -rP ============================================== test session starts ============================================== platform linux -- Python 3.11.11, pytest-8.4.1, pluggy-1.6.0 -- /home/vvvdwbvvv/.local/bin/python3 cachedir: .pytest_cache rootdir: /home/vvvdwbvvv/develop/Liger-Kernel configfile: pyproject.toml plugins: asyncio-1.0.0 asyncio: mode=Mode.AUTO, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function collected 33 items / 32 deselected / 1 selected test/transformers/test_monkey_patch.py::test_apply_liger_kernel_to_instance_for_paligemma ------------------------------------------------- live log call ------------------------------------------------- INFO liger_kernel.transformers.monkey_patch:monkey_patch.py:1864 Applying Liger kernels to model instance with model type: paligemma with kwargs: {} PASSED [100%] ==================================================== PASSES ===================================================== _______________________________ test_apply_liger_kernel_to_instance_for_paligemma _______________________________ --------------------------------------------- Captured stdout call ---------------------------------------------- PaliGemmaForConditionalGeneration( (model): PaliGemmaModel( (vision_tower): SiglipVisionModel( (vision_model): SiglipVisionTransformer( (embeddings): SiglipVisionEmbeddings( (patch_embedding): Conv2d(3, 48, kernel_size=(16, 16), stride=(16, 16), padding=valid) (position_embedding): Embedding(196, 48) ) (encoder): SiglipEncoder( (layers): ModuleList( (0-1): 2 x SiglipEncoderLayer( (layer_norm1): LigerLayerNorm((48,), eps=1e-05) (self_attn): SiglipAttention( (k_proj): Linear(in_features=48, out_features=48, bias=True) (v_proj): Linear(in_features=48, out_features=48, bias=True) (q_proj): Linear(in_features=48, out_features=48, bias=True) (out_proj): Linear(in_features=48, out_features=48, bias=True) ) (layer_norm2): LigerLayerNorm((48,), eps=1e-05) (mlp): SiglipMLP( (activation_fn): PytorchGELUTanh() (fc1): Linear(in_features=48, out_features=64, bias=True) (fc2): Linear(in_features=64, out_features=48, bias=True) ) ) ) ) (post_layernorm): LigerLayerNorm((48,), eps=1e-05) (head): SiglipMultiheadAttentionPoolingHead( (attention): MultiheadAttention( (out_proj): NonDynamicallyQuantizableLinear(in_features=48, out_features=48, bias=True) ) (layernorm): LigerLayerNorm((48,), eps=1e-05, elementwise_affine=True) (mlp): SiglipMLP( (activation_fn): PytorchGELUTanh() (fc1): Linear(in_features=48, out_features=64, bias=True) (fc2): Linear(in_features=64, out_features=48, bias=True) ) ) ) ) (multi_modal_projector): PaliGemmaMultiModalProjector( (linear): Linear(in_features=48, out_features=2048, bias=True) ) (language_model): GemmaModel( (embed_tokens): Embedding(256000, 32, padding_idx=0) (layers): ModuleList( (0-1): 2 x GemmaDecoderLayer( (self_attn): GemmaAttention( (q_proj): Linear(in_features=32, out_features=4096, bias=False) (k_proj): Linear(in_features=32, out_features=4096, bias=False) (v_proj): Linear(in_features=32, out_features=4096, bias=False) (o_proj): Linear(in_features=4096, out_features=32, bias=False) ) (mlp): LigerGEGLUMLP( (gate_proj): Linear(in_features=32, out_features=64, bias=False) (up_proj): Linear(in_features=32, out_features=64, bias=False) (down_proj): Linear(in_features=64, out_features=32, bias=False) (act_fn): SiLU() ) (input_layernorm): LigerRMSNorm((32,), eps=1e-05, offset=1.0, in_place=True, row_mode=None) (post_attention_layernorm): LigerRMSNorm((32,), eps=1e-05, offset=1.0, in_place=True, row_mode=None) ) ) (norm): LigerRMSNorm((32,), eps=1e-05, offset=1.0, in_place=True, row_mode=None) (rotary_emb): GemmaRotaryEmbedding() ) ) (lm_head): Linear(in_features=32, out_features=256000, bias=False) ) ----------------------------------------------- Captured log call ----------------------------------------------- INFO liger_kernel.transformers.monkey_patch:monkey_patch.py:1864 Applying Liger kernels to model instance with model type: paligemma with kwargs: {} ======================================= 1 passed, 32 deselected in 2.42s ======================================== </details> - Hardware Type: <BLANK> - [x] run `make test` to ensure correctness - [x] run `make checkstyle` to ensure code style - [ ] run `make test-convergence` to ensure convergence --------- Co-authored-by: Shao Tang <tangshao28@gmail.com> Co-authored-by: Tcc0403 <76503978+Tcc0403@users.noreply.github.com>
diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py
@@ -1096,7 +1096,9 @@ def apply_liger_kernel_to_paligemma(
     # PaliGemma submodules are ['vision_tower', 'multi_modal_projector', 'language_model']
 
     from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
+    from transformers.models.gemma.modeling_gemma import GemmaModel
     from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
+    from transformers.models.gemma2.modeling_gemma2 import Gemma2Model
     from transformers.models.paligemma import modeling_paligemma
     from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
     from transformers.models.siglip import modeling_siglip
@@ -1155,7 +1157,7 @@ def apply_liger_kernel_to_paligemma(
 
         language_model = model.language_model
 
-        if isinstance(language_model, GemmaForCausalLM):
+        if isinstance(language_model, (GemmaForCausalLM, GemmaModel)):
             apply_liger_kernel_to_gemma(
                 rope=rope,
                 cross_entropy=False,
@@ -1165,7 +1167,7 @@ def apply_liger_kernel_to_paligemma(
                 model=language_model,
             )
 
-        elif isinstance(language_model, Gemma2ForCausalLM):
+        elif isinstance(language_model, (Gemma2ForCausalLM, Gemma2Model)):
             apply_liger_kernel_to_gemma2(
                 rope=rope,
                 cross_entropy=False,
diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py
@@ -38,6 +38,7 @@
     from liger_kernel.transformers.model.mistral import lce_forward as mistral_lce_forward
     from liger_kernel.transformers.model.mixtral import lce_forward as mixtral_lce_forward
     from liger_kernel.transformers.model.mllama import lce_forward as mllama_lce_forward
+    from liger_kernel.transformers.model.paligemma import lce_forward as paligemma_lce_forward
     from liger_kernel.transformers.model.phi3 import lce_forward as phi3_lce_forward
     from liger_kernel.transformers.model.qwen2 import lce_forward as qwen2_lce_forward
 else:
@@ -49,6 +50,7 @@
     )
     from liger_kernel.transformers.model.mixtral import lce_forward_deprecated as mixtral_lce_forward
     from liger_kernel.transformers.model.mllama import lce_forward_deprecated as mllama_lce_forward
+    from liger_kernel.transformers.model.paligemma import lce_forward_deprecated as paligemma_lce_forward
     from liger_kernel.transformers.model.phi3 import lce_forward_deprecated as phi3_lce_forward
     from liger_kernel.transformers.model.qwen2 import lce_forward_deprecated as qwen2_lce_forward
 
@@ -126,6 +128,15 @@ def is_gemma3_available():
         return False
 
 
+def is_paligemma_available():
+    try:
+        import transformers.models.paligemma  # noqa: F401
+
+        return True
+    except ImportError:
+        return False
+
+
 def test_import_from_root():
     try:
         from liger_kernel.transformers import AutoLigerKernelForCausalLM  # noqa: F401
@@ -793,6 +804,62 @@ def test_apply_liger_kernel_to_instance_for_gemma2():
             pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
 
 
+@pytest.mark.skipif(not is_paligemma_available(), reason="paligemma module not available")
+def test_apply_liger_kernel_to_instance_for_paligemma():
+    # Ensure any monkey patching is cleaned up for subsequent tests
+    with patch("transformers.models.paligemma.modeling_paligemma"):
+        from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+
+        # Instantiate a dummy model
+        config = transformers.models.paligemma.configuration_paligemma.PaliGemmaConfig(
+            torch_dtype=torch.bfloat16,
+            text_config={
+                "num_hidden_layers": 2,
+                "rms_norm_eps": 1e-5,
+                "hidden_size": 32,
+                "intermediate_size": 64,
+                "hidden_act": "silu",
+            },
+            vision_config={
+                "num_hidden_layers": 2,
+                "layer_norm_eps": 1e-5,
+                "hidden_size": 48,
+                "intermediate_size": 64,
+            },
+        )
+
+        dummy_model_instance = PaliGemmaForConditionalGeneration(config)
+        assert isinstance(dummy_model_instance, PaliGemmaForConditionalGeneration)
+
+        # Check that model instance variables are not yet patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(paligemma_lce_forward)
+        assert inspect.getsource(
+            dummy_model_instance.vision_tower.vision_model.post_layernorm.forward
+        ) != inspect.getsource(LigerLayerNorm.forward)
+
+        for layer in dummy_model_instance.vision_tower.vision_model.encoder.layers:
+            assert inspect.getsource(layer.layer_norm1.forward) != inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.layer_norm2.forward) != inspect.getsource(LigerLayerNorm.forward)
+
+        # Test applying kernels to the model instance
+        _apply_liger_kernel_to_instance(model=dummy_model_instance)
+
+        # Check that the model's instance variables were correctly patched with Liger modules
+        assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(paligemma_lce_forward)
+        assert inspect.getsource(
+            dummy_model_instance.vision_tower.vision_model.post_layernorm.forward
+        ) == inspect.getsource(LigerLayerNorm.forward)
+
+        for layer in dummy_model_instance.vision_tower.vision_model.encoder.layers:
+            assert inspect.getsource(layer.layer_norm1.forward) == inspect.getsource(LigerLayerNorm.forward)
+            assert inspect.getsource(layer.layer_norm2.forward) == inspect.getsource(LigerLayerNorm.forward)
+
+        try:
+            print(dummy_model_instance)
+        except Exception as e:
+            pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}")
+
+
 @pytest.mark.skipif(not is_gemma3_available(), reason="gemma3 module not available")
 def test_apply_liger_kernel_to_instance_for_gemma3_text():
     # Ensure any monkey patching is cleaned up for subsequent tests