gemma3 consider loss_kwargs (#1007)

jp1924 · Tcc0403 · web-flow · commit 3b793754495d · 2026-01-16T07:38:10.000+08:00
## Summary
When applying the liger-kernel in SFTTrainer of the latest version of
TRL (0.26.2), `return_token_accuracy` is also passed to input_data to
compute `token_accuracy` alongside compute_loss.
However, in Gemma3, `return_token_accuracy` is applied correctly during
the loss step in causal_forward but not in multimodal_forward.
Therefore, using inspect, I wrote code to separate only the kwagrs that
can enter LCE from lm_kwagrs and pass them to loss_kwagrs.
Using this, it functions correctly even in the latest version of trl.

&lt;!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
---&gt;

## Testing Done
&lt;!--- This is a required section; please describe how this change was
tested. ---&gt;

&lt;!-- 
Replace BLANK with your device type. For example, A100-80G-PCIe

Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them. 
--&gt;

- Hardware Type: &lt;BLANK&gt;
- [ ] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [ ] run `make test-convergence` to ensure convergence

---------

Co-authored-by: Tcc0403 &lt;76503978+Tcc0403@users.noreply.github.com&gt;
diff --git a/src/liger_kernel/transformers/model/gemma3.py b/src/liger_kernel/transformers/model/gemma3.py
@@ -8,7 +8,6 @@
 from transformers.cache_utils import Cache
 from transformers.utils import logging
 
-from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
 from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
@@ -268,23 +267,15 @@ def multimodal_forward(
         shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
         shift_labels = shift_labels.view(-1).to(hidden_device)
 
-        # Extract loss-related kwargs for LigerFusedLinearCrossEntropyLoss
-        lce_param_keys = {
-            "ce_weight",
-            "ignore_index",
-            "lse_square_scale",
-            "label_smoothing",
-            "reduction",
-            "softcap",
-            "return_z_loss",
-            "accum_dtype",
-            "use_token_scaling",
-            "return_token_accuracy",
-        }
-        lce_kwargs = {k: lm_kwargs.pop(k) for k in lce_param_keys if k in lm_kwargs}
-
-        lce = LigerFusedLinearCrossEntropyLoss(**lce_kwargs)
-        result = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
+        result = LigerForCausalLMLoss(
+            hidden_states=shift_hidden_states,
+            lm_head_weight=self.lm_head.weight,
+            labels=shift_labels,
+            hidden_size=self.config.text_config.hidden_size,
+            shift_labels=shift_labels,
+            final_logit_softcapping=getattr(self.config.text_config, "final_logit_softcapping", None),
+            **lm_kwargs,
+        )
         loss, _, token_accuracy = unpack_cross_entropy_result(result)
 
     else:
diff --git a/src/liger_kernel/transformers/model/loss_utils.py b/src/liger_kernel/transformers/model/loss_utils.py
@@ -1,3 +1,5 @@
+import inspect
+
 from typing import Optional
 from typing import Tuple
 
@@ -71,6 +73,10 @@ def LigerForCausalLMLoss(
     return_token_accuracy: bool = False,
     **kwargs,
 ):
+    # Filter out inapplicable kwargs to liger_fused_linear_cross_entropy
+    applicable_params = inspect.signature(F.liger_fused_linear_cross_entropy).parameters
+    kwargs = {k: v for k, v in kwargs.items() if k in applicable_params}
+
     # Skip upcast since intermediate values for the loss are all fp32 in kernel
     if shift_labels is None:
         # Shift so that token < n predict n