pass param down to LigerFusedLinearCrossEntropyLoss (#1010)

kaixuanliu · web-flow · commit 41d4bcfd5717 · 2026-01-14T09:53:42.000-08:00
## Summary
&lt;!--- This is a required section; please describe the main purpose of
this proposed code change. ---&gt;

&lt;!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
---&gt;

## Testing Done
&lt;!--- This is a required section; please describe how this change was
tested. ---&gt;

&lt;!-- 
Replace A100-80G-PCIe with your device type. For example, A100-80G-PCIe

Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them. 
--&gt;

- Hardware Type: &lt;BLANK&gt;
- [x] run `make test` to ensure correctness
- [x] run `make checkstyle` to ensure code style
- [x] run `make test-convergence` to ensure convergence

---------

Signed-off-by: Liu, Kaixuan &lt;kaixuan.liu@intel.com&gt;
diff --git a/src/liger_kernel/transformers/model/gemma3.py b/src/liger_kernel/transformers/model/gemma3.py
@@ -268,7 +268,22 @@ def multimodal_forward(
         shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
         shift_labels = shift_labels.view(-1).to(hidden_device)
 
-        lce = LigerFusedLinearCrossEntropyLoss()
+        # Extract loss-related kwargs for LigerFusedLinearCrossEntropyLoss
+        lce_param_keys = {
+            "ce_weight",
+            "ignore_index",
+            "lse_square_scale",
+            "label_smoothing",
+            "reduction",
+            "softcap",
+            "return_z_loss",
+            "accum_dtype",
+            "use_token_scaling",
+            "return_token_accuracy",
+        }
+        lce_kwargs = {k: lm_kwargs.pop(k) for k in lce_param_keys if k in lm_kwargs}
+
+        lce = LigerFusedLinearCrossEntropyLoss(**lce_kwargs)
         result = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
         loss, _, token_accuracy = unpack_cross_entropy_result(result)