Fix llava eval mode (#714)

Tcc0403 · lancerts · yundai424 · web-flow · commit ea3ac1b86a85 · 2025-05-26T17:28:00.000-07:00
## Summary
&lt;!--- This is a required section; please describe the main purpose of
this proposed code change. ---&gt;
Llava is missing logits in eval mode.
&lt;!---
## Details
This is an optional section; is there anything specific that reviewers
should be aware of?
---&gt;

## Testing Done
&lt;!--- This is a required section; please describe how this change was
tested. ---&gt;

&lt;!-- 
Replace BLANK with your device type. For example, A100-80G-PCIe

Complete the following tasks before sending your PR, and replace `[ ]`
with
`[x]` to indicate you have done them. 
--&gt;

- Hardware Type: &lt;BLANK&gt;
- [ ] run `make test` to ensure correctness
- [ ] run `make checkstyle` to ensure code style
- [ ] run `make test-convergence` to ensure convergence

---------

Signed-off-by: Tcc0403 &lt;76503978+Tcc0403@users.noreply.github.com&gt;
Co-authored-by: Shao Tang &lt;tangshao28@gmail.com&gt;
Co-authored-by: Yun Dai &lt;yundai424@gmail.com&gt;
Co-authored-by: Vaibhav Jindal &lt;vaibhav.jndl@gmail.com&gt;
diff --git a/src/liger_kernel/transformers/model/llava.py b/src/liger_kernel/transformers/model/llava.py
@@ -5,6 +5,7 @@
 
 import torch
 
+from torch.nn import CrossEntropyLoss
 from transformers.models.llava.modeling_llava import LlavaCausalLMOutputWithPast
 from transformers.utils import is_torchdynamo_compiling
 from transformers.utils.deprecation import deprecate_kwarg
@@ -189,7 +190,20 @@ def lce_forward_deprecated(
 
         lce = LigerFusedLinearCrossEntropyLoss()
         loss = lce(self.language_model.lm_head.weight, shift_hidden_states, shift_labels)
-
+    else:
+        logits = self.language_model.lm_head(hidden_states)
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device))
     if not return_dict:
         # NOTE: This part has not been tested.
         output = outputs[1:]
@@ -349,6 +363,28 @@ def lce_forward(
             shift_hidden_states.view(-1, shift_hidden_states.size(-1)),
             shift_labels.view(-1).to(shift_hidden_states.device),
         )
+    else:
+        logits = self.language_model.lm_head(hidden_states)
+        if labels is not None:
+            # Upcast to float if we need to compute the loss to avoid potential precision issues
+            logits = logits.float()
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(logits.device)
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+
+            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
 
     if not return_dict:
         # NOTE: This part has not been tested.