fix(trainer): Correct loss scaling for incomplete gradient accumulation steps (#39659)

hutaiHang · taihang · qgallouedec · web-flow · commit 075dbbceaa2f · 2025-07-29T17:12:31.000+02:00
* Fix issue[#38837]: wrong loss scaled in last step of epoch * chore: trigger CI * Update src/transformers/trainer.py Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> * Update src/transformers/modeling_flash_attention_utils.py Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com> --------- Co-authored-by: taihang <taihang@U-2RHYVWX7-2207.local> Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
@@ -2530,6 +2530,9 @@ def _inner_training_loop(
                 update_step += 1
                 num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder
                 batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device)
+                # Store the number of batches for current gradient accumulation
+                # This is used to correctly scale the loss when the last accumulation step has fewer batches
+                self.current_gradient_accumulation_steps = len(batch_samples)
                 for i, inputs in enumerate(batch_samples):
                     step += 1
                     do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch
@@ -3830,7 +3833,8 @@ def training_step(
         else:
             # Finally we need to normalize the loss for reporting if GA loss bug is not fixed during compute loss
             if (not self.model_accepts_loss_kwargs or num_items_in_batch is None) and self.compute_loss_func is None:
-                loss = loss / self.args.gradient_accumulation_steps
+                # If the model does not accept loss kwargs, we need to normalize the loss by the number of gradient accumulation steps
+                loss = loss / self.current_gradient_accumulation_steps
 
             # Turning off loss scaling w.r.t. gradient accumulation when DeepSpeed is enabled
             # https://github.com/huggingface/transformers/pull/35808