NVIDIA · shjwudp · Dec 22, 2025
@@ -253,6 +253,15 @@ def __init__(
                 tp_group=self.pg_collection.tp,
             )
 
+            # The Linear-Cross-Entropy fusion bypasses the output layer during the forward pass.
+            # The output_layer submodule shares its weights with the gpt_model to prevent
+            # issues arising from pre-forward hooks attached to the output layer.
+            if (
+                self.config.cross_entropy_loss_fusion
+                and self.config.cross_entropy_fusion_impl == 'linear'
+            ):
+                self.shared_output_layer_weight = self.output_layer.weight
+
         if self.pre_process or self.post_process or self.mtp_process:
             self.setup_embeddings_and_output_layer()
 
@@ -611,7 +620,24 @@ def _postprocess(
                     cp_group=self.cp_group,
                     packed_seq_params=packed_seq_params,
                 )
-                mtp_loss = self.compute_language_model_loss(mtp_labels, mtp_logits)
+
+                # Compute mtp loss without storing logits to save memory.
+                mtp_loss = self.compute_output_layer_and_language_model_loss(
+                    hidden_states_list[mtp_layer_number + 1],
+                    labels=mtp_labels,
+                    weight=(
+                        output_weight
+                        if output_weight is not None
+                        else self.shared_output_layer_weight
+                    ),
+                    sequence_parallel_enabled=self.output_layer.sequence_parallel,
+                    column_parallel_linear=self.output_layer,
+                    col_linear_kwargs={
+                        'weight': output_weight,
+                        'runtime_gather_output': runtime_gather_output,
+                    },
+                )
+
                 mtp_loss = loss_mask * mtp_loss
                 if self.training:
                     # TODO(shifangx): remove the use of parallel_state here
@@ -685,7 +711,17 @@ def _postprocess(
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
-        loss = self.compute_language_model_loss(labels, logits)
+        loss = self.compute_output_layer_and_language_model_loss(
+            hidden_states,
+            labels=labels,
+            weight=output_weight if output_weight is not None else self.shared_output_layer_weight,
+            sequence_parallel_enabled=self.output_layer.sequence_parallel,
+            column_parallel_linear=self.output_layer,
+            col_linear_kwargs={
+                'weight': output_weight,
+                'runtime_gather_output': runtime_gather_output,
+            },
+        )
 
         return loss
 

@@ -149,6 +149,15 @@ def __init__(
                 tp_group=self.pg_collection.tp,
             )
 
+            # The Linear-Cross-Entropy fusion bypasses the output layer during the forward pass.
+            # The output_layer submodule shares its weights with the gpt_model to prevent
+            # issues arising from pre-forward hooks attached to the output layer.
+            if (
+                self.config.cross_entropy_loss_fusion
+                and self.config.cross_entropy_fusion_impl == 'linear'
+            ):
+                self.shared_output_layer_weight = self.output_layer.weight
+
         if self.pre_process or self.post_process:
             self.setup_embeddings_and_output_layer()
 
@@ -303,6 +312,16 @@ def forward(
             # [s b h] => [b s h]
             return logits.transpose(0, 1).contiguous()
 
-        loss = self.compute_language_model_loss(labels, logits)
+        loss = self.compute_output_layer_and_language_model_loss(
+            hidden_states,
+            labels,
+            weight=output_weight if output_weight is not None else self.shared_output_layer_weight,
+            sequence_parallel_enabled=self.output_layer.sequence_parallel,
+            column_parallel_linear=self.output_layer,
+            col_linear_kwargs={
+                "weight": output_weight,
+                "runtime_gather_output": runtime_gather_output,
+            },
+        )
 
         return loss