deepmodeling · Copilot · Sep 20, 2025 · Sep 20, 2025 · Sep 20, 2025 · Sep 20, 2025
diff --git a/deepmd/pd/train/training.py b/deepmd/pd/train/training.py
@@ -75,6 +75,9 @@
 from deepmd.utils.data import (
     DataRequirementItem,
 )
+from deepmd.utils.nan_detector import (
+    check_total_loss_nan,
+)
 from deepmd.utils.path import (
     DPH5Path,
 )
@@ -859,6 +862,9 @@ def log_loss_valid(_task_key="Default"):
 
                 if not self.multi_task:
                     train_results = log_loss_train(loss, more_loss)
+                    # Check for NaN in total loss using CPU values from lcurve computation
+                    if self.rank == 0 and "rmse" in train_results:
+                        check_total_loss_nan(display_step_id, train_results["rmse"])
-                        check_total_loss_nan(display_step_id, train_results["rmse"])
+                        check_total_loss_nan(display_step_id, loss)
-                        check_total_loss_nan(display_step_id, train_results["rmse"])
+                        check_total_loss_nan(display_step_id, loss)
                     valid_results = log_loss_valid()
                     if self.rank == 0:
                         log.info(
@@ -900,6 +906,11 @@ def log_loss_valid(_task_key="Default"):
                                 loss, more_loss, _task_key=_key
                             )
                         valid_results[_key] = log_loss_valid(_task_key=_key)
+                        # Check for NaN in total loss using CPU values from lcurve computation
+                        if self.rank == 0 and "rmse" in train_results[_key]:
+                            check_total_loss_nan(
+                                display_step_id, train_results[_key]["rmse"]
+                            )
                         if self.rank == 0:
                             log.info(
                                 format_training_message_per_task(

diff --git a/deepmd/pt/train/training.py b/deepmd/pt/train/training.py
@@ -75,6 +75,9 @@
 from deepmd.utils.data import (
     DataRequirementItem,
 )
+from deepmd.utils.nan_detector import (
+    check_total_loss_nan,
+)
 
 if torch.__version__.startswith("2"):
     import torch._dynamo
@@ -949,6 +952,9 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
 
                 if not self.multi_task:
                     train_results = log_loss_train(loss, more_loss)
+                    # Check for NaN in total loss using CPU values from lcurve computation
+                    if self.rank == 0 and "rmse" in train_results:
+                        check_total_loss_nan(display_step_id, train_results["rmse"])
-                    if self.rank == 0 and "rmse" in train_results:
-                        check_total_loss_nan(display_step_id, train_results["rmse"])
+                    if self.rank == 0:
+                        check_total_loss_nan(display_step_id, loss)
-                    if self.rank == 0 and "rmse" in train_results:
-                        check_total_loss_nan(display_step_id, train_results["rmse"])
+                    if self.rank == 0:
+                        check_total_loss_nan(display_step_id, loss)
                     valid_results = log_loss_valid()
                     if self.rank == 0:
                         log.info(
@@ -997,6 +1003,11 @@ def log_loss_valid(_task_key: str = "Default") -> dict:
                                     loss, more_loss, _task_key=_key
                                 )
                         valid_results[_key] = log_loss_valid(_task_key=_key)
+                        # Check for NaN in total loss using CPU values from lcurve computation
+                        if self.rank == 0 and "rmse" in train_results[_key]:
+                            check_total_loss_nan(
+                                display_step_id, train_results[_key]["rmse"]
-                        if self.rank == 0 and "rmse" in train_results[_key]:
-                            check_total_loss_nan(
-                                display_step_id, train_results[_key]["rmse"]
+                        if self.rank == 0:
+                            check_total_loss_nan(
+                                display_step_id, loss
-                        if self.rank == 0 and "rmse" in train_results[_key]:
-                            check_total_loss_nan(
-                                display_step_id, train_results[_key]["rmse"]
+                        if self.rank == 0:
+                            check_total_loss_nan(
+                                display_step_id, loss
+                            )
                         if self.rank == 0:
                             log.info(
                                 format_training_message_per_task(

diff --git a/deepmd/tf/train/trainer.py b/deepmd/tf/train/trainer.py
@@ -60,6 +60,9 @@
 from deepmd.utils.data import (
     DataRequirementItem,
 )
+from deepmd.utils.nan_detector import (
+    check_total_loss_nan,
+)
 
 log = logging.getLogger(__name__)
 
@@ -684,6 +687,11 @@ def valid_on_the_fly(
 
         cur_batch = self.cur_batch
         current_lr = run_sess(self.sess, self.learning_rate)
+
+        # Check for NaN in total loss before writing to file and saving checkpoint
+        # We check the main total loss component that represents training loss
+        check_total_loss_nan(cur_batch, train_results["rmse"])
-        check_total_loss_nan(cur_batch, train_results["rmse"])
+        check_total_loss_nan(cur_batch, train_results["loss"])
-        check_total_loss_nan(cur_batch, train_results["rmse"])
+        check_total_loss_nan(cur_batch, train_results["loss"])
+
         if print_header:
             self.print_header(fp, train_results, valid_results)
         self.print_on_training(

diff --git a/deepmd/utils/nan_detector.py b/deepmd/utils/nan_detector.py
@@ -0,0 +1,54 @@
+# SPDX-License-Identifier: LGPL-3.0-or-later
+"""Utilities for detecting NaN values in loss during training."""
+
+import logging
+import math
+
+log = logging.getLogger(__name__)
+
+
+class LossNaNError(RuntimeError):
+    """Exception raised when NaN is detected in total loss during training."""
+
+    def __init__(self, step: int, total_loss: float) -> None:
+        """Initialize the exception.
+
+        Parameters
+        ----------
+        step : int
+            The training step where NaN was detected
+        total_loss : float
+            The total loss value that contains NaN
+        """
+        self.step = step
+        self.total_loss = total_loss
+        message = (
+            f"NaN detected in total loss at training step {step}: {total_loss}. "
+            f"Training stopped to prevent wasting time with corrupted parameters. "
+            f"This typically indicates unstable training conditions such as "
+            f"learning rate too high, poor data quality, or numerical instability."
+        )
+        super().__init__(message)
+
+
+def check_total_loss_nan(step: int, total_loss: float) -> None:
+    """Check if the total loss contains NaN and raise an exception if found.
+
+    This function is designed to be called during training after the total loss
+    is computed and converted to a CPU float value.
+
+    Parameters
+    ----------
+    step : int
+        Current training step
+    total_loss : float
+        Total loss value to check for NaN
+
+    Raises
+    ------
+    LossNaNError
+        If the total loss contains NaN
+    """
+    if math.isnan(total_loss):
+        log.error(f"NaN detected in total loss at step {step}: {total_loss}")
+        raise LossNaNError(step, total_loss)