diff --git a/deepmd/pd/entrypoints/main.py b/deepmd/pd/entrypoints/main.py index 8600d73bc9..f66a2225ee 100644 --- a/deepmd/pd/entrypoints/main.py +++ b/deepmd/pd/entrypoints/main.py @@ -224,6 +224,20 @@ def get_backend_info(self) -> dict: **op_info, } + def get_device_name(self) -> str | None: + """Get the underlying GPU name. + + Returns + ------- + str or None + The device name if available, otherwise None. + """ + if paddle.device.is_compiled_with_cuda(): + cuda = paddle.device.cuda + if cuda.device_count() > 0: + return cuda.get_device_name() + return None + def train( input_file: str, diff --git a/deepmd/pt/entrypoints/main.py b/deepmd/pt/entrypoints/main.py index 75efdd8c9f..71a358173e 100644 --- a/deepmd/pt/entrypoints/main.py +++ b/deepmd/pt/entrypoints/main.py @@ -252,6 +252,18 @@ def get_backend_info(self) -> dict: **op_info, } + def get_device_name(self) -> str | None: + """Use PyTorch's current device name as the device identifier. + + Returns + ------- + str or None + The device name if available, otherwise None. + """ + if torch.cuda.is_available(): + return torch.cuda.get_device_name(torch.cuda.current_device()) + return None + def train( input_file: str, diff --git a/deepmd/tf/train/run_options.py b/deepmd/tf/train/run_options.py index 0b5c3b1b43..e34244abbf 100644 --- a/deepmd/tf/train/run_options.py +++ b/deepmd/tf/train/run_options.py @@ -73,6 +73,24 @@ def get_backend_info(self) -> dict: "build with TF lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"), } + def get_device_name(self) -> str | None: + """Get the hardware device name if available. + + Returns + ------- + str or None + The device name (e.g., NVIDIA A100) if available, otherwise None. + """ + try: + gpus = tf.config.get_visible_devices("GPU") + if gpus: + details = tf.config.experimental.get_device_details(gpus[0]) + return details.get("device_name") + except (AttributeError, RuntimeError): + # Experimental API may not exist or fail in some TF versions + pass + return None + class RunOptions: """Class with info on how to run training (cluster, MPI and GPU config). diff --git a/deepmd/utils/summary.py b/deepmd/utils/summary.py index c00e6deb9e..f052f67699 100644 --- a/deepmd/utils/summary.py +++ b/deepmd/utils/summary.py @@ -74,6 +74,9 @@ def __call__(self) -> None: "computing device": self.get_compute_device(), } ) + device_name = self.get_device_name() + if device_name: + build_info["Device Name"] = device_name if self.is_built_with_cuda(): env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "unset") build_info["CUDA_VISIBLE_DEVICES"] = env_value @@ -123,6 +126,10 @@ def get_compute_device(self) -> str: def get_ngpus(self) -> int: """Get the number of GPUs.""" + @abstractmethod + def get_device_name(self) -> str | None: + """Get the device name (e.g., NVIDIA A800-SXM4-80GB) if available.""" + def get_backend_info(self) -> dict: """Get backend information.""" return {}