Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions deepmd/pd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,20 @@ def get_backend_info(self) -> dict:
**op_info,
}

def get_device_name(self) -> str | None:
"""Get the underlying GPU name.

Returns
-------
str or None
The device name if available, otherwise None.
"""
if paddle.device.is_compiled_with_cuda():
cuda = paddle.device.cuda
if cuda.device_count() > 0:
return cuda.get_device_name()
return None


def train(
input_file: str,
Expand Down
12 changes: 12 additions & 0 deletions deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,18 @@ def get_backend_info(self) -> dict:
**op_info,
}

def get_device_name(self) -> str | None:
"""Use PyTorch's current device name as the device identifier.

Returns
-------
str or None
The device name if available, otherwise None.
"""
if torch.cuda.is_available():
return torch.cuda.get_device_name(torch.cuda.current_device())
return None


def train(
input_file: str,
Expand Down
18 changes: 18 additions & 0 deletions deepmd/tf/train/run_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,24 @@ def get_backend_info(self) -> dict:
"build with TF lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"),
}

def get_device_name(self) -> str | None:
"""Get the hardware device name if available.

Returns
-------
str or None
The device name (e.g., NVIDIA A100) if available, otherwise None.
"""
try:
gpus = tf.config.get_visible_devices("GPU")
if gpus:
details = tf.config.experimental.get_device_details(gpus[0])
return details.get("device_name")
except (AttributeError, RuntimeError):
# Experimental API may not exist or fail in some TF versions
pass
return None


class RunOptions:
"""Class with info on how to run training (cluster, MPI and GPU config).
Expand Down
7 changes: 7 additions & 0 deletions deepmd/utils/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def __call__(self) -> None:
"computing device": self.get_compute_device(),
}
)
device_name = self.get_device_name()
if device_name:
build_info["Device Name"] = device_name
if self.is_built_with_cuda():
env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "unset")
build_info["CUDA_VISIBLE_DEVICES"] = env_value
Expand Down Expand Up @@ -123,6 +126,10 @@ def get_compute_device(self) -> str:
def get_ngpus(self) -> int:
"""Get the number of GPUs."""

@abstractmethod
def get_device_name(self) -> str | None:
"""Get the device name (e.g., NVIDIA A800-SXM4-80GB) if available."""

def get_backend_info(self) -> dict:
"""Get backend information."""
return {}