Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions deepmd/pd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,23 @@ def get_backend_info(self) -> dict:
**op_info,
}

def get_device_name(self) -> str | None:
"""Use Paddle's CUDA device properties to get the underlying GPU name.

Returns
-------
str or None
The device name if available, otherwise None.
"""
if paddle.device.is_compiled_with_cuda():
cuda_mod = getattr(paddle.device, "cuda", None)
if cuda_mod is not None and cuda_mod.device_count() > 0:
get_props = getattr(cuda_mod, "get_device_properties", None)
if callable(get_props):
props = get_props(0)
return getattr(props, "name", None)
return None


def train(
input_file: str,
Expand Down
12 changes: 12 additions & 0 deletions deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,18 @@ def get_backend_info(self) -> dict:
**op_info,
}

def get_device_name(self) -> str | None:
"""Use PyTorch's current device name as the device identifier.

Returns
-------
str or None
The device name if available, otherwise None.
"""
if torch.cuda.is_available():
return torch.cuda.get_device_name(torch.cuda.current_device())
return None


def train(
input_file: str,
Expand Down
15 changes: 15 additions & 0 deletions deepmd/tf/train/run_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,21 @@ def get_backend_info(self) -> dict:
"build with TF lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"),
}

def get_device_name(self) -> str | None:
"""Prefer the hardware device name if available, fall back to identifier.

Returns
-------
str or None
The device name if available, otherwise None.
"""
gpus = tf.config.list_physical_devices("GPU")
if gpus:
# Use the first physical GPU device identifier as the device name
details = tf.config.experimental.get_device_details(gpus[0])
return details.get("device_name") or gpus[0].name
return None


class RunOptions:
"""Class with info on how to run training (cluster, MPI and GPU config).
Expand Down
13 changes: 13 additions & 0 deletions deepmd/utils/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ def __call__(self) -> None:
"computing device": self.get_compute_device(),
}
)
device_name = self.get_device_name()
if device_name:
build_info["Device Name"] = device_name
if self.is_built_with_cuda():
env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "unset")
build_info["CUDA_VISIBLE_DEVICES"] = env_value
Expand Down Expand Up @@ -126,3 +129,13 @@ def get_ngpus(self) -> int:
def get_backend_info(self) -> dict:
"""Get backend information."""
return {}

def get_device_name(self) -> str | None:
"""Get the device name (e.g., NVIDIA A800-SXM4-80GB).

Returns
-------
str or None
The device name if available, otherwise None.
"""
return None