Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions deepmd/loggers/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ def format_training_message(
eta: int | None = None,
) -> str:
"""Format a training message."""
msg = f"batch {batch:7d}: total wall time = {wall_time:.2f} s"
msg = f"Batch {batch:7d}: total wall time = {wall_time:.2f} s"
if isinstance(eta, int):
msg += f", eta = {datetime.timedelta(seconds=int(eta))!s}"
return msg
Expand All @@ -29,7 +29,7 @@ def format_training_message_per_task(
# sort rmse
rmse = dict(sorted(rmse.items()))
return (
f"batch {batch:7d}: {task_name}"
f"Batch {batch:7d}: {task_name}"
f"{', '.join([f'{kk} = {vv:8.2e}' for kk, vv in rmse.items()])}"
f"{lr}"
)
4 changes: 2 additions & 2 deletions deepmd/pd/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,8 @@ def get_backend_info(self) -> dict:
op_info = {}
return {
"Backend": "Paddle",
"PD ver": f"v{paddle.__version__}-g{paddle.version.commit[:11]}",
"Enable custom OP": False,
"PD Ver": f"v{paddle.__version__}-g{paddle.version.commit[:11]}",
"Custom OP Enabled": False,
**op_info,
}

Expand Down
10 changes: 5 additions & 5 deletions deepmd/pt/entrypoints/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,16 +239,16 @@ def get_backend_info(self) -> dict:
"""Get backend information."""
if ENABLE_CUSTOMIZED_OP:
op_info = {
"build with PT ver": GLOBAL_CONFIG["pt_version"],
"build with PT inc": GLOBAL_CONFIG["pt_include_dir"].replace(";", "\n"),
"build with PT lib": GLOBAL_CONFIG["pt_libs"].replace(";", "\n"),
"Built With PT Ver": GLOBAL_CONFIG["pt_version"],
"Built With PT Inc": GLOBAL_CONFIG["pt_include_dir"].replace(";", "\n"),
"Built With PT Lib": GLOBAL_CONFIG["pt_libs"].replace(";", "\n"),
}
else:
op_info = {}
return {
"Backend": "PyTorch",
"PT ver": f"v{torch.__version__}-g{torch.version.git_version[:11]}",
"Enable custom OP": ENABLE_CUSTOMIZED_OP,
"PT Ver": f"v{torch.__version__}-g{torch.version.git_version[:11]}",
"Custom OP Enabled": ENABLE_CUSTOMIZED_OP,
**op_info,
}

Expand Down
45 changes: 45 additions & 0 deletions deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,51 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self.profiling = training_params.get("profiling", False)
self.profiling_file = training_params.get("profiling_file", "timeline.json")

# Log model summary info (descriptor type and parameter count)
if self.rank == 0:
self._log_model_summary()

def _log_model_summary(self) -> None:
"""Log model summary information including descriptor type and parameter count."""

def get_descriptor_type(model: Any) -> str:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why annotate `Any‘?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

since

def get_model_for_wrapper(
    _model_params: dict[str, Any],
    resuming: bool = False,
    _loss_params: dict[str, Any] | None = None,
) -> Any

returns Any

"""Get the descriptor type name from model."""
# Standard models have get_descriptor method
if hasattr(model, "get_descriptor"):
descriptor = model.get_descriptor()
serialized = descriptor.serialize()
if isinstance(serialized, dict) and "type" in serialized:
return serialized["type"].upper()
# ZBL models: descriptor is in atomic_model.models[0]
if hasattr(model, "atomic_model") and hasattr(model.atomic_model, "models"):
models = model.atomic_model.models
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not a good behavior to visit a inner attribution like this.

if models: # Check non-empty
dp_model = models[0]
if hasattr(dp_model, "descriptor"):
serialized = dp_model.descriptor.serialize()
if isinstance(serialized, dict) and "type" in serialized:
return serialized["type"].upper() + " (with ZBL)"
return "UNKNOWN"

def count_parameters(model: Any) -> int:
"""Count the total number of trainable parameters."""
return sum(p.numel() for p in model.parameters() if p.requires_grad)

if not self.multi_task:
desc_type = get_descriptor_type(self.model)
num_params = count_parameters(self.model)
log.info("")
log.info(f"Descriptor: {desc_type}")
log.info(f"Model Params: {num_params / 1e6:.3f} M")
else:
# For multi-task, log each model's info
for model_key in self.model_keys:
desc_type = get_descriptor_type(self.model[model_key])
num_params = count_parameters(self.model[model_key])
log.info("")
log.info(f"Descriptor [{model_key}]: {desc_type}")
log.info(f"Model Params [{model_key}]: {num_params / 1e6:.3f} M")

def run(self) -> None:
fout = (
open(
Expand Down
8 changes: 4 additions & 4 deletions deepmd/tf/train/run_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,10 @@ def get_backend_info(self) -> dict:
"""Get backend information."""
return {
"Backend": "TensorFlow",
"TF ver": tf.version.GIT_VERSION,
"build with TF ver": TF_VERSION,
"build with TF inc": GLOBAL_CONFIG["tf_include_dir"].replace(";", "\n"),
"build with TF lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"),
"TF Ver": tf.version.GIT_VERSION,
"Built With TF Ver": TF_VERSION,
"Built With TF Inc": GLOBAL_CONFIG["tf_include_dir"].replace(";", "\n"),
"Built With TF Lib": GLOBAL_CONFIG["tf_libs"].replace(";", "\n"),
}


Expand Down
4 changes: 2 additions & 2 deletions deepmd/utils/data_system.py
Original file line number Diff line number Diff line change
Expand Up @@ -713,9 +713,9 @@ def print_summary(
# width 65
sys_width = 42
log.info(
f"---Summary of DataSystem: {name:13s}-----------------------------------------------"
f"---Summary of DataSystem: {name.capitalize():13s}-----------------------------------------------"
)
log.info("found %d system(s):", nsystems)
log.info("Found %d System(s):", nsystems)
log.info(
"%s %6s %6s %6s %9s %3s",
_format_name_length("system", sys_width),
Expand Down
53 changes: 39 additions & 14 deletions deepmd/utils/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,13 @@ class SummaryPrinter(ABC):
)

BUILD: ClassVar = {
"installed to": "\n".join(deepmd.__path__),
"source": GLOBAL_CONFIG["git_summ"],
"source branch": GLOBAL_CONFIG["git_branch"],
"source commit": GLOBAL_CONFIG["git_hash"],
"source commit at": GLOBAL_CONFIG["git_date"],
"use float prec": global_float_prec,
"build variant": GLOBAL_CONFIG["dp_variant"],
"Installed To": "\n".join(deepmd.__path__),
"Source": GLOBAL_CONFIG["git_summ"],
"Source Branch": GLOBAL_CONFIG["git_branch"],
"Source Commit": GLOBAL_CONFIG["git_hash"],
"Source Commit At": GLOBAL_CONFIG["git_date"],
"Float Precision": global_float_prec.capitalize(),
"Build Variant": GLOBAL_CONFIG["dp_variant"].upper(),
}

def __call__(self) -> None:
Expand All @@ -64,30 +64,55 @@ def __call__(self) -> None:
if len(nodelist) > 1:
build_info.update(
{
"world size": str(len(nodelist)),
"node list": ", ".join(set(nodelist)),
"World Size": str(len(nodelist)),
"Node List": ", ".join(set(nodelist)),
}
)
build_info.update(
{
"running on": nodename,
"computing device": self.get_compute_device(),
"Running On": nodename,
"Computing Device": self.get_compute_device().upper(),
}
)
backend = build_info.get("Backend")
device_name = None
try:
if backend == "PyTorch":
import torch

if torch.cuda.is_available():
device_name = torch.cuda.get_device_name(0)
elif backend == "TensorFlow":
import tensorflow as tf

gpus = tf.config.list_physical_devices("GPU")
if gpus:
# Use the first physical GPU device identifier as the device name
device_name = gpus[0].name
elif backend == "Paddle":
import paddle

# Use Paddle's current device string (e.g., "gpu:0") as a device identifier
device_name = paddle.get_device()
except Exception:
# Best-effort device name detection; ignore failures silently
pass
if device_name:
build_info["Device Name"] = device_name
if self.is_built_with_cuda():
env_value = os.environ.get("CUDA_VISIBLE_DEVICES", "unset")
build_info["CUDA_VISIBLE_DEVICES"] = env_value
if self.is_built_with_rocm():
env_value = os.environ.get("HIP_VISIBLE_DEVICES", "unset")
build_info["HIP_VISIBLE_DEVICES"] = env_value
if self.is_built_with_cuda() or self.is_built_with_rocm():
build_info["Count of visible GPUs"] = str(self.get_ngpus())
build_info["Visible GPU Count"] = str(self.get_ngpus())

intra, inter = get_default_nthreads()
build_info.update(
{
"num_intra_threads": str(intra),
"num_inter_threads": str(inter),
"NUM_INTRA_THREADS": str(intra),
"NUM_INTER_THREADS": str(inter),
}
)
# count the maximum characters in the keys and values
Expand Down
144 changes: 144 additions & 0 deletions source/tests/pt/test_model_summary.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
# SPDX-License-Identifier: LGPL-3.0-or-later
"""Tests for model summary display functions."""

import unittest
from unittest.mock import (
MagicMock,
)

import torch


class TestGetDescriptorType(unittest.TestCase):
"""Test get_descriptor_type helper function."""

@staticmethod
def get_descriptor_type(model):
"""Replicate the logic from training.py for testing."""
# Standard models have get_descriptor method
if hasattr(model, "get_descriptor"):
descriptor = model.get_descriptor()
serialized = descriptor.serialize()
if isinstance(serialized, dict) and "type" in serialized:
return serialized["type"].upper()
# ZBL models: descriptor is in atomic_model.models[0]
if hasattr(model, "atomic_model") and hasattr(model.atomic_model, "models"):
models = model.atomic_model.models
if models: # Check non-empty
dp_model = models[0]
if hasattr(dp_model, "descriptor"):
serialized = dp_model.descriptor.serialize()
if isinstance(serialized, dict) and "type" in serialized:
return serialized["type"].upper() + " (with ZBL)"
return "UNKNOWN"

def test_standard_model(self):
"""Test descriptor type detection for standard models."""
mock_descriptor = MagicMock()
mock_descriptor.serialize.return_value = {"type": "se_e2_a"}

mock_model = MagicMock()
mock_model.get_descriptor.return_value = mock_descriptor

result = self.get_descriptor_type(mock_model)
self.assertEqual(result, "SE_E2_A")

def test_zbl_model(self):
"""Test descriptor type detection for ZBL models."""
mock_descriptor = MagicMock()
mock_descriptor.serialize.return_value = {"type": "dpa1"}

mock_dp_model = MagicMock()
mock_dp_model.descriptor = mock_descriptor

mock_atomic_model = MagicMock()
mock_atomic_model.models = [mock_dp_model]

mock_model = MagicMock(spec=[]) # No get_descriptor
mock_model.atomic_model = mock_atomic_model

result = self.get_descriptor_type(mock_model)
self.assertEqual(result, "DPA1 (with ZBL)")

def test_empty_models_list(self):
"""Test handling of empty models list in ZBL model."""
mock_atomic_model = MagicMock()
mock_atomic_model.models = []

mock_model = MagicMock(spec=[])
mock_model.atomic_model = mock_atomic_model

result = self.get_descriptor_type(mock_model)
self.assertEqual(result, "UNKNOWN")

def test_missing_type_key(self):
"""Test handling of serialize() without 'type' key."""
mock_descriptor = MagicMock()
mock_descriptor.serialize.return_value = {"other_key": "value"}

mock_model = MagicMock()
mock_model.get_descriptor.return_value = mock_descriptor

result = self.get_descriptor_type(mock_model)
self.assertEqual(result, "UNKNOWN")

def test_serialize_returns_non_dict(self):
"""Test handling of serialize() returning non-dict."""
mock_descriptor = MagicMock()
mock_descriptor.serialize.return_value = "not_a_dict"

mock_model = MagicMock()
mock_model.get_descriptor.return_value = mock_descriptor

result = self.get_descriptor_type(mock_model)
self.assertEqual(result, "UNKNOWN")

def test_unknown_model_structure(self):
"""Test handling of unknown model structure."""
mock_model = MagicMock(spec=[]) # No get_descriptor, no atomic_model
result = self.get_descriptor_type(mock_model)
self.assertEqual(result, "UNKNOWN")


class TestCountParameters(unittest.TestCase):
"""Test count_parameters helper function."""

@staticmethod
def count_parameters(model):
"""Replicate the logic from training.py for testing."""
return sum(p.numel() for p in model.parameters() if p.requires_grad)

def test_all_trainable(self):
"""Test counting when all parameters are trainable."""
with torch.device("cpu"):
model = torch.nn.Linear(10, 5) # 10*5 + 5 = 55 parameters
result = self.count_parameters(model)
self.assertEqual(result, 55)

def test_mixed_trainable(self):
"""Test counting with some frozen parameters."""
with torch.device("cpu"):
model = torch.nn.Sequential(
torch.nn.Linear(10, 5), # 55 params
torch.nn.Linear(5, 3), # 18 params
)
# Freeze first layer
for param in model[0].parameters():
param.requires_grad = False

result = self.count_parameters(model)
self.assertEqual(result, 18) # Only second layer

def test_all_frozen(self):
"""Test counting when all parameters are frozen."""
with torch.device("cpu"):
model = torch.nn.Linear(10, 5)
for param in model.parameters():
param.requires_grad = False

result = self.count_parameters(model)
self.assertEqual(result, 0)


if __name__ == "__main__":
unittest.main()