Skip to content
429 changes: 375 additions & 54 deletions deepmd/dpmodel/utils/learning_rate.py

Large diffs are not rendered by default.

20 changes: 4 additions & 16 deletions deepmd/pd/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def get_sample():
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -387,11 +387,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
self.warmup_steps = training_params.get("warmup_steps", 0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
for model_key in self.model_keys:
Expand Down Expand Up @@ -580,18 +576,13 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step, warmup_steps):
if step < warmup_steps:
return step / warmup_steps
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
if self.opt_type == "Adam":
self.scheduler = paddle.optimizer.lr.LambdaDecay(
learning_rate=self.lr_exp.start_lr,
lr_lambda=lambda step: warm_up_linear(step, self.warmup_steps),
lr_lambda=lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
self.optimizer = paddle.optimizer.Adam(
learning_rate=self.scheduler, parameters=self.wrapper.parameters()
Expand Down Expand Up @@ -755,10 +746,7 @@ def step(_step_id, task_key="Default") -> None:
fout1.flush()
if self.opt_type == "Adam":
cur_lr = self.scheduler.get_lr()
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr

# disable synchronization in forward-backward manually
# as derivatives exist in model forward
Expand Down
4 changes: 3 additions & 1 deletion deepmd/pd/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PD_PRECISION_DICT

Expand Down Expand Up @@ -239,7 +240,8 @@ def to_numpy_array(
):
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PD_PRECISION_DICT
reverse_precision_dict = {v: k for k, v in PD_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
Expand Down
41 changes: 6 additions & 35 deletions deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,7 @@ def get_sample() -> Any:
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -431,27 +431,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
warmup_steps = training_params.get("warmup_steps", None)
warmup_ratio = training_params.get("warmup_ratio", None)
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
if not 0 <= warmup_ratio < 1:
raise ValueError(f"warmup_ratio must be in [0, 1), got {warmup_ratio}")
self.warmup_steps = int(warmup_ratio * self.num_steps)
if self.warmup_steps == 0 and warmup_ratio > 0:
log.warning(
f"warmup_ratio {warmup_ratio} results in 0 warmup steps "
f"due to truncation. Consider using a larger ratio or "
f"specify warmup_steps directly."
)
else:
self.warmup_steps = 0
self.warmup_start_factor = training_params.get("warmup_start_factor", 0.0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
for model_key in self.model_keys:
Expand Down Expand Up @@ -697,14 +677,6 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step: int, warmup_steps: int) -> float:
if step < warmup_steps:
return self.warmup_start_factor + (1.0 - self.warmup_start_factor) * (
step / warmup_steps
)
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
if self.opt_type in ["Adam", "AdamW"]:
Expand All @@ -725,7 +697,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self.optimizer.load_state_dict(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would suggest providing a method for accessing the start_lr, rather than directly reads the data of the object.

)
elif self.opt_type == "LKF":
self.optimizer = LKFOptimizer(
Expand All @@ -748,7 +721,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self.optimizer.load_state_dict(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
else:
raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
Expand Down Expand Up @@ -822,10 +796,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
fout1.flush()
if self.opt_type in ["Adam", "AdamW", "AdaMuon"]:
cur_lr = self.scheduler.get_last_lr()[0]
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr
model_pred, loss, more_loss = self.wrapper(
**input_dict, cur_lr=pref_lr, label=label_dict, task_key=task_key
)
Expand Down
9 changes: 7 additions & 2 deletions deepmd/pt/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PT_PRECISION_DICT

Expand Down Expand Up @@ -227,18 +228,22 @@ def to_numpy_array(xx: None) -> None: ...


def to_numpy_array(
xx: torch.Tensor | None,
xx: torch.Tensor | np.ndarray | float | None,
) -> np.ndarray | None:
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
if isinstance(xx, np.ndarray):
return xx.astype(GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PT_PRECISION_DICT
reverse_precision_dict = {v: k for k, v in PT_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
prec = reverse_precision_dict.get(xx.dtype, None)
prec = NP_PRECISION_DICT.get(prec, None)
if prec is None:
raise ValueError(f"unknown precision {xx.dtype}")
assert isinstance(xx, torch.Tensor)
if xx.dtype == torch.bfloat16:
# https://github.com/pytorch/pytorch/issues/109873
xx = xx.float()
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/dipole.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/dos.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/ener.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
The loss function parameters.
lr : LearningRateExp
lr : LearningRateSchedule
The learning rate.

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/polar.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
52 changes: 32 additions & 20 deletions deepmd/tf/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import os
import shutil
import time
from typing import (
Any,
)

import google.protobuf.message
import numpy as np
Expand Down Expand Up @@ -52,7 +55,7 @@
load_graph_def,
)
from deepmd.tf.utils.learning_rate import (
LearningRateExp,
LearningRateSchedule,
)
from deepmd.tf.utils.sess import (
run_sess,
Expand Down Expand Up @@ -100,21 +103,18 @@ def _init_param(self, jdata) -> None:
self.model = Model(**model_param)
self.fitting = self.model.get_fitting()

def get_lr_and_coef(lr_param):
def get_lr_and_coef(
lr_param: dict[str, Any],
) -> tuple[LearningRateSchedule, float]:
scale_by_worker = lr_param.get("scale_by_worker", "linear")
if scale_by_worker == "linear":
scale_lr_coef = float(self.run_opt.world_size)
elif scale_by_worker == "sqrt":
scale_lr_coef = np.sqrt(self.run_opt.world_size).real
else:
scale_lr_coef = 1.0
lr_type = lr_param.get("type", "exp")
if lr_type == "exp":
lr = LearningRateExp(
lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"]
)
else:
raise RuntimeError("unknown learning_rate type " + lr_type)
lr_params = {k: v for k, v in lr_param.items() if k != "scale_by_worker"}
lr = LearningRateSchedule(lr_params)
return lr, scale_lr_coef

# learning rate
Expand Down Expand Up @@ -242,8 +242,13 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix="") -> Non
def _build_lr(self) -> None:
self._extra_train_ops = []
self.global_step = tf.train.get_or_create_global_step()
self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
log.info("built lr")
if self.stop_batch == 0:
# Use constant start_lr when stop_batch is zero (no training)
self.learning_rate = tf.cast(self.lr.start_lr(), GLOBAL_TF_FLOAT_PRECISION)
log.info("built lr (constant start_lr for stop_batch=0)")
else:
self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
log.info("built lr")

def _build_loss(self):
if self.stop_batch == 0:
Expand Down Expand Up @@ -426,14 +431,21 @@ def train(self, train_data=None, valid_data=None) -> None:
elapsed_batch = stop_batch - start_batch
is_first_step = True
self.cur_batch = cur_batch
log.info(
"start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
self.lr.value(cur_batch),
self.lr.decay_steps_,
self.lr.decay_rate_,
self.lr.value(stop_batch),
)
if stop_batch == 0:
lr0 = self.lr.start_lr()
log.info(
"start training at lr %.2e (== %.2e), final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
lr0,
lr0,
)
else:
log.info(
"start training at lr %.2e (== %.2e), final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
self.lr.value(cur_batch),
self.lr.value(stop_batch),
)

prf_options = None
prf_run_metadata = None
Expand Down Expand Up @@ -797,7 +809,7 @@ def _get_place_holders(self, data_dict) -> None:
prec = GLOBAL_ENER_FLOAT_PRECISION
self.place_holders[kk] = tf.placeholder(prec, [None], name="t_" + kk)
self.place_holders["find_" + kk] = tf.placeholder(
tf.float32, name="t_find_" + kk
GLOBAL_TF_FLOAT_PRECISION, name="t_find_" + kk
)

def _init_from_frz_model(self) -> None:
Expand Down
4 changes: 2 additions & 2 deletions deepmd/tf/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
DeepmdDataSystem,
)
from .learning_rate import (
LearningRateExp,
LearningRateSchedule,
)
from .pair_tab import (
PairTab,
Expand All @@ -20,7 +20,7 @@
__all__ = [
"DeepmdData",
"DeepmdDataSystem",
"LearningRateExp",
"LearningRateSchedule",
"PairTab",
"Plugin",
"PluginVariant",
Expand Down
Loading