Skip to content

Commit 52cd68d

Browse files
guyueh1terrykong
andauthored
feat: Using mcore cpu optimizer (#1242)
Signed-off-by: Guyue Huang <guyueh@nvidia.com> Signed-off-by: Guyue Huang <140554423+guyueh1@users.noreply.github.com> Co-authored-by: Terry Kong <terrycurtiskong@gmail.com>
1 parent d726c38 commit 52cd68d

File tree

10 files changed

+55
-2
lines changed

10 files changed

+55
-2
lines changed

examples/configs/dpo.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,10 @@ policy:
136136

137137
clip_grad: ${policy.max_grad_norm}
138138

139+
# optimizer cpu offload
140+
optimizer_cpu_offload: false
141+
optimizer_offload_fraction: 0.0
142+
139143
scheduler:
140144
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
141145
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}

examples/configs/grpo_math_1B.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,10 @@ policy:
110110

111111
clip_grad: ${policy.max_grad_norm}
112112

113+
# optimizer cpu offload
114+
optimizer_cpu_offload: false
115+
optimizer_offload_fraction: 0.0
116+
113117
scheduler:
114118
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
115119
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}

examples/configs/rm.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@ policy:
105105

106106
clip_grad: ${policy.max_grad_norm}
107107

108+
# optimizer cpu offload
109+
optimizer_cpu_offload: false
110+
optimizer_offload_fraction: 0.0
111+
108112
scheduler:
109113
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
110114
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}

examples/configs/sft.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,10 @@ policy:
114114

115115
clip_grad: ${policy.max_grad_norm}
116116

117+
# optimizer cpu offload
118+
optimizer_cpu_offload: false
119+
optimizer_offload_fraction: 0.0
120+
117121
scheduler:
118122
start_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}
119123
end_weight_decay: ${policy.megatron_cfg.optimizer.weight_decay}

examples/configs/sft_openmathinstruct2_megatron.yaml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ policy:
6262
use_precision_aware_optimizer: false #true ## TODO: precision aware optim not working with fp8. Is this expected?
6363
weight_decay: 0.01
6464

65+
# optimizer cpu offload
66+
optimizer_cpu_offload: false
67+
optimizer_offload_fraction: 0.0
68+
6569
## recently introduced, our current mcore commit doesn't have this
6670
#fp8_recipe: delayed
6771

nemo_rl/models/policy/__init__.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,11 @@ class MegatronOptimizerConfig(TypedDict):
6161
use_distributed_optimizer: bool
6262
use_precision_aware_optimizer: bool
6363
clip_grad: float
64+
# knob to enable optimizer cpu offload
65+
optimizer_cpu_offload: bool
66+
# knob to set the fraction of parameters to keep on CPU
67+
# currently if optimizer_cpu_offload is true, this knob must be 1.0
68+
optimizer_offload_fraction: float
6469

6570

6671
class MegatronSchedulerConfig(TypedDict):

nemo_rl/models/policy/megatron_policy_worker.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -618,6 +618,19 @@ def __init__(
618618
"Refer to https://github.com/NVIDIA-NeMo/RL/issues/1164 for latest updates with this issue."
619619
)
620620

621+
optimizer_cpu_offload = self.cfg["megatron_cfg"]["optimizer"][
622+
"optimizer_cpu_offload"
623+
]
624+
optimizer_offload_fraction = self.cfg["megatron_cfg"]["optimizer"][
625+
"optimizer_offload_fraction"
626+
]
627+
if optimizer_cpu_offload:
628+
# Currently, hybrid optimizer (partly on GPU and partly on CPU) is not supported because it conflicts with the way
629+
# Nemo-rl handles the optimizer offload/onload between generation and training. So if using CPU optimizer the offload_fraction should be 1.0.
630+
assert optimizer_offload_fraction == 1.0, (
631+
"Currently for optimizer offloading, only optimizer_offload_fraction=1.0 is supported"
632+
)
633+
621634
checkpoint_config = CheckpointConfig(
622635
save_interval=100,
623636
save=weights_path,
@@ -1759,7 +1772,11 @@ def prepare_for_training(self, *args, **kwargs):
17591772
self.model.train()
17601773

17611774
# Move optimizer state to CUDA if it exists
1762-
if hasattr(self, "optimizer") and self.optimizer is not None:
1775+
if (
1776+
hasattr(self, "optimizer")
1777+
and self.optimizer is not None
1778+
and (not self.cfg["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"])
1779+
):
17631780
if isinstance(self.optimizer, ChainedOptimizer):
17641781
optimizer_state = self.optimizer.state
17651782
else:
@@ -1786,7 +1803,11 @@ def offload_before_refit(self):
17861803
self.model, "cpu", move_params=False, move_grads=True
17871804
) # get rid of grad buffers
17881805
torch.randn(1).cuda() # wake up torch allocator
1789-
if hasattr(self, "optimizer") and self.optimizer is not None:
1806+
if (
1807+
hasattr(self, "optimizer")
1808+
and self.optimizer is not None
1809+
and (not self.cfg["megatron_cfg"]["optimizer"]["optimizer_cpu_offload"])
1810+
):
17901811
# Iterate through the state dictionaries for each parameter group
17911812
if isinstance(self.optimizer, ChainedOptimizer):
17921813
optimizer_state = self.optimizer.state

tests/unit/models/generation/test_vllm_generation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,8 @@ def get_basic_megatron_test_config(
192192
"use_distributed_optimizer": True,
193193
"use_precision_aware_optimizer": True,
194194
"clip_grad": 1.0,
195+
"optimizer_cpu_offload": False,
196+
"optimizer_offload_fraction": 0.0,
195197
},
196198
"scheduler": {
197199
"start_weight_decay": 0.01,

tests/unit/models/policy/test_megatron_worker.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ def create_megatron_test_config(
114114
"use_distributed_optimizer": True,
115115
"use_precision_aware_optimizer": True,
116116
"clip_grad": 1.0,
117+
"optimizer_cpu_offload": False,
118+
"optimizer_offload_fraction": 0.0,
117119
},
118120
"scheduler": {
119121
"start_weight_decay": 0.01,

tools/refit_verifier.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,9 @@ def setup_configs(args, tokenizer):
232232
"use_distributed_optimizer": True,
233233
"use_precision_aware_optimizer": True,
234234
"clip_grad": 1.0,
235+
# Optimizer CPU offload settings
236+
"optimizer_cpu_offload": False,
237+
"optimizer_offload_fraction": 0.0,
235238
},
236239
"scheduler": {
237240
"start_weight_decay": 0.01,

0 commit comments

Comments
 (0)