Skip to content

Commit ea484e3

Browse files
committed
support option to disable creating gloo pg
Signed-off-by: Ananth Subramaniam <[email protected]>
1 parent 6da3d43 commit ea484e3

File tree

3 files changed

+13
-1
lines changed

3 files changed

+13
-1
lines changed

nemo/tron/config.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,9 @@ class DistributedInitConfig:
212212
use_tp_pp_dp_mapping: bool = False
213213
"""If set, distributed ranks initialize order is changed from tp-dp-pp to tp-pp-dp. Make sure EP and CP aren't used with this option enabled"""
214214

215+
use_gloo_process_groups: bool = True
216+
"""If set, create Gloo process groups for communications."""
217+
215218

216219
@dataclass
217220
class ProfilingConfig:
@@ -543,6 +546,14 @@ def __post_init__(self):
543546
self.model_config.use_cpu_initialization or self.dist_config.lazy_mpu_init
544547
)
545548

549+
# Make sure all functionality that requires Gloo process groups is disabled.
550+
if not self.dist_config.enable_gloo_process_groups:
551+
if self.optimizer_config.use_distributed_optimizer:
552+
# If using distributed optimizer, must use distributed checkpointing.
553+
# Legacy checkpointing uses Gloo process groups to collect full distributed
554+
# optimizer state in the CPU memory of DP rank 0.
555+
assert self.checkpoint_config.ckpt_format == "torch_dist"
556+
546557
# Scheduler
547558
if self.scheduler_config.lr_decay_iters is None:
548559
self.scheduler_config.lr_decay_iters = self.train_config.train_iters

nemo/tron/init.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ def _initialize_distributed(
229229
),
230230
get_embedding_ranks=get_embedding_ranks,
231231
get_position_embedding_ranks=get_position_embedding_ranks,
232+
create_gloo_process_groups=cfg.dist_config.use_gloo_process_groups,
232233
)
233234
if get_rank_safe() == 0:
234235
print(

nemo/tron/optim.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020

2121
def setup_optimizer(cfg: ConfigContainer, model, no_weight_decay_cond=None, scale_lr_cond=None, lr_mult=1.0):
22-
optimizer = get_megatron_optimizer(cfg.optimizer_config, model, no_weight_decay_cond, scale_lr_cond, lr_mult)
22+
optimizer = get_megatron_optimizer(cfg.optimizer_config, model, no_weight_decay_cond, scale_lr_cond, lr_mult, use_gloo_process_groups=cfg.dist_config.use_gloo_process_groups)
2323
scheduler = _get_scheduler(cfg, optimizer)
2424

2525
return optimizer, scheduler

0 commit comments

Comments
 (0)