Skip to content

Commit f3e6cc8

Browse files
Fix checkpoint converter missing parallel group initialization (#3217)
1 parent adce147 commit f3e6cc8

File tree

3 files changed

+47
-20
lines changed

3 files changed

+47
-20
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ Download = "https://github.com/NVIDIA/Megatron-LM/releases"
6464
Homepage = "https://github.com/NVIDIA/Megatron-LM/megatron/core"
6565

6666
[project.optional-dependencies]
67-
mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"]
67+
mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers", "accelerate"]
6868

6969
dev = [
7070
"nvidia-modelopt[torch]; sys_platform != 'darwin'",

tools/checkpoint/saver_base.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,15 @@ def initialize_megatron_env(self):
170170

171171
# For backward compatibility during local parallel states refactoring
172172
fake_tp_group = _ConverterFakeProcessGroup(size=self.args.target_tensor_parallel_size)
173+
fake_pp_group = _ConverterFakeProcessGroup(size=self.args.target_pipeline_parallel_size)
173174
fake_ep_group = _ConverterFakeProcessGroup(size=self.args.target_expert_parallel_size)
175+
fake_dp_group = _ConverterFakeProcessGroup(size=1)
174176
mpu._TENSOR_MODEL_PARALLEL_GROUP = fake_tp_group
177+
mpu._PIPELINE_MODEL_PARALLEL_GROUP = fake_pp_group
175178
mpu._EXPERT_MODEL_PARALLEL_GROUP = fake_ep_group
179+
mpu._DATA_PARALLEL_GROUP = fake_dp_group
180+
mpu._DATA_PARALLEL_GROUP_WITH_CP = fake_dp_group
181+
mpu._INTRA_PARTIAL_DATA_PARALLEL_GROUP_WITH_CP = fake_dp_group
176182
fused_kernels.load(self.margs)
177183

178184
try:

uv.lock

Lines changed: 40 additions & 19 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)