Skip to content

Commit 982ec64

Browse files
committed
rename tests and examples
Signed-off-by: yaoyu-33 <yaoyu.094@gmail.com>
1 parent fa5af7e commit 982ec64

File tree

5 files changed

+47
-48
lines changed

5 files changed

+47
-48
lines changed

examples/recipes/local_parallel_groups/README.md renamed to examples/recipes/decentralized_pg/README.md

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
# Local Parallel Groups Examples
1+
# Decentralized Process Groups Examples
22

3-
This directory contains examples demonstrating how to use **local parallel groups** (`use_decentralized_pg=True`) in Megatron-Bridge for distributed training.
3+
This directory contains examples demonstrating how to use **decentralized process groups** (`use_decentralized_pg=True`) in Megatron-Bridge for distributed training.
44

55
## Overview
66

@@ -15,20 +15,20 @@ Instead of relying on Megatron-Core's global parallel state (mpu) module, you ca
1515
| File | Description |
1616
|------|-------------|
1717
| `pretrain_qwen3_simple.py` | **Simple**: Use a recipe and enable `use_decentralized_pg=True` |
18-
| `pretrain_qwen3_with_local_parallel_groups.py` | **Advanced**: Manually create process groups with `HyperCommGrid` |
18+
| `pretrain_qwen3_with_decentralized_pg.py` | **Advanced**: Manually create process groups with `HyperCommGrid` |
1919

2020
## Quick Start
2121

2222
### Simple Approach (Recommended)
2323

24-
Just use an existing recipe and enable local parallel groups:
24+
Just use an existing recipe and enable decentralized process groups:
2525

2626
```bash
2727
# 8 GPUs: TP2 x PP2 x DP2
28-
torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
28+
uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
2929

30-
# Or with uv
31-
uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
30+
# 4 GPUs: TP2 x PP2 x DP1
31+
uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
3232
```
3333

3434
The key is just two lines:
@@ -42,7 +42,7 @@ cfg = qwen3_4b_pretrain_config(
4242
# ... other settings
4343
)
4444

45-
# Enable local parallel groups
45+
# Enable decentralized process groups
4646
cfg.dist.use_decentralized_pg = True
4747
cfg.dist.use_gloo_process_groups = False # Gloo not supported
4848
```
@@ -53,14 +53,14 @@ For full control over process groups:
5353

5454
```bash
5555
# 8 GPUs: TP2 x PP2 x DP2
56-
torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py
56+
uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
5757

5858
# 4 GPUs: TP2 x PP2 x DP1
59-
torchrun --nproc_per_node=4 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
59+
uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
6060
--tp-size 2 --pp-size 2
6161

6262
# 2 GPUs: TP2 x PP1 x DP1
63-
torchrun --nproc_per_node=2 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
63+
uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
6464
--tp-size 2 --pp-size 1
6565
```
6666

examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py renamed to examples/recipes/decentralized_pg/pretrain_qwen3_simple.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@
1515

1616
"""
1717
==============================================================================
18-
Example: Qwen3 Pretraining with Local Parallel Groups (Simple)
18+
Example: Qwen3 Pretraining with Decentralized Process Groups (Simple)
1919
==============================================================================
2020
21-
This example demonstrates the simplest way to enable local parallel groups:
21+
This example demonstrates the simplest way to enable decentralized process groups:
2222
just use an existing recipe and set `cfg.dist.use_decentralized_pg = True`.
2323
2424
The setup() function inside pretrain() will automatically create the
@@ -27,12 +27,10 @@
2727
How to Run
2828
----------
2929
# 8 GPUs: TP2 x PP2 x DP2
30-
torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
31-
32-
uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
30+
uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
3331
3432
# 4 GPUs: TP2 x PP2 x DP1
35-
torchrun --nproc_per_node=4 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
33+
uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
3634
"""
3735

3836
import torch
@@ -43,7 +41,7 @@
4341

4442

4543
def main() -> None:
46-
"""Run Qwen3 pretraining with local parallel groups enabled."""
44+
"""Run Qwen3 pretraining with decentralized process groups enabled."""
4745
# Get the standard Qwen3 4B pretrain config with overrides
4846
cfg = qwen3_4b_pretrain_config(
4947
# Use mock data for demo
@@ -60,12 +58,14 @@ def main() -> None:
6058
lr_warmup_iters=10,
6159
lr_decay_iters=100,
6260
)
61+
# known issue with share_embeddings_and_output_weights
62+
cfg.model.share_embeddings_and_output_weights = False
6363

6464
# =========================================================================
65-
# KEY: Enable local parallel groups
65+
# KEY: Enable decentralized process groups
6666
# =========================================================================
6767
cfg.dist.use_decentralized_pg = True
68-
cfg.dist.use_gloo_process_groups = False # Gloo not supported with local PG
68+
cfg.dist.use_gloo_process_groups = False # Gloo not supported with decentralized PG
6969

7070
pretrain(config=cfg, forward_step_func=forward_step)
7171

examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py renamed to examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515

1616
"""
1717
==============================================================================
18-
Example: Qwen3 Pretraining with Local Parallel Groups (Advanced/Manual)
18+
Example: Qwen3 Pretraining with Decentralized Process Groups (Advanced/Manual)
1919
==============================================================================
2020
2121
This example demonstrates how to MANUALLY create process groups using
@@ -37,16 +37,14 @@
3737
How to Run
3838
----------
3939
# 8 GPUs: TP2 x PP2 x DP2
40-
torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py
41-
42-
uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py
40+
uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
4341
4442
# 4 GPUs: TP2 x PP2 x DP1
45-
torchrun --nproc_per_node=4 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
43+
uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
4644
--tp-size 2 --pp-size 2
4745
4846
# 2 GPUs: TP2 x PP1 x DP1
49-
torchrun --nproc_per_node=2 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
47+
uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
5048
--tp-size 2 --pp-size 1
5149
"""
5250

@@ -95,7 +93,7 @@
9593

9694
def parse_args() -> argparse.Namespace:
9795
"""Parse command-line arguments."""
98-
parser = argparse.ArgumentParser(description="Qwen3 Pretraining with Manual Local Parallel Groups")
96+
parser = argparse.ArgumentParser(description="Qwen3 Pretraining with Manual Decentralized Process Groups")
9997

10098
# Parallelism settings
10199
parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size (default: 2)")
@@ -292,7 +290,7 @@ def create_process_group_collection(
292290
# Build the ProcessGroupCollection
293291
# ===========================================================================
294292
# This is the single object that contains ALL process groups and gets
295-
# passed through function calls in local parallel groups mode.
293+
# passed through function calls in decentralized process groups mode.
296294
pg_collection = ProcessGroupCollection(
297295
# Core parallelism groups
298296
tp=tp_pg,
@@ -420,7 +418,7 @@ def run_training(args: argparse.Namespace, pg_collection: ProcessGroupCollection
420418
# ===========================================================================
421419
# Create output directories
422420
# ===========================================================================
423-
base_dir = tempfile.mkdtemp(prefix="mbridge_local_pg_")
421+
base_dir = tempfile.mkdtemp(prefix="mbridge_decentralized_pg_")
424422
checkpoint_dir = os.path.join(base_dir, "checkpoints")
425423
tensorboard_dir = os.path.join(base_dir, "tensorboard")
426424

@@ -498,7 +496,7 @@ def run_training(args: argparse.Namespace, pg_collection: ProcessGroupCollection
498496
# managing process groups ourselves via pg_collection
499497
dist_cfg = DistributedInitConfig(
500498
use_decentralized_pg=True,
501-
use_gloo_process_groups=False, # Gloo not supported with local PG
499+
use_gloo_process_groups=False, # Gloo not supported with decentralized PG
502500
)
503501

504502
dataset_cfg = MockGPTDatasetConfig(
@@ -635,7 +633,7 @@ def main() -> None:
635633
args = parse_args()
636634

637635
print_rank_0("=" * 70)
638-
print_rank_0("Qwen3 Pretraining with MANUALLY Created Local Parallel Groups")
636+
print_rank_0("Qwen3 Pretraining with MANUALLY Created Decentralized Process Groups")
639637
print_rank_0("=" * 70)
640638
print_rank_0("")
641639
print_rank_0("This example shows how to:")

tests/functional_tests/training/test_local_parallel_groups.py renamed to tests/functional_tests/training/test_decentralized_pg.py

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -68,13 +68,13 @@ def cleanup_megatron_state():
6868
pass
6969

7070

71-
class TestLocalParallelGroupsPretrain:
71+
class TestDecentralizedPgPretrain:
7272
"""
7373
Functional tests for pretraining with use_decentralized_pg enabled.
7474
"""
7575

7676
@pytest.mark.run_only_on("GPU")
77-
def test_pretrain_with_local_parallel_groups(self, tmp_path):
77+
def test_pretrain_with_decentralized_pg(self, tmp_path):
7878
"""
7979
Test end to end training with use_decentralized_pg=True.
8080
@@ -199,7 +199,7 @@ def test_pretrain_with_local_parallel_groups(self, tmp_path):
199199
clear_directories(tmp_path)
200200

201201
@pytest.mark.run_only_on("GPU")
202-
def test_pretrain_with_local_parallel_groups_disabled(self, tmp_path):
202+
def test_pretrain_with_decentralized_pg_disabled(self, tmp_path):
203203
"""
204204
Test end to end training with use_decentralized_pg=False (default).
205205
@@ -322,12 +322,13 @@ def test_pretrain_with_local_parallel_groups_disabled(self, tmp_path):
322322
finally:
323323
clear_directories(tmp_path)
324324

325+
#
325326
@pytest.mark.run_only_on("GPU")
326-
def test_pretrain_with_local_parallel_groups_and_pp(self, tmp_path):
327+
def test_pretrain_with_decentralized_pg_and_pp(self, tmp_path):
327328
"""
328329
Test training with use_decentralized_pg=True and pipeline parallelism.
329330
330-
This test verifies that the local parallel groups feature works correctly
331+
This test verifies that the decentralized process groups feature works correctly
331332
with pipeline parallelism enabled.
332333
"""
333334
initialize_distributed()
@@ -453,11 +454,11 @@ def test_pretrain_with_local_parallel_groups_and_pp(self, tmp_path):
453454
clear_directories(tmp_path)
454455

455456
@pytest.mark.run_only_on("GPU")
456-
def test_pretrain_with_local_parallel_groups_and_cp(self, tmp_path):
457+
def test_pretrain_with_decentralized_pg_and_cp(self, tmp_path):
457458
"""
458459
Test training with use_decentralized_pg=True and context parallelism.
459460
460-
This test verifies that the local parallel groups feature works correctly
461+
This test verifies that the decentralized process groups feature works correctly
461462
with context parallelism enabled.
462463
"""
463464
initialize_distributed()
@@ -583,11 +584,11 @@ def test_pretrain_with_local_parallel_groups_and_cp(self, tmp_path):
583584
clear_directories(tmp_path)
584585

585586
@pytest.mark.run_only_on("GPU")
586-
def test_pretrain_with_local_parallel_groups_combined_parallelism(self, tmp_path):
587+
def test_pretrain_with_decentralized_pg_combined_parallelism(self, tmp_path):
587588
"""
588589
Test training with use_decentralized_pg=True and combined TP+PP.
589590
590-
This test verifies that the local parallel groups feature works correctly
591+
This test verifies that the decentralized process groups feature works correctly
591592
with multiple forms of parallelism enabled simultaneously.
592593
"""
593594
initialize_distributed()
@@ -713,11 +714,11 @@ def test_pretrain_with_local_parallel_groups_combined_parallelism(self, tmp_path
713714
clear_directories(tmp_path)
714715

715716
@pytest.mark.run_only_on("GPU")
716-
def test_pretrain_with_local_parallel_groups_and_tp(self, tmp_path):
717+
def test_pretrain_with_decentralized_pg_and_tp(self, tmp_path):
717718
"""
718719
Test training with use_decentralized_pg=True and tensor parallelism.
719720
720-
This test verifies that the local parallel groups feature works correctly
721+
This test verifies that the decentralized process groups feature works correctly
721722
with tensor parallelism enabled.
722723
"""
723724
initialize_distributed()

tests/unit_tests/training/test_local_parallel_groups.py renamed to tests/unit_tests/training/test_decentralized_pg.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
from megatron.bridge.training.config import DistributedInitConfig
2929

3030

31-
class TestDistributedInitConfigLocalParallelGroups:
31+
class TestDistributedInitConfigDecentralizedPg:
3232
"""Tests for DistributedInitConfig.use_decentralized_pg configuration."""
3333

3434
def test_use_decentralized_pg_default_is_false(self):
@@ -286,7 +286,7 @@ class TestInitializeDistributedBranching:
286286
@patch("torch.cuda.device_count", return_value=1)
287287
@patch("torch.distributed.is_initialized", return_value=True)
288288
@patch("megatron.bridge.training.initialize.get_rank_safe", return_value=0)
289-
def test_uses_hyper_comm_grid_when_local_parallel_groups_enabled(
289+
def test_uses_hyper_comm_grid_when_decentralized_pg_enabled(
290290
self,
291291
mock_get_rank,
292292
mock_is_init,
@@ -330,7 +330,7 @@ def test_uses_hyper_comm_grid_when_local_parallel_groups_enabled(
330330
@patch("torch.cuda.device_count", return_value=1)
331331
@patch("torch.distributed.is_initialized", return_value=True)
332332
@patch("megatron.bridge.training.initialize.get_rank_safe", return_value=0)
333-
def test_uses_mpu_when_local_parallel_groups_disabled(
333+
def test_uses_mpu_when_decentralized_pg_disabled(
334334
self,
335335
mock_get_rank,
336336
mock_is_init,
@@ -380,7 +380,7 @@ def test_uses_mpu_when_local_parallel_groups_disabled(
380380
mock_parallel_state.initialize_model_parallel.assert_called_once()
381381

382382

383-
class TestSetupUsesLocalParallelGroups:
383+
class TestSetupUsesDecentralizedPg:
384384
"""Tests for setup function behavior with use_decentralized_pg."""
385385

386386
def test_config_use_decentralized_pg_enabled(self):
@@ -542,7 +542,7 @@ def test_setup_passes_none_when_use_decentralized_pg_false(self):
542542
assert passed_pg_collection is None
543543

544544

545-
class TestCheckpointingWithLocalParallelGroups:
545+
class TestCheckpointingWithDecentralizedPg:
546546
"""Tests for checkpointing behavior based on use_decentralized_pg setting."""
547547

548548
def test_modelopt_state_save_skipped_when_use_decentralized_pg_true(self):
@@ -584,7 +584,7 @@ def test_modelopt_state_save_executed_when_use_decentralized_pg_false(self):
584584
assert should_save_modelopt is True
585585

586586

587-
class TestTrainTensorShapesAdjustWithLocalParallelGroups:
587+
class TestTrainTensorShapesAdjustWithDecentralizedPg:
588588
"""Tests for train.py tensor shapes adjust function behavior."""
589589

590590
def test_tensor_shapes_adjust_fn_is_none_when_use_decentralized_pg_true(self):

0 commit comments

Comments
 (0)