rename tests and examples

yaoyu-33 · yaoyu-33 · commit 982ec64b48d1 · 2026-01-21T12:21:54.000-08:00
Signed-off-by: yaoyu-33 &lt;yaoyu.094@gmail.com&gt;
diff --git a/examples/recipes/decentralized_pg/README.md b/examples/recipes/decentralized_pg/README.md
@@ -1,6 +1,6 @@
-# Local Parallel Groups Examples
+# Decentralized Process Groups Examples
 
-This directory contains examples demonstrating how to use **local parallel groups** (`use_decentralized_pg=True`) in Megatron-Bridge for distributed training.
+This directory contains examples demonstrating how to use **decentralized process groups** (`use_decentralized_pg=True`) in Megatron-Bridge for distributed training.
 
 ## Overview
 
@@ -15,20 +15,20 @@ Instead of relying on Megatron-Core's global parallel state (mpu) module, you ca
 | File | Description |
 |------|-------------|
 | `pretrain_qwen3_simple.py` | **Simple**: Use a recipe and enable `use_decentralized_pg=True` |
-| `pretrain_qwen3_with_local_parallel_groups.py` | **Advanced**: Manually create process groups with `HyperCommGrid` |
+| `pretrain_qwen3_with_decentralized_pg.py` | **Advanced**: Manually create process groups with `HyperCommGrid` |
 
 ## Quick Start
 
 ### Simple Approach (Recommended)
 
-Just use an existing recipe and enable local parallel groups:
+Just use an existing recipe and enable decentralized process groups:
 
 ```bash
 # 8 GPUs: TP2 x PP2 x DP2
-torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
 
-# Or with uv
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
+# 4 GPUs: TP2 x PP2 x DP1
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
 ```
 
 The key is just two lines:
@@ -42,7 +42,7 @@ cfg = qwen3_4b_pretrain_config(
     # ... other settings
 )
 
-# Enable local parallel groups
+# Enable decentralized process groups
 cfg.dist.use_decentralized_pg = True
 cfg.dist.use_gloo_process_groups = False  # Gloo not supported
 ```
@@ -53,14 +53,14 @@ For full control over process groups:
 
 ```bash
 # 8 GPUs: TP2 x PP2 x DP2
-torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
 
 # 4 GPUs: TP2 x PP2 x DP1
-torchrun --nproc_per_node=4 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 2
 
 # 2 GPUs: TP2 x PP1 x DP1
-torchrun --nproc_per_node=2 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
+uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 1
 ```
 
diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_simple.py b/examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
@@ -15,10 +15,10 @@
 
 """
 ==============================================================================
-Example: Qwen3 Pretraining with Local Parallel Groups (Simple)
+Example: Qwen3 Pretraining with Decentralized Process Groups (Simple)
 ==============================================================================
 
-This example demonstrates the simplest way to enable local parallel groups:
+This example demonstrates the simplest way to enable decentralized process groups:
 just use an existing recipe and set `cfg.dist.use_decentralized_pg = True`.
 
 The setup() function inside pretrain() will automatically create the
@@ -27,12 +27,10 @@
 How to Run
 ----------
 # 8 GPUs: TP2 x PP2 x DP2
-torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
-
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
 
 # 4 GPUs: TP2 x PP2 x DP1
-torchrun --nproc_per_node=4 examples/recipes/local_parallel_groups/pretrain_qwen3_simple.py
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_simple.py
 """
 
 import torch
@@ -43,7 +41,7 @@
 
 
 def main() -> None:
-    """Run Qwen3 pretraining with local parallel groups enabled."""
+    """Run Qwen3 pretraining with decentralized process groups enabled."""
     # Get the standard Qwen3 4B pretrain config with overrides
     cfg = qwen3_4b_pretrain_config(
         # Use mock data for demo
@@ -60,12 +58,14 @@ def main() -> None:
         lr_warmup_iters=10,
         lr_decay_iters=100,
     )
+    # known issue with share_embeddings_and_output_weights
+    cfg.model.share_embeddings_and_output_weights = False
 
     # =========================================================================
-    # KEY: Enable local parallel groups
+    # KEY: Enable decentralized process groups
     # =========================================================================
     cfg.dist.use_decentralized_pg = True
-    cfg.dist.use_gloo_process_groups = False  # Gloo not supported with local PG
+    cfg.dist.use_gloo_process_groups = False  # Gloo not supported with decentralized PG
 
     pretrain(config=cfg, forward_step_func=forward_step)
 
diff --git a/examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py b/examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
@@ -15,7 +15,7 @@
 
 """
 ==============================================================================
-Example: Qwen3 Pretraining with Local Parallel Groups (Advanced/Manual)
+Example: Qwen3 Pretraining with Decentralized Process Groups (Advanced/Manual)
 ==============================================================================
 
 This example demonstrates how to MANUALLY create process groups using
@@ -37,16 +37,14 @@
 How to Run
 ----------
 # 8 GPUs: TP2 x PP2 x DP2
-torchrun --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py
-
-uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py
+uv run python -m torch.distributed.run --nproc_per_node=8 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py
 
 # 4 GPUs: TP2 x PP2 x DP1
-torchrun --nproc_per_node=4 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
+uv run python -m torch.distributed.run --nproc_per_node=4 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 2
 
 # 2 GPUs: TP2 x PP1 x DP1
-torchrun --nproc_per_node=2 examples/recipes/local_parallel_groups/pretrain_qwen3_with_local_parallel_groups.py \
+uv run python -m torch.distributed.run --nproc_per_node=2 examples/recipes/decentralized_pg/pretrain_qwen3_with_decentralized_pg.py \
     --tp-size 2 --pp-size 1
 """
 
@@ -95,7 +93,7 @@
 
 def parse_args() -> argparse.Namespace:
     """Parse command-line arguments."""
-    parser = argparse.ArgumentParser(description="Qwen3 Pretraining with Manual Local Parallel Groups")
+    parser = argparse.ArgumentParser(description="Qwen3 Pretraining with Manual Decentralized Process Groups")
 
     # Parallelism settings
     parser.add_argument("--tp-size", type=int, default=2, help="Tensor parallel size (default: 2)")
@@ -292,7 +290,7 @@ def create_process_group_collection(
     # Build the ProcessGroupCollection
     # ===========================================================================
     # This is the single object that contains ALL process groups and gets
-    # passed through function calls in local parallel groups mode.
+    # passed through function calls in decentralized process groups mode.
     pg_collection = ProcessGroupCollection(
         # Core parallelism groups
         tp=tp_pg,
@@ -420,7 +418,7 @@ def run_training(args: argparse.Namespace, pg_collection: ProcessGroupCollection
     # ===========================================================================
     # Create output directories
     # ===========================================================================
-    base_dir = tempfile.mkdtemp(prefix="mbridge_local_pg_")
+    base_dir = tempfile.mkdtemp(prefix="mbridge_decentralized_pg_")
     checkpoint_dir = os.path.join(base_dir, "checkpoints")
     tensorboard_dir = os.path.join(base_dir, "tensorboard")
 
@@ -498,7 +496,7 @@ def run_training(args: argparse.Namespace, pg_collection: ProcessGroupCollection
     # managing process groups ourselves via pg_collection
     dist_cfg = DistributedInitConfig(
         use_decentralized_pg=True,
-        use_gloo_process_groups=False,  # Gloo not supported with local PG
+        use_gloo_process_groups=False,  # Gloo not supported with decentralized PG
     )
 
     dataset_cfg = MockGPTDatasetConfig(
@@ -635,7 +633,7 @@ def main() -> None:
     args = parse_args()
 
     print_rank_0("=" * 70)
-    print_rank_0("Qwen3 Pretraining with MANUALLY Created Local Parallel Groups")
+    print_rank_0("Qwen3 Pretraining with MANUALLY Created Decentralized Process Groups")
     print_rank_0("=" * 70)
     print_rank_0("")
     print_rank_0("This example shows how to:")
diff --git a/tests/functional_tests/training/test_decentralized_pg.py b/tests/functional_tests/training/test_decentralized_pg.py
@@ -68,13 +68,13 @@ def cleanup_megatron_state():
         pass
 
 
-class TestLocalParallelGroupsPretrain:
+class TestDecentralizedPgPretrain:
     """
     Functional tests for pretraining with use_decentralized_pg enabled.
     """
 
     @pytest.mark.run_only_on("GPU")
-    def test_pretrain_with_local_parallel_groups(self, tmp_path):
+    def test_pretrain_with_decentralized_pg(self, tmp_path):
         """
         Test end to end training with use_decentralized_pg=True.
 
@@ -199,7 +199,7 @@ def test_pretrain_with_local_parallel_groups(self, tmp_path):
             clear_directories(tmp_path)
 
     @pytest.mark.run_only_on("GPU")
-    def test_pretrain_with_local_parallel_groups_disabled(self, tmp_path):
+    def test_pretrain_with_decentralized_pg_disabled(self, tmp_path):
         """
         Test end to end training with use_decentralized_pg=False (default).
 
@@ -322,12 +322,13 @@ def test_pretrain_with_local_parallel_groups_disabled(self, tmp_path):
         finally:
             clear_directories(tmp_path)
 
+    #
     @pytest.mark.run_only_on("GPU")
-    def test_pretrain_with_local_parallel_groups_and_pp(self, tmp_path):
+    def test_pretrain_with_decentralized_pg_and_pp(self, tmp_path):
         """
         Test training with use_decentralized_pg=True and pipeline parallelism.
 
-        This test verifies that the local parallel groups feature works correctly
+        This test verifies that the decentralized process groups feature works correctly
         with pipeline parallelism enabled.
         """
         initialize_distributed()
@@ -453,11 +454,11 @@ def test_pretrain_with_local_parallel_groups_and_pp(self, tmp_path):
             clear_directories(tmp_path)
 
     @pytest.mark.run_only_on("GPU")
-    def test_pretrain_with_local_parallel_groups_and_cp(self, tmp_path):
+    def test_pretrain_with_decentralized_pg_and_cp(self, tmp_path):
         """
         Test training with use_decentralized_pg=True and context parallelism.
 
-        This test verifies that the local parallel groups feature works correctly
+        This test verifies that the decentralized process groups feature works correctly
         with context parallelism enabled.
         """
         initialize_distributed()
@@ -583,11 +584,11 @@ def test_pretrain_with_local_parallel_groups_and_cp(self, tmp_path):
             clear_directories(tmp_path)
 
     @pytest.mark.run_only_on("GPU")
-    def test_pretrain_with_local_parallel_groups_combined_parallelism(self, tmp_path):
+    def test_pretrain_with_decentralized_pg_combined_parallelism(self, tmp_path):
         """
         Test training with use_decentralized_pg=True and combined TP+PP.
 
-        This test verifies that the local parallel groups feature works correctly
+        This test verifies that the decentralized process groups feature works correctly
         with multiple forms of parallelism enabled simultaneously.
         """
         initialize_distributed()
@@ -713,11 +714,11 @@ def test_pretrain_with_local_parallel_groups_combined_parallelism(self, tmp_path
             clear_directories(tmp_path)
 
     @pytest.mark.run_only_on("GPU")
-    def test_pretrain_with_local_parallel_groups_and_tp(self, tmp_path):
+    def test_pretrain_with_decentralized_pg_and_tp(self, tmp_path):
         """
         Test training with use_decentralized_pg=True and tensor parallelism.
 
-        This test verifies that the local parallel groups feature works correctly
+        This test verifies that the decentralized process groups feature works correctly
         with tensor parallelism enabled.
         """
         initialize_distributed()
diff --git a/tests/unit_tests/training/test_decentralized_pg.py b/tests/unit_tests/training/test_decentralized_pg.py
@@ -28,7 +28,7 @@
 from megatron.bridge.training.config import DistributedInitConfig
 
 
-class TestDistributedInitConfigLocalParallelGroups:
+class TestDistributedInitConfigDecentralizedPg:
     """Tests for DistributedInitConfig.use_decentralized_pg configuration."""
 
     def test_use_decentralized_pg_default_is_false(self):
@@ -286,7 +286,7 @@ class TestInitializeDistributedBranching:
     @patch("torch.cuda.device_count", return_value=1)
     @patch("torch.distributed.is_initialized", return_value=True)
     @patch("megatron.bridge.training.initialize.get_rank_safe", return_value=0)
-    def test_uses_hyper_comm_grid_when_local_parallel_groups_enabled(
+    def test_uses_hyper_comm_grid_when_decentralized_pg_enabled(
         self,
         mock_get_rank,
         mock_is_init,
@@ -330,7 +330,7 @@ def test_uses_hyper_comm_grid_when_local_parallel_groups_enabled(
     @patch("torch.cuda.device_count", return_value=1)
     @patch("torch.distributed.is_initialized", return_value=True)
     @patch("megatron.bridge.training.initialize.get_rank_safe", return_value=0)
-    def test_uses_mpu_when_local_parallel_groups_disabled(
+    def test_uses_mpu_when_decentralized_pg_disabled(
         self,
         mock_get_rank,
         mock_is_init,
@@ -380,7 +380,7 @@ def test_uses_mpu_when_local_parallel_groups_disabled(
         mock_parallel_state.initialize_model_parallel.assert_called_once()
 
 
-class TestSetupUsesLocalParallelGroups:
+class TestSetupUsesDecentralizedPg:
     """Tests for setup function behavior with use_decentralized_pg."""
 
     def test_config_use_decentralized_pg_enabled(self):
@@ -542,7 +542,7 @@ def test_setup_passes_none_when_use_decentralized_pg_false(self):
         assert passed_pg_collection is None
 
 
-class TestCheckpointingWithLocalParallelGroups:
+class TestCheckpointingWithDecentralizedPg:
     """Tests for checkpointing behavior based on use_decentralized_pg setting."""
 
     def test_modelopt_state_save_skipped_when_use_decentralized_pg_true(self):
@@ -584,7 +584,7 @@ def test_modelopt_state_save_executed_when_use_decentralized_pg_false(self):
         assert should_save_modelopt is True
 
 
-class TestTrainTensorShapesAdjustWithLocalParallelGroups:
+class TestTrainTensorShapesAdjustWithDecentralizedPg:
     """Tests for train.py tensor shapes adjust function behavior."""
 
     def test_tensor_shapes_adjust_fn_is_none_when_use_decentralized_pg_true(self):