cleanup: remove unused scatter_gather_tensors_in_pipeline argument (#4140)

Phlip79 · claude · web-flow · commit 10e7b74597b4 · 2026-04-04T00:31:20.000Z
Co-authored-by: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/examples/academic_paper_scripts/sc21/run_figure_18.sh b/examples/academic_paper_scripts/sc21/run_figure_18.sh
@@ -4,25 +4,12 @@
 # Choose the case to run.
 # ================================
 
-# Scatter-gather communication optimization options = [YES, NO].
-SCATTER_GATHER=YES
-
 # Batch size (global batch size) options = [12, 24, 36, ..., 60].
 GBS=12
 
 
 
-
-
-# Set scatter-gather communication optimization options.
-if [ ${SCATTER_GATHER} == "YES" ]; then
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
-elif [ ${SCATTER_GATHER} == "NO" ]; then
-    MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 --no-scatter-gather-tensors-in-pipeline "
-else
-    echo "Invalid configuration"
-    exit 1
-fi
+MEGATRON_EXTRA_PARAMS="--activations-checkpoint-method uniform --num-layers-per-virtual-pipeline-stage 2 "
 
 
 # Other params.
@@ -37,7 +24,7 @@ NNODES=12
 
 
 # Name of the job.
-export JOB_NAME=results_figure_18_scatter_gather_${SCATTER_GATHER}_batch_size_${GBS}
+export JOB_NAME=results_figure_18_batch_size_${GBS}
 
 
 # Import the configs.
diff --git a/examples/gpt3/gpt_config.yaml b/examples/gpt3/gpt_config.yaml
@@ -218,7 +218,6 @@ overlap_grad_reduce: False
 align_grad_reduce: True
 overlap_param_gather: False
 align_param_gather: False
-scatter_gather_tensors_in_pipeline: True
 local_rank: null
 lazy_mpu_init: null
 empty_unused_memory_level: 0
diff --git a/megatron/training/arguments.py b/megatron/training/arguments.py
@@ -2696,9 +2696,6 @@ def _add_distributed_args(parser):
                        help='If not set, all PP stages will launch param all-gathers simultaneously. '
                        'Otherwise, each PP stage will independently launch as needed.',
                        dest='align_param_gather')
-    group.add_argument('--no-scatter-gather-tensors-in-pipeline', action='store_false',
-                       help='If not set, use scatter/gather to optimize communication of tensors in pipeline.',
-                       dest='scatter_gather_tensors_in_pipeline')
     group.add_argument('--use-distributed-optimizer', action='store_true',
                        help='Use distributed optimizer.')
     group.add_argument('--use-nccl-ub', action='store_true', dest='nccl_ub',