[PP] Add pipeline_parallel_layers_per_stage config (#1072)

H-Huang · web-flow · commit 8b737fe3c6e8 · 2025-04-07T22:09:40.000-04:00
landing #1041
diff --git a/tests/integration_tests.py b/tests/integration_tests.py
@@ -194,6 +194,12 @@ def build_test_list():
                     "--parallelism.pipeline_parallel_schedule 1F1B",
                     "--parallelism.data_parallel_shard_degree 2",
                 ],
+                [
+                    "--parallelism.pipeline_parallel_degree 2",
+                    "--parallelism.pipeline_parallel_schedule 1F1B",
+                    "--parallelism.pipeline_parallel_layers_per_stage 4",
+                    "--parallelism.data_parallel_shard_degree 2",
+                ],
             ],
             "PP+DP 1F1B 2D test",
             "pp_dp_1f1b",
@@ -258,6 +264,11 @@ def build_test_list():
                     "--parallelism.pipeline_parallel_degree 4",
                     "--parallelism.pipeline_parallel_schedule Interleaved1F1B",
                 ],
+                [
+                    "--parallelism.pipeline_parallel_degree 4",
+                    "--parallelism.pipeline_parallel_schedule Interleaved1F1B",
+                    "--parallelism.pipeline_parallel_layers_per_stage 1",
+                ],
             ],
             "PP looped 1F1B test",
             "pp_looped_1f1b",
diff --git a/torchtitan/config_manager.py b/torchtitan/config_manager.py
@@ -487,6 +487,15 @@ def __init__(self):
                 Note: fully-automated splitting may be enabled in the future,
                 but currently the split points must be specified manually.""",
         )
+        self.parser.add_argument(
+            "--parallelism.pipeline_parallel_layers_per_stage",
+            type=int,
+            default=None,
+            help="""
+                The number of layers per stage. If specified, the split points will be calculated from
+                the number of layers and pipeline_parallel_degree. If not specified, the layers per stage will
+                be inferred from the model, schedule, and pipeline_parallel_degree.""",
+        )
         self.parser.add_argument(
             "--parallelism.pipeline_parallel_schedule",
             type=str,
diff --git a/torchtitan/distributed/pipeline.py b/torchtitan/distributed/pipeline.py
@@ -3,8 +3,9 @@
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
+import math
 import os
-from typing import Callable
+from typing import Callable, Optional
 
 from torch.distributed.pipelining.schedules import (
     _PipelineSchedule,
@@ -25,50 +26,84 @@
 # TODO: It's unclear if this API is general enough to be used by other models.
 # If not, we should move it to a Transformer-specific directory.
 def generate_split_points(
-    pipeline_parallel_schedule: str, pp_dim: int, num_layers: int
+    schedule_str: str,
+    layers_per_stage: Optional[int],
+    pp_dim: int,
+    num_layers: int,
+    input_weight: int = 1,
+    output_weight: int = 1,
 ) -> list[str]:
     """
-    Generate a default split point based on the number of layers and
-    pipeline parallel dimension.
+    Generate a list of split points based on the number of layers and
+    pipeline parallel dimension, ensuring the first and last stages have the least layers.
 
     Args:
-        job_config (JobConfig): The job configuration.
+        schedule_str (str): The string of the schedule name.
+        layers_per_stage (int): The number of layers per stage.
         pp_dim (int): The pipeline parallel dimension.
         num_layers (int): The number of layers in the model.
+        input_output_weight (int): The number of layers to consider the input/output modules in the layer calculation.
 
     Returns:
         list[str]: A list of split point FQNs.
     """
 
-    schedule_class = get_schedule_class(pipeline_parallel_schedule)
-    if issubclass(schedule_class, PipelineScheduleSingle):
-        num_stages_per_rank = 1
-    elif issubclass(schedule_class, PipelineScheduleMulti):
-        # Multi-stage schedules support more than 2 stages per rank, but this is the default if
-        # no pipeline split is specified
-        num_stages_per_rank = 2
+    schedule_class = get_schedule_class(schedule_str)
+    is_single_stage_schedule = issubclass(schedule_class, PipelineScheduleSingle)
+    num_stages_per_rank = 1 if is_single_stage_schedule else 2
+
+    if layers_per_stage is not None:
+        total_stages = math.ceil(num_layers / layers_per_stage)
+        if total_stages % pp_dim != 0:
+            raise ValueError(
+                f"Number of stages ({total_stages}) must be divisible by the pipeline parallel dimension ({pp_dim})."
+                f"Each rank should have the same number of stages. "
+            )
+        num_stages_per_rank = total_stages // pp_dim
+
+        if is_single_stage_schedule and num_stages_per_rank != 1:
+            raise ValueError(
+                f"Number of stages per rank ({num_stages_per_rank}) must be 1 for single stage schedules."
+            )
+        elif not is_single_stage_schedule and num_stages_per_rank < 2:
+            raise ValueError(
+                f"Number of stages per rank ({num_stages_per_rank}) must be >= 2 for multi stage schedules."
+            )
     else:
-        raise ValueError(f"Unsupported pipeline schedule: {pipeline_parallel_schedule}")
-    total_stages = pp_dim * num_stages_per_rank
-    if total_stages > num_layers:
-        raise ValueError("Total stages cannot be greater than the number of layers")
-
-    base_interval = num_layers // total_stages
-    extra_layers = num_layers % total_stages
-
-    splits = []
-    current_layer = 0
-    for i in range(total_stages - 1):
-        if i == 0:
-            current_layer += base_interval
-        else:
-            # Middle stages get an extra layer if there are any remaining
-            if extra_layers > 0:
-                current_layer += base_interval + 1
-                extra_layers -= 1
-            else:
-                current_layer += base_interval
-        splits.append("layers." + str(current_layer))
+        total_stages = pp_dim * num_stages_per_rank
+        if total_stages > num_layers:
+            raise ValueError("Total stages cannot be greater than the number of layers")
+
+    # Calculate effective number of layers including input and output weights
+    effective_num_layers = num_layers + input_weight + output_weight
+    base_layers_per_stage = effective_num_layers // total_stages
+
+    splits = [""] * (total_stages - 1)
+    current_layer_index = 0
+
+    # First stage
+    layers_on_first_stage = max(0, base_layers_per_stage - input_weight)
+    current_layer_index += layers_on_first_stage
+    splits[0] = "layers." + str(current_layer_index)
+
+    # Last stage
+    layers_on_last_stage = max(0, base_layers_per_stage - output_weight)
+    splits[-1] = "layers." + str(num_layers - layers_on_last_stage)
+
+    # Middle stages
+    remaining_layers = num_layers - layers_on_first_stage - layers_on_last_stage - 1
+    middle_stages = len(splits) - 2
+    layers_per_middle_stage = remaining_layers // middle_stages
+    # split remainder evenly across middle stages
+    remainder = remaining_layers % middle_stages
+
+    for i in range(1, middle_stages + 1):
+        current_layer_index += layers_per_middle_stage
+        if remainder > 0:
+            current_layer_index += 1
+            remainder -= 1
+        splits[i] = "layers." + str(current_layer_index)
+
     logger.info(
         f"No 'pipeline_parallel_split_points' provided so the generated splits are: {splits} "
         "This may be sub-optimal as the number of layers per stage may be unbalanced."
diff --git a/torchtitan/models/llama3/pipeline_llama.py b/torchtitan/models/llama3/pipeline_llama.py
@@ -94,6 +94,7 @@ def pipeline_llama_manual_split(
 
     splits = parallelism_config.pipeline_parallel_split_points or generate_split_points(
         parallelism_config.pipeline_parallel_schedule,
+        parallelism_config.pipeline_parallel_layers_per_stage,
         parallel_dims.pp,
         model_config.n_layers,
     )

Original file line number	Diff line number	Diff line change
`@@ -94,6 +94,7 @@ def pipeline_llama_manual_split(`
`94`	`94`
`95`	`95`	`splits = parallelism_config.pipeline_parallel_split_points or generate_split_points(`
`96`	`96`	`parallelism_config.pipeline_parallel_schedule,`
	`97`	`+ parallelism_config.pipeline_parallel_layers_per_stage,`
`97`	`98`	`parallel_dims.pp,`
`98`	`99`	`model_config.n_layers,`
`99`	`100`	`)`