Add pipeline parallel (#1060)

kwen2501 · web-flow · commit 9dc9effd6744 · 2024-08-26T14:22:16.000-07:00
ghstack-source-id: 02acf73 Pull Request resolved: #1060
diff --git a/build/model_dist.py b/build/model_dist.py
@@ -30,27 +30,35 @@
 device_mesh = None
 
 
-class Transformer(nn.Module):
-    def __init__(self, config: TransformerArgs) -> None:
+class TransformerStage(nn.Module):
+    def __init__(self, config: TransformerArgs, stage_idx: int, n_stages: int) -> None:
         super().__init__()
         self.config = config
+        self.stage_idx = stage_idx
+        self.n_stages = n_stages
+        self.layers_per_stage = config.n_layers // n_stages
 
         # Get device mesh
         global device_mesh
         if device_mesh is None:
             device_mesh = _mesh_resources.get_current_mesh()
 
-        tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
-        self.tok_embeddings = parallelize_module(
-            tok_embeddings,
-            device_mesh,
-            RowwiseParallel(input_layouts=Replicate()),
-        )
-        self.layers = nn.ModuleList(
-            TransformerBlock(config) for _ in range(config.n_layers)
-        )
-        self.norm = RMSNorm(config.dim, eps=config.norm_eps)
-        self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
+        if stage_idx == 0:
+            tok_embeddings = nn.Embedding(config.vocab_size, config.dim)
+            self.tok_embeddings = parallelize_module(
+                tok_embeddings,
+                device_mesh,
+                RowwiseParallel(input_layouts=Replicate()),
+            )
+
+        # Use ModuleDict so that each layer can be assigned its layer ID in the original model
+        self.layers = nn.ModuleDict()
+        for layer_id in range(self.layers_per_stage * stage_idx, self.layers_per_stage * (stage_idx + 1)):
+            self.layers[str(layer_id)] = TransformerBlock(config)
+
+        if stage_idx == n_stages - 1:
+            self.norm = RMSNorm(config.dim, eps=config.norm_eps)
+            self.output = nn.Linear(config.dim, config.vocab_size, bias=False)
 
         # self.freqs_cis: Optional[Tensor] = None
         # self.mask_cache: Optional[Tensor] = None
@@ -67,7 +75,7 @@ def setup_caches(self, max_batch_size, max_seq_length):
         max_seq_length = find_multiple(max_seq_length, 8)
         self.max_seq_length = max_seq_length
         self.max_batch_size = max_batch_size
-        for b in self.layers:
+        for b in self.layers.values():
             b.attention.kv_cache = KVCache(
                 max_batch_size, max_seq_length, self.config.n_local_heads, head_dim
             )
@@ -84,19 +92,26 @@ def setup_caches(self, max_batch_size, max_seq_length):
         )
         self.register_buffer("causal_mask", causal_mask, persistent=True)
 
-    def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
+    def forward(self, x: Tensor, input_pos: Optional[Tensor] = None) -> Tensor:
         assert self.freqs_cis is not None, "Caches must be initialized first"
+        if input_pos is None:
+            input_pos = torch.arange(x.shape[1], device=x.device, dtype=torch.long)
         mask = self.causal_mask[None, None, input_pos]
         freqs_cis = self.freqs_cis[input_pos]
-        x: DTensor = self.tok_embeddings(idx)
-        # TODO: sequence parallelize this
 
-        for _, layer in enumerate(self.layers):
+        if self.stage_idx == 0:
+            x: DTensor = self.tok_embeddings(x)
+            # TODO: sequence parallelize this
+
+        for _, layer in self.layers.items():
             x = layer(x, input_pos, freqs_cis, mask)
-        x = self.norm(x)
-        logits = self.output(x)
-        # print(f"logits shape: {logits.shape}")
-        return logits
+
+        if self.stage_idx == self.n_stages - 1:
+            x = self.norm(x)
+            x = self.output(x)
+
+        # print(f"stage output shape: {x.shape}")
+        return x
 
     @classmethod
     def from_name(cls, name: str):
diff --git a/dist_run.py b/dist_run.py
@@ -4,34 +4,67 @@
 # This source code is licensed under the license found in the
 # LICENSE file in the root directory of this source tree.
 
+# Run command:
+# torchrun --nproc-per-node 4 dist_run.py
+
 import torch
 import torch.distributed as dist
+from torch.distributed.pipelining import PipelineStage, ScheduleGPipe
 
 from build.model import TransformerArgs
-from build.model_dist import Transformer
+from build.model_dist import TransformerStage
 
 # Model config
 def main():
     config = TransformerArgs.from_name("Transformer-2-7b-chat-hf")
     print(config)
 
     # Construct a device mesh with available devices (multi-host or single host)
-    device_mesh = dist.init_device_mesh("cuda", (2,), mesh_dim_names=("tp",))
+    device_mesh = dist.init_device_mesh("cuda", (2, 2), mesh_dim_names=("pp", "tp"))
+    tp_mesh = device_mesh["tp"]
+    pp_mesh = device_mesh["pp"]
+    pp_rank = pp_mesh.get_local_rank()
+    nstages = pp_mesh.size()
+
     rank = dist.get_rank()
     device = torch.device(f"cuda:{rank}")
 
     # Create parallel model with device_mesh context
     with device:
-        with device_mesh:
-            model = Transformer(config)
+        with tp_mesh:
+            model = TransformerStage(config, pp_rank, nstages)
             model.setup_caches(1, 4096)
 
     print(model)
 
     # Distributed run
-    input_ids = torch.randint(0, config.vocab_size, (1, 4096), device=device)
-    input_pos = torch.arange(0, 4096, device=device)
-    output = model(input_ids, input_pos)
+    mbs = 2                         # number of micro-batches
+    mb_size = 1                     # micro-batch size
+    batch_size = mbs * mb_size      # total batch size
+    seqlen = 4096                   # sequence length
+    dim = 4096                      # embedding dimension
+
+    # Example input for pipeline stages
+    mb_ids = torch.randint(0, config.vocab_size, (mb_size, seqlen), device=device)
+    activation = torch.rand(mb_size, seqlen, dim, device=device)
+    example_args = mb_ids if pp_rank == 0 else activation
+
+    # Create pipeline stages
+    stage = PipelineStage(
+        model, pp_rank, nstages, device,
+        input_args=(example_args,),
+        group=pp_mesh.get_group(),
+    )
+
+    # Run pipeline
+    schedule = ScheduleGPipe(stage, mbs)
+    input_ids = torch.randint(0, config.vocab_size, (batch_size, seqlen), device=device)
+    if pp_rank == 0:
+        schedule.step(input_ids)
+    else:
+        output = schedule.step()
+        print(f"{output=}")
+
     dist.destroy_process_group()
     print(f"Rank {rank} completes.")