[PP] Optimize memory usage by releasing output memory earlier (pytorch#153383)

H-Huang · pytorchmergebot · commit d9ef1012dbbc · 2025-05-13T14:42:38.000Z
Considering `output_chunks` is only used for last stage, we should not keep the outputs of each stage in memory; this will allow memory to be freed earlier. Pull Request resolved: pytorch#153383 Approved by: https://github.com/Skylion007, https://github.com/kwen2501
diff --git a/test/distributed/pipelining/test_stage.py b/test/distributed/pipelining/test_stage.py
@@ -320,6 +320,64 @@ def test_custom_dw_errors(self):
         with self.assertRaisesRegex(AssertionError, "backward_one_chunk"):
             stage_with_dw_builder.backward_weight_one_chunk(bwd_chunk_id=0)
 
+    @requires_nccl()
+    @skip_but_pass_in_sandcastle_if(not TEST_MULTIGPU, "NCCL test requires 2+ GPUs")
+    def test_output_chunks_memory_usage(self):
+        """Test that output_chunks doesn't store memory for non-first stages."""
+        full_mod = MultiMLP(d_hid, n_layers=self.world_size)
+        full_mod.to(self.device)
+        stage_mod = full_mod.get_submodule(f"layers.{self.rank}")
+        x = torch.randn(batch_size, d_hid, device=self.device)
+        target = torch.randn(batch_size, d_hid, device=self.device)
+        stage = PipelineStage(
+            stage_mod,
+            self.rank,
+            self.world_size,
+            self.device,
+        )
+        self.assertEqual(
+            len(stage.output_chunks), 0, "output_chunks should be empty initially"
+        )
+
+        schedule = ScheduleGPipe(
+            stage, chunks, loss_fn=torch.nn.MSELoss(reduction="sum")
+        )
+
+        def _run_step(x):
+            if self.rank == 0:
+                return schedule.step(x)
+            elif self.rank == self.world_size - 1:
+                return schedule.step(target=target)
+            else:
+                return schedule.step()
+
+        _run_step(x)
+
+        # Verify fwd_cache is empty
+        self.assertEqual(len(stage.fwd_cache), 0, "fwd_cache should be cleared")
+
+        # Check output_chunks state after step
+        if self.rank == self.world_size - 1:
+            self.assertEqual(
+                len(stage.output_chunks),
+                chunks,
+                "Last stage should store output chunks",
+            )
+        else:
+            self.assertEqual(
+                len(stage.output_chunks),
+                0,
+                f"Non-last stage (rank {self.rank}) should not store output chunks",
+            )
+
+        # Clear the schedule and stage caches
+        stage.clear_runtime_states()
+        if self.rank == self.world_size - 1:
+            # Last stage should have output_chunks populated
+            self.assertEqual(
+                len(stage.output_chunks), 0, "Last stage should store output chunks"
+            )
+
 
 instantiate_parametrized_tests(StageTest)
 
diff --git a/torch/distributed/pipelining/stage.py b/torch/distributed/pipelining/stage.py
@@ -433,10 +433,7 @@ def get_fwd_send_ops(self, fwd_chunk_id: int) -> list[dist.P2POp]:
         """
         Get the activation send ops for current stage's forward.
         """
-        output = self.output_chunks[fwd_chunk_id]
-        # Unify output form to tuple for easy correspondance with
-        # `act_send_info`
-        output_tuple = output if type(output) is tuple else (output,)
+        output_tuple, _ = self.fwd_cache[fwd_chunk_id]
 
         ops: list[dist.P2POp] = []
 
@@ -719,7 +716,9 @@ def forward_one_chunk(
         output_tuple = _normalize_model_output_as_tuple(output)
 
         # Prepare for final output merge or reduction
-        self.output_chunks.append(output)
+        # Output chunks is only used for the last stage since we only merge the output of the last stage
+        if self.is_last:
+            self.output_chunks.append(output)
 
         # Save activations and inputs for backward
         flat_args = flatten_args(composite_args)