Pipeline Integration

aporialiao · facebook-github-bot · commit 142322b82a15 · 2025-07-22T17:08:00.000-07:00
Differential Revision: D78191049
diff --git a/torchrec/distributed/benchmark/benchmark_train_pipeline.py b/torchrec/distributed/benchmark/benchmark_train_pipeline.py
@@ -20,6 +20,7 @@
     See benchmark_pipeline_utils.py for step-by-step instructions.
 """
 
+import copy
 from dataclasses import dataclass, field
 from typing import Dict, List, Optional, Type, Union
 
@@ -349,9 +350,40 @@ def _func_to_benchmark(
             pipeline: TrainPipeline,
         ) -> None:
             dataloader = iter(bench_inputs)
+            i = 0
             while True:
                 try:
-                    pipeline.progress(dataloader)
+                    # import fbvscode
+
+                    # fbvscode.set_trace()
+                    if i == 3:
+                        # Extract existing sharding plan
+                        existing_sharding_plan = pipeline._model.module.sparse.ebc.module_sharding_plan  # pyre-ignore
+                        fqn_to_local_shards = "sparse.ebc"
+                        # Modify existing sharding plan - Hard code
+                        sharding_param = copy.deepcopy(
+                            existing_sharding_plan["table_0"]
+                        )
+                        new_device = 1 if sharding_param.ranks[0] == 0 else 0
+                        sharding_param.ranks = [new_device]
+                        sharding_param.sharding_spec.shards[0].placement = (
+                            torch.distributed._remote_device(
+                                f"rank:{new_device}/cuda:{new_device}"
+                            )
+                        )
+
+                        new_sharding_plan = {}
+                        new_sharding_plan["table_0"] = sharding_param
+                        # Reshard
+                        pipeline.progress_with_reshard(  # pyre-ignore
+                            dataloader_iter=dataloader,
+                            reshard_params=new_sharding_plan,
+                            sharded_module_fqn=fqn_to_local_shards,
+                        )
+                        i += 1
+                    else:
+                        pipeline.progress(dataloader)
+                        i += 1
                 except StopIteration:
                     break
 
diff --git a/torchrec/distributed/embeddingbag.py b/torchrec/distributed/embeddingbag.py
@@ -1764,7 +1764,7 @@ def update_shards(
             # Modifies new_opt_state in place and returns it
             optimizer_state = update_optimizer_state_post_resharding(
                 old_opt_state=old_optimizer_state,  # pyre-ignore
-                new_opt_state=copy.deepcopy(self._optim.state_dict()),
+                new_opt_state=self._optim.state_dict(),  # undo deep copy?
                 ordered_shard_names_and_lengths=local_shard_names_by_src_rank,
                 output_tensor=local_optimizer_tensors,
                 max_dim_0=max_dim_0,
diff --git a/torchrec/distributed/train_pipeline/train_pipelines.py b/torchrec/distributed/train_pipeline/train_pipelines.py
@@ -66,7 +66,7 @@
     DataLoadingThread,
     use_context_for_postprocs,
 )
-from torchrec.distributed.types import Awaitable
+from torchrec.distributed.types import Awaitable, ParameterSharding
 from torchrec.pt2.checks import is_torchdynamo_compiling
 from torchrec.pt2.utils import default_pipeline_input_transformer
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
@@ -696,6 +696,87 @@ def progress(self, dataloader_iter: Iterator[In]) -> Out:
         self.dequeue_batch()
         return output
 
+    def progress_with_reshard(
+        self,
+        dataloader_iter: Iterator[In],
+        reshard_params: Dict[str, ParameterSharding],
+        sharded_module_fqn: Optional[str] = None,
+    ) -> Out:
+        """
+        As resharding will affect the tensor placements. Will temporarily undo pipeline overlap
+        """
+        # Assume pipeline batches are not empty:
+        # # attach the model just in case the user forgets to call it, especially when the user
+        # # pauses the pipeline.progress and detach the model for other purpose.
+        # if not self._model_attached:
+        #     self.attach(self._model)
+
+        # # fill the pipeline is only needed for the beginning when the pipeline (batches) is empty
+        # self.fill_pipeline(dataloader_iter)
+
+        # Assume not last batch
+        # # here is the expected stop after exhausting all batches
+        if not self.batches:
+            raise StopIteration
+        # import fbvscode
+
+        # fbvscode.set_trace()
+        # TODO: Remove once Bulk Eval migrated (needed for bwd compat, this class only)
+        self._set_module_context(self.contexts[0])
+
+        if self._model.training:
+            with record_function("## zero_grad ##"):
+                self._optimizer.zero_grad()
+
+        # wait for batches[0] being available on device, this should always be completed since
+        # the input_dist of batches[0] has be invoked in previous iter. TODO: fact check
+        self._wait_for_batch()
+
+        # Assume _enqueue_batch_after_forward is False
+        # if not self._enqueue_batch_after_forward:
+        #     # batch i+2: load data and copy to gpu, the dataload iter will first exhaust here
+        #     self.enqueue_batch(dataloader_iter)
+
+        # But reshard after this.
+        # forward
+        with record_function("## forward ##"):
+            losses, output = self._model_fwd(self.batches[0])
+
+        # if self._enqueue_batch_after_forward:
+        #     # batch i+2: load data and copy to gpu, the dataload iter will first exhaust here.
+        #     # Start this step after the forward of batch i, so that the H2D copy doesn't compete
+        #     # for pcie bandwidth with embedding lookup from UVM/UVM_CACHING.
+        #     self.enqueue_batch(dataloader_iter)
+
+        if self._model.training:
+            # backward
+            self._backward(losses)
+
+            self.sync_embeddings(
+                self._model,
+                self._dmp_collection_sync_interval_batches,
+                self.contexts[0],
+            )
+
+            # update
+            with record_function("## optimizer ##"):
+                self._optimizer.step()
+
+        # Reshard
+        self._model.reshard(  # pyre-ignore
+            sharded_module_fqn=sharded_module_fqn,
+            changed_shard_to_params=reshard_params,
+        )
+
+        # Need to reshard before this.
+        if len(self.batches) >= 2:
+            # invoke splits all_to_all comms (first part of input_dist)
+            self.start_sparse_data_dist(self.batches[1], self.contexts[1])
+            # invoke data (values, lengths, etc.) all_to_all comms (second part of input_dist)
+            self.wait_sparse_data_dist(self.contexts[1])
+        self.dequeue_batch()
+        return output
+
     def _create_context(self) -> TrainPipelineContext:
         context = self._context_type(index=self._next_index, version=1)
         self._next_index += 1