helper for collectives to pass tensors in sharding tests (#3161)

isururanawaka · facebook-github-bot · commit 140e97969324 · 2025-07-10T15:22:08.000-07:00
Summary: Pull Request resolved: #3161 -Create gather_all_tensors function -Replace dist.all_gather operation Reviewed By: aporialiao Differential Revision: D77819104 fbshipit-source-id: f82d19c5aae6ae955b7e100f17dedd206ec1a0b4
diff --git a/torchrec/distributed/test_utils/test_sharding.py b/torchrec/distributed/test_utils/test_sharding.py
@@ -76,6 +76,34 @@ class SharderType(Enum):
     EMBEDDING_COLLECTION = "embedding_collection"
 
 
+def _gather_all_tensors(
+    local_tensor: torch.Tensor,
+    world_size: int,
+    pg: Optional[dist.ProcessGroup] = None,
+) -> List[torch.Tensor]:
+    """
+    Gathers tensors from all processes in a distributed group.
+
+    This function collects tensors from all processes in the specified
+    process group and returns a list of tensors, where each tensor
+    corresponds to the data from one process.
+
+    Args:
+        local_tensor (torch.Tensor): The tensor to be gathered from the local process.
+        world_size (int): The number of processes in the distributed group.
+        pg (Optional[ProcessGroup]): The process group to use for communication.
+                                                If not provided, a default ProcessGroup will be created.
+
+    Returns:
+        List[torch.Tensor]: A list of tensors gathered from all processes.
+    """
+    all_local_tensors: List[torch.Tensor] = []
+    for _ in range(world_size):
+        all_local_tensors.append(torch.empty_like(local_tensor))
+    dist.all_gather(all_local_tensors, local_tensor, pg)
+    return all_local_tensors
+
+
 def create_test_sharder(
     sharder_type: str,
     sharding_type: str,
@@ -558,14 +586,10 @@ def dynamic_sharding_test(
         )
 
         # TODO: support non-sharded forward with zero batch size KJT
-        all_local_pred_m1 = []
-        for _ in range(world_size):
-            all_local_pred_m1.append(torch.empty_like(local_m1_pred))
-        dist.all_gather(all_local_pred_m1, local_m1_pred, group=ctx.pg)
-        all_local_pred_m2 = []
-        for _ in range(world_size):
-            all_local_pred_m2.append(torch.empty_like(local_m2_pred))
-        dist.all_gather(all_local_pred_m2, local_m2_pred, group=ctx.pg)
+
+        all_local_pred_m1 = _gather_all_tensors(local_m1_pred, world_size, ctx.pg)
+
+        all_local_pred_m2 = _gather_all_tensors(local_m2_pred, world_size, ctx.pg)
 
         # Compare predictions of sharded vs unsharded models.
         if qcomms_config is None:
@@ -895,10 +919,7 @@ def _custom_hook(input: List[torch.Tensor]) -> None:
 
     # TODO: support non-sharded forward with zero batch size KJT
     if not allow_zero_batch_size:
-        all_local_pred = []
-        for _ in range(world_size):
-            all_local_pred.append(torch.empty_like(local_pred))
-        dist.all_gather(all_local_pred, local_pred, group=pg)
+        all_local_pred = _gather_all_tensors(local_pred, world_size, pg)
 
         # Run second training step of the unsharded model.
         assert optim == EmbOptimType.EXACT_SGD