explore using a side stream for two data-dependent all_to_all_single comms (#3440)

TroyGarden · meta-codesync[bot] · commit a31bd7625bed · 2025-10-08T08:06:29.000-07:00
Summary: Pull Request resolved: #3440 # context * table-wise-row-wise (TWRW) sharding takes the advantage of high bandwidth intra-node comms for the data-intensive row-wise sharded embedding table pooling. * it uses [two output dist components](https://github.com/meta-pytorch/torchrec/blob/release/v1.3.0/torchrec/distributed/sharding/twrw_sharding.py#L479-L490): intra-node dist and cross-node dist. The cross-node dist relies on the data/result from intra-node dist. * This data dependency actually creates a blocking situation on the main cuda (compute) stream, as shown below (nccl:_reduce_scatter for the intra-node dist, nccl:_all_to_all for the cross-node dist) {F1982557282} # experiment * the correct approach is to use a side stream to process the data-dependent comms * without side stream: [trace](https://drive.google.com/file/d/1lpa-NrBD0IWcpskdN1Lwiu0XcSTe01bW/view?usp=sharing) the first comms is blocking the main stream execution {F1982557422} * with side stream: [trace](https://drive.google.com/file/d/1FqNpq4yMx9H6vL47S8KX5dvk2PJv_QGa/view?usp=sharing) both comms are non-blocking on the main stream {F1982557381} Reviewed By: spmex Differential Revision: D82002643 fbshipit-source-id: 00ee3e7b20f4ed0b799b3c8a49a3a5f7566f87c1
diff --git a/torchrec/distributed/benchmark/benchmark_comms.py b/torchrec/distributed/benchmark/benchmark_comms.py
@@ -148,7 +148,7 @@ def a2a_async_base(
             async_op=True,
         )
 
-    with record_function("## comms validation ##"):
+    with record_function("## comms pre-check ##"):
         # pre-check is performed before comms' done
         pre_checks = _validate(post_comms, ctx).to("cpu", non_blocking=True)
         # need this cuda.event to record the device-to-host data transfer
@@ -159,13 +159,15 @@ def a2a_async_base(
         pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
 
     ev_d2h.synchronize()  # make sure the pre_checks is available from cpu side
-    with record_function(f"## post-comms compute: pre-check-{pre_checks}##"):
+    with record_function(f"## comms check and pre-check: {pre_checks} ##"):
         # assertion fails without wait(), this wait() makes the main cuda stream wait
         # for the comms to finish, so the post-comms compute will be blocked until
         # the comms is done
         req.wait()
         checks = _validate(post_comms, ctx).to("cpu", non_blocking=True)
         ev_d2h.record()  # record the device-to-host data transfer
+
+    with record_function("## post-comms compute ##"):
         post_comms = _compute(
             dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=post_comms[0]
         )
@@ -176,6 +178,81 @@ def a2a_async_base(
         assert checks
 
 
+# all_to_all_single with sync and single stream
+def a2a_async_twice(
+    _batch_inputs: List[Dict[str, Any]],
+    dim: int,
+    num_mul: int,
+    num_concat: int,
+    ctx: MultiProcessContext,
+) -> None:
+    with record_function("## pre-comms compute ##"):
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
+
+    with record_function("## pre-allocation ##"):
+        # use zeros instead of empty to make sure no previous data used
+        post_comms1 = torch.zeros_like(pre_comms)
+        post_comms2 = torch.zeros_like(pre_comms)
+
+    with record_function("## comms1 ##"):
+        req1 = dist.all_to_all_single(
+            output=post_comms1,
+            input=pre_comms,
+            group=ctx.pg,
+            async_op=True,
+        )
+
+    with record_function("## comms1 pre-validation ##"):
+        # pre-check is performed before comms' done
+        pre_checks1 = _validate(post_comms1, ctx).to("cpu", non_blocking=True)
+        # need this cuda.event to record the device-to-host data transfer
+        ev_d2h = torch.cuda.Event()
+        ev_d2h.record()
+
+    with record_function("## comms2 ##"):
+        side_stream = torch.cuda.Stream()
+        post_comms2.record_stream(side_stream)
+        with torch.cuda.stream(side_stream):
+            req1.wait()  # let the side stream wait for comms1 to finish
+            pre_comms = torch.sigmoid(post_comms1) + ctx.rank
+            req2 = dist.all_to_all_single(
+                output=post_comms2,
+                input=pre_comms,
+                group=ctx.pg,
+                async_op=True,
+            )
+
+    with record_function("## irrelevant compute1 ##"):
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
+
+    with record_function("## comms2 pre-validation ##"):
+        # pre-check is performed before comms' done, actually even before comms2 starts
+        pre_checks2 = _validate(post_comms2, ctx).to("cpu", non_blocking=True)
+        ev_d2h.record()  # record the device-to-host data transfer
+
+    with record_function("## irrelevant compute2 ##"):
+        pre_comms = _compute(dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx)
+
+    ev_d2h.synchronize()  # make sure the pre_checks is available from cpu side
+    with record_function(f"## comms1 checks and pre-checks1 {pre_checks1} ##"):
+        req1.wait()  # let the main stream wait for comms1 to finish
+        checks1 = _validate(post_comms1, ctx).to("cpu", non_blocking=True)
+    with record_function(f"## comms2 checks and pre-checks2 {pre_checks2} ##"):
+        req2.wait()  # let the main stream wait for comms2 to finish
+        checks2 = _validate(post_comms2, ctx).to("cpu", non_blocking=True)
+        ev_d2h.record()  # record the device-to-host data transfer
+
+    with record_function("## post-comms comput ##"):
+        post_comms2 = _compute(
+            dim=dim, num_mul=num_mul, num_concat=num_concat, ctx=ctx, x=post_comms2[0]
+        )
+
+    with record_function("## assert ##"):
+        # again, make sure the device-to-host data transfer is done before the assertion
+        ev_d2h.synchronize()
+        assert checks1 and checks2
+
+
 # single-rank runner
 def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig) -> None:
     # Ensure GPUs are available and we have enough of them
@@ -195,6 +272,8 @@ def a2a_single_runner(rank: int, world_size: int, arg: AllToAllSingleRunConfig)
             func = a2a_sync_base
         elif arg.name.startswith("a2a_async_base"):
             func = a2a_async_base
+        elif arg.name.startswith("a2a_async_twice"):
+            func = a2a_async_twice
         else:
             func = a2a_sync_base