add all gather string util (#1004)

JKSenthil · facebook-github-bot · commit f5bf0c174d71 · 2025-06-03T18:21:03.000-07:00
Summary: Pull Request resolved: #1004 Reviewed By: galrotem Differential Revision: D75021212 fbshipit-source-id: fcb01621f6a4d8d8d7ef4166363bf4046bbbefc8
diff --git a/tests/utils/test_distributed.py b/tests/utils/test_distributed.py
@@ -20,6 +20,7 @@
 from torch.distributed import ProcessGroup
 from torchtnt.utils.distributed import (
     _validate_global_rank_world_size,
+    all_gather_str,
     all_gather_tensors,
     broadcast_str,
     destroy_process_group,
@@ -610,3 +611,29 @@ def _test_broadcast_str() -> None:
 
         tc = unittest.TestCase()
         tc.assertEqual(broadcasted_val, "foo")
+
+    @skip_if_not_distributed
+    def test_all_gather_str(self) -> None:
+        backend = "gloo"
+        if torch.cuda.is_available():
+            backend = "nccl"
+
+        spawn_multi_process(2, backend, self._test_all_gather_str)
+
+    @staticmethod
+    def _test_all_gather_str() -> None:
+        if torch.cuda.is_available():
+            torch.cuda.set_device(dist.get_rank())
+
+        val = None
+        if dist.get_rank() == 0:
+            val = "foo"
+        else:
+            val = "barzoo"
+
+        # Test case 1: fixed_buffer_size == len(val)
+        vals = all_gather_str(val)
+
+        tc = unittest.TestCase()
+        tc.assertEqual(vals[0], "foo")
+        tc.assertEqual(vals[1], "barzoo")
diff --git a/torchtnt/utils/distributed.py b/torchtnt/utils/distributed.py
@@ -772,6 +772,46 @@ def broadcast_str(
     return string
 
 
+def all_gather_str(
+    val: str, process_group: Optional[dist.ProcessGroup] = None
+) -> List[str]:
+    """
+    Optimized all gather-ing string without invoking all_gather_object
+    which is subject to hang issues on nccl.
+
+    Args:
+        val: string to include in all_gather
+        process_group: the process group to broadcast in
+
+    Returns:
+        List of all strings
+
+    Note:
+        Will construct and use a temporary gloo process group to minimize device to host transfers
+
+    TODO: support fixed_buffer_size
+    """
+
+    if not dist.is_available() or not dist.is_initialized():
+        return [val]
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+    # use gloo so that we avoid gpu->cpu (device to host) transfers
+    # with get_or_create_gloo_pg(process_group) as gloo_pg:
+
+    # Initialize buffer and buffer_length for all ranks
+    buffer = torch.frombuffer(val.encode("utf-8"), dtype=torch.uint8).to(device)
+    # use `all_gather_tensors` which handles all gathering tensors
+    # of same shape but different lengths (since strings may be different
+    # length on each rank)
+    buffer_strings = all_gather_tensors(buffer, group=process_group)
+
+    result = [bytes(buffer.tolist()).decode("utf-8") for buffer in buffer_strings]
+
+    return result
+
+
 @contextmanager
 def get_or_create_gloo_pg(
     candidate_pg: Optional[dist.ProcessGroup] = None,