add broadcast_str util (meta-pytorch#986)

JKSenthil · facebook-github-bot · commit d050dcd653d5 · 2025-03-24T14:16:19.000-07:00
Summary: Pull Request resolved: meta-pytorch#986 Reviewed By: diego-urgell Differential Revision: D71653630 fbshipit-source-id: 35e1691b31482c0a395c5032da08f5cf869c7cfa
diff --git a/tests/utils/test_distributed.py b/tests/utils/test_distributed.py
@@ -21,6 +21,7 @@
 from torchtnt.utils.distributed import (
     _validate_global_rank_world_size,
     all_gather_tensors,
+    broadcast_str,
     destroy_process_group,
     get_file_init_method,
     get_global_rank,
@@ -571,3 +572,22 @@ def get_backend(_) -> str:
                     raise Exception("Test Exception")
 
         mock_destroy_process_group.assert_called_once_with(pg)
+
+    @skip_if_not_distributed
+    def test_broadcast_str(self) -> None:
+        spawn_multi_process(2, "gloo", self._test_broadcast_str)
+
+    @staticmethod
+    def _test_broadcast_str() -> None:
+        """
+        Tests that test_broadcast_str works as expected
+        """
+
+        val = None
+        if dist.get_rank() == 0:
+            val = "foo"
+
+        broadcasted_val = broadcast_str(val)
+
+        tc = unittest.TestCase()
+        tc.assertEqual(broadcasted_val, "foo")
diff --git a/tests/utils/test_distributed_gpu.py b/tests/utils/test_distributed_gpu.py
@@ -14,6 +14,7 @@
 from torchtnt.utils.device import get_device_from_env
 from torchtnt.utils.distributed import (
     all_gather_tensors,
+    broadcast_str,
     get_global_rank,
     get_local_rank,
     PGWrapper,
@@ -79,3 +80,23 @@ def _test_method(offset_arg: int, offset_kwarg: int) -> int:
     def test_spawn_multi_process(self) -> None:
         mp_list = spawn_multi_process(2, "nccl", self._test_method, 3, offset_kwarg=2)
         self.assertEqual(mp_list, [1, 2])
+
+    @skip_if_not_gpu
+    @skip_if_not_distributed
+    def test_broadcast_str(self) -> None:
+        spawn_multi_process(2, "gloo", self._test_broadcast_str)
+
+    @staticmethod
+    def _test_broadcast_str() -> None:
+        """
+        Tests that test_broadcast_strworks as expected
+        """
+
+        val = None
+        if dist.get_rank() == 0:
+            val = "foo"
+
+        broadcasted_val = broadcast_str(val)
+
+        tc = unittest.TestCase()
+        tc.assertEqual(broadcasted_val, "foo")
diff --git a/torchtnt/utils/distributed.py b/torchtnt/utils/distributed.py
@@ -684,6 +684,69 @@ def wrapper(*args: TParams.args, **kwargs: TParams.kwargs) -> TReturn:
     return wrapper
 
 
+def broadcast_str(
+    val: Optional[str],
+    src: int = 0,
+    process_group: Optional[dist.ProcessGroup] = None,
+) -> Optional[str]:
+    """
+    Broadcasts a string from a source rank to all other ranks in a process group.
+    Serializes string as sequence of uint8 and broadcasts as a tensor. This avoids
+    issues with broadcast_object_list and related apis which use pickle to serialize objects.
+
+    Args:
+        val: the string to broadcast
+        src: the source rank to broadcast from
+        process_group: the process group to broadcast in. Defaults to the WORLD process group.
+
+    Returns:
+        The broadcasted string.
+
+    Note:
+        This function issues two collective calls, one to broadcast the size of the serialized string and
+        one to broadcast the string itself. This can theoretically be limited to one collective call
+        by hardcoding maximum buffer size to use, and filling unused buffer slots with preselected
+        null tokens. However, this is not implemented to avoid unnecessary complexity.
+    """
+    if not dist.is_available() or not dist.is_initialized():
+        return val
+
+    rank = dist.get_rank(group=process_group)
+
+    # device to use when broadcasting the string
+    device = torch.device(
+        torch.cuda.current_device()
+        if dist.get_backend(process_group) == "nccl"
+        else "cpu"
+    )
+
+    # dummy instantiation to keep pyre happy
+    buffer = torch.empty((1), dtype=torch.uint8)
+    buffer_length = torch.empty((1), dtype=torch.int, device=device)
+    if rank == src:
+        assert (
+            val is not None
+        ), "Source rank must provide a string to broadcast, got None"
+
+        # convert string to tensor
+        buffer = torch.frombuffer(val.encode("utf-8"), dtype=torch.uint8)
+        buffer = buffer.to(device=device)
+        buffer_length = torch.tensor((len(buffer)), dtype=torch.int, device=device)
+
+    # first broadcast the buffer length so receiving ranks can allocate the correct amount of memory
+    dist.broadcast(buffer_length, src=src, group=process_group)
+    if rank != src:
+        size = int(buffer_length.item())
+        buffer = torch.empty((size), dtype=torch.uint8, device=device)
+
+    # now broadcast string
+    dist.broadcast(buffer, src=src, group=process_group)
+
+    # convert tensor to string
+    string = bytes(buffer.tolist()).decode(encoding="utf-8")
+    return string
+
+
 @contextmanager
 def get_or_create_gloo_pg(
     candidate_pg: Optional[dist.ProcessGroup] = None,