Updated in broadcast_str to use correct tensor size

VieEeEw · facebook-github-bot · commit 763e5725ed29 · 2025-07-08T10:30:44.000-07:00
Summary:
Tensor size from source rank was [] before the fix while on other ranks tensor size was [1]. Broadcasting from [] to [1] is an illegal usage.
The bug heppened to not cause any failures.

Reviewed By: diego-urgell, JKSenthil, fduwjj

Differential Revision: D77901586

fbshipit-source-id: 2be33d7a2fcd1113995fcff499b6e90b06c0abb3
diff --git a/tests/utils/test_distributed_gpu.py b/tests/utils/test_distributed_gpu.py
@@ -86,12 +86,17 @@ def test_spawn_multi_process(self) -> None:
     def test_broadcast_str(self) -> None:
         spawn_multi_process(2, "gloo", self._test_broadcast_str)
 
+    @skip_if_not_gpu
+    @skip_if_not_distributed
+    def test_broadcast_str_gpu(self) -> None:
+        spawn_multi_process(2, "nccl", self._test_broadcast_str)
+
     @staticmethod
     def _test_broadcast_str() -> None:
         """
         Tests that test_broadcast_strworks as expected
         """
-
+        init_from_env()
         val = None
         if dist.get_rank() == 0:
             val = "foo"
diff --git a/torchtnt/utils/distributed.py b/torchtnt/utils/distributed.py
@@ -738,7 +738,7 @@ def broadcast_str(
         # convert string to tensor
         buffer = torch.frombuffer(val.encode("utf-8"), dtype=torch.uint8)
         buffer = buffer.to(device=device)
-        buffer_length = torch.tensor((len(buffer)), dtype=torch.int, device=device)
+        buffer_length = torch.tensor([len(buffer)], dtype=torch.int, device=device)
 
         if fixed_buffer_size is not None:
             if len(buffer) > fixed_buffer_size: