fix bucketized allreduce (meta-pytorch#278)

tushar00jain · facebook-github-bot · commit 85414c94be8d · 2025-10-07T15:23:36.000-07:00
Summary:

- update the callback to work with the new ManagedWork
- provide an option to use bucketization using env var

Differential Revision: D84101245
diff --git a/torchft/local_sgd.py b/torchft/local_sgd.py
@@ -11,6 +11,7 @@
 
 import logging
 import math
+import os
 from contextlib import nullcontext
 from types import TracebackType
 from typing import Any, Dict, List, Optional, Tuple, Type
@@ -25,6 +26,8 @@
 
 logger: logging.Logger = logging.getLogger(__name__)
 
+USE_BUCKETIZATION_ENV: str = "TORCHFT_USE_BUCKETIZATION"
+
 
 def extract_local_tensor(t: torch.Tensor) -> torch.Tensor:
     """
@@ -171,7 +174,7 @@ def _average(self) -> list[torch.Tensor]:
 
 
 class _StreamingDiLoCoFragment:
-    bucket_cap_mb: int = 32 * 1024 * 1024
+    bucket_cap_mb: int = 1 * 1024 * 1024 * 1024
     use_bucketization: bool = False
 
     def __init__(
@@ -220,7 +223,11 @@ def __init__(
         if bucket_cap_mb is not None:
             self.bucket_cap_mb = int(bucket_cap_mb * 1024 * 1024)
 
-        self.use_bucketization = use_bucketization
+        if os.getenv(USE_BUCKETIZATION_ENV, "False") == "True":
+            self.use_bucketization = True
+        else:
+            self.use_bucketization = use_bucketization
+
         self.should_quantize = should_quantize
 
         self._grads: Dict[str, torch.Tensor] = {}
@@ -535,14 +542,9 @@ def _bucketize_and_allreduce(
             def callback(
                 fut: torch.futures.Future[list[torch.Tensor]],
             ) -> list[torch.Tensor]:
-                with torch.cuda.stream(self._stream) if self._stream else nullcontext():
-                    nonlocal bucket_tensors, flat_buffer
-                    # Setup stream dependency
-                    fut.wait()
-                    for t, pack_offset, numel in bucket_tensors:
-                        t.copy_(
-                            flat_buffer[pack_offset : pack_offset + numel].view_as(t)
-                        )
+                nonlocal bucket_tensors, flat_buffer
+                for t, pack_offset, numel in bucket_tensors:
+                    t.copy_(flat_buffer[pack_offset : pack_offset + numel].view_as(t))
 
                 return []