Reenable overlapping of stashing kernels

nanz-nv · vasunvidia · commit 6eb84a02b6b5 · 2026-02-19T15:59:47.000-08:00
diff --git a/megatron/core/transformer/moe/paged_stash.py b/megatron/core/transformer/moe/paged_stash.py
@@ -525,7 +525,7 @@ def __init__(self):
         """Initialize the manager with queues and dedicated CUDA streams."""
         # allocate streams and events for synchronization
         self.enabled = False
-        self._pack_stream = torch.cuda.current_stream()#torch.cuda.Stream()
+        self._pack_stream = torch.cuda.Stream()
         # Currently paged stashing is not stream-safe, so use the same stream for packing
         # and unpacking
         self._unpack_stream = self._pack_stream