wip

Amir-19 · Amir-19 · commit edb75a0415c9 · 2025-09-30T22:57:58.000-07:00
diff --git a/flashinfer/cute_dsl/blockscaled_gemm.py b/flashinfer/cute_dsl/blockscaled_gemm.py
@@ -50,20 +50,26 @@
     Uint64,
     T,
     Integer,
-    dsl_user_op,
-    extract_mlir_values,
-    new_from_mlir_values,
-)
-# TODO(asamani): remove Int32 from above?
-from cutlass.cute.typing import (
-    Int32,
     Float16,
     BFloat16,
     Float32,
     Float8E4M3FN,
     Float8E5M2,
     Tensor,
+    dsl_user_op,
+    extract_mlir_values,
+    new_from_mlir_values,
 )
+
+# from cutlass.cute.typing import (
+#     Int32,
+#     Float16,
+#     BFloat16,
+#     Float32,
+#     Float8E4M3FN,
+#     Float8E5M2,
+#     Tensor,
+# )
 from cutlass._mlir.dialects import llvm
 from flashinfer.utils import get_compute_capability
 from cutlass.utils.static_persistent_tile_scheduler import WorkTileInfo
@@ -1871,13 +1877,10 @@ def kernel(
                         * cute.size(self.cluster_shape_mn)
                         + cute.arch.block_idx_in_cluster()
                     )
-                    #cute.printf(tile_id)
                     if warp_idx == self.epilog_warp_id[0]:
                         cute.arch.cp_async_bulk_wait_group(0, read=False)
                         # System barrier to make sure that data from each GPU is in memory before allreduce
                         with cute.arch.elect_one():
-                    #         cute.printf("EPILOGUE: rank=%d warp=%d tile_id=%d num_executed=%d\n", 
-                    #    self.rank_id, warp_idx, tile_id, tile_sched.num_tiles_executed)
                             flag = barrier_flag_mc.iterator + tile_id
                             cute.arch.fence_acq_rel_gpu()
                             distributed_helpers.spin_lock_multimem_arrive(flag)
@@ -1996,8 +1999,6 @@ def kernel(
                     # System barrier to make sure that data from each GPU is in memory before allreduce
                     if warp_idx == self.all_reduce_warp_id[0]:
                         with cute.arch.elect_one():
-                    #         cute.printf("ALLREDUCE: rank=%d warp=%d tile_id=%d num_executed=%d\n",
-                    #    self.rank_id, warp_idx, tile_id, tile_sched.num_tiles_executed)
                             flag = barrier_flag.iterator + tile_id
                             # TODO: we may use LDG+STG for spin lock instead of ATOMIC_CAS for better performance.
                             distributed_helpers.spin_lock_wait(flag, num_ranks)
@@ -2695,6 +2696,7 @@ def can_implement(
         b_major: str,
         c_major: str,
         all_reduce: str = "none",
+        process_group: Optional[torch.distributed.ProcessGroup] = None,
     ) -> bool:
         """
         Check if the gemm can be implemented
@@ -2753,10 +2755,10 @@ def can_implement(
         ):
             can_implement = False
 
-        # check for all reduce constraints
-        # TODO(asamani): expand the logic for mnnvl support
+        # Check for all reduce constraints
         if all_reduce != "none":
-            if torch.distributed.get_world_size() not in [2, 4, 8]:
+            # TODO(asamani): expand the logic for mnnvl support
+            if torch.distributed.get_world_size(process_group) not in [2, 4, 8]:
                 can_implement = False
         return can_implement
 
@@ -2993,9 +2995,6 @@ def __call__(
                 order=(0, 1, 2) if self._c_major == "m" else (1, 0, 2),
             ),
         ) if c_mc_ptr is not None else None
-        #TODO(asamani): urgent fix this is just for dev
-        # this should be calculated based on how many total tiles we need to work
-        # on
         barrier_flag_tensor = cute.make_tensor(
             barrier_flag_ptr,
             layout=cute.make_ordered_layout((barrier_flag_size,), order=(0,)),