[AsyncMM] re-enable and prepare for cutlass 3.5.1 update (pytorch#144011)

Yifu Wang · pytorchmergebot · commit 24ac87392bc4 · 2025-01-07T02:15:42.000Z
Pull Request resolved: pytorch#144011 Approved by: https://github.com/Skylion007, https://github.com/drisspg
diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@@ -5,13 +5,15 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/cuda/CUDAGuard.h>
 
-#if false && !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && \
+#if !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && \
     CUDA_VERSION >= 12000
 #define BUILD_ASYNC_MM_KERNEL
 #endif
 
 #if defined(BUILD_ASYNC_MM_KERNEL)
 
+// TODO(yifu): remove this once cutlass 3.5.1 upgrade is completed
+#if CUTLASS_VERSION != 351
 // We are going to override the cuTensorMapEncodeTiled driver api with our lazy
 // loader
 static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
@@ -56,7 +58,19 @@ static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
 #include <cute/tensor.hpp>
 #undef cuTensorMapEncodeTiled
 // Set everything back to normal
+// clang-format on
+#else
+#include <cutlass/core_io.h>
+#include <cutlass/cutlass.h>
+#include <cutlass/gemm/device/gemm.h>
+#include <cutlass/half.h>
+#include <cutlass/numeric_types.h>
+#include <cutlass/trace.h>
+#include <cutlass/util/host_tensor.h>
+#include <cute/tensor.hpp>
+#endif
 
+#include <cutlass/version.h>
 #include <cutlass/gemm/collective/collective_builder.hpp>
 #include <cutlass/gemm/device/gemm_universal_adapter.h>
 #include <cutlass/epilogue/collective/collective_builder.hpp>
@@ -65,7 +79,6 @@ static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
 #include <cutlass/gemm/dispatch_policy.hpp>
 #include <cutlass/gemm/kernel/gemm_universal.hpp>
 #include <cutlass/util/packed_stride.hpp>
-// clang-format on
 
 #include <torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh>
 
@@ -107,7 +120,7 @@ at::Tensor async_input_mm_impl(
           cutlass::epilogue::collective::EpilogueTileAuto,
           ElementAccumulator,
           ElementAccumulator,
-          void,
+          ElementC,
           LayoutC,
           AlignmentC,
           ElementC,
@@ -133,7 +146,7 @@ at::Tensor async_input_mm_impl(
           KernelSchedule>::CollectiveOp;
 
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int, int>,
+      Shape<int, int, int>,
       CollectiveMainloop,
       CollectiveEpilogue,
       cutlass::gemm::PersistentAsyncInputScheduler<KernelSchedule>>;
@@ -171,15 +184,15 @@ at::Tensor async_input_mm_impl(
 
   typename Gemm::Arguments arguments{
       cutlass::gemm::GemmUniversalMode::kGemm,
-      {M, N, K, 1},
+      {M, N, K},
       {
           reinterpret_cast<ElementA*>(a.data_ptr<at::BFloat16>()),
           stride_A,
           reinterpret_cast<ElementB*>(b.data_ptr<at::BFloat16>()),
           stride_B,
       },
       {{1, 1},
-       nullptr,
+       reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
        stride_C,
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
        stride_C},
diff --git a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
@@ -263,7 +263,9 @@ public:
       cta_m, cta_n
     );
   }
-  // Kernel helper function to get next work ID
+
+// TODO(yifu): remove this once cutlass 3.5.1 upgrade is completed
+#if CUTLASS_VERSION != 351
   template <class WorkIdPipeline, class WorkIdPipelineState>
   CUTLASS_DEVICE
   auto
@@ -278,19 +280,18 @@ public:
     // Return true to indicate that the WorkID pipeline state should be advanced
     return cute::make_tuple(new_work_tile_info, true);
   }
-
+#else
   CUTLASS_DEVICE
-  static auto
-  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
-    // Get every cta coord in three dimensions of the cluster
-    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = cute::block_id_in_cluster();
-    return make_coord(
-      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
-      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
-      _,
-      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
-    );
+  auto
+  fetch_next_work(WorkTileInfo work_tile_info) {
+    if (continue_current_work(work_tile_info)) {
+      return work_tile_info;
+    }
+
+    advance_to_next_work();
+    return get_current_work();
   }
+#endif
 
   // Given the inputs, computes the physical grid we should launch.
   template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
@@ -347,20 +348,6 @@ public:
     );
   }
 
-  // Convert CTA-level work tile info to cluster-level tile coord
-  CUTLASS_DEVICE
-  cute::Coord<int,int,int,int>
-  tile_info_to_coord_mnkl(WorkTileInfo work_tile_info) const {
-    // TileScheduler works at CTA-level, kernel works at cluster-level
-    int m_coord = idx2crd(work_tile_info.M_idx / params.cluster_shape_m_,
-                          params.problem_tiles_m_);
-    int n_coord = idx2crd(work_tile_info.N_idx / params.cluster_shape_n_,
-                          params.problem_tiles_n_);
-    int l_coord = idx2crd(work_tile_info.L_idx,
-                          params.problem_tiles_l_);
-    return make_coord(m_coord, n_coord, _, l_coord);
-  }
-
   // Returns whether the block assigned this work should compute the epilogue for the corresponding
   // output tile. For the basic tile scheduler, this is always true.
   CUTLASS_HOST_DEVICE
@@ -471,7 +458,7 @@ public:
   template <class ProblemShape, class ElementAccumulator>
   static cutlass::Status
   initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
-    uint32_t, const uint32_t = 1) {
+    uint32_t, const uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
     return Status::kSuccess;
   }
 public: