Revert "[AsyncMM] re-enable and prepare for cutlass 3.5.1 update (pytorch#144011)"

pytorchmergebot · pytorchmergebot · commit 778d953951f5 · 2025-01-07T03:24:01.000Z
This reverts commit 24ac873. Reverted pytorch#144011 on behalf of https://github.com/malfet due to Not sure what is going on, but lots of builds are failing ([comment](pytorch#144011 (comment)))
diff --git a/torch/csrc/distributed/c10d/cuda/AsyncMM.cu b/torch/csrc/distributed/c10d/cuda/AsyncMM.cu
@@ -5,15 +5,13 @@
 #include <ATen/cuda/nvrtc_stub/ATenNVRTC.h>
 #include <c10/cuda/CUDAGuard.h>
 
-#if !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && \
+#if false && !defined(USE_ROCM) && !defined(_WIN32) && defined(CUDA_VERSION) && \
     CUDA_VERSION >= 12000
 #define BUILD_ASYNC_MM_KERNEL
 #endif
 
 #if defined(BUILD_ASYNC_MM_KERNEL)
 
-// TODO(yifu): remove this once cutlass 3.5.1 upgrade is completed
-#if CUTLASS_VERSION != 351
 // We are going to override the cuTensorMapEncodeTiled driver api with our lazy
 // loader
 static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
@@ -58,19 +56,7 @@ static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
 #include <cute/tensor.hpp>
 #undef cuTensorMapEncodeTiled
 // Set everything back to normal
-// clang-format on
-#else
-#include <cutlass/core_io.h>
-#include <cutlass/cutlass.h>
-#include <cutlass/gemm/device/gemm.h>
-#include <cutlass/half.h>
-#include <cutlass/numeric_types.h>
-#include <cutlass/trace.h>
-#include <cutlass/util/host_tensor.h>
-#include <cute/tensor.hpp>
-#endif
 
-#include <cutlass/version.h>
 #include <cutlass/gemm/collective/collective_builder.hpp>
 #include <cutlass/gemm/device/gemm_universal_adapter.h>
 #include <cutlass/epilogue/collective/collective_builder.hpp>
@@ -79,6 +65,7 @@ static CUresult CUDAAPI nvrtc_cuTensorMapEncodeTiled(
 #include <cutlass/gemm/dispatch_policy.hpp>
 #include <cutlass/gemm/kernel/gemm_universal.hpp>
 #include <cutlass/util/packed_stride.hpp>
+// clang-format on
 
 #include <torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh>
 
@@ -120,7 +107,7 @@ at::Tensor async_input_mm_impl(
           cutlass::epilogue::collective::EpilogueTileAuto,
           ElementAccumulator,
           ElementAccumulator,
-          ElementC,
+          void,
           LayoutC,
           AlignmentC,
           ElementC,
@@ -146,7 +133,7 @@ at::Tensor async_input_mm_impl(
           KernelSchedule>::CollectiveOp;
 
   using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
-      Shape<int, int, int>,
+      Shape<int, int, int, int>,
       CollectiveMainloop,
       CollectiveEpilogue,
       cutlass::gemm::PersistentAsyncInputScheduler<KernelSchedule>>;
@@ -184,15 +171,15 @@ at::Tensor async_input_mm_impl(
 
   typename Gemm::Arguments arguments{
       cutlass::gemm::GemmUniversalMode::kGemm,
-      {M, N, K},
+      {M, N, K, 1},
       {
           reinterpret_cast<ElementA*>(a.data_ptr<at::BFloat16>()),
           stride_A,
           reinterpret_cast<ElementB*>(b.data_ptr<at::BFloat16>()),
           stride_B,
       },
       {{1, 1},
-       reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
+       nullptr,
        stride_C,
        reinterpret_cast<ElementC*>(out.data_ptr<at::BFloat16>()),
        stride_C},
diff --git a/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh b/torch/csrc/distributed/c10d/cuda/cutlass/gemm/kernel/persistent_async_input_scheduler.cuh
@@ -263,9 +263,7 @@ public:
       cta_m, cta_n
     );
   }
-
-// TODO(yifu): remove this once cutlass 3.5.1 upgrade is completed
-#if CUTLASS_VERSION != 351
+  // Kernel helper function to get next work ID
   template <class WorkIdPipeline, class WorkIdPipelineState>
   CUTLASS_DEVICE
   auto
@@ -280,18 +278,19 @@ public:
     // Return true to indicate that the WorkID pipeline state should be advanced
     return cute::make_tuple(new_work_tile_info, true);
   }
-#else
-  CUTLASS_DEVICE
-  auto
-  fetch_next_work(WorkTileInfo work_tile_info) {
-    if (continue_current_work(work_tile_info)) {
-      return work_tile_info;
-    }
 
-    advance_to_next_work();
-    return get_current_work();
+  CUTLASS_DEVICE
+  static auto
+  work_tile_to_cta_coord(WorkTileInfo work_tile_info) {
+    // Get every cta coord in three dimensions of the cluster
+    auto [cta_m_in_cluster, cta_n_in_cluster, cta_l_in_cluster] = cute::block_id_in_cluster();
+    return make_coord(
+      work_tile_info.M_idx + static_cast<int32_t>(cta_m_in_cluster),
+      work_tile_info.N_idx + static_cast<int32_t>(cta_n_in_cluster),
+      _,
+      work_tile_info.L_idx + static_cast<int32_t>(cta_l_in_cluster)
+    );
   }
-#endif
 
   // Given the inputs, computes the physical grid we should launch.
   template<class ProblemShapeMNKL, class BlockShape, class ClusterShape>
@@ -348,6 +347,20 @@ public:
     );
   }
 
+  // Convert CTA-level work tile info to cluster-level tile coord
+  CUTLASS_DEVICE
+  cute::Coord<int,int,int,int>
+  tile_info_to_coord_mnkl(WorkTileInfo work_tile_info) const {
+    // TileScheduler works at CTA-level, kernel works at cluster-level
+    int m_coord = idx2crd(work_tile_info.M_idx / params.cluster_shape_m_,
+                          params.problem_tiles_m_);
+    int n_coord = idx2crd(work_tile_info.N_idx / params.cluster_shape_n_,
+                          params.problem_tiles_n_);
+    int l_coord = idx2crd(work_tile_info.L_idx,
+                          params.problem_tiles_l_);
+    return make_coord(m_coord, n_coord, _, l_coord);
+  }
+
   // Returns whether the block assigned this work should compute the epilogue for the corresponding
   // output tile. For the basic tile scheduler, this is always true.
   CUTLASS_HOST_DEVICE
@@ -458,7 +471,7 @@ public:
   template <class ProblemShape, class ElementAccumulator>
   static cutlass::Status
   initialize_workspace(Arguments const&, void*, cudaStream_t, ProblemShape, KernelHardwareInfo const&,
-    uint32_t, const uint32_t = 1, CudaHostAdapter* cuda_adapter = nullptr) {
+    uint32_t, const uint32_t = 1) {
     return Status::kSuccess;
   }
 public: