Skip to content

Commit 06e560d

Browse files
authored
Blockwise/Groupwise kernel improvement and programatic dependent launch enablement (#2161)
Co-authored-by: dePaul Miller <23461061+depaulmillz@users.noreply.github.com>
1 parent df18f5e commit 06e560d

File tree

3 files changed

+24
-3
lines changed

3 files changed

+24
-3
lines changed

CMakeLists.txt

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -382,7 +382,21 @@ endif()
382382

383383
if (CUTLASS_ENABLE_GDC_FOR_SM90)
384384
message(STATUS "Grid Dependency Control (GDC) is enabled for SM90 kernels (required for programmatic dependent launches).")
385-
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
385+
list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM90=1)
386+
endif()
387+
388+
if (NOT DEFINED CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT)
389+
set(CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT ON)
390+
endif()
391+
392+
set(CUTLASS_ENABLE_GDC_FOR_SM100
393+
${CUTLASS_ENABLE_GDC_FOR_SM100_DEFAULT}
394+
CACHE BOOL
395+
"Enables Grid Dependency Control (GDC) for SM100 kernels (required for PDL).")
396+
397+
if (CUTLASS_ENABLE_GDC_FOR_SM100)
398+
message(STATUS "Grid Dependency Control (GDC) is enabled for SM100 kernels (required for programmatic dependent launches).")
399+
list(APPEND CUTLASS_CUDA_FLAGS -DCUTLASS_ENABLE_GDC_FOR_SM100=1)
386400
endif()
387401

388402
set(CUTLASS_ENABLE_SYNCLOG OFF CACHE BOOL "Enable synchronization event logging for race condition debugging. WARNING: This redefines __syncthreads() and __syncwarp() in all downstream code!")

include/cutlass/arch/grid_dependency_control.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,11 @@
4646
defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && defined(__CUDA_ARCH_FEAT_SM90_ALL))
4747
#define CUTLASS_GDC_ENABLED
4848
#endif
49+
#if (defined(CUTLASS_ENABLE_GDC_FOR_SM100) && \
50+
__CUDACC_VER_MAJOR__ >= 12 && \
51+
defined(__CUDA_ARCH__) && __CUDA_ARCH__ == 1000 && defined(__CUDA_ARCH_FEAT_SM100_ALL))
52+
#define CUTLASS_GDC_ENABLED
53+
#endif
4954
#endif
5055

5156
namespace cutlass {

include/cutlass/gemm/collective/sm100_mma_warpspecialized_blockwise_scaling.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -281,13 +281,15 @@ struct CollectiveMma<
281281
static constexpr int LeadingScalesPerTileSFA = size<0,1>(LayoutSFA{}.stride()) == 1 ? ScaleMsPerTile : ScaleKsPerTile;
282282
using ScaleCopyTypeA = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFA, 16)>;
283283
using SmemScalingCopyAtomA = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeA>, ElementAccumulator>;
284+
static constexpr int ElementsPerSFACopy = static_cast<int>(sizeof(ScaleCopyTypeA) / sizeof(ElementAccumulator));
284285

285286
static constexpr int LeadingScalesPerTileSFB = size<0,1>(LayoutSFB{}.stride()) == 1 ? ScaleNsPerTile : ScaleKsPerTile;
286287
using ScaleCopyTypeB = cute::uint_byte_t<cute::min(static_cast<int>(sizeof(ElementAccumulator)) * LeadingScalesPerTileSFB, 16)>;
287288
using SmemScalingCopyAtomB = Copy_Atom<SM80_CP_ASYNC_CACHEALWAYS<ScaleCopyTypeB>, ElementAccumulator>;
289+
static constexpr int ElementsPerSFBCopy = static_cast<int>(sizeof(ScaleCopyTypeB) / sizeof(ElementAccumulator));
288290

289-
using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFA>>>{}));
290-
using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<LeadingScalesPerTileSFB>>>{}));
291+
using TiledCopyScaleA = decltype(make_tiled_copy(SmemScalingCopyAtomA{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFACopy>>>{}));
292+
using TiledCopyScaleB = decltype(make_tiled_copy(SmemScalingCopyAtomB{}, Layout<Shape<_1>>{}, Layout<Shape<Int<ElementsPerSFBCopy>>>{}));
291293

292294
struct SharedStorage {
293295
struct TensorStorage : cute::aligned_struct<128, _0> {

0 commit comments

Comments
 (0)