[TMA] Move ptxas bug workaround to TMAToLLVM and gate on ptx version (#5408)

peterbell10 · web-flow · commit 2f403588feed · 2024-12-12T17:23:35.000Z
There is a bug in the version of `ptxas` we have which treats
`global_stride` values as if they are 16-byte strides instead of single
byte strides (presumably because the tensormap structure packs them in
this format).

This centralizes the workaround to one point in the code and gates it on
the current ptx version.
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -53,6 +53,7 @@ using namespace mlir::triton;
 #define fmin(...) rewriter.create<LLVM::MinNumOp>(loc, __VA_ARGS__)
 #define shl(...) rewriter.create<LLVM::ShlOp>(loc, __VA_ARGS__)
 #define lshr(...) rewriter.create<LLVM::LShrOp>(loc, __VA_ARGS__)
+#define ashr(...) rewriter.create<LLVM::AShrOp>(loc, __VA_ARGS__)
 #define and_(...) rewriter.create<LLVM::AndOp>(loc, __VA_ARGS__)
 #define xor_(...) rewriter.create<LLVM::XOrOp>(loc, __VA_ARGS__)
 #define or_(...) rewriter.create<LLVM::OrOp>(loc, __VA_ARGS__)
diff --git a/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h b/include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h
@@ -52,11 +52,6 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
       loc, builder.getI64Type(), builder.getI64IntegerAttr(elemSize));
   Value globalStride = builder.template create<arith::MulIOp>(
       loc, op.getStrides()[0], elemSizeVal);
-  // TODO: Workaround for ptxas bug, remove when we update ptxas
-  Value four = builder.template create<arith::ConstantOp>(
-      loc, builder.getI64Type(), builder.getI64IntegerAttr(4));
-  globalStride =
-      builder.template create<arith::ShRSIOp>(loc, globalStride, four);
 
   int elemTypeEnum;
   switch (elemSize) {
diff --git a/test/TritonGPU/samples/simulated-grouped-gemm.mlir b/test/TritonGPU/samples/simulated-grouped-gemm.mlir
diff --git a/test/TritonNvidiaGPU/tma_lowering.mlir b/test/TritonNvidiaGPU/tma_lowering.mlir
@@ -38,8 +38,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
   // CHECK-LABEL: make_tensor_descriptor
   // CHECK: %0 = arith.extsi %arg2 : i32 to i64
   // CHECK: %1 = ttg.global_scratch_alloc {alignment = 128 : i32, nbytes = 128 : i32} : !tt.ptr<i8>
-  // CHECK: %2 = arith.shrsi %0, %c4_i64 : i64
-  // CHECK: tt.experimental_tensormap_create %1, %arg0, [%c32_i32, %c8_i32], [%arg2, %arg1], [%2], [%c1_i32, %c1_i32] {elem_type = 0 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 1 : i32} : (!tt.ptr<i8>, !tt.ptr<i8>, i32, i32, i32, i32, i64, i32, i32) -> ()
+  // CHECK: tt.experimental_tensormap_create %1, %arg0, [%c32_i32, %c8_i32], [%arg2, %arg1], [%0], [%c1_i32, %c1_i32] {elem_type = 0 : i32, fill_mode = 0 : i32, interleave_layout = 0 : i32, swizzle_mode = 1 : i32} : (!tt.ptr<i8>, !tt.ptr<i8>, i32, i32, i32, i32, i64, i32, i32) -> ()
   // CHECK: tt.experimental_tensormap_fenceproxy_acquire %1 : !tt.ptr<i8>
   // CHECK: tt.reinterpret_tensor_descriptor %1 : !tt.ptr<i8> to !tt.tensordesc<tensor<8x32xi8>>
   tt.func public @make_tensor_descriptor(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg1: i32 {tt.divisibility = 16 : i32}, %arg2: i32 {tt.divisibility = 16 : i32} ) -> !tt.tensordesc<tensor<8x32xi8>> {
diff --git a/third_party/nvidia/language/cuda/_experimental_tma.py b/third_party/nvidia/language/cuda/_experimental_tma.py
@@ -68,8 +68,6 @@ def experimental_device_tensormap_create2d(
     element_size = element_ty.primitive_bitwidth // 8
     element_size_t = core.full([], element_size, core.int64, _builder=_builder)
     global_stride = semantic.mul(element_size_t, global_size[-1], True, _builder)
-    # Undocumented, but global_stride seems to be divided by 16
-    global_stride = semantic.ashr(global_stride, semantic.to_tensor(4, _builder), _builder)
 
     contig_dim_size_in_bytes = element_size * load_size[-1]
     if contig_dim_size_in_bytes > 128:
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp
@@ -249,6 +249,7 @@ struct ExperimentalTensormapCreateOpConversion
     Location loc = op->getLoc();
     auto ctx = getContext();
 
+    bool needsStrideWorkaround = targetInfo.getPtxVersion() <= 85;
     auto smemBase = LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op);
 
     zero_fill_tma(loc, ctx, rewriter, targetInfo, smemBase);
@@ -264,8 +265,13 @@ struct ExperimentalTensormapCreateOpConversion
                                    op.getGlobalDim()[i]);
     }
     for (int i = 0; i + 1 < op.getRank(); ++i) {
+      auto strideVal = op.getGlobalStride()[i];
+      if (needsStrideWorkaround) {
+        // Workaround for a ptxas bug
+        strideVal = ashr(strideVal, i64_val(4));
+      }
       tensormap_replace_global_stride(loc, ctx, rewriter, smemBase, i,
-                                      op.getGlobalStride()[i]);
+                                      strideVal);
     }
     for (int i = 0; i < op.getRank(); ++i) {
       tensormap_replace_element_stride(loc, ctx, rewriter, smemBase, i,
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TargetInfo.h
@@ -62,6 +62,8 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
 
   bool supportVectorizedAtomics() const override;
 
+  int getPtxVersion() const { return ptxVersion; }
+
 private:
   int computeCapability;
   int ptxVersion;