[AMD] Make buffer op stride optional (#5908)

antiagainst · web-flow · commit f906b9b2f9a5 · 2025-02-13T11:21:17.000-08:00
We may not able to deduce it; then we cannot set the swizzling factor.
Using zero is confusing for such cases as it can mean broadcasting;
we want to explicitly use nullptr.

Also cleaned up some style nits along the way.
diff --git a/test/Conversion/amd/buffer_load_store.mlir b/test/Conversion/amd/buffer_load_store.mlir
@@ -9,8 +9,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
         // CHECK: %[[offset:.*]] = llvm.select %[[c_mask]]
         // CHECK: %[[aux:.*]] = llvm.mlir.constant(3 : i32) : i32
         // CHECK: rocdl.raw.ptr.buffer.load {{.*}}, %[[offset]], {{.*}}, %[[aux]]
-        %c0 = arith.constant 0 : i32
-        %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs stride = %c0 : tensor<128xf32, #blocked0>
+        %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs : tensor<128xf32, #blocked0>
         tt.return
   }
 }
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir
@@ -566,7 +566,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     %5 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32
     %6 = tt.splat %5 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
     %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
-    // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]] stride = %c0_i32
+    // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]]
     %8 = tt.atomic_rmw fadd, acq_rel, gpu, %7, %arg1 : (tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
     tt.return %8 : tensor<1024xf32, #blocked>
   }
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -41,8 +41,7 @@ include "TritonAMDGPUAttrDefs.td"
 
 
 class TT_AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
-    Op<TritonAMDGPU_Dialect, mnemonic, !listconcat(traits, [])> {
-}
+    Op<TritonAMDGPU_Dialect, mnemonic, !listconcat(traits, [])>;
 
 //
 // Interfaces
@@ -53,8 +52,7 @@ def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
 // ExtractSliceOp
 //===----------------------------------------------------------------------===//
 
-def ExtractSliceOp
-    : TT_AMDGPU_Op<"extract_slice", [Pure]> {
+def ExtractSliceOp : TT_AMDGPU_Op<"extract_slice", [Pure]> {
   let summary = "extract slice operation";
   let description = [{
     The "extract_slice" operation enables extracting a slice of a tensor in
@@ -92,8 +90,10 @@ def ExtractSliceOp
     size of the slice is determined by the result type.
     }];
 
-  let arguments = (ins AnyRankedTensor:$source,
-      DenseI64ArrayAttr:$static_offsets);
+  let arguments = (ins
+    AnyRankedTensor:$source,
+    DenseI64ArrayAttr:$static_offsets
+  );
   let results = (outs AnyRankedTensor:$result);
 
   let builders = [
@@ -117,6 +117,10 @@ def ExtractSliceOp
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// InstructionSchedHint
+//===----------------------------------------------------------------------===//
+
 def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
   let summary = "A placeholder op for instruction scheduling hints within a basic block";
   let description = [{
@@ -156,8 +160,11 @@ def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
   let assemblyFormat = [{ attr-dict }];
 }
 
-def CondBarrierOp : TT_AMDGPU_Op<"cond_barrier">,
-  Arguments<(ins I1:$pred)> {
+//===----------------------------------------------------------------------===//
+// CondBarrierOp
+//===----------------------------------------------------------------------===//
+
+def CondBarrierOp : TT_AMDGPU_Op<"cond_barrier"> {
   let summary = "Conditionally set barriers to synchronize partial threads in a block";
 
   let description = [{
@@ -170,22 +177,25 @@ def CondBarrierOp : TT_AMDGPU_Op<"cond_barrier">,
       NB. This doesn't set any memory fence.
   }];
 
+  let arguments = (ins I1:$pred);
+
   let assemblyFormat = "$pred attr-dict";
 }
 
-//
-// AMD Buffer operations.
-//
+//===----------------------------------------------------------------------===//
+// BufferLoadOp
+//===----------------------------------------------------------------------===//
+
 def BufferLoadOp : TT_AMDGPU_Op<"buffer_load", [
   SameLoadStoreOperandsAndResultEncoding,
   AttrSizedOperandSegments,
   MemoryEffects<[MemRead<GlobalMemory>]>,
   TypesMatchWith<"result element type matches the pointed type of ptr", "result", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"result and offsets have the same shape", "result", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"result and mask have the same shape", "result", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 3) || std::equal_to<>()">,
+                 "(cast<BufferLoadOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
   TypesMatchWith<"result and other have the same type", "result", "other", "$_self",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferLoadOp>($_op).getOther() == nullptr) || std::equal_to<>()">,
 ]>{
     let summary = "Load from a scalar base pointer and a tensor offset";
     let description = [{
@@ -201,11 +211,10 @@ def BufferLoadOp : TT_AMDGPU_Op<"buffer_load", [
       when it converts to the buffer ops because it is important for optimizing
       the cache memory access.
     }];
-    let arguments = (
-      ins
+    let arguments = (ins
       TT_Ptr:$ptr,
       I32Tensor:$offsets,
-      I32:$stride,
+      Optional<I32>:$stride,
       DefaultValuedAttr<TT_CacheModifierAttr, "::mlir::triton::CacheModifier::NONE">:$cache,
       Optional<TT_BoolTensor>:$mask,
       Optional<TT_Tensor>:$other
@@ -215,24 +224,29 @@ def BufferLoadOp : TT_AMDGPU_Op<"buffer_load", [
     let assemblyFormat = [{
       $ptr `[` $offsets `]` (`,` $mask^)? (`,` $other^)?
       oilist(`cacheModifier` `=` $cache)
-      `stride` `=` $stride
+      (`stride` `=` $stride^)?
       attr-dict `:` type($result)
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// BufferAtomicRMWOp
+//===----------------------------------------------------------------------===//
+
 def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
+  AttrSizedOperandSegments,
   SameLoadStoreOperandsAndResultEncoding,
   MemoryEffects<[MemRead<GlobalMemory>]>,
   MemoryEffects<[MemWrite<GlobalMemory>]>,
   TypesMatchWith<"result element type matches the value type", "result", "value", "$_self">,
   TypesMatchWith<"result element type matches the pointed type of ptr", "result", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"result and offsets have the same shape", "result", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"result and mask have the same shape", "result", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferAtomicRMWOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
   TypesMatchWith<"value element type matches the pointed type of ptr", "value", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"value and offsets have the same shape", "value", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"value and mask have the same shape", "value", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferAtomicRMWOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
 ]>{
     let summary = "Atomic RMW op which reads, modifies, and writes to a scalar base pointer and a tensor offset";
     let description = [{
@@ -246,13 +260,12 @@ def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
         the address difference between the first elements of each row in bytes. Compiler tries to obtain the `stride`
         when it converts to the buffer ops because it is important for optimizing the cache memory access.
     }];
-    let arguments = (
-      ins
+    let arguments = (ins
       TT_AtomicRMWAttr:$atomic_rmw_op,
       TT_Ptr:$ptr,
       I32Tensor:$offsets,
       TT_Tensor:$value,
-      I32:$stride,
+      Optional<I32>:$stride,
       TT_MemSemanticAttr:$sem,
       TT_MemSyncScopeAttr:$scope,
       Optional<TT_BoolTensor>:$mask
@@ -261,46 +274,23 @@ def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
 
     let assemblyFormat = [{
         $atomic_rmw_op `,` $sem `,` $scope `,` $value `,` $ptr `[` $offsets `]` (`,` $mask^)?
-        `stride` `=` $stride
+        (`stride` `=` $stride^)?
         attr-dict `:` type($result)
     }];
 }
 
-def TTG_UpcastMXFPOp : TT_AMDGPU_Op<"upcast_mxfp", [Pure]> {
-  let summary = "Convert an mxfp tensor to bf16/fp16";
-
-  let hasVerifier = 1;
-
-  let description = [{
-    Compute the bf16 encoded in the given mxfp number as per
-    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-  }];
-  let arguments = (
-    ins
-    TT_Tensor:$src,
-    TT_Tensor:$scale,
-    TT_ScaleDotElemTypeAttr:$fp_type,
-    BoolAttr:$fastMath
-  );
-  let results = (outs TT_Tensor:$result);
-
-  let assemblyFormat = [{
-    $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
-  }];
-
-  let extraClassDeclaration = [{
-    static RankedTensorType deduceOutputType(
-        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
-  }];
-}
+//===----------------------------------------------------------------------===//
+// BufferStoreOp
+//===----------------------------------------------------------------------===//
 
 def BufferStoreOp : TT_AMDGPU_Op<"buffer_store", [
+  AttrSizedOperandSegments,
   SameLoadStoreOperandsEncoding,
   MemoryEffects<[MemWrite<GlobalMemory>]>,
   TypesMatchWith<"value element type matches the pointed type of ptr", "value", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"value and offsets have the same shape", "value", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"value and mask have the same shape", "value", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferStoreOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
 ]>{
     let summary = "Store into scalar base pointer and a tensor offset";
     let description = [{
@@ -316,22 +306,53 @@ def BufferStoreOp : TT_AMDGPU_Op<"buffer_store", [
       when it converts to the buffer ops because it is important for optimizing
       the cache memory access.
     }];
-    let arguments = (
-      ins
+    let arguments = (ins
       TT_Tensor:$value,
       TT_Ptr:$ptr,
       I32Tensor:$offsets,
-      I32:$stride,
+      Optional<I32>:$stride,
       DefaultValuedAttr<TT_CacheModifierAttr, "mlir::triton::CacheModifier::NONE">:$cache,
       Optional<TT_BoolTensor>:$mask
     );
 
     let assemblyFormat = [{
       $value `,` $ptr `[` $offsets `]` (`,` $mask^)?
       oilist(`cacheModifier` `=` $cache)
-      `stride` `=` $stride
+      (`stride` `=` $stride^)?
       attr-dict `:` type($value)
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// UpcastMXFPOp
+//===----------------------------------------------------------------------===//
+
+def TTG_UpcastMXFPOp : TT_AMDGPU_Op<"upcast_mxfp", [Pure]> {
+  let summary = "Convert an mxfp tensor to bf16/fp16";
+
+  let hasVerifier = 1;
+
+  let description = [{
+    Compute the bf16 encoded in the given mxfp number as per
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+  }];
+  let arguments = (
+    ins
+    TT_Tensor:$src,
+    TT_Tensor:$scale,
+    TT_ScaleDotElemTypeAttr:$fp_type,
+    BoolAttr:$fastMath
+  );
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
+  }];
+
+  let extraClassDeclaration = [{
+    static RankedTensorType deduceOutputType(
+        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
+  }];
+}
+
 #endif
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp
@@ -237,22 +237,16 @@ Value getBlockStride(Location loc, Value offset, PatternRewriter &rewriter) {
   // canonicalize pointer pass sets block stride via
   // `offset:add-broadcast-muli-splat`, backtrace that pattern to reach the
   // stride.
-  if (auto maybeAdd = offset.getDefiningOp<arith::AddIOp>()) {
-    for (auto addOpr : maybeAdd.getOperands()) {
+  if (auto maybeAdd = offset.getDefiningOp<arith::AddIOp>())
+    for (auto addOpr : maybeAdd.getOperands())
       if (auto maybeBC = addOpr.getDefiningOp<tt::BroadcastOp>()) {
         auto bcSrc = maybeBC.getSrc();
-        if (auto maybeMul = bcSrc.getDefiningOp<arith::MulIOp>()) {
-          for (auto mulOpr : maybeMul.getOperands()) {
-            if (auto maybeSplat = mulOpr.getDefiningOp<tt::SplatOp>()) {
+        if (auto maybeMul = bcSrc.getDefiningOp<arith::MulIOp>())
+          for (auto mulOpr : maybeMul.getOperands())
+            if (auto maybeSplat = mulOpr.getDefiningOp<tt::SplatOp>())
               return maybeSplat.getSrc();
-            }
-          }
-        }
       }
-    }
-  }
-  return rewriter.create<arith::ConstantIntOp>(loc, 0, 32);
-  ;
+  return nullptr;
 }
 
 } // namespace

Original file line number	Diff line number	Diff line change
`@@ -9,8 +9,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {`
`9`	`9`	`// CHECK: %[[offset:.*]] = llvm.select %[[c_mask]]`
`10`	`10`	`// CHECK: %[[aux:.*]] = llvm.mlir.constant(3 : i32) : i32`
`11`	`11`	`// CHECK: rocdl.raw.ptr.buffer.load {{.}}, %[[offset]], {{.}}, %[[aux]]`
`12`		`- %c0 = arith.constant 0 : i32`
`13`		`- %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs stride = %c0 : tensor<128xf32, #blocked0>`
	`12`	`+ %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs : tensor<128xf32, #blocked0>`
`14`	`13`	`tt.return`
`15`	`14`	`}`
`16`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -566,7 +566,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {`
`566`	`566`	`%5 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32`
`567`	`567`	`%6 = tt.splat %5 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>`
`568`	`568`	`%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>`
`569`		`- // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]] stride = %c0_i32`
	`569`	`+ // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]]`
`570`	`570`	`%8 = tt.atomic_rmw fadd, acq_rel, gpu, %7, %arg1 : (tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>`
`571`	`571`	`tt.return %8 : tensor<1024xf32, #blocked>`
`572`	`572`	`}`