Merge commit '2a04155bd063630a2b59b0d437d922b12828fbbd'

whitneywhtsang · whitneywhtsang · commit caac3558c2be · 2025-02-15T04:17:26.000Z
diff --git a/RELEASE.md b/RELEASE.md
@@ -1,4 +1,53 @@
-# Release Process
+# Releasing Triton
+
+Triton releases provide a stable snapshot of the code base encapsulated into a binary that can easily be consumed through PyPI. Additionally, releases represent points in time when we, as the development team, can signal to the community that certain new features are available, what improvements have been made, and any changes that are coming that may impact them (i.e. breaking changes).
+
+## Release Compatibility Matrix
+
+Following is the Release Compatibility Matrix for Triton releases:
+
+| Triton version | Python version | Manylinux version |
+| --- | --- | --- |
+| 3.2.0 | >=3.9, <=3.13 | glibc 2.17+ x86-64 |
+| 3.1.0 | >=3.8, <=3.12 | glibc 2.17+ x86-64 |
+| 3.0.0 | >=3.8, <=3.12 | glibc 2.17+ x86-64 |
+| 2.3.1 | >=3.7, <=3.12 | glibc 2.17+ x86-64 |
+| 2.3.0 | >=3.7, <=3.12 | glibc 2.17+ x86-64 |
+| 2.2.0 | >=3.7, <=3.12 | glibc 2.17+ x86-64 |
+| 2.1.0 | >=3.7, <=3.11 | glibc 2.17+ x86-64 |
+| 2.0.0 | >=3.6, <=3.11 | glibc 2.17+ x86-64 |
+| 1.1.1 | >=3.6, <=3.9 | glibc 2.17+ x86-64 |
+| 1.1.0 | >=3.6, <=3.9 | glibc 2.17+ x86-64 |
+| 1.0.0 | >=3.6, <=3.9 | glibc 2.17+ x86-64 |
+
+## Release Cadence
+
+Following is the release cadence for year 2024/2025. All future release dates below are tentative. Please note: Patch Releases are optional.
+
+| Minor Version | Release branch cut | Release date | Patch Release date |
+| --- | --- | --- | --- |
+| 3.5.0 | Sep 2025 | Oct 2025 | --- |
+| 3.4.0 | Jun 2025 | Jul 2025 | --- |
+| 3.3.0 | Feb/Mar 2025 | Apr 2025 | --- |
+| 3.2.0 | Dec 2024 | Jan 2025 | --- |
+| 3.1.0 | Jun 2024 | Oct 2024 | --- |
+| 3.0.0 | Jun 2024 | Jul 2024 | --- |
+| 2.3.0 | Dec 2023 | Apr 2024 | May 2024 |
+| 2.2.0 | Dec 2023 | Jan 2024 | --- |
+
+## Release Cherry-Pick Criteria
+
+After branch cut, we approach finalizing the release branch with clear criteria on what cherry picks are allowed in. Note: a cherry pick is a process to land a PR in the release branch after branch cut. These are typically limited to ensure that the team has sufficient time to complete a thorough round of testing on a stable code base.
+
+* Regression fixes - that address functional/performance regression against the most recent release (e.g. 3.2 for 3.3 release)
+* Critical fixes - critical fixes for severe issue such as silent incorrectness, backwards compatibility, crashes, deadlocks, (large) memory leaks
+* Fixes to new features introduced in the most recent release (e.g. 3.2 for 3.3 release)
+* Documentation improvements
+* Release branch specific changes (e.g. change version identifiers or CI fixes)
+
+Please note: **No feature work allowed for cherry picks**. All PRs that are considered for cherry-picks need to be merged on trunk, the only exception are Release branch specific changes. An issue is for tracking cherry-picks to the release branch is created after the branch cut. **Only issues that have ‘cherry-picks’ in the issue tracker will be considered for the release.**
+
+# Intel Release Process
 
 Intel XPU Backend for Triton releases are aligned to the upstream `triton-lang/triton` project and to `PyTorch`. To make a release:
 
diff --git a/lib/Analysis/Utility.cpp b/lib/Analysis/Utility.cpp
@@ -302,7 +302,7 @@ bool ScanLoweringHelper::isSupported() {
 }
 
 unsigned ScanLoweringHelper::getScratchSizeInElems() {
-  unsigned numWarps = lookupNumWarps(scanOp);
+  unsigned numWarps = product(getEncoding().getWarpsPerCTA());
   unsigned numNonAxisElementsPerWarp =
       getNonAxisNumThreadsPerWarp() * getNonAxisNumElementsPerThread();
   unsigned numElements = numWarps * numNonAxisElementsPerWarp *
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp
@@ -372,18 +372,7 @@ static std::optional<ttg::SharedEncodingTrait>
 getSharedEncoding(Operation *loadOp, bool isTMALoad) {
   auto ty = cast<RankedTensorType>(loadOp->getResultTypes()[0]);
   auto ctaLayout = ttg::getCTALayout(ty.getEncoding());
-  auto blockedOrder = ttg::getOrder(ty.getEncoding());
-  SmallVector<unsigned> order;
-  if (blockedOrder.size() == 3) {
-    for (unsigned i = 0; i < blockedOrder.size(); ++i) {
-      if (blockedOrder[i] == 0)
-        continue;
-      order.push_back(blockedOrder[i]);
-    }
-    order.push_back(0);
-  } else {
-    order = blockedOrder;
-  }
+  auto order = ttg::getOrder(ty.getEncoding());
 
   ttg::SharedEncodingTrait localAllocEnc;
   if (llvm::any_of(loadOp->getUsers(), [&](Operation *user) {
diff --git a/python/triton/runtime/jit.py b/python/triton/runtime/jit.py
@@ -587,6 +587,9 @@ def run(self, *args, grid, warmup, **kwargs):
                        *bound_args.values())
         return kernel
 
+    def repr(self, _):
+        return self._fn_name if self._repr is None else self._repr(_)
+
     def __init__(self, fn, version=None, do_not_specialize=None, do_not_specialize_on_alignment=None, debug=None,
                  noinline=None, repr=None, launch_metadata=None):
         do_not_specialize = do_not_specialize if do_not_specialize else []
@@ -599,7 +602,8 @@ def __init__(self, fn, version=None, do_not_specialize=None, do_not_specialize_o
         self.do_not_specialize = do_not_specialize
         self.do_not_specialize_on_alignment = do_not_specialize_on_alignment
         self.starting_line_number = inspect.getsourcelines(fn)[1]
-        self.repr = lambda _: fn.__name__ if repr is None else repr(_)
+        self._repr = repr
+        self._fn_name = fn.__name__
         self.launch_metadata = launch_metadata
 
         self.params = []
@@ -613,7 +617,7 @@ def __init__(self, fn, version=None, do_not_specialize=None, do_not_specialize_o
         src = src[re.search(r"^def\s+\w+\s*\(", src, re.MULTILINE).start():]
         self._unsafe_update_src(src)
         # cache of just-in-time compiled kernels
-        self.device_caches = defaultdict(lambda: self.create_binder())
+        self.device_caches = defaultdict(self.create_binder)
         self.hash = None
 
         # Map of global variables used by the function and any functions it
diff --git a/test/Analysis/test-allocation.mlir b/test/Analysis/test-allocation.mlir
@@ -615,4 +615,14 @@ tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {
   // CHECK-NEXT: size = 1024
 }
 
+// CHECK-LABEL: scan_alloc
+tt.func @scan_alloc(%x : tensor<8x16xf32, #AL>) {
+  // CHECK: offset = 0, size = 512
+  %a = "tt.scan"(%x) <{axis = 0 : i32, reverse = false}>({
+  ^bb0(%arg0: f32, %arg1: f32):
+    %add = arith.addf %arg0, %arg1 : f32
+    tt.scan.return %add : f32
+  }) : (tensor<8x16xf32, #AL>) -> tensor<8x16xf32, #AL>
+  tt.return
+}
 }
diff --git a/test/Conversion/amd/buffer_load_store.mlir b/test/Conversion/amd/buffer_load_store.mlir
@@ -9,8 +9,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {
         // CHECK: %[[offset:.*]] = llvm.select %[[c_mask]]
         // CHECK: %[[aux:.*]] = llvm.mlir.constant(3 : i32) : i32
         // CHECK: rocdl.raw.ptr.buffer.load {{.*}}, %[[offset]], {{.*}}, %[[aux]]
-        %c0 = arith.constant 0 : i32
-        %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs stride = %c0 : tensor<128xf32, #blocked0>
+        %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs : tensor<128xf32, #blocked0>
         tt.return
   }
 }
diff --git a/test/TritonGPU/amd/amd-convert-buffer-ops.mlir b/test/TritonGPU/amd/amd-convert-buffer-ops.mlir
@@ -566,7 +566,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     %5 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32
     %6 = tt.splat %5 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>
     %7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>
-    // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]] stride = %c0_i32
+    // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]]
     %8 = tt.atomic_rmw fadd, acq_rel, gpu, %7, %arg1 : (tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>
     tt.return %8 : tensor<1024xf32, #blocked>
   }
diff --git a/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td b/third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td
@@ -41,8 +41,7 @@ include "TritonAMDGPUAttrDefs.td"
 
 
 class TT_AMDGPU_Op<string mnemonic, list<Trait> traits = []> :
-    Op<TritonAMDGPU_Dialect, mnemonic, !listconcat(traits, [])> {
-}
+    Op<TritonAMDGPU_Dialect, mnemonic, !listconcat(traits, [])>;
 
 //
 // Interfaces
@@ -53,8 +52,7 @@ def GlobalMemory : Resource<"::mlir::triton::GlobalMemory">;
 // ExtractSliceOp
 //===----------------------------------------------------------------------===//
 
-def ExtractSliceOp
-    : TT_AMDGPU_Op<"extract_slice", [Pure]> {
+def ExtractSliceOp : TT_AMDGPU_Op<"extract_slice", [Pure]> {
   let summary = "extract slice operation";
   let description = [{
     The "extract_slice" operation enables extracting a slice of a tensor in
@@ -92,8 +90,10 @@ def ExtractSliceOp
     size of the slice is determined by the result type.
     }];
 
-  let arguments = (ins AnyRankedTensor:$source,
-      DenseI64ArrayAttr:$static_offsets);
+  let arguments = (ins
+    AnyRankedTensor:$source,
+    DenseI64ArrayAttr:$static_offsets
+  );
   let results = (outs AnyRankedTensor:$result);
 
   let builders = [
@@ -117,6 +117,10 @@ def ExtractSliceOp
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// InstructionSchedHint
+//===----------------------------------------------------------------------===//
+
 def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
   let summary = "A placeholder op for instruction scheduling hints within a basic block";
   let description = [{
@@ -156,8 +160,11 @@ def InstructionSchedHint : TT_AMDGPU_Op<"instruction_sched_hint", []> {
   let assemblyFormat = [{ attr-dict }];
 }
 
-def CondBarrierOp : TT_AMDGPU_Op<"cond_barrier">,
-  Arguments<(ins I1:$pred)> {
+//===----------------------------------------------------------------------===//
+// CondBarrierOp
+//===----------------------------------------------------------------------===//
+
+def CondBarrierOp : TT_AMDGPU_Op<"cond_barrier"> {
   let summary = "Conditionally set barriers to synchronize partial threads in a block";
 
   let description = [{
@@ -170,22 +177,25 @@ def CondBarrierOp : TT_AMDGPU_Op<"cond_barrier">,
       NB. This doesn't set any memory fence.
   }];
 
+  let arguments = (ins I1:$pred);
+
   let assemblyFormat = "$pred attr-dict";
 }
 
-//
-// AMD Buffer operations.
-//
+//===----------------------------------------------------------------------===//
+// BufferLoadOp
+//===----------------------------------------------------------------------===//
+
 def BufferLoadOp : TT_AMDGPU_Op<"buffer_load", [
   SameLoadStoreOperandsAndResultEncoding,
   AttrSizedOperandSegments,
   MemoryEffects<[MemRead<GlobalMemory>]>,
   TypesMatchWith<"result element type matches the pointed type of ptr", "result", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"result and offsets have the same shape", "result", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"result and mask have the same shape", "result", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 3) || std::equal_to<>()">,
+                 "(cast<BufferLoadOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
   TypesMatchWith<"result and other have the same type", "result", "other", "$_self",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferLoadOp>($_op).getOther() == nullptr) || std::equal_to<>()">,
 ]>{
     let summary = "Load from a scalar base pointer and a tensor offset";
     let description = [{
@@ -201,11 +211,10 @@ def BufferLoadOp : TT_AMDGPU_Op<"buffer_load", [
       when it converts to the buffer ops because it is important for optimizing
       the cache memory access.
     }];
-    let arguments = (
-      ins
+    let arguments = (ins
       TT_Ptr:$ptr,
       I32Tensor:$offsets,
-      I32:$stride,
+      Optional<I32>:$stride,
       DefaultValuedAttr<TT_CacheModifierAttr, "::mlir::triton::CacheModifier::NONE">:$cache,
       Optional<TT_BoolTensor>:$mask,
       Optional<TT_Tensor>:$other
@@ -215,24 +224,29 @@ def BufferLoadOp : TT_AMDGPU_Op<"buffer_load", [
     let assemblyFormat = [{
       $ptr `[` $offsets `]` (`,` $mask^)? (`,` $other^)?
       oilist(`cacheModifier` `=` $cache)
-      `stride` `=` $stride
+      (`stride` `=` $stride^)?
       attr-dict `:` type($result)
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// BufferAtomicRMWOp
+//===----------------------------------------------------------------------===//
+
 def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
+  AttrSizedOperandSegments,
   SameLoadStoreOperandsAndResultEncoding,
   MemoryEffects<[MemRead<GlobalMemory>]>,
   MemoryEffects<[MemWrite<GlobalMemory>]>,
   TypesMatchWith<"result element type matches the value type", "result", "value", "$_self">,
   TypesMatchWith<"result element type matches the pointed type of ptr", "result", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"result and offsets have the same shape", "result", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"result and mask have the same shape", "result", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferAtomicRMWOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
   TypesMatchWith<"value element type matches the pointed type of ptr", "value", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"value and offsets have the same shape", "value", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"value and mask have the same shape", "value", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferAtomicRMWOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
 ]>{
     let summary = "Atomic RMW op which reads, modifies, and writes to a scalar base pointer and a tensor offset";
     let description = [{
@@ -246,13 +260,12 @@ def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
         the address difference between the first elements of each row in bytes. Compiler tries to obtain the `stride`
         when it converts to the buffer ops because it is important for optimizing the cache memory access.
     }];
-    let arguments = (
-      ins
+    let arguments = (ins
       TT_AtomicRMWAttr:$atomic_rmw_op,
       TT_Ptr:$ptr,
       I32Tensor:$offsets,
       TT_Tensor:$value,
-      I32:$stride,
+      Optional<I32>:$stride,
       TT_MemSemanticAttr:$sem,
       TT_MemSyncScopeAttr:$scope,
       Optional<TT_BoolTensor>:$mask
@@ -261,18 +274,23 @@ def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
 
     let assemblyFormat = [{
         $atomic_rmw_op `,` $sem `,` $scope `,` $value `,` $ptr `[` $offsets `]` (`,` $mask^)?
-        `stride` `=` $stride
+        (`stride` `=` $stride^)?
         attr-dict `:` type($result)
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// BufferStoreOp
+//===----------------------------------------------------------------------===//
+
 def BufferStoreOp : TT_AMDGPU_Op<"buffer_store", [
+  AttrSizedOperandSegments,
   SameLoadStoreOperandsEncoding,
   MemoryEffects<[MemWrite<GlobalMemory>]>,
   TypesMatchWith<"value element type matches the pointed type of ptr", "value", "ptr", "getPointerTypeToElement($_self)">,
   TypesMatchWith<"value and offsets have the same shape", "value", "offsets", "getI32SameShape($_self)">,
   TypesMatchWith<"value and mask have the same shape", "value", "mask", "getI1SameShape($_self)",
-                 "($_op.getOperands().size() <= 4) || std::equal_to<>()">,
+                 "(cast<BufferStoreOp>($_op).getMask() == nullptr) || std::equal_to<>()">,
 ]>{
     let summary = "Store into scalar base pointer and a tensor offset";
     let description = [{
@@ -288,22 +306,53 @@ def BufferStoreOp : TT_AMDGPU_Op<"buffer_store", [
       when it converts to the buffer ops because it is important for optimizing
       the cache memory access.
     }];
-    let arguments = (
-      ins
+    let arguments = (ins
       TT_Tensor:$value,
       TT_Ptr:$ptr,
       I32Tensor:$offsets,
-      I32:$stride,
+      Optional<I32>:$stride,
       DefaultValuedAttr<TT_CacheModifierAttr, "mlir::triton::CacheModifier::NONE">:$cache,
       Optional<TT_BoolTensor>:$mask
     );
 
     let assemblyFormat = [{
       $value `,` $ptr `[` $offsets `]` (`,` $mask^)?
       oilist(`cacheModifier` `=` $cache)
-      `stride` `=` $stride
+      (`stride` `=` $stride^)?
       attr-dict `:` type($value)
     }];
 }
 
+//===----------------------------------------------------------------------===//
+// UpcastMXFPOp
+//===----------------------------------------------------------------------===//
+
+def TTG_UpcastMXFPOp : TT_AMDGPU_Op<"upcast_mxfp", [Pure]> {
+  let summary = "Convert an mxfp tensor to bf16/fp16";
+
+  let hasVerifier = 1;
+
+  let description = [{
+    Compute the bf16 encoded in the given mxfp number as per
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+  }];
+  let arguments = (
+    ins
+    TT_Tensor:$src,
+    TT_Tensor:$scale,
+    TT_ScaleDotElemTypeAttr:$fp_type,
+    BoolAttr:$fastMath
+  );
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
+  }];
+
+  let extraClassDeclaration = [{
+    static RankedTensorType deduceOutputType(
+        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
+  }];
+}
+
 #endif
diff --git a/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp b/third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/ConvertToBufferOps.cpp

Original file line number	Diff line number	Diff line change
`@@ -302,7 +302,7 @@ bool ScanLoweringHelper::isSupported() {`
`302`	`302`	`}`
`303`	`303`
`304`	`304`	`unsigned ScanLoweringHelper::getScratchSizeInElems() {`
`305`		`- unsigned numWarps = lookupNumWarps(scanOp);`
	`305`	`+ unsigned numWarps = product(getEncoding().getWarpsPerCTA());`
`306`	`306`	`unsigned numNonAxisElementsPerWarp =`
`307`	`307`	`getNonAxisNumThreadsPerWarp() * getNonAxisNumElementsPerThread();`
`308`	`308`	`unsigned numElements = numWarps * numNonAxisElementsPerWarp *`
Original file line number	Diff line number	Diff line change
`@@ -615,4 +615,14 @@ tt.func @call_graph_2(%A : !tt.ptr<f16>, %cond : i1) {`
`615`	`615`	`// CHECK-NEXT: size = 1024`
`616`	`616`	`}`
`617`	`617`
	`618`	`+// CHECK-LABEL: scan_alloc`
	`619`	`+tt.func @scan_alloc(%x : tensor<8x16xf32, #AL>) {`
	`620`	`+ // CHECK: offset = 0, size = 512`
	`621`	`+ %a = "tt.scan"(%x) <{axis = 0 : i32, reverse = false}>({`
	`622`	`+ ^bb0(%arg0: f32, %arg1: f32):`
	`623`	`+ %add = arith.addf %arg0, %arg1 : f32`
	`624`	`+ tt.scan.return %add : f32`
	`625`	`+ }) : (tensor<8x16xf32, #AL>) -> tensor<8x16xf32, #AL>`
	`626`	`+ tt.return`
	`627`	`+}`
`618`	`628`	`}`
Original file line number	Diff line number	Diff line change
`@@ -9,8 +9,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32} {`
`9`	`9`	`// CHECK: %[[offset:.*]] = llvm.select %[[c_mask]]`
`10`	`10`	`// CHECK: %[[aux:.*]] = llvm.mlir.constant(3 : i32) : i32`
`11`	`11`	`// CHECK: rocdl.raw.ptr.buffer.load {{.}}, %[[offset]], {{.}}, %[[aux]]`
`12`		`- %c0 = arith.constant 0 : i32`
`13`		`- %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs stride = %c0 : tensor<128xf32, #blocked0>`
	`12`	`+ %ret = amdgpu.buffer_load %arg0[%offset] cacheModifier = cs : tensor<128xf32, #blocked0>`
`14`	`13`	`tt.return`
`15`	`14`	`}`
`16`	`15`	`}`
Original file line number	Diff line number	Diff line change
`@@ -566,7 +566,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {`
`566`	`566`	`%5 = tt.addptr %arg0, %1 : !tt.ptr<f32>, i32`
`567`	`567`	`%6 = tt.splat %5 : !tt.ptr<f32> -> tensor<1024x!tt.ptr<f32>, #blocked>`
`568`	`568`	`%7 = tt.addptr %6, %4 : tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xi32, #blocked>`
`569`		`- // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]] stride = %c0_i32`
	`569`	`+ // CHECK: %[[loaded:.*]] = amdgpu.buffer_atomic_rmw fadd, acq_rel, gpu, %arg1, %[[scalar_ptr]][%[[offset]]]`
`570`	`570`	`%8 = tt.atomic_rmw fadd, acq_rel, gpu, %7, %arg1 : (tensor<1024x!tt.ptr<f32>, #blocked>, tensor<1024xf32, #blocked>) -> tensor<1024xf32, #blocked>`
`571`	`571`	`tt.return %8 : tensor<1024xf32, #blocked>`
`572`	`572`	`}`