intel
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/llvm-hash.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 6 additions & 2 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎python/test/unit/language/test_core.py‎
Lines changed: 4 additions & 1 deletion b/‎python/test/unit/language/test_core.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎test/Conversion/amd/buffer_load_to_local_to_llvm.mlir‎
Lines changed: 120 additions & 0 deletions b/‎test/Conversion/amd/buffer_load_to_local_to_llvm.mlir‎
Lines changed: 120 additions & 0 deletions
@@ -50,6 +50,7 @@ void registerTestAllocationPass();
 void registerTestLivenessPass();
 void registerTestMembarPass();
 void registerTestTritonAMDGPURangeAnalysis();
+void registerTestTritonAMDGPUFoldTrueCmpIOp();
 } // namespace test
 } // namespace mlir
 
@@ -65,6 +66,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::test::registerTestLivenessPass();
   mlir::test::registerTestMembarPass();
   mlir::test::registerTestTritonAMDGPURangeAnalysis();
+  mlir::test::registerTestTritonAMDGPUFoldTrueCmpIOp();
   mlir::triton::registerConvertTritonToTritonGPUPass();
   mlir::triton::intel::registerConvertTritonToTritonGPUWarpPass();
   mlir::triton::intel::registerTritonIntelTensorDescToBlockPointer();
 
@@ -1 +1 @@
-1d4801f22ab1fd6205b1cf625b690aefc554cd4c
+adba14acea99cc6a17d837763a3248c9d4a2fadf
@@ -81,8 +81,12 @@ updateEncodingForShape(Operation *op, gpu::SharedEncodingTrait encoding,
       return swizEnc;
 
     auto rank = tensorType.getRank();
-    SmallVector<unsigned> order(
-        swizEnc.getOrder().drop_front(swizEnc.getOrder().size() - rank));
+    auto oldOrder = swizEnc.getOrder();
+    assert(oldOrder.size() <= rank);
+    SmallVector<unsigned> order;
+    for (int i = 0; i + oldOrder.size() < rank; ++i)
+      order.push_back(rank - i - 1);
+    order.append(oldOrder.begin(), oldOrder.end());
     auto newCtaEnc = updateCTALayoutForShape(ctaLayout, tensorType.getShape());
     return gpu::SwizzledSharedEncodingAttr::get(
         ctx, swizEnc.getVec(), swizEnc.getPerPhase(), swizEnc.getMaxPhase(),
 
@@ -4751,7 +4751,10 @@ def _kernel(out_ptr, N: tl.constexpr, BLOCK_N: tl.constexpr):
     if is_interpreter():
         return
 
-    assert 'llvm.assume' in pgm.asm['llir']
+    assert 'llvm.intr.assume' in pgm.asm['ttgir']
+    # stream pipeliner on AMD folds true cmpi ops to %true (Which llvm itself then dces)
+    if not is_hip():
+        assert 'llvm.assume' in pgm.asm['llir']
 
 
 # ---------------
 
@@ -162,3 +162,123 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [8, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 4, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: buffer_load_swizzled_simple
+  tt.func public @buffer_load_swizzled_simple(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: !tt.ptr<f16>,
+                                %arg2: tensor<16x64xi32, #blocked>,
+                                %arg3: !ttg.memdesc<16x64xf16, #shared, #smem, mutable>) {
+    // Each thread needs to load 2 elements and we load 1 (sizePerThread) per buffer load instruction
+    // COMMON: rocdl.make.buffer.rsrc
+    // COMMON-NOT: rocdl.make.buffer.rsrc
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON-NOT: rocdl.raw.ptr.buffer.load.lds
+    %65 = amdgpu.buffer_load_to_local %arg1[%arg2] into %arg3 {OpIdx = #amdgpu.OpIdx<1>} : <f16>[tensor<16x64xi32, #blocked>] -> <16x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 2, maxPhase = 8, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: buffer_load_to_local_swizzled_mask_other
+  tt.func public @buffer_load_to_local_swizzled_mask_other(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: !tt.ptr<f16>,
+                                %arg2: tensor<32x32xi32, #blocked>,
+                                %arg3: !ttg.memdesc<32x32xf16, #shared, #smem, mutable>,
+                                %arg4: i32) {
+    // We need the splat to allow the AxisAnalysis to work during lowering
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<32x32xf16, #blocked>
+    %c0_i32 = arith.constant 0 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %c31_i32 = arith.constant 31 : i32
+    %1 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x32x!tt.ptr<f16>, #blocked>
+    %29 = arith.addi %arg4, %c31_i32 : i32
+    %30 = arith.divsi %29, %c32_i32 : i32
+    %31 = arith.cmpi sgt, %30, %c0_i32 : i32
+
+    %51 = tt.make_range {end = 32 : i32, start = 0 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %52 = tt.expand_dims %51 {axis = 1 : i32} : tensor<32xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xi32, #blocked>
+    %65 = tt.splat %arg4 : i32 -> tensor<32x1xi32, #blocked>
+    %66 = arith.cmpi slt, %52, %65 : tensor<32x1xi32, #blocked>
+    %67 = tt.broadcast %66 : tensor<32x1xi1, #blocked> -> tensor<32x32xi1, #blocked>
+
+    %70 = tt.splat %31 : i1 -> tensor<32x32xi1, #blocked>
+    %71 = arith.andi %70, %67 : tensor<32x32xi1, #blocked>
+
+    // Each thread needs to load 4 elements and we load 1 (sizePerThread) per buffer load instruction
+    // Note that mask/other alignment is 1 so we need 4 conditionals
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON: rocdl.ds_bpermute
+    // COMMON: rocdl.ballot
+    // COMMON: rocdl.raw.ptr.buffer.load.lds
+    // COMMON: _predicated_store
+
+    // COMMON-NOT: rocdl.ds_bpermute
+    // COMMON-NOT: rocdl.ballot
+    // COMMON-NOT: rocdl.raw.ptr.buffer.load.lds
+    // COMMON-NOT: _predicated_store
+
+    amdgpu.buffer_load_to_local %arg1[%arg2] mask=%67 other=%cst_0 into %arg3 {OpIdx = #amdgpu.OpIdx<1>} : <f16>[tensor<32x32xi32, #blocked>] tensor<32x32xf16, #blocked>  -> <32x32xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 32], order = [0, 1]}>
+#shared = #ttg.swizzled_shared<{vec = 8, perPhase = 4, maxPhase = 16, order = [0, 1]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, ttg.shared = 0 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // COMMON-LABEL: buffer_load_to_local_swizzled_vectorized_8xf16
+  tt.func public @buffer_load_to_local_swizzled_vectorized_8xf16(%arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32}, %arg2: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>) {
+    %cst = arith.constant dense<64> : tensor<1x64xi32, #blocked>
+    %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %0 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked>
+    %4 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %5 = arith.muli %4, %cst : tensor<1x64xi32, #blocked>
+    %6 = tt.broadcast %5 : tensor<1x64xi32, #blocked> -> tensor<64x64xi32, #blocked>
+    %7 = arith.addi %3, %6 : tensor<64x64xi32, #blocked>
+
+    // Each thread needs to load 8 elements and we load 8 (sizePerThread) per buffer load instruction
+    // GFX950: rocdl.make.buffer.rsrc
+    // GFX950: rocdl.ds_bpermute
+    // GFX950: rocdl.raw.ptr.buffer.load.lds
+    // GFX950-NOT: rocdl.raw.ptr.buffer.load.lds
+
+    // GFX942 does not support vectorization > 4bytes so we cannot lower it
+    // GFX942-NOT: rocdl.raw.ptr.buffer.load.lds
+    // GFX942: amdgpu.buffer_load_to_local
+    %8 = amdgpu.buffer_load_to_local %arg1[%7] into %arg2 : <f16>[tensor<64x64xi32, #blocked>]  -> <64x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-1d4801f22ab1fd6205b1cf625b690aefc554cd4c`
	`1`	`+adba14acea99cc6a17d837763a3248c9d4a2fadf`