intel
diff --git a/‎test/Conversion/tritongpu_to_llvm_blackwell.mlir‎
Lines changed: 27 additions & 0 deletions b/‎test/Conversion/tritongpu_to_llvm_blackwell.mlir‎
Lines changed: 27 additions & 0 deletions
@@ -341,3 +341,30 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     tt.return
   }
 }
+
+// -----
+
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 256], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
+#tmem = #ttng.tensor_memory_encoding<blockM = 128, blockN = 256, unpacked = true>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 65544 : i32, ttg.target = "cuda:100", ttg.tensor_memory_size = 128 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: @tensor_memory_ld_128x256
+  // CHECK: tcgen05.st.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.st.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.st.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.st.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.wait::st.sync.aligned
+  // CHECK: tcgen05.ld.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.ld.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.ld.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.ld.sync.aligned.32x32b.x64.b32
+  // CHECK: tcgen05.wait::ld.sync.aligned
+  tt.func public @tensor_memory_ld_128x256(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f16>) {
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #blocked1>
+    %0 = ttng.tmem_alloc %cst_0 {tensor_memory_col_offset = 0 : i32, tensor_memory_row_offset = 0 : i32} : (tensor<128x256xf32, #blocked1>) -> !ttg.memdesc<128x256xf32, #tmem, #ttng.tensor_memory, mutable>
+    %20 = ttng.tmem_load %0 : !ttg.memdesc<128x256xf32, #tmem, #ttng.tensor_memory, mutable> -> tensor<128x256xf32, #blocked1>
+    tt.return
+  }
+}