[Intel] Rework load-store redundant data masking (#3896)

alexbaden · web-flow · commit 9facf00e6736 · 2025-04-11T20:38:10.000-04:00
Upstream updated the masking logic when generating vectorized loads/stores to use linear layout free variable masks: https://github.com/triton-lang/triton/pull/5432/files#diff-7147172e5de66b21e7447a220f435a703df5302dd91a41a5be9683e5396652c5 The free variable mask essentially finds inputs to the linear layout that do not change the output - these inputs are duplicates and should be masked when doing loads or stores. This PR applies that work to the Intel backend. Previously, we were ignoring duplicates in the registers which could result in incorrect results if the tensor size of the load/store operand was smaller than the DPAS tile size in a particular dimension. A nice side effect here is we generate fewer stores - from 32 `<1 x i16>` store instructions in the LLVM IR previously to 16 instructions now for the B operand in the float16 128x8 test case. Close #3841
diff --git a/python/test/unit/intel/test_block_load.py b/python/test/unit/intel/test_block_load.py
@@ -15,8 +15,6 @@
 def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pathlib.Path):
     # modify the layouts to ensure the correct OCL/SPIRV intrinsic is called for each datatype
     if dtype_str == "int8":
-        if M == 128 and N == 16 or N == 8:
-            pytest.skip("TODO: test fails verification")
         A_width = 2
         B_width = 4
         layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 4, threadsPerWarp = 16, warpsPerCTA = [1, 4], repCluster = [1, 2], A = [8, 32], B = [32, 32], C = [8, 32]}>"
@@ -25,8 +23,6 @@ def test_block_load_dpas_layout(M, N, dtype_str, transpose, device, tmp_path: pa
         B_width = 1
         layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>"
     else:
-        if M == 128 and N == 8:
-            pytest.skip("TODO: test fails verification")
         A_width = 1
         B_width = 2
         layouts = "#mma = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>"
diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir
@@ -676,6 +676,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     // CHECK-NEXT: [[CST_0:%.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK-NEXT: [[IE1:%.*]] = llvm.insertelement [[BCAST0]], [[VEC1]][[[CST_0]] : i32] : vector<1xf32>
     // CHECK-NEXT: [[BCAST1:%.*]] = llvm.bitcast [[IE1]] : vector<1xf32> to i32
+    // CHECK-NEXT: [[TRUE1:%.*]] = llvm.mlir.constant(true) : i1
     // CHECK-NEXT: [[AND1:%.*]] = llvm.and {{.*}}, [[ARG2_0]] : i1
     // CHECK-NEXT: [[VEC2:%.*]] = llvm.mlir.undef : vector<1xi32>
     // CHECK-NEXT: [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -1059,17 +1060,23 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: atomic_cas_f32_scalar_no_store
   tt.func @atomic_cas_f32_scalar_no_store(%ptr : !tt.ptr<f32>, %cmp : f32, %val : f32) {
-    // CHECK:      [[TRUE:%.*]] = llvm.mlir.constant(true) : i1
-    // CHECK:      [[CMP0:%.*]] = llvm.icmp "eq"
-    // CHECK:      [[MASK0:%.*]] = llvm.and [[TRUE]], [[CMP0]]
-    // CHECK:      [[CMP:%.*]] = llvm.icmp "eq"
-    // CHECK:      [[MASK:%.*]] = llvm.and [[MASK0]], [[CMP]]
-    // CHECK:      [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:      [[ZERO0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:      [[TRUE:%.*]] = llvm.mlir.constant(-1 : i32) : i32
+    // CHECK:      [[MASKLANE:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPLANE:%.*]] = llvm.icmp "eq" [[MASKLANE]], [[ZERO0]]
+    // CHECK:      [[MASKWARP:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPWARP:%.*]] = llvm.icmp "eq" [[MASKWARP]], [[ZERO0]]
+    // CHECK-NEXT: [[MASKWARPANDLANE:%.*]] = llvm.and [[CMPLANE]], [[CMPWARP]]
+    // CHECK:      llvm.mlir.constant(-1 : i32) : i32
+    // CHECK:      [[MASKBLOCK:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPBLOCK:%.*]] = llvm.icmp "eq" [[MASKBLOCK]], [[ZERO0]]
+    // CHECK-NEXT: [[MASK:%.*]] = llvm.and [[MASKWARPANDLANE]], [[CMPBLOCK]]
+    // CHECK:      [[ZERO1:%.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK:      [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
     // CHECK:      [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
     // CHECK:      [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32
     // CHECK:      llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> ()
-    // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb1, ^bb2([[ZERO]] : i32)
+    // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb1, ^bb2([[ZERO1]] : i32)
     // CHECK-NEXT: ^bb1:
     // CHECK-NEXT:   [[BCAST1:%.*]] = llvm.bitcast %arg1 : f32 to i32
     // CHECK-NEXT:   [[BCAST2:%.*]] = llvm.bitcast %arg2 : f32 to i32
@@ -1089,13 +1096,19 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
   // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
   // CHECK-LABEL: atomic_cas_f32_scalar
   tt.func @atomic_cas_f32_scalar(%ptr : !tt.ptr<f32>, %cmp : f32, %val : f32) {
-    // CHECK:      [[TRUE:%.*]] = llvm.mlir.constant(true) : i1
-    // CHECK:      [[CMP0:%.*]] = llvm.icmp "eq"
-    // CHECK:      [[MASK0:%.*]] = llvm.and [[TRUE]], [[CMP0]]
-    // CHECK:      [[CMP:%.*]] = llvm.icmp "eq"
-    // CHECK:      [[MASK:%.*]] = llvm.and [[MASK0]], [[CMP]]
-    // CHECK:      [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb1, ^bb2([[ZERO]] : i32)
+    // CHECK:      [[ZERO0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:      [[TRUE:%.*]] = llvm.mlir.constant(-1 : i32) : i32
+    // CHECK:      [[MASKLANE:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPLANE:%.*]] = llvm.icmp "eq" [[MASKLANE]], [[ZERO0]]
+    // CHECK:      [[MASKWARP:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPWARP:%.*]] = llvm.icmp "eq" [[MASKWARP]], [[ZERO0]]
+    // CHECK-NEXT: [[MASKWARPANDLANE:%.*]] = llvm.and [[CMPLANE]], [[CMPWARP]]
+    // CHECK:      llvm.mlir.constant(-1 : i32) : i32
+    // CHECK:      [[MASKBLOCK:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPBLOCK:%.*]] = llvm.icmp "eq" [[MASKBLOCK]], [[ZERO0]]
+    // CHECK-NEXT: [[MASK:%.*]] = llvm.and [[MASKWARPANDLANE]], [[CMPBLOCK]]
+    // CHECK:      [[ZERO1:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-NEXT: llvm.cond_br [[MASK]], ^bb1, ^bb2([[ZERO1]] : i32)
     // CHECK-NEXT: ^bb1:
     // CHECK-NEXT:   [[BCAST1:%.*]] = llvm.bitcast %arg1 : f32 to i32
     // CHECK-NEXT:   [[BCAST2:%.*]] = llvm.bitcast %arg2 : f32 to i32
@@ -1131,14 +1144,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     // CHECK-NEXT: [[EV1_ARG2:%.*]] = llvm.extractvalue %arg2[1] : !llvm.struct<(f32, f32)>
     // CHECK:      [[EV0_ARG0:%.*]] = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr<1>, ptr<1>)>
     // CHECK-NEXT: [[EV1_ARG0:%.*]] = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr<1>, ptr<1>)>
-    // CHECK:      llvm.mlir.constant(true) : i1
-    // CHECK:      [[CST_TRUE:%.*]] = llvm.mlir.constant(true) : i1
-    // CHECK:      [[PRED0:%.*]] = llvm.and [[CST_TRUE]], {{.*}} : i1
-    // CHECK-NEXT: [[UNDEF1:%.*]] = llvm.mlir.undef : vector<1xf32>
+    // CHECK:      [[EV0_ARG1:%.*]] = llvm.extractvalue %arg1[0] : !llvm.struct<(i1, i1)>
+    // CHECK-NEXT: [[EV1_ARG1:%.*]] = llvm.extractvalue %arg1[1] : !llvm.struct<(i1, i1)>
+    // CHECK:      [[UNDEF1:%.*]] = llvm.mlir.undef : vector<1xf32>
     // CHECK:      [[IE1:%.*]] = llvm.insertelement [[EV0_ARG2]], [[UNDEF1]][{{.*}} : i64] : vector<1xf32>
-    // CHECK-NEXT: [[PRED1:%.*]] = llvm.and [[PRED0]], {{.*}} : i1
     // CHECK-NEXT: [[ZERO1:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
-    // CHECK:      llvm.cond_br [[PRED1]], ^bb1, ^bb2([[ZERO1]] : f32)
+    // CHECK:      llvm.cond_br [[EV0_ARG1]], ^bb1, ^bb2([[ZERO1]] : f32)
     // CHECK-NEXT: ^bb1:
     // CHECK-NEXT:   [[BCAST2:%.*]] = llvm.bitcast [[IE1]] : vector<1xf32> to f32
     // CHECK-NEXT:   [[RMW_RES1:%.*]] = llvm.atomicrmw fadd [[EV0_ARG0]], [[BCAST2]] monotonic : !llvm.ptr<1>, f32
@@ -1147,13 +1158,12 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     // CHECK-NEXT:   [[RMW_CAST:%.*]] = llvm.bitcast [[RMW_PHI1]] : f32 to f32
     // CHECK-NEXT:   [[UNDEF2:%.*]] = llvm.mlir.undef : vector<1xf32>
     // CHECK:        [[IE2:%.*]] = llvm.insertelement [[EV1_ARG2]], [[UNDEF2]][{{.*}} : i64] : vector<1xf32>
-    // CHECK-NEXT:   [[PRED2:%.*]] = llvm.and [[PRED0]], {{.*}} : i1
     // CHECK-NEXT:   [[ZERO2:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
     // CHECK:        [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
     // CHECK:        [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
     // CHECK:        [[GLOBAL:%.*]] = llvm.mlir.constant(528 : i32) : i32
     // CHECK:        llvm.call spir_funccc @_Z22__spirv_ControlBarrieriii([[WGSCOPE]], [[WGMEMSCOPE]], [[GLOBAL]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> ()
-    // CHECK-NEXT:   llvm.cond_br [[PRED2]], ^bb3, ^bb4([[ZERO2]] : f32)
+    // CHECK-NEXT:   llvm.cond_br [[EV1_ARG1]], ^bb3, ^bb4([[ZERO2]] : f32)
     // CHECK-NEXT: ^bb3:
     // CHECK-NEXT:   [[BCAST2:%.*]] = llvm.bitcast [[IE2]] : vector<1xf32> to f32
     // CHECK-NEXT:   [[RMW_RES2:%.*]] = llvm.atomicrmw fadd [[EV1_ARG0]], [[BCAST2]] monotonic : !llvm.ptr<1>, f32
@@ -1169,14 +1179,19 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
 module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: atomic_add_f32_scalar_no_store
   tt.func @atomic_add_f32_scalar_no_store(%arg0 : !tt.ptr<f32>, %arg1 : i1, %arg2 : f32) {
-    // CHECK:      [[CST_TRUE:%.*]] = llvm.mlir.constant(true) : i1
-    // CHECK:      [[CMP:%.*]] = llvm.icmp "eq"
-    // CHECK-NEXT: [[AND:%.*]] = llvm.and [[CST_TRUE]], [[CMP]]  : i1
-    // CHECK:      [[CMP1:%.*]] = llvm.icmp "eq"
-    // CHECK-NEXT: [[AND1:%.*]] = llvm.and [[AND]], [[CMP1]]  : i1
-    // CHECK:      [[UNDEF1:%.*]] = llvm.mlir.undef : vector<1xf32>
+    // CHECK:      [[ZERO0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:      [[MASKLANE:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPLANE:%.*]] = llvm.icmp "eq" [[MASKLANE]], [[ZERO0]]
+    // CHECK:      [[MASKWARP:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPWARP:%.*]] = llvm.icmp "eq" [[MASKWARP]], [[ZERO0]]
+    // CHECK-NEXT: [[MASKWARPANDLANE:%.*]] = llvm.and [[CMPLANE]], [[CMPWARP]]
+    // CHECK:      llvm.mlir.constant(-1 : i32) : i32
+    // CHECK:      [[MASKBLOCK:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPBLOCK:%.*]] = llvm.icmp "eq" [[MASKBLOCK]], [[ZERO0]]
+    // CHECK-NEXT: [[MASK:%.*]] = llvm.and [[MASKWARPANDLANE]], [[CMPBLOCK]]
+    // CHECK-NEXT: [[UNDEF1:%.*]] = llvm.mlir.undef : vector<1xf32>
     // CHECK:      [[IE1:%.*]] = llvm.insertelement %arg2, [[UNDEF1]][{{.*}} : i64] : vector<1xf32>
-    // CHECK:      [[PRED:%.*]] = llvm.and [[AND1]], %arg1  : i1
+    // CHECK:      [[PRED:%.*]] = llvm.and %arg1, [[MASK]]  : i1
     // CHECK-NEXT: [[ZERO:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
     // CHECK:      [[WGSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
     // CHECK:      [[WGMEMSCOPE:%.*]] = llvm.mlir.constant(2 : i32) : i32
@@ -1200,14 +1215,19 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
   // CHECK: llvm.func spir_funccc @_Z7barrierj(i32) attributes {convergent, no_unwind, will_return}
   // CHECK-LABEL: atomic_add_f32_scalar
   tt.func @atomic_add_f32_scalar(%arg0 : !tt.ptr<f32>, %arg1 : i1, %arg2 : f32) {
-    // CHECK:      [[CST_TRUE:%.*]] = llvm.mlir.constant(true) : i1
-    // CHECK:      [[CMP:%.*]] = llvm.icmp "eq"
-    // CHECK-NEXT: [[AND:%.*]] = llvm.and [[CST_TRUE]], [[CMP]]  : i1
-    // CHECK:      [[CMP1:%.*]] = llvm.icmp "eq"
-    // CHECK-NEXT: [[AND1:%.*]] = llvm.and [[AND]], [[CMP1]]  : i1
-    // CHECK:      [[UNDEF1:%.*]] = llvm.mlir.undef : vector<1xf32>
+    // CHECK:      [[ZERO0:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK:      [[MASKLANE:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPLANE:%.*]] = llvm.icmp "eq" [[MASKLANE]], [[ZERO0]]
+    // CHECK:      [[MASKWARP:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPWARP:%.*]] = llvm.icmp "eq" [[MASKWARP]], [[ZERO0]]
+    // CHECK-NEXT: [[MASKWARPANDLANE:%.*]] = llvm.and [[CMPLANE]], [[CMPWARP]]
+    // CHECK:      llvm.mlir.constant(-1 : i32) : i32
+    // CHECK:      [[MASKBLOCK:%.*]] = llvm.and
+    // CHECK-NEXT: [[CMPBLOCK:%.*]] = llvm.icmp "eq" [[MASKBLOCK]], [[ZERO0]]
+    // CHECK-NEXT: [[MASK:%.*]] = llvm.and [[MASKWARPANDLANE]], [[CMPBLOCK]]
+    // CHECK-NEXT: [[UNDEF1:%.*]] = llvm.mlir.undef : vector<1xf32>
     // CHECK:      [[IE1:%.*]] = llvm.insertelement %arg2, [[UNDEF1]][{{.*}} : i64] : vector<1xf32>
-    // CHECK:      [[PRED:%.*]] = llvm.and [[AND1]], %arg1  : i1
+    // CHECK:      [[PRED:%.*]] = llvm.and %arg1, [[MASK]]  : i1
     // CHECK-NEXT: [[ZERO:%.*]] = llvm.mlir.constant(0.000000e+00 : f32) : f32
     // CHECK-NEXT: llvm.cond_br [[PRED]], ^bb1, ^bb2([[ZERO]] : f32)
     // CHECK-NEXT: ^bb1:
@@ -1295,22 +1315,22 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
     // CHECK-NEXT: [[ARG0_1:%.*]] = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr<1>, ptr<1>)>
     // CHECK-NEXT: [[ARG1_0:%.*]] = llvm.extractvalue %arg1[0] : !llvm.struct<(f32, f32)>
     // CHECK-NEXT: [[ARG1_1:%.*]] = llvm.extractvalue %arg1[1] : !llvm.struct<(f32, f32)>
-    // CHECK:      llvm.mlir.constant(true) : i1
     // CHECK:      [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
-    // CHECK-NEXT: llvm.call spir_funccc @_Z12get_local_idj([[ZERO]]) {{.*}} : (i32) -> i64
-    // CHECK:      [[TRUE1:%.*]] = llvm.mlir.constant(true) : i1
-    // CHECK:      [[TRUE2:%.*]] = llvm.mlir.constant(true) : i1
-    // CHECK:      [[PRED:%.*]] = llvm.and [[TRUE1]], [[TRUE2]] : i1
+    // CHECK:      [[ZERO1:%.*]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK-NEXT: llvm.call spir_funccc @_Z12get_local_idj([[ZERO1]]) {{.*}} : (i32) -> i64
+    // CHECK:      [[PRED:%.*]]  = llvm.mlir.constant(true) : i1
     // CHECK:      llvm.cond_br [[PRED]], ^bb1, ^bb2
     // CHECK-NEXT: ^bb1:
     // CHECK-NEXT:   [[BCAST:%.*]] = llvm.bitcast [[ARG0_0]] : !llvm.ptr<1> to !llvm.ptr<1>
     // CHECK-NEXT:   llvm.store {{.*}}, [[BCAST]] {alignment = 4 : i64} : vector<1xi32>, !llvm.ptr<1>
     // CHECK-NEXT:   llvm.br ^bb2
     // CHECK-NEXT: ^bb2:
+    // CHECK:        llvm.mlir.undef : vector<1xf32>
+    // CHECK:        [[PRED2:%.*]] = llvm.mlir.constant(true) : i1
     // CHECK:        [[VEC:%.*]] = llvm.mlir.undef : vector<1xi32>
     // CHECK-NEXT:   [[ZERO:%.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK-NEXT:   [[IE1:%.*]] = llvm.insertelement {{.*}}, [[VEC]][[[ZERO]] : i32] : vector<1xi32>
-    // CHECK:        llvm.cond_br [[PRED]], ^bb3, ^bb4
+    // CHECK:        llvm.cond_br [[PRED2]], ^bb3, ^bb4
     // CHECK-NEXT: ^bb3:
     // CHECK-NEXT:   [[BCAST1:%.*]] = llvm.bitcast [[ARG0_1]] : !llvm.ptr<1> to !llvm.ptr<1>
     // CHECK-NEXT:   llvm.store [[IE1]], [[BCAST1]] {alignment = 4 : i64} : vector<1xi32>, !llvm.ptr<1>
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp