[LoadOpToBlockIOConversion] Improve codegen for other (#5141)

whitneywhtsang · web-flow · commit 8e26be5595b5 · 2025-09-19T00:01:31.000-04:00
Optimizes block load lowering when the optional "other" value is a
non-zero constant splat by materializing a single repeated constant
instead of unpacking LLVM elements, and updates tests accordingly by
removing prior select expectations. Key changes focus on specializing
constant handling and pruning test expectations tied to the old
expansion path.

---------

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir b/test/TritonIntelGPU/tensor-pointer-load-block-2d.mlir
@@ -266,7 +266,6 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 8 : i32, "ttg.thr
     // CHECK: %[[PRED_BOOL:.*]] =  llvm.trunc %[[PRED]] : i8 to i1
     // CHECK: %[[BASE_Y_0:.*]] = llvm.select %[[PRED_BOOL]], %[[CST0_0]], %[[BLOCK_SHAPE_Y]] : i1, i32
     // CHECK: %[[LOAD_0:.*]] = triton_gen.2Dblockload {{.*}}, %[[BASE_Y_0]] {elem_size_in_bits = 16, tile_width = 16, tile_height = 16, v_blocks = 2
-    // CHECK: llvm.select {{.*}}, %[[LOAD_0]], {{.*}} : i1, vector<32xf16>
 
     // CHECK: %[[CST0_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: %[[CST0_1:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -275,7 +274,6 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 8 : i32, "ttg.thr
     // CHECK: %[[PRED_BOOL:.*]] =  llvm.trunc %[[PRED]] : i8 to i1
     // CHECK: %[[BASE_Y_1:.*]] = llvm.select %[[PRED_BOOL]], %[[CST0_0]], %[[BLOCK_SHAPE_Y]] : i1, i32
     // CHECK: %[[LOAD_1:.*]] = triton_gen.2Dblockload {{.*}}, %[[BASE_Y_1]]  {elem_size_in_bits = 16, tile_width = 16, tile_height = 16, v_blocks = 2
-    // CHECK: llvm.select {{.*}}, %[[LOAD_1]], {{.*}} : i1, vector<32xf16>
 
     // CHECK: %[[CST0_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: %[[CST0_1:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -284,7 +282,6 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 8 : i32, "ttg.thr
     // CHECK: %[[PRED_BOOL:.*]] =  llvm.trunc %[[PRED]] : i8 to i1
     // CHECK: %[[BASE_Y_2:.*]] = llvm.select %[[PRED_BOOL]], %[[CST0_0]], %[[BLOCK_SHAPE_Y]] : i1, i32
     // CHECK: %[[LOAD_2:.*]] = triton_gen.2Dblockload {{.*}}, %[[BASE_Y_2]]  {elem_size_in_bits = 16, tile_width = 16, tile_height = 16, v_blocks = 2
-    // CHECK: llvm.select {{.*}}, %[[LOAD_2]], {{.*}} : i1, vector<32xf16>
 
     // CHECK: %[[CST0_0:.*]] = llvm.mlir.constant(0 : i32) : i32
     // CHECK: %[[CST0_1:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -293,7 +290,6 @@ module attributes {ttig.support_sg_2d_block, "ttg.num-warps" = 8 : i32, "ttg.thr
     // CHECK: %[[PRED_BOOL:.*]] =  llvm.trunc %[[PRED]] : i8 to i1
     // CHECK: %[[BASE_Y_3:.*]] = llvm.select %[[PRED_BOOL]], %[[CST0_0]], %[[BLOCK_SHAPE_Y]] : i1, i32
     // CHECK: %[[LOAD_3:.*]] = triton_gen.2Dblockload {{.*}}, %[[BASE_Y_3]]  {elem_size_in_bits = 16, tile_width = 16, tile_height = 16, v_blocks = 2
-    // CHECK: llvm.select {{.*}}, %[[LOAD_3]], {{.*}} : i1, vector<32xf16>
     %11 = tt.load %10, %a_mask, %a_other {ttig.block_io = "row_major"} : tensor<256x64x!tt.ptr<f16>, #mma>
 
     tt.return
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1863,7 +1863,6 @@ struct LoadOpToBlockIOConversion
       return rewriteTensorPointerLoad(op, adaptor, rewriter);
 
     Value mask = op.getMask();
-    Value other = op.getOther();
     Type resultType = op.getType();
     auto tensorType = cast<RankedTensorType>(resultType);
 
@@ -2056,16 +2055,12 @@ struct LoadOpToBlockIOConversion
     unsigned instWidth = dpasInstShape[threadOrder[rank - 2]];
     unsigned instHeight = dpasInstShape[threadOrder[rank - 1]];
 
-    bool otherIsSplatConstInt = false;
-    int64_t splatVal = 0;
-
     std::map<SmallVector<unsigned>, Value> ptrs;
     std::map<SmallVector<unsigned>, Value> masks;
     std::map<SmallVector<unsigned>, Value> others;
 
     Value llPtr = adaptor.getPtr();
     Value llMask = adaptor.getMask();
-    Value llOther = adaptor.getOther();
 
     SmallVector<Value> ptrElems, maskElems, otherElems;
     // Get the LLVM values for pointers
@@ -2101,16 +2096,30 @@ struct LoadOpToBlockIOConversion
       return failure();
 
     // Get the LLVM values for `other`
+    Value other = op.getOther();
+    Value llOther = adaptor.getOther();
     DenseElementsAttr constAttr;
-    if (other && isa<IntegerType>(eltTy) &&
-        matchPattern(other, m_Constant(&constAttr)) && constAttr.isSplat() &&
-        isa<IntegerType>(constAttr.getElementType())) {
-      otherIsSplatConstInt = true;
-      splatVal = constAttr.getSplatValue<APInt>().getSExtValue();
-    }
-    if (other) {
-      otherElems = unpackLLElements(loc, llOther, rewriter);
-    }
+    if (other)
+      if (matchPattern(other, m_Constant(&constAttr)) && constAttr.isSplat()) {
+        Type elemTy = constAttr.getElementType();
+        auto handleSplatValue = [&](auto splatVal) {
+          if (!splatVal.isZero()) {
+            otherElems = SmallVector<Value>(
+                numElems,
+                rewriter.create<LLVM::ConstantOp>(loc, elemTy, splatVal));
+          }
+        };
+
+        TypeSwitch<mlir::Type>(elemTy)
+            .Case<FloatType>([&](FloatType) {
+              handleSplatValue(constAttr.getSplatValue<APFloat>());
+            })
+            .Case<IntegerType>([&](IntegerType) {
+              handleSplatValue(constAttr.getSplatValue<APInt>());
+            });
+      } else {
+        otherElems = unpackLLElements(loc, llOther, rewriter);
+      }
 
     // re-arrange the ptrs and masks to for large 2D block IO.
     // Layout is unrelated to the scalar type.