[intel] improve pitch and width constexpr folding (#5489)

januszjah · web-flow · commit 20be0694c623 · 2025-11-27T13:22:11.000-05:00
This PR improves constant expression folding for pitch and width parameters in Intel GPU block I/O operations. The changes introduce a more robust constant evaluation mechanism that handles multiple levels of type casts and operation folding, addressing issue #5338.
diff --git a/test/TritonIntelGPU/blockptr_load.mlir b/test/TritonIntelGPU/blockptr_load.mlir
@@ -136,7 +136,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32,
     // CHECK:           %[[OFFSET_0:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[OFFSET_1:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[WIDTH_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:           %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK:           %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[VAL_11]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue %[[VAL_12]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[COL_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[BASE:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
@@ -199,7 +199,7 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 16 : i32,
     // CHECK:           %[[OFFSET_0:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][0] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[OFFSET_1:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][1] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[WIDTH_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][2] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
-    // CHECK:           %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
+    // CHECK:           %[[HEIGHT_i64:.*]] = llvm.extractvalue %[[VAL_10]][3] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[ROW_STRIDE_i64:.*]] = llvm.extractvalue %[[VAL_11]][4] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[COL_STRIDE_i64:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][5] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
     // CHECK:           %[[BASE:.*]] = llvm.extractvalue %[[BLOCK_POINTER]][6] : !llvm.struct<(i32, i32, i64, i64, i64, i64, ptr<1>)>
diff --git a/third_party/intel/include/Utils/Utility.h b/third_party/intel/include/Utils/Utility.h
@@ -19,9 +19,9 @@ Value findOrCreateIntConstant(Location loc, int val, unsigned bitWidth,
 std::optional<mlir::triton::MakeTensorPtrOp>
 findDefiningMakeTensorPtrOp(Value val);
 
-// This function folds the `op` operation and returns the constant value if it
+// This function folds the `v` value and returns the constant value if it
 // has successfully folded to a constant. Otherwise, it returns `std::nullopt`.
-std::optional<int64_t> getFoldedConstantValue(Operation *op);
+std::optional<int64_t> getFoldedConstantValue(Value v, int depth = 8);
 
 // Return true if the `val` value is a constant containing a value equal to
 // expected.
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -1629,23 +1629,20 @@ struct LoadOpToBlockIOConversion
       std::swap(baseWidth, baseHeight);
     }
     // HW requires the pitch to be at least 64 bytes.
-    std::function<Value(Value)> skipTrunc = [&](Value v) {
-      if (dyn_cast_or_null<LLVM::TruncOp>(v.getDefiningOp()))
-        return skipTrunc(v.getDefiningOp()->getOperand(0));
-      return v;
-    };
-    if (Operation *op = skipTrunc(pitch).getDefiningOp()) {
-      std::optional<int64_t> pitchConst =
-          mlir::triton::intel::getFoldedConstantValue(op);
-      if (pitchConst.has_value()) {
-        if ((*pitchConst * elemSizeInBits / 8) < 64)
-          return failure();
-      }
+    if (auto pitchConst = mlir::triton::intel::getFoldedConstantValue(pitch)) {
+      if ((*pitchConst * elemSizeInBits / 8) < 64)
+        return failure();
     }
 
     baseWidth = b.trunc(i32_ty, baseWidth);
     baseHeight = b.trunc(i32_ty, baseHeight);
 
+    if (auto widthConst =
+            mlir::triton::intel::getFoldedConstantValue(baseWidth)) {
+      if ((*widthConst * elemSizeInBits / 8) < 64)
+        return failure();
+    }
+
     const unsigned originalElemBits = elemSizeInBits;
     if (isTransposeRequired) {
       // adjust the block io parameter to align HW's limitations on
diff --git a/third_party/intel/lib/Utils/Utility.cpp b/third_party/intel/lib/Utils/Utility.cpp
@@ -1,5 +1,6 @@
 #include "intel/include/Utils/Utility.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/Interfaces/LoopLikeInterface.h"
@@ -104,28 +105,55 @@ std::optional<tt::MakeTensorPtrOp> findDefiningMakeTensorPtrOp(Value val) {
   return std::nullopt;
 }
 
-std::optional<int64_t> getFoldedConstantValue(Operation *op) {
-  SmallVector<OpFoldResult> results;
-  if (failed(op->fold(results)))
-    return std::nullopt;
+static Value skipCasts(Value v) {
+  Operation *def = v.getDefiningOp();
+  if (def &&
+      isa<LLVM::TruncOp, LLVM::SExtOp, LLVM::ZExtOp, LLVM::BitcastOp>(def))
+    return def->getOperand(0);
+  return v;
+}
+
+static Value foldValue(Value v) {
+  if (Operation *def = v.getDefiningOp()) {
+    SmallVector<OpFoldResult> results;
+
+    if (failed(def->fold(results)))
+      return v;
 
-  // If fold succeeded but `results` is empty, we give a second try, after the
-  // operands have been switched during the first call to `fold()`.
-  if (results.empty()) {
-    if (failed(op->fold(results)))
-      return std::nullopt;
+    // If fold succeeded but `results` is empty, we give a second try, after the
+    // operands have been switched during the first call to `fold()`.
+    if (results.empty()) {
+      if (failed(def->fold(results)))
+        return v;
+    }
+
+    if (results.size() == 1) {
+      if (auto val = dyn_cast_or_null<Value>(results[0]))
+        return val;
+    }
   }
+  return v;
+}
 
-  if (results.size() != 1)
-    return std::nullopt;
+std::optional<int64_t> getFoldedConstantValue(Value v, int depth) {
+  for (int i = 0; i < depth; ++i) {
+    if (auto res = getConstantIntValue(v))
+      return res;
+
+    Value newV = skipCasts(v);
+    newV = foldValue(newV);
+
+    if (newV == v)
+      break;
 
-  return getConstantIntValue(results[0]);
+    v = newV;
+  }
+
+  return std::nullopt;
 }
 
 bool isConstant(Value val, int64_t expected) {
-  if (auto defOp = val.getDefiningOp())
-    return (getFoldedConstantValue(defOp) == expected);
-  return false;
+  return (getFoldedConstantValue(val) == expected);
 }
 
 Value getFinalValue(Value value) {