Fix build and test failures after llvm/llvm-project@bc773632355b

anmyachev · whitneywhtsang · commit be91d90999a9 · 2025-09-04T05:03:26.000Z
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/test/TritonIntelGPU/split-barrier.mlir b/test/TritonIntelGPU/split-barrier.mlir
@@ -19,9 +19,9 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     %22 = tt.make_tensor_ptr %arg1, [%c0_i64, %c0_i64], [%c0_i64, %c0_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #dot1>>
     // CHECK:      scf.for %[[V:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
     // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
-    // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
+    // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
     // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
-    // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
+    // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
     // CHECK:      scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
     // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
     // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
@@ -66,9 +66,9 @@ module attributes {"ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32
     %22 = tt.make_tensor_ptr %arg1, [%c0_i64, %c0_i64], [%c0_i64, %c0_i64], [%c0_i32, %c0_i32] {order = array<i32: 1, 0>} : <tensor<64x256xf16, #dot1>>
 
     // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
-    // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
+    // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
     // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #[[$DPAS]], kWidth = 1}>>>
-    // CHECK-NEXT: ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
+    // CHECK:      ttig.prefetch {{.*}} : !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #[[$DPAS]], kWidth = 2}>>>
     // CHECK:      scf.for %[[IV:.*]] = {{.*}} to {{.*}} step {{.*}} iter_args({{.*}}) -> (tensor<128x256xf32, #mma>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>, !tt.ptr<tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 1}>>>, !tt.ptr<tensor<64x256xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>>)
     // WORKGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = WorkGroup, memory_scope = WorkGroup}
     // SUBGROUP_SCOPE-NEXT: triton_gen.split_barrier_arrive {execution_scope = SubGroup, memory_scope = SubGroup}
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp
@@ -120,8 +120,8 @@ class CanonicalMaskValidator final : public MaskValidatorBase {
           cast<arith::ConstantIntOp>(maskInfo.N.getDefiningOp()).value();
       unsigned END = maskInfo.END;
       bool cond = UB == ((N - END) / END) + 1;
-      return builder.create<arith::ConstantIntOp>(forOp.getLoc(), cond,
-                                                  builder.getI1Type());
+      return builder.create<arith::ConstantIntOp>(forOp.getLoc(),
+                                                  builder.getI1Type(), cond);
     }
 
     auto divOp = cast<arith::DivSIOp>(defOp);
@@ -276,8 +276,8 @@ class InvariantMaskValidator final : public MaskValidatorBase {
       [[maybe_unused]] auto rangeOp = cast<tt::MakeRangeOp>(rhs);
       assert(rangeOp.getStart() < rangeOp.getEnd() && "Invalid range");
       unsigned start = rangeOp.getStart();
-      auto cstOp = builder.createOrFold<arith::ConstantIntOp>(loc, start,
-                                                              lhsVal.getType());
+      auto cstOp = builder.createOrFold<arith::ConstantIntOp>(
+          loc, lhsVal.getType(), start);
       return builder.createOrFold<arith::CmpIOp>(loc, arith::CmpIPredicate::slt,
                                                  lhsVal, cstOp);
     }
diff --git a/third_party/intel/lib/TritonGENToLLVM/Attributes.cpp b/third_party/intel/lib/TritonGENToLLVM/Attributes.cpp
@@ -10,6 +10,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "attributes"
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/DistributeToWarps.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/DistributeToWarps.cpp
@@ -238,7 +238,7 @@ void distributeMakeRangeOp(tt::MakeRangeOp op, Value warpId) {
   // case, we would need to determine a dimension-specific offset similar to
   // `tt.make_tensor_ptr`' distribution pattern.
   auto elemTy = convTy.getElementType();
-  auto numElemsConst = b.create<arith::ConstantIntOp>(loc, numElems, elemTy);
+  auto numElemsConst = b.create<arith::ConstantIntOp>(loc, elemTy, numElems);
   auto rangeOffset = b.create<arith::MulIOp>(loc, warpId, numElemsConst);
   auto splat = b.create<tt::SplatOp>(loc, convTy, rangeOffset);
   auto newRange = b.create<arith::AddIOp>(loc, subRange, splat);
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/MatchTargetSize.cpp
@@ -1253,7 +1253,7 @@ void MatchTargetSizePass::transformMakeRangeOp(tt::MakeRangeOp op) {
   SmallVector<Value> subRanges;
   for (int i = 0; i < end / subgroupSize; ++i) {
     Value offset =
-        b.create<arith::ConstantIntOp>(loc, i * subgroupSize, elemTy);
+        b.create<arith::ConstantIntOp>(loc, elemTy, i * subgroupSize);
     Value offsetTensor = b.create<tt::SplatOp>(loc, subRangeTy, offset);
     subRanges.push_back(b.create<arith::AddIOp>(loc, subRange, offsetTensor));
   }

Original file line number	Diff line number	Diff line change
`@@ -1253,7 +1253,7 @@ void MatchTargetSizePass::transformMakeRangeOp(tt::MakeRangeOp op) {`
`1253`	`1253`	`SmallVector<Value> subRanges;`
`1254`	`1254`	`for (int i = 0; i < end / subgroupSize; ++i) {`
`1255`	`1255`	`Value offset =`
`1256`		`- b.create<arith::ConstantIntOp>(loc, i * subgroupSize, elemTy);`
	`1256`	`+ b.create<arith::ConstantIntOp>(loc, elemTy, i * subgroupSize);`
`1257`	`1257`	`Value offsetTensor = b.create<tt::SplatOp>(loc, subRangeTy, offset);`
`1258`	`1258`	`subRanges.push_back(b.create<arith::AddIOp>(loc, subRange, offsetTensor));`
`1259`	`1259`	`}`