intel
diff --git a/‎third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp‎
Lines changed: 53 additions & 44 deletions b/‎third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp‎
Lines changed: 53 additions & 44 deletions
diff --git a/‎third_party/intel/lib/TritonIntelGPUToLLVM/BF16Casts.cpp‎
Lines changed: 26 additions & 24 deletions b/‎third_party/intel/lib/TritonIntelGPUToLLVM/BF16Casts.cpp‎
Lines changed: 26 additions & 24 deletions
diff --git a/‎third_party/intel/lib/TritonIntelGPUToLLVM/ControlFlowOpToLLVM.cpp‎
Lines changed: 3 additions & 2 deletions b/‎third_party/intel/lib/TritonIntelGPUToLLVM/ControlFlowOpToLLVM.cpp‎
Lines changed: 3 additions & 2 deletions
@@ -183,6 +183,7 @@ createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
   MLIRContext *ctx = rewriter.getContext();
   VectorType resType = op.getRes().getType();
   Location loc = op->getLoc();
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
 
   Value ptr = op.getPtr();
   Value baseWidth = op.getBaseWidth();
@@ -199,7 +200,7 @@ createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
 
   // The IGC intrinsic requires the first argument be int64
   ptr = rewriter.create<LLVM::PtrToIntOp>(loc, int64Ty, ptr);
-  Value one = i32_val(1);
+  Value one = b.i32_val(1);
 
   SmallVector<Type> argTypes{int64Ty,
                              baseWidth.getType(),
@@ -216,18 +217,18 @@ createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op,
                              int32Ty};
 
   SmallVector<Value> args{ptr,
-                          sub(baseWidth, one),
-                          sub(baseHeight, one),
-                          sub(basePitch, one),
+                          b.sub(baseWidth, one),
+                          b.sub(baseHeight, one),
+                          b.sub(basePitch, one),
                           x,
                           y,
-                          i32_val(op.getElemSizeInBits()),
-                          i32_val(op.getTileWidth()),
-                          i32_val(op.getTileHeight()),
-                          i32_val(op.getVBlocks()),
-                          i1_val(op.getTranspose()),
-                          i1_val(op.getVnniTransform()),
-                          i32_val(static_cast<int>(op.getCacheControl()))};
+                          b.i32_val(op.getElemSizeInBits()),
+                          b.i32_val(op.getTileWidth()),
+                          b.i32_val(op.getTileHeight()),
+                          b.i32_val(op.getVBlocks()),
+                          b.i1_val(op.getTranspose()),
+                          b.i1_val(op.getVnniTransform()),
+                          b.i32_val(static_cast<int>(op.getCacheControl()))};
 
   LLVM::CallOp call = createDeviceFunctionCall(
       rewriter, funcName, resType, argTypes, args, {}, noUnwindWillReturnAttrs);
@@ -291,6 +292,7 @@ createGenISA2DBlockWrite(TritonGEN::Matrix2DBlockStoreOp op,
                          ConversionPatternRewriter &rewriter) {
   MLIRContext *ctx = rewriter.getContext();
   Location loc = op->getLoc();
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
 
   // The IGC intrinsic requires the first argument be int64
   Value ptr = op.getPtr();
@@ -305,7 +307,7 @@ createGenISA2DBlockWrite(TritonGEN::Matrix2DBlockStoreOp op,
   VectorType storeValType = op.getStoredVal().getType();
   std::string funcName =
       "llvm.genx.GenISA.LSC2DBlockWrite." + getGenISATypeMangling(storeValType);
-  Value one = i32_val(1);
+  Value one = b.i32_val(1);
 
   SmallVector<Type> argTypes{
       int_ty(64),          baseWidth.getType(), baseHeight.getType(),
@@ -314,18 +316,18 @@ createGenISA2DBlockWrite(TritonGEN::Matrix2DBlockStoreOp op,
       int_ty(32),          int_ty(1),           int_ty(1),
       int_ty(32),          storeVal.getType()};
   SmallVector<Value> args{ptr,
-                          sub(baseWidth, one),
-                          sub(baseHeight, one),
-                          sub(basePitch, one),
+                          b.sub(baseWidth, one),
+                          b.sub(baseHeight, one),
+                          b.sub(basePitch, one),
                           x,
                           y,
-                          i32_val(op.getElemSizeInBits()),
-                          i32_val(op.getTileWidth()),
-                          i32_val(op.getTileHeight()),
-                          i32_val(op.getVBlocks()),
-                          i1_val(false), // transpose
-                          i1_val(false), // vnniTransform
-                          i32_val(static_cast<int>(op.getCacheControl())),
+                          b.i32_val(op.getElemSizeInBits()),
+                          b.i32_val(op.getTileWidth()),
+                          b.i32_val(op.getTileHeight()),
+                          b.i32_val(op.getVBlocks()),
+                          b.i1_val(false), // transpose
+                          b.i1_val(false), // vnniTransform
+                          b.i32_val(static_cast<int>(op.getCacheControl())),
                           storeVal};
 
   LLVM::CallOp call =
@@ -339,6 +341,7 @@ createGenISA2DBlockPrefetch(TritonGEN::Matrix2DBlockPrefetchOp op,
                             ConversionPatternRewriter &rewriter) {
   MLIRContext *ctx = rewriter.getContext();
   Location loc = op->getLoc();
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
 
   // The IGC intrinsic requires the first argument be int64
   Value ptr = op.getPtr();
@@ -348,7 +351,7 @@ createGenISA2DBlockPrefetch(TritonGEN::Matrix2DBlockPrefetchOp op,
   Value basePitch = op.getBasePitch();
   Value x = op.getX();
   Value y = op.getY();
-  Value one = i32_val(1);
+  Value one = b.i32_val(1);
 
   SmallVector<Type> argTypes{
       int_ty(64),          baseWidth.getType(), baseHeight.getType(),
@@ -357,18 +360,18 @@ createGenISA2DBlockPrefetch(TritonGEN::Matrix2DBlockPrefetchOp op,
       int_ty(32),          int_ty(1),           int_ty(1),
       int_ty(32)};
   SmallVector<Value> args{ptr,
-                          sub(baseWidth, one),
-                          sub(baseHeight, one),
-                          sub(basePitch, one),
+                          b.sub(baseWidth, one),
+                          b.sub(baseHeight, one),
+                          b.sub(basePitch, one),
                           x,
                           y,
-                          i32_val(op.getElemSizeInBits()),
-                          i32_val(op.getTileWidth()),
-                          i32_val(op.getTileHeight()),
-                          i32_val(op.getVBlocks()),
-                          i1_val(false), // transpose
-                          i1_val(false), // vnniTransform
-                          i32_val(static_cast<int>(op.getCacheControl()))};
+                          b.i32_val(op.getElemSizeInBits()),
+                          b.i32_val(op.getTileWidth()),
+                          b.i32_val(op.getTileHeight()),
+                          b.i32_val(op.getVBlocks()),
+                          b.i1_val(false), // transpose
+                          b.i1_val(false), // vnniTransform
+                          b.i32_val(static_cast<int>(op.getCacheControl()))};
 
   const StringLiteral funcName = "llvm.genx.GenISA.LSC2DBlockPrefetch.isVoid";
   return createDeviceFunctionCall(rewriter, funcName, void_ty(ctx), {argTypes},
@@ -485,11 +488,12 @@ struct TritonMatrix2DBlockLoadLowering
                   ConversionPatternRewriter &rewriter) const override {
     MLIRContext *ctx = rewriter.getContext();
     Location loc = op->getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
     VectorType resType = op.getRes().getType();
 
     auto dest = rewriter.create<LLVM::AllocaOp>(
         loc, ptr_ty(ctx), resType.getElementType(),
-        i32_val(resType.getNumElements()));
+        b.i32_val(resType.getNumElements()));
     std::string fnName = "intel_sub_group_2d_block_read_";
     if (op.getVnniTransform())
       fnName += "transform_";
@@ -503,9 +507,10 @@ struct TritonMatrix2DBlockLoadLowering
     fnName +=
         intel::getTypeMangling(resType.getElementType(), /*isUnsigned=*/true);
     VectorType vecType = vec_ty(i32_ty, 2);
-    Value byteCoord = insert_element(
-        vecType, insert_element(vecType, undef(vecType), op.getX(), i32_val(0)),
-        op.getY(), i32_val(1));
+    Value byteCoord = b.insert_element(
+        vecType,
+        b.insert_element(vecType, b.undef(vecType), op.getX(), b.i32_val(0)),
+        op.getY(), b.i32_val(1));
     SmallVector<Type> argTypes{ptr_ty(ctx, 1), i32_ty,  i32_ty,
                                i32_ty,         vecType, ptr_ty(ctx)};
     SmallVector<Value> args{op.getPtr(),        op.getBaseWidth(),
@@ -545,11 +550,12 @@ struct TritonMatrix2DBlockStoreLowering
                   ConversionPatternRewriter &rewriter) const override {
     MLIRContext *ctx = rewriter.getContext();
     Location loc = op->getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
 
     VectorType storeValType = op.getStoredVal().getType();
     auto storeValPtr = rewriter.create<LLVM::AllocaOp>(
         loc, ptr_ty(ctx), storeValType.getElementType(),
-        i32_val(storeValType.getNumElements()));
+        b.i32_val(storeValType.getNumElements()));
     rewriter.create<LLVM::StoreOp>(loc, op.getStoredVal(), storeValPtr);
 
     std::string fnName = "intel_sub_group_2d_block_write_";
@@ -565,9 +571,10 @@ struct TritonMatrix2DBlockStoreLowering
                                          : "h";
 
     VectorType vecType = vec_ty(i32_ty, 2);
-    Value byteCoord = insert_element(
-        vecType, insert_element(vecType, undef(vecType), op.getX(), i32_val(0)),
-        op.getY(), i32_val(1));
+    Value byteCoord = b.insert_element(
+        vecType,
+        b.insert_element(vecType, b.undef(vecType), op.getX(), b.i32_val(0)),
+        op.getY(), b.i32_val(1));
     SmallVector<Type> argTypes{ptr_ty(ctx, 1), i32_ty,  i32_ty,
                                i32_ty,         vecType, ptr_ty(ctx)};
     SmallVector<Value> args{op.getPtr(),        op.getBaseWidth(),
@@ -607,16 +614,18 @@ struct TritonMatrix2DBlockPrefetchLowering
                   ConversionPatternRewriter &rewriter) const override {
     MLIRContext *ctx = rewriter.getContext();
     Location loc = op->getLoc();
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
     std::string fnName = "intel_sub_group_2d_block_prefetch_";
     fnName += std::to_string(op.getElemSizeInBits()) + "b_" +
               std::to_string(op.getTileHeight()) + "r" +
               std::to_string(op.getTileWidth()) + "x" +
               std::to_string(op.getVBlocks()) + "c";
     fnName = "_Z" + std::to_string(fnName.size()) + fnName + "PU3AS1viiiDv2_i";
     VectorType vecType = vec_ty(i32_ty, 2);
-    Value byteCoord = insert_element(
-        vecType, insert_element(vecType, undef(vecType), op.getX(), i32_val(0)),
-        op.getY(), i32_val(1));
+    Value byteCoord = b.insert_element(
+        vecType,
+        b.insert_element(vecType, b.undef(vecType), op.getX(), b.i32_val(0)),
+        op.getY(), b.i32_val(1));
     SmallVector<Type> argTypes{ptr_ty(ctx, 1), i32_ty, i32_ty, i32_ty, vecType};
     SmallVector<Value> args{op.getPtr(), op.getBaseWidth(), op.getBaseHeight(),
                             op.getBasePitch(), byteCoord};
 
@@ -75,6 +75,7 @@ struct TruncBF16 : ConvertOpToLLVMPattern<arith::TruncFOp> {
 namespace mlir::triton::intel {
 Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter,
                         Value v) {
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
   if (auto definingOp = v.getDefiningOp()) {
     auto moduleOp = definingOp->getParentWithTrait<OpTrait::SymbolTable>();
     if (moduleOp->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
@@ -86,19 +87,20 @@ Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter,
       auto ext_func = triton::gpu::intel::lookupOrCreateSPIRVFn(moduleOp, name,
                                                                 inTy, outTy);
       auto call = triton::gpu::intel::createSPIRVBuiltinCall(
-          loc, rewriter, ext_func, bitcast(v, inTy).getResult());
+          loc, rewriter, ext_func, b.bitcast(v, inTy).getResult());
       return call.getResult();
     }
   }
 
-  auto as_int16 = bitcast(v, i16_ty);
-  auto as_int32 = zext(i32_ty, as_int16);
-  auto shifted = shl(i32_ty, as_int32, i32_val(16));
-  return (bitcast(shifted, f32_ty));
+  auto as_int16 = b.bitcast(v, i16_ty);
+  auto as_int32 = b.zext(i32_ty, as_int16);
+  auto shifted = b.shl(i32_ty, as_int32, b.i32_val(16));
+  return (b.bitcast(shifted, f32_ty));
 }
 
 Value convertFp32ToBf16(Location loc, ConversionPatternRewriter &rewriter,
                         Value v, RoundingMode rounding) {
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
   if (auto definingOp = v.getDefiningOp()) {
     auto moduleOp = definingOp->getParentWithTrait<OpTrait::SymbolTable>();
     if (moduleOp->hasAttr(triton::gpu::intel::TritonIntelGPUDialect::
@@ -114,36 +116,36 @@ Value convertFp32ToBf16(Location loc, ConversionPatternRewriter &rewriter,
           moduleOp, name, inTy, funcOutTy);
       auto call = triton::gpu::intel::createSPIRVBuiltinCall(loc, rewriter,
                                                              trunc_func, v);
-      return bitcast(call.getResult(), outTy);
+      return b.bitcast(call.getResult(), outTy);
     }
   }
 
   assert(!isa<VectorType>(v.getType()) && "Not yet supported");
 
-  auto as_uint32 = bitcast(v, i32_ty);
+  auto as_uint32 = b.bitcast(v, i32_ty);
   auto check_exponent =
-      and_(i32_ty, xor_(i32_ty, as_uint32, i32_val(0xffffffff)),
-           i32_val(0x7f800000));
-  auto exponent_not_all1s = icmp_ne(check_exponent, i32_val(0));
-  auto exponent_all1s = icmp_eq(check_exponent, i32_val(0));
+      b.and_(i32_ty, b.xor_(i32_ty, as_uint32, b.i32_val(0xffffffff)),
+             b.i32_val(0x7f800000));
+  auto exponent_not_all1s = b.icmp_ne(check_exponent, b.i32_val(0));
+  auto exponent_all1s = b.icmp_eq(check_exponent, b.i32_val(0));
   Value rounded = as_uint32;
   if (rounding == RoundingMode::RTNE) {
-    rounded =
-        add(i32_ty, i32_val(0x7fff),
-            and_(i32_ty, lshr(i32_ty, as_uint32, i32_val(16)), i32_val(1)));
-    rounded = add(i32_ty, rounded, as_uint32);
-    rounded = select(exponent_not_all1s, rounded, as_uint32);
+    rounded = b.add(
+        i32_ty, b.i32_val(0x7fff),
+        b.and_(i32_ty, b.lshr(i32_ty, as_uint32, b.i32_val(16)), b.i32_val(1)));
+    rounded = b.add(i32_ty, rounded, as_uint32);
+    rounded = b.select(exponent_not_all1s, rounded, as_uint32);
   }
 
-  auto preserve_nan =
-      and_(i1_ty, exponent_all1s,
-           icmp_ne(and_(i32_ty, as_uint32, i32_val(0xffff)), i32_val(0)));
-  auto nan = or_(i32_ty, as_uint32, i32_val(0x10000));
-  Value res = select(preserve_nan, nan, rounded);
+  auto preserve_nan = b.and_(
+      i1_ty, exponent_all1s,
+      b.icmp_ne(b.and_(i32_ty, as_uint32, b.i32_val(0xffff)), b.i32_val(0)));
+  auto nan = b.or_(i32_ty, as_uint32, b.i32_val(0x10000));
+  Value res = b.select(preserve_nan, nan, rounded);
 
-  auto shifted = lshr(i32_ty, res, i32_val(16));
-  auto truncated = trunc(i16_ty, shifted);
-  return bitcast(truncated, bf16_ty);
+  auto shifted = b.lshr(i32_ty, res, b.i32_val(16));
+  auto truncated = b.trunc(i16_ty, shifted);
+  return b.bitcast(truncated, bf16_ty);
 }
 
 void populateBF16CastsLLVMPatterns(LLVMTypeConverter &typeConverter,
 
@@ -38,9 +38,10 @@ struct ReturnOpConversion
         Value packedResults =
             rewriter.create<LLVM::UndefOp>(op.getLoc(), packedResultsTy);
         auto loc = op.getLoc();
+        auto b = TritonLLVMOpBuilder(loc, rewriter);
         for (auto it : llvm::enumerate(adaptor.getOperands())) {
-          packedResults = insert_val(packedResultsTy, packedResults, it.value(),
-                                     it.index());
+          packedResults = b.insert_val(packedResultsTy, packedResults,
+                                       it.value(), it.index());
         }
         newOp = rewriter.create<LLVM::ReturnOp>(op.getLoc(), packedResults);
       }
Original file line number	Diff line number	Diff line change
`@@ -38,9 +38,10 @@ struct ReturnOpConversion`
`38`	`38`	`Value packedResults =`
`39`	`39`	`rewriter.create<LLVM::UndefOp>(op.getLoc(), packedResultsTy);`
`40`	`40`	`auto loc = op.getLoc();`
	`41`	`+ auto b = TritonLLVMOpBuilder(loc, rewriter);`
`41`	`42`	`for (auto it : llvm::enumerate(adaptor.getOperands())) {`
`42`		`- packedResults = insert_val(packedResultsTy, packedResults, it.value(),`
`43`		`- it.index());`
	`43`	`+ packedResults = b.insert_val(packedResultsTy, packedResults,`
	`44`	`+ it.value(), it.index());`
`44`	`45`	`}`
`45`	`46`	`newOp = rewriter.create<LLVM::ReturnOp>(op.getLoc(), packedResults);`
`46`	`47`	`}`