diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt index 8516e80da2..1bb3165bbb 100644 --- a/cmake/llvm-hash.txt +++ b/cmake/llvm-hash.txt @@ -1 +1 @@ -f6ded0be897e2878612dd903f7e8bb85448269e5 +49d5bb0ad0cb31410184c462801c5049ad671517 diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp index f324eb99cb..ece3cbdf81 100644 --- a/lib/Dialect/TritonGPU/IR/Ops.cpp +++ b/lib/Dialect/TritonGPU/IR/Ops.cpp @@ -931,8 +931,9 @@ void WarpSpecializeOp::getSuccessorRegions( return; } // And the default region branches transparently back to the parent. - assert(src.getRegionOrNull() == &getDefaultRegion()); - successors.push_back(RegionSuccessor(getResults())); + assert(src.getTerminatorPredecessorOrNull()->getParentRegion() == + &getDefaultRegion()); + successors.push_back(RegionSuccessor(getOperation(), getResults())); } LogicalResult WarpSpecializeOp::verify() { diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp index 61508a0d93..603a2c2310 100644 --- a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp +++ b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp @@ -338,7 +338,7 @@ void cloneOpsInBlock(Block *block, SmallVector &builders, builder.mapping.lookupOrDefault(yieldOp.getOperand(i))); } - builder.create(op->getLoc(), newYieldOperands); + scf::YieldOp::create(builder, op->getLoc(), newYieldOperands); } } else { assert(hasPartition(op)); @@ -449,7 +449,7 @@ LogicalResult triton::gpu::partitionLoop(scf::ForOp loop) { for (auto [b, region, partition] : llvm::zip( builders, wgOp.getPartitionRegions(), partitions.getPartitions())) { if (!llvm::is_contained(getPartitionIds(loop), b.partitionId)) { - b.create(wgOp.getLoc(), SmallVector{}); + nvws::WarpGroupYieldOp::create(b, wgOp.getLoc(), SmallVector{}); continue; } auto newForOp = *region.front().getOps().begin(); diff --git a/python/src/llvm.cc b/python/src/llvm.cc index f5d379aae6..e9180728b9 100644 --- a/python/src/llvm.cc +++ b/python/src/llvm.cc @@ -57,7 +57,6 @@ createTargetMachine(llvm::Module *module, std::string proc, bool disableLLVMOpt = mlir::triton::tools::getBoolEnv("DISABLE_LLVM_OPT"); if (enable_fp_fusion) opt.AllowFPOpFusion = llvm::FPOpFusion::Fast; - opt.UnsafeFPMath = false; opt.NoInfsFPMath = false; opt.NoNaNsFPMath = true; opt.TrapUnreachable = true; diff --git a/test/Conversion/tritonnvidiagpu_to_llvm.mlir b/test/Conversion/tritonnvidiagpu_to_llvm.mlir index 0e9690b5d5..f5a0b7eb06 100644 --- a/test/Conversion/tritonnvidiagpu_to_llvm.mlir +++ b/test/Conversion/tritonnvidiagpu_to_llvm.mlir @@ -215,9 +215,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ // CHECK-LABEL: async_copy_mbarrier_arrive module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} { tt.func public @async_copy_mbarrier_arrive(%arg0: !ttg.memdesc<1xi64, #shared, #ttg.shared_memory>) attributes { noinline = false } { - // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} : !llvm.ptr<3> + // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} : !llvm.ptr<3> ttng.async_copy_mbarrier_arrive %arg0 : !ttg.memdesc<1xi64, #shared, #ttg.shared_memory> - // CHECK: nvvm.cp.async.mbarrier.arrive.shared %{{.*}} {noinc = true} : !llvm.ptr<3> + // CHECK: nvvm.cp.async.mbarrier.arrive %{{.*}} {noinc = true} : !llvm.ptr<3> ttng.async_copy_mbarrier_arrive %arg0 { noIncrement } : !ttg.memdesc<1xi64, #shared, #ttg.shared_memory> tt.return } diff --git a/third_party/amd/include/Analysis/RangeAnalysis.h b/third_party/amd/include/Analysis/RangeAnalysis.h index df5ff673ba..e042ecde06 100644 --- a/third_party/amd/include/Analysis/RangeAnalysis.h +++ b/third_party/amd/include/Analysis/RangeAnalysis.h @@ -84,7 +84,7 @@ struct TritonIntegerRangeAnalysis : dataflow::IntegerRangeAnalysis { /// the loop operands and all users and all users of the results of the loop. void visitRegionSuccessors( ProgramPoint *point, RegionBranchOpInterface branch, - RegionBranchPoint successor, + RegionSuccessor successor, ArrayRef abstractLattices) override; /// Collect all operands that participate in assumptions (see description of diff --git a/third_party/amd/lib/Analysis/RangeAnalysis.cpp b/third_party/amd/lib/Analysis/RangeAnalysis.cpp index fd0eb9ab7b..b1057774cb 100644 --- a/third_party/amd/lib/Analysis/RangeAnalysis.cpp +++ b/third_party/amd/lib/Analysis/RangeAnalysis.cpp @@ -630,7 +630,7 @@ void TritonIntegerRangeAnalysis::initializeFuncOp(tt::FuncOp op) { void TritonIntegerRangeAnalysis::visitRegionSuccessors( ProgramPoint *point, RegionBranchOpInterface branch, - RegionBranchPoint successor, + RegionSuccessor successor, ArrayRef abstractLattices) { LLVM_DEBUG({ DBGS() << "Visit Region Succesors of "; @@ -715,10 +715,11 @@ void TritonIntegerRangeAnalysis::visitRegionSuccessors( if (!inputs.empty()) { firstIndex = cast(inputs.front()).getResultNumber(); } - visitNonControlFlowArguments(branch, - RegionSuccessor(branch->getResults().slice( - firstIndex, inputs.size())), - lattices, firstIndex); + visitNonControlFlowArguments( + branch, + RegionSuccessor( + branch, branch->getResults().slice(firstIndex, inputs.size())), + lattices, firstIndex); } else { if (!inputs.empty()) { firstIndex = cast(inputs.front()).getArgNumber(); diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp index 5c71425db8..f95b3a1bba 100644 --- a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp +++ b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp @@ -481,8 +481,8 @@ class LocalBarrierOpConversion // amdgpu::MemoryCounterWaitOp will lower s_waitcnt // - s_barrier syncronizes the execution for the CTA auto dsAttr = rewriter.getI32IntegerAttr(0); - rewriter.create( - op->getLoc(), /* load= */ nullptr, /* store= */ nullptr, + amdgpu::MemoryCounterWaitOp::create( + rewriter, op->getLoc(), /* load= */ nullptr, /* store= */ nullptr, /* ds= */ dsAttr); rewriter.replaceOpWithNewOp(op); diff --git a/third_party/intel/cmake/FindSPIRVToLLVMTranslator.cmake b/third_party/intel/cmake/FindSPIRVToLLVMTranslator.cmake index 741c68429e..d059f86e02 100644 --- a/third_party/intel/cmake/FindSPIRVToLLVMTranslator.cmake +++ b/third_party/intel/cmake/FindSPIRVToLLVMTranslator.cmake @@ -75,76 +75,6 @@ if (NOT SPIRVToLLVMTranslator_FOUND) if(NOT PATCH_RESULT EQUAL 0) message(FATAL_ERROR "Failed to apply 3388.patch to SPIRV-LLVM-Translator") endif() - - # FIXME: Don't apply patch when LLVM commit update to 6cba572. - execute_process( - COMMAND git apply --check ${CMAKE_CURRENT_LIST_DIR}/revert_3385.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - ERROR_QUIET - RESULT_VARIABLE PATCH_RESULT - ) - if(PATCH_RESULT EQUAL 0) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/revert_3385.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - RESULT_VARIABLE PATCH_RESULT - ) - else() - execute_process( # Check if the patch is already applied - COMMAND git apply --reverse --check ${CMAKE_CURRENT_LIST_DIR}/revert_3385.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - RESULT_VARIABLE PATCH_RESULT - ) - endif() - if(NOT PATCH_RESULT EQUAL 0) - message(FATAL_ERROR "Failed to apply revert_3385.patch to SPIRV-LLVM-Translator") - endif() - - # FIXME: Don't apply patch when LLVM commit update to 573ca36. - execute_process( - COMMAND git apply --check ${CMAKE_CURRENT_LIST_DIR}/revert_3406.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - ERROR_QUIET - RESULT_VARIABLE PATCH_RESULT - ) - if(PATCH_RESULT EQUAL 0) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/revert_3406.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - RESULT_VARIABLE PATCH_RESULT - ) - else() - execute_process( # Check if the patch is already applied - COMMAND git apply --reverse --check ${CMAKE_CURRENT_LIST_DIR}/revert_3406.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - RESULT_VARIABLE PATCH_RESULT - ) - endif() - if(NOT PATCH_RESULT EQUAL 0) - message(FATAL_ERROR "Failed to apply revert_3406.patch to SPIRV-LLVM-Translator") - endif() - execute_process( - COMMAND git apply --check ${CMAKE_CURRENT_LIST_DIR}/revert_3407.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - ERROR_QUIET - RESULT_VARIABLE PATCH_RESULT - ) - if(PATCH_RESULT EQUAL 0) - execute_process( - COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/revert_3407.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - RESULT_VARIABLE PATCH_RESULT - ) - else() - execute_process( # Check if the patch is already applied - COMMAND git apply --reverse --check ${CMAKE_CURRENT_LIST_DIR}/revert_3407.patch - WORKING_DIRECTORY ${spirv-llvm-translator_SOURCE_DIR} - RESULT_VARIABLE PATCH_RESULT - ) - endif() - if(NOT PATCH_RESULT EQUAL 0) - message(FATAL_ERROR "Failed to apply revert_3407.patch to SPIRV-LLVM-Translator") - endif() endif() set(SPIRVToLLVMTranslator_INCLUDE_DIR "${SPIRVToLLVMTranslator_SOURCE_DIR}/include" diff --git a/third_party/intel/cmake/revert_3385.patch b/third_party/intel/cmake/revert_3385.patch deleted file mode 100644 index b29bb82531..0000000000 --- a/third_party/intel/cmake/revert_3385.patch +++ /dev/null @@ -1,15 +0,0 @@ -diff --git a/lib/SPIRV/LLVMToSPIRVDbgTran.cpp b/lib/SPIRV/LLVMToSPIRVDbgTran.cpp -index d0648000..e6cc346d 100644 ---- a/lib/SPIRV/LLVMToSPIRVDbgTran.cpp -+++ b/lib/SPIRV/LLVMToSPIRVDbgTran.cpp -@@ -587,8 +587,8 @@ SPIRVEntry *LLVMToSPIRVDbgTran::transDbgCompileUnit(const DICompileUnit *CU) { - if (isNonSemanticDebugInfo()) - generateBuildIdentifierAndStoragePath(CU); - -- auto DwarfLang = static_cast( -- CU->getSourceLanguage().getUnversionedName()); -+ auto DwarfLang = -+ static_cast(CU->getSourceLanguage()); - Ops[LanguageIdx] = - BM->getDebugInfoEIS() == SPIRVEIS_NonSemantic_Shader_DebugInfo_200 - ? convertDWARFSourceLangToSPIRVNonSemanticDbgInfo(DwarfLang) diff --git a/third_party/intel/cmake/revert_3406.patch b/third_party/intel/cmake/revert_3406.patch deleted file mode 100644 index ea05a000ad..0000000000 --- a/third_party/intel/cmake/revert_3406.patch +++ /dev/null @@ -1,69 +0,0 @@ -diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp -index f0c61024..b017e663 100644 ---- a/lib/SPIRV/SPIRVWriter.cpp -+++ b/lib/SPIRV/SPIRVWriter.cpp -@@ -5120,9 +5120,10 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II, - } - SPIRVType *Ty = transScavengedType(II); - auto *PtrVector = transValue(II->getArgOperand(0), BB); -- uint32_t Alignment = II->getParamAlign(0).valueOrOne().value(); -- auto *Mask = transValue(II->getArgOperand(1), BB); -- auto *FillEmpty = transValue(II->getArgOperand(2), BB); -+ uint32_t Alignment = -+ cast(II->getArgOperand(1))->getZExtValue(); -+ auto *Mask = transValue(II->getArgOperand(2), BB); -+ auto *FillEmpty = transValue(II->getArgOperand(3), BB); - std::vector Ops = {PtrVector->getId(), Alignment, Mask->getId(), - FillEmpty->getId()}; - return BM->addInstTemplate(internal::OpMaskedGatherINTEL, Ops, BB, Ty); -@@ -5139,8 +5140,9 @@ SPIRVValue *LLVMToSPIRVBase::transIntrinsicInst(IntrinsicInst *II, - } - auto *InputVector = transValue(II->getArgOperand(0), BB); - auto *PtrVector = transValue(II->getArgOperand(1), BB); -- uint32_t Alignment = II->getParamAlign(1).valueOrOne().value(); -- auto *Mask = transValue(II->getArgOperand(2), BB); -+ uint32_t Alignment = -+ cast(II->getArgOperand(2))->getZExtValue(); -+ auto *Mask = transValue(II->getArgOperand(3), BB); - std::vector Ops = {InputVector->getId(), PtrVector->getId(), - Alignment, Mask->getId()}; - return BM->addInstTemplate(internal::OpMaskedScatterINTEL, Ops, BB, -diff --git a/test/extensions/INTEL/SPV_INTEL_masked_gather_scatter/intel-gather-scatter.ll b/test/extensions/INTEL/SPV_INTEL_masked_gather_scatter/intel-gather-scatter.ll -index 02e6c961..2db4f044 100644 ---- a/test/extensions/INTEL/SPV_INTEL_masked_gather_scatter/intel-gather-scatter.ll -+++ b/test/extensions/INTEL/SPV_INTEL_masked_gather_scatter/intel-gather-scatter.ll -@@ -42,11 +42,11 @@ - - ; CHECK-LLVM: %[[#VECGATHER:]] = load <4 x ptr addrspace(4)>, ptr - ; CHECK-LLVM: %[[#VECSCATTER:]] = load <4 x ptr addrspace(4)>, ptr --; CHECK-LLVM: %[[GATHER:[a-z0-9]+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)> align 4 %[[#VECGATHER]], <4 x i1> , <4 x i32> ) --; CHECK-LLVM: call void @llvm.masked.scatter.v4i32.v4p4(<4 x i32> %[[GATHER]], <4 x ptr addrspace(4)> align 4 %[[#VECSCATTER]], <4 x i1> splat (i1 true)) -+; CHECK-LLVM: %[[GATHER:[a-z0-9]+]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)> %[[#VECGATHER]], i32 4, <4 x i1> , <4 x i32> ) -+; CHECK-LLVM: call void @llvm.masked.scatter.v4i32.v4p4(<4 x i32> %[[GATHER]], <4 x ptr addrspace(4)> %[[#VECSCATTER]], i32 4, <4 x i1> splat (i1 true)) - --; CHECK-LLVM-DAG: declare <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)>, <4 x i1>, <4 x i32>) --; CHECK-LLVM-DAG: declare void @llvm.masked.scatter.v4i32.v4p4(<4 x i32>, <4 x ptr addrspace(4)>, <4 x i1>) -+; CHECK-LLVM-DAG: declare <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)>, i32 immarg, <4 x i1>, <4 x i32>) -+; CHECK-LLVM-DAG: declare void @llvm.masked.scatter.v4i32.v4p4(<4 x i32>, <4 x ptr addrspace(4)>, i32 immarg, <4 x i1>) - - target datalayout = "e-p:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024" - target triple = "spir" -@@ -58,14 +58,14 @@ entry: - %arg1 = alloca <4 x ptr addrspace(4)> - %0 = load <4 x ptr addrspace(4)>, ptr %arg0 - %1 = load <4 x ptr addrspace(4)>, ptr %arg1 -- %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)> align 4 %0, <4 x i1> , <4 x i32> ) -- call void @llvm.masked.scatter.v4i32.v4p4(<4 x i32> %res, <4 x ptr addrspace(4)> align 4 %1, <4 x i1> splat (i1 true)) -+ %res = call <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)> %0, i32 4, <4 x i1> , <4 x i32> ) -+ call void @llvm.masked.scatter.v4i32.v4p4(<4 x i32> %res, <4 x ptr addrspace(4)> %1, i32 4, <4 x i1> splat (i1 true)) - ret void - } - --declare <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)>, <4 x i1>, <4 x i32>) -+declare <4 x i32> @llvm.masked.gather.v4i32.v4p4(<4 x ptr addrspace(4)>, i32, <4 x i1>, <4 x i32>) - --declare void @llvm.masked.scatter.v4i32.v4p4(<4 x i32>, <4 x ptr addrspace(4)>, <4 x i1>) -+declare void @llvm.masked.scatter.v4i32.v4p4(<4 x i32>, <4 x ptr addrspace(4)>, i32, <4 x i1>) - - !llvm.module.flags = !{!0} - !opencl.spir.version = !{!1} diff --git a/third_party/intel/cmake/revert_3407.patch b/third_party/intel/cmake/revert_3407.patch deleted file mode 100644 index a3e4e94306..0000000000 --- a/third_party/intel/cmake/revert_3407.patch +++ /dev/null @@ -1,66 +0,0 @@ -diff --git a/lib/SPIRV/OCLToSPIRV.cpp b/lib/SPIRV/OCLToSPIRV.cpp -index 05a60ec4..2866141c 100644 ---- a/lib/SPIRV/OCLToSPIRV.cpp -+++ b/lib/SPIRV/OCLToSPIRV.cpp -@@ -152,8 +152,8 @@ getAtomicPointerMemorySemanticsMemoryMask(const Value *Ptr, - - static size_t getOCLCpp11AtomicMaxNumOps(StringRef Name) { - return StringSwitch(Name) -- .Cases({"load", "flag_test_and_set", "flag_clear"}, 3) -- .Cases({"store", "exchange"}, 4) -+ .Cases("load", "flag_test_and_set", "flag_clear", 3) -+ .Cases("store", "exchange", 4) - .StartsWith("compare_exchange", 6) - .StartsWith("fetch", 4) - .Default(0); -diff --git a/lib/SPIRV/SPIRVUtil.cpp b/lib/SPIRV/SPIRVUtil.cpp -index b747e423..14fc6d05 100644 ---- a/lib/SPIRV/SPIRVUtil.cpp -+++ b/lib/SPIRV/SPIRVUtil.cpp -@@ -620,11 +620,11 @@ static std::string demangleBuiltinOpenCLTypeName(StringRef MangledStructName) { - /// floating point type. - static Type *parsePrimitiveType(LLVMContext &Ctx, StringRef Name) { - return StringSwitch(Name) -- .Cases({"char", "signed char", "unsigned char"}, Type::getInt8Ty(Ctx)) -- .Cases({"short", "unsigned short"}, Type::getInt16Ty(Ctx)) -- .Cases({"int", "unsigned int"}, Type::getInt32Ty(Ctx)) -- .Cases({"long", "unsigned long"}, Type::getInt64Ty(Ctx)) -- .Cases({"long long", "unsigned long long"}, Type::getInt64Ty(Ctx)) -+ .Cases("char", "signed char", "unsigned char", Type::getInt8Ty(Ctx)) -+ .Cases("short", "unsigned short", Type::getInt16Ty(Ctx)) -+ .Cases("int", "unsigned int", Type::getInt32Ty(Ctx)) -+ .Cases("long", "unsigned long", Type::getInt64Ty(Ctx)) -+ .Cases("long long", "unsigned long long", Type::getInt64Ty(Ctx)) - .Case("half", Type::getHalfTy(Ctx)) - .Case("float", Type::getFloatTy(Ctx)) - .Case("double", Type::getDoubleTy(Ctx)) -diff --git a/lib/SPIRV/SPIRVWriter.cpp b/lib/SPIRV/SPIRVWriter.cpp -index b017e663..3fd8f21b 100644 ---- a/lib/SPIRV/SPIRVWriter.cpp -+++ b/lib/SPIRV/SPIRVWriter.cpp -@@ -5397,16 +5397,16 @@ LLVMToSPIRVBase::getFPBuiltinType(IntrinsicInst *II, StringRef &OpName) { - OpName = Name.split('.').first; - FPBuiltinType Type = - StringSwitch(OpName) -- .Cases({"fadd", "fsub", "fmul", "fdiv", "frem"}, -+ .Cases("fadd", "fsub", "fmul", "fdiv", "frem", - FPBuiltinType::REGULAR_MATH) -- .Cases({"sin", "cos", "tan"}, FPBuiltinType::EXT_1OPS) -- .Cases({"sinh", "cosh", "tanh"}, FPBuiltinType::EXT_1OPS) -- .Cases({"asin", "acos", "atan"}, FPBuiltinType::EXT_1OPS) -- .Cases({"asinh", "acosh", "atanh"}, FPBuiltinType::EXT_1OPS) -- .Cases({"exp", "exp2", "exp10", "expm1"}, FPBuiltinType::EXT_1OPS) -- .Cases({"log", "log2", "log10", "log1p"}, FPBuiltinType::EXT_1OPS) -- .Cases({"sqrt", "rsqrt", "erf", "erfc"}, FPBuiltinType::EXT_1OPS) -- .Cases({"atan2", "pow", "hypot", "ldexp"}, FPBuiltinType::EXT_2OPS) -+ .Cases("sin", "cos", "tan", FPBuiltinType::EXT_1OPS) -+ .Cases("sinh", "cosh", "tanh", FPBuiltinType::EXT_1OPS) -+ .Cases("asin", "acos", "atan", FPBuiltinType::EXT_1OPS) -+ .Cases("asinh", "acosh", "atanh", FPBuiltinType::EXT_1OPS) -+ .Cases("exp", "exp2", "exp10", "expm1", FPBuiltinType::EXT_1OPS) -+ .Cases("log", "log2", "log10", "log1p", FPBuiltinType::EXT_1OPS) -+ .Cases("sqrt", "rsqrt", "erf", "erfc", FPBuiltinType::EXT_1OPS) -+ .Cases("atan2", "pow", "hypot", "ldexp", FPBuiltinType::EXT_2OPS) - .Case("sincos", FPBuiltinType::EXT_3OPS) - .Default(FPBuiltinType::UNKNOWN); - return Type; diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp index 7d96f86b25..c5f147a19b 100644 --- a/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp +++ b/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp @@ -144,22 +144,22 @@ class FuseReshapeWithLoad : public tt::intel::Fuser { unsigned newInnermostDimIdx = (innermostDimIdx - 1); unsigned newOutermostDimIdx = !newInnermostDimIdx; - auto div = builder.create(loc, strides[0], - newStrides[newOutermostDimIdx]); + auto div = arith::DivUIOp::create(builder, loc, strides[0], + newStrides[newOutermostDimIdx]); - newShape[newOutermostDimIdx] = builder.create( - loc, builder.create(loc, shapes[0], div), + newShape[newOutermostDimIdx] = arith::AddIOp::create( + builder, loc, arith::MulIOp::create(builder, loc, shapes[0], div), newShape[newOutermostDimIdx]); - newOffsets[newOutermostDimIdx] = builder.create( - loc, - builder.create( - loc, offsets[0], - builder.create(loc, offsets[0].getType(), div)), + newOffsets[newOutermostDimIdx] = arith::AddIOp::create( + builder, loc, + arith::MulIOp::create( + builder, loc, offsets[0], + arith::TruncIOp::create(builder, loc, offsets[0].getType(), div)), newOffsets[newOutermostDimIdx]); - Value ptr = builder.create( - loc, newPtrType, makeTensorPtrOp.getBase(), newShape, newStrides, - newOffsets, + Value ptr = tt::MakeTensorPtrOp::create( + builder, loc, newPtrType, makeTensorPtrOp.getBase(), newShape, + newStrides, newOffsets, DenseI32ArrayAttr::get( builder.getContext(), makeTensorPtrOp.getOrderAttr().asArrayRef().drop_front())); @@ -306,8 +306,8 @@ class FuseReshapeWithLoad : public tt::intel::Fuser { if (auto advanceOp = dyn_cast(user)) { OpBuilder rewriter(advanceOp); SmallVector newOffsets(advanceOp.getOffsets().drop_front()); - auto newAdvanceOp = rewriter.create(loc, newVal.getType(), - newVal, newOffsets); + auto newAdvanceOp = tt::AdvanceOp::create(rewriter, loc, newVal.getType(), + newVal, newOffsets); mapping.map(static_cast(advanceOp), static_cast(newAdvanceOp)); LLVM_DEBUG(llvm::dbgs().indent(2) @@ -319,10 +319,11 @@ class FuseReshapeWithLoad : public tt::intel::Fuser { if (auto loadOp = dyn_cast(user)) { OpBuilder rewriter(loadOp); - auto newLoadOp = rewriter.create( - loadOp.getLoc(), newVal, loadOp.getMask(), loadOp.getOther(), - loadOp.getBoundaryCheckAttr(), loadOp.getPaddingAttr(), - loadOp.getCache(), loadOp.getEvict(), loadOp.getIsVolatile()); + auto newLoadOp = tt::LoadOp::create( + rewriter, loadOp.getLoc(), newVal, loadOp.getMask(), + loadOp.getOther(), loadOp.getBoundaryCheckAttr(), + loadOp.getPaddingAttr(), loadOp.getCache(), loadOp.getEvict(), + loadOp.getIsVolatile()); newLoadOp->setAttrs(loadOp->getAttrs()); mapping.map(static_cast(loadOp), static_cast(newLoadOp)); diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp index 2e7fdd434a..662c2e5a19 100644 --- a/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp +++ b/third_party/intel/lib/Dialect/Triton/Transforms/RemoveMasks.cpp @@ -34,14 +34,14 @@ static Operation *dropMask(Operation *op, bool maskVal) { TypeSwitch(op) .Case([&](auto loadOp) { if (maskVal) { - auto newLoadOp = builder.create( - loc, loadOp.getPtr(), loadOp.getBoundaryCheck(), + auto newLoadOp = tt::LoadOp::create( + builder, loc, loadOp.getPtr(), loadOp.getBoundaryCheck(), loadOp.getPadding(), loadOp.getCache(), loadOp.getEvict(), loadOp.getIsVolatile()); loadOp->replaceAllUsesWith(newLoadOp); } else { Operation *cstOp = - builder.create(loc, loadOp.getOther()); + arith::ConstantOp::create(builder, loc, loadOp.getOther()); loadOp->replaceAllUsesWith(cstOp); } }) @@ -270,8 +270,8 @@ class CanonicalMaskValidator final : public MaskValidatorBase { cast(maskInfo.N.getDefiningOp()).value(); unsigned END = maskInfo.END; bool cond = UB == ((N - END) / END) + 1; - return builder.create(forOp.getLoc(), - builder.getI1Type(), cond); + return arith::ConstantIntOp::create(builder, forOp.getLoc(), + builder.getI1Type(), cond); } auto divOp = cast(defOp); @@ -282,12 +282,12 @@ class CanonicalMaskValidator final : public MaskValidatorBase { Value zero = tt::intel::findOrCreateIntConstant( loc, 0, lhs.getType().getIntOrFloatBitWidth(), builder); - Value cmp1 = builder.create( - loc, arith::CmpIPredicate::eq, - builder.create(loc, lhs, rhs), zero); - Value cmp2 = - builder.create(loc, arith::CmpIPredicate::sgt, lhs, rhs); - return builder.create(loc, cmp1, cmp2); + Value cmp1 = arith::CmpIOp::create( + builder, loc, arith::CmpIPredicate::eq, + arith::RemSIOp::create(builder, loc, lhs, rhs), zero); + Value cmp2 = arith::CmpIOp::create(builder, loc, arith::CmpIPredicate::sgt, + lhs, rhs); + return arith::AndIOp::create(builder, loc, cmp1, cmp2); } virtual std::string getName() const { return "CanonicalMaskValidator"; } @@ -548,8 +548,8 @@ class LoopVersioner { // Create the versioning branch. OpBuilder builder(forOp); Location loc = forOp.getLoc(); - auto ifOp = builder.create(loc, getUsedResults(forOp), verCond, - /*withThenRegion=*/true); + auto ifOp = scf::IfOp::create(builder, loc, getUsedResults(forOp), verCond, + /*withThenRegion=*/true); // Clone the original loop into the 2 if branches. IRMapping map; @@ -571,16 +571,16 @@ class LoopVersioner { }; // Create the yield operations for the two if branches. - thenB.create(loc, pruneUnusedResults(forOp, thenForLoop)); - elseB.create(loc, pruneUnusedResults(forOp, elseForLoop)); + scf::YieldOp::create(thenB, loc, pruneUnusedResults(forOp, thenForLoop)); + scf::YieldOp::create(elseB, loc, pruneUnusedResults(forOp, elseForLoop)); // Drop the mask from candidate masked operations in the "then" region. for (Operation *maskedOp : collector.getMaskedOps()) { Operation *mappedOp = map.lookup(maskedOp); if (auto loadOp = dyn_cast(mappedOp)) { OpBuilder builder(mappedOp); - auto newLoad = builder.create( - loadOp.getLoc(), loadOp.getPtr(), loadOp.getCache(), + auto newLoad = tt::LoadOp::create( + builder, loadOp.getLoc(), loadOp.getPtr(), loadOp.getCache(), loadOp.getEvict(), loadOp.getIsVolatile()); mappedOp->replaceAllUsesWith(newLoad); mappedOp->erase(); @@ -623,11 +623,11 @@ class LoopVersioner { for (; it != maskConds.end(); ++it) { Value nextCond = (*it)->getResult(0); Value cond = maskValidator.getVersioningCond(forOp, nextCond); - verCond = builder.create(loc, verCond, cond); + verCond = arith::AndIOp::create(builder, loc, verCond, cond); } - auto ifOp = builder.create(loc, forOp.getResultTypes(), verCond, - /*withThenRegion=*/true); + auto ifOp = scf::IfOp::create(builder, loc, forOp.getResultTypes(), verCond, + /*withThenRegion=*/true); // Clone the original loop into the 2 if branches. IRMapping map; @@ -638,8 +638,8 @@ class LoopVersioner { // Create the yield operations for the two if branches. if (!thenForLoop->getResults().empty()) { - thenB.create(loc, thenForLoop->getResults()); - elseB.create(loc, elseForLoop->getResults()); + scf::YieldOp::create(thenB, loc, thenForLoop->getResults()); + scf::YieldOp::create(elseB, loc, elseForLoop->getResults()); } // Drop the mask from candidate masked operations in the "then" region's @@ -648,8 +648,8 @@ class LoopVersioner { Operation *mappedOp = map.lookup(maskedOp); if (auto loadOp = dyn_cast(mappedOp)) { OpBuilder builder(mappedOp); - auto newLoad = builder.create( - loadOp.getLoc(), loadOp.getPtr(), loadOp.getCache(), + auto newLoad = tt::LoadOp::create( + builder, loadOp.getLoc(), loadOp.getPtr(), loadOp.getCache(), loadOp.getEvict(), loadOp.getIsVolatile()); mappedOp->replaceAllUsesWith(newLoad); mappedOp->erase(); diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/StrideVersioning.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/StrideVersioning.cpp index 0391ed9426..4693c64f98 100644 --- a/third_party/intel/lib/Dialect/Triton/Transforms/StrideVersioning.cpp +++ b/third_party/intel/lib/Dialect/Triton/Transforms/StrideVersioning.cpp @@ -128,19 +128,20 @@ class LoopVersioner { SmallVector versioningConds; for (Operation *makeTensorPtrOp : makeTensorPtrOps) { Value stride = makeTensorPtrToStride[makeTensorPtrOp]; - versioningConds.emplace_back(builder.create( - loc, arith::CmpIPredicate::eq, stride, oneVal)); + versioningConds.emplace_back(arith::CmpIOp::create( + builder, loc, arith::CmpIPredicate::eq, stride, oneVal)); } assert(!versioningConds.empty() && "Expecting at least one versioning condition"); Value verCond = versioningConds.front(); for (unsigned i = 1; i < versioningConds.size(); ++i) - verCond = builder.create(loc, verCond, versioningConds[i]); + verCond = + arith::AndIOp::create(builder, loc, verCond, versioningConds[i]); // Version the loop. - auto ifOp = builder.create(loc, forOp.getResultTypes(), verCond, - /*withThenRegion=*/true); + auto ifOp = scf::IfOp::create(builder, loc, forOp.getResultTypes(), verCond, + /*withThenRegion=*/true); IRMapping map; OpBuilder thenB = ifOp.getThenBodyBuilder(); Operation *thenForLoop = thenB.clone(*forOp.getOperation(), map); @@ -149,8 +150,8 @@ class LoopVersioner { // Create the yield operations for the two if branches. if (!thenForLoop->getResults().empty()) { - thenB.create(loc, thenForLoop->getResults()); - elseB.create(loc, elseForLoop->getResults()); + scf::YieldOp::create(thenB, loc, thenForLoop->getResults()); + scf::YieldOp::create(elseB, loc, elseForLoop->getResults()); } // Now that the loop has been versioned, replace the uses of the original diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp index 37c2d6223e..b44c59e73f 100644 --- a/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp +++ b/third_party/intel/lib/Dialect/Triton/Transforms/TensorDescToBlockPointer.cpp @@ -134,8 +134,8 @@ struct TritonIntelTensorDescToBlockPointer }); auto makeTensorPtrOp = [&]() { - auto makeTensorPtr = builder.create( - loc, base, shape, strides, offsets, sizes, + auto makeTensorPtr = tt::MakeTensorPtrOp::create( + builder, loc, base, shape, strides, offsets, sizes, builder.getDenseI32ArrayAttr({1, 0})); return makeTensorPtr; }; @@ -249,7 +249,7 @@ struct TritonIntelTensorDescToBlockPointer auto ptrType = cast(operand.getType()); auto tensorType = cast(ptrType.getPointeeType()); Value ptr = - builder.create(loc, ptrType, operand, op.getIndices()); + tt::AdvanceOp::create(builder, loc, ptrType, operand, op.getIndices()); SmallVector boundaryCheck; for (size_t i = 0; i < tensorType.getRank(); ++i) diff --git a/third_party/intel/lib/GPUToTritonGEN/OpToFuncCallLowering.h b/third_party/intel/lib/GPUToTritonGEN/OpToFuncCallLowering.h index 90bdbf0b7c..f6ad3339b7 100644 --- a/third_party/intel/lib/GPUToTritonGEN/OpToFuncCallLowering.h +++ b/third_party/intel/lib/GPUToTritonGEN/OpToFuncCallLowering.h @@ -72,8 +72,8 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern { return success(); } - Value truncated = rewriter.create( - op->getLoc(), adaptor.getOperands().front().getType(), + Value truncated = LLVM::FPTruncOp::create( + rewriter, op->getLoc(), adaptor.getOperands().front().getType(), callOp.getResult()); rewriter.replaceOp(op, {truncated}); return success(); @@ -85,8 +85,9 @@ struct OpToFuncCallLowering : public ConvertOpToLLVMPattern { if (!isa(type)) return operand; - return rewriter.create( - operand.getLoc(), Float32Type::get(rewriter.getContext()), operand); + return LLVM::FPExtOp::create(rewriter, operand.getLoc(), + Float32Type::get(rewriter.getContext()), + operand); } StringRef getFunctionName(Type type) const { diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp index 4a52a1fd35..985a20a086 100644 --- a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp +++ b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp @@ -32,7 +32,6 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/TypeSwitch.h" -#include "llvm/ADT/identity.h" #include "llvm/Support/ErrorHandling.h" #include "triton/Conversion/TritonGPUToLLVM/Utility.h" @@ -332,7 +331,7 @@ createGenISA2DBlockRead(TritonGEN::Matrix2DBlockLoadOp op, auto [ptr, baseWidth, x] = computeAlignedBasePtrWidthAndOffset(op, rewriter); // The IGC intrinsic requires the first argument be int64 - ptr = rewriter.create(loc, int64Ty, ptr); + ptr = LLVM::PtrToIntOp::create(rewriter, loc, int64Ty, ptr); SmallVector argTypes{int64Ty, baseWidth.getType(), @@ -439,7 +438,7 @@ createGenISA2DBlockWrite(TritonGEN::Matrix2DBlockStoreOp op, auto [ptr, baseWidth, x] = computeAlignedBasePtrWidthAndOffset(op, rewriter); // The IGC intrinsic requires the first argument be int64 - ptr = rewriter.create(loc, int_ty(64), ptr); + ptr = LLVM::PtrToIntOp::create(rewriter, loc, int_ty(64), ptr); SmallVector argTypes{ int_ty(64), baseWidth.getType(), baseHeight.getType(), @@ -482,7 +481,7 @@ createGenISA2DBlockPrefetch(TritonGEN::Matrix2DBlockPrefetchOp op, auto [ptr, baseWidth, x] = computeAlignedBasePtrWidthAndOffset(op, rewriter); // The IGC intrinsic requires the first argument be int64 - ptr = rewriter.create(loc, int_ty(64), ptr); + ptr = LLVM::PtrToIntOp::create(rewriter, loc, int_ty(64), ptr); SmallVector argTypes{ int_ty(64), baseWidth.getType(), baseHeight.getType(), @@ -543,10 +542,10 @@ createAssertNot(ConversionPatternRewriter &rewriter, Block *ifBlock = rewriter.createBlock(prevBlock->getParent()); rewriter.setInsertionPointToStart(ifBlock); emitter.assertFail(rewriter, loc, message, file, func, line); - rewriter.create(loc, thenBlock); + LLVM::BrOp::create(rewriter, loc, thenBlock); rewriter.setInsertionPointToEnd(prevBlock); - rewriter.create(loc, condition, ifBlock, thenBlock); + LLVM::CondBrOp::create(rewriter, loc, condition, ifBlock, thenBlock); rewriter.setInsertionPointToStart(thenBlock); } @@ -570,51 +569,53 @@ static void create2DBlockAssertsImpl( Value cMaxAlign = b.i32_val(std::max(4u, elemSize)); Value wTooLarge = - rewriter.create(loc, ICmpPredicate::ugt, baseWidth, c24m1); + ICmpOp::create(rewriter, loc, ICmpPredicate::ugt, baseWidth, c24m1); createAssertNot(rewriter, emitter, wTooLarge, "2nd operand (base width) should be <= 24 bits"); Value wTooSmall = - rewriter.create(loc, ICmpPredicate::ult, baseWidth, c64); + ICmpOp::create(rewriter, loc, ICmpPredicate::ult, baseWidth, c64); createAssertNot(rewriter, emitter, wTooSmall, "2nd operand (base width) should be >= 64"); - Value wRem = rewriter.create(loc, baseWidth, cMaxAlign); - Value wNotAligned = rewriter.create(loc, ICmpPredicate::ne, wRem, c0); + Value wRem = URemOp::create(rewriter, loc, baseWidth, cMaxAlign); + Value wNotAligned = + ICmpOp::create(rewriter, loc, ICmpPredicate::ne, wRem, c0); createAssertNot( rewriter, emitter, wNotAligned, "2nd operand (base width) should be aligned to MAX(4, element_size)"); Value hTooLarge = - rewriter.create(loc, ICmpPredicate::ugt, baseHeight, c24m1); + ICmpOp::create(rewriter, loc, ICmpPredicate::ugt, baseHeight, c24m1); createAssertNot(rewriter, emitter, hTooLarge, "3rd operand (base height) should be <= 24 bits"); Value pTooLarge = - rewriter.create(loc, ICmpPredicate::ugt, basePitch, c24m1); + ICmpOp::create(rewriter, loc, ICmpPredicate::ugt, basePitch, c24m1); createAssertNot(rewriter, emitter, pTooLarge, "4th operand (base pitch) should be <= 24 bits"); Value pTooSmall = - rewriter.create(loc, ICmpPredicate::ult, basePitch, c64); + ICmpOp::create(rewriter, loc, ICmpPredicate::ult, basePitch, c64); createAssertNot(rewriter, emitter, pTooSmall, "4th operand (base pitch) should be >= 64"); - Value pRem = rewriter.create(loc, basePitch, c16); - Value pNotAligned = rewriter.create(loc, ICmpPredicate::ne, pRem, c0); + Value pRem = URemOp::create(rewriter, loc, basePitch, c16); + Value pNotAligned = + ICmpOp::create(rewriter, loc, ICmpPredicate::ne, pRem, c0); createAssertNot(rewriter, emitter, pNotAligned, "4th operand (base pitch) should be a multiple of 16 bytes"); Value pLessThanWidth = - rewriter.create(loc, ICmpPredicate::ult, basePitch, baseWidth); + ICmpOp::create(rewriter, loc, ICmpPredicate::ult, basePitch, baseWidth); createAssertNot( rewriter, emitter, pLessThanWidth, "4th operand (base pitch) should be >= 2nd operand (base width)"); - Value offsetBytes = rewriter.create(loc, x, cElemSize); - Value offsetRem = rewriter.create(loc, offsetBytes, c4); + Value offsetBytes = MulOp::create(rewriter, loc, x, cElemSize); + Value offsetRem = URemOp::create(rewriter, loc, offsetBytes, c4); Value badOffset = - rewriter.create(loc, ICmpPredicate::ne, offsetRem, c0); + ICmpOp::create(rewriter, loc, ICmpPredicate::ne, offsetRem, c0); createAssertNot( rewriter, emitter, badOffset, "5th operand (x) should be properly aligned for the element size"); @@ -669,7 +670,7 @@ struct TritonMatrixDPASLowering VectorType aTy = VectorType::get( bitWidth / packedAType.getIntOrFloatBitWidth(), packedAType); if (aOrigTy != aTy) - a = rewriter.create(loc, aTy, a); + a = LLVM::BitcastOp::create(rewriter, loc, aTy, a); Value b = op.getB(); VectorType bOrigTy = cast(b.getType()); @@ -678,7 +679,7 @@ struct TritonMatrixDPASLowering VectorType bTy = VectorType::get( bitWidth / packedBType.getIntOrFloatBitWidth(), packedBType); if (bOrigTy != bTy) - b = rewriter.create(loc, bTy, b); + b = LLVM::BitcastOp::create(rewriter, loc, bTy, b); Value c = op.getC(); VectorType cOrigTy = cast(c.getType()); @@ -689,7 +690,7 @@ struct TritonMatrixDPASLowering ? VectorType::get(cOrigTy.getShape(), int16Ty) : cOrigTy; if (cOrigTy != cTy) - c = rewriter.create(loc, cTy, c); + c = LLVM::BitcastOp::create(rewriter, loc, cTy, c); std::string fnName = "__spirv_SubgroupMatrixMultiplyAccumulateINTEL"; SmallVector argTypes{int32Ty, aTy, bTy, cTy, int32Ty}; @@ -713,7 +714,7 @@ struct TritonMatrixDPASLowering rewriter, fnName, cTy, argTypes, args, {}, funcAttrs) ->getResult(0); if (cOrigTy != cTy) - result = rewriter.create(loc, cOrigTy, result); + result = LLVM::BitcastOp::create(rewriter, loc, cOrigTy, result); rewriter.replaceOp(op, result); return success(); @@ -787,9 +788,9 @@ struct TritonMatrix2DBlockLoadLowering auto b = TritonLLVMOpBuilder(loc, rewriter); VectorType resType = op.getRes().getType(); - auto dest = rewriter.create( - loc, ptr_ty(ctx), resType.getElementType(), - b.i32_val(resType.getNumElements())); + auto dest = LLVM::AllocaOp::create(rewriter, loc, ptr_ty(ctx), + resType.getElementType(), + b.i32_val(resType.getNumElements())); std::string fnName = "__spirv_Subgroup2DBlockLoad"; if (op.getVnniTransform()) fnName += "Transform"; @@ -838,7 +839,7 @@ struct TritonMatrix2DBlockLoadLowering *optCacheControls); } - rewriter.replaceOp(op, rewriter.create(loc, resType, dest)); + rewriter.replaceOp(op, LLVM::LoadOp::create(rewriter, loc, resType, dest)); return success(); } @@ -873,10 +874,10 @@ struct TritonMatrix2DBlockStoreLowering auto b = TritonLLVMOpBuilder(loc, rewriter); VectorType storeValType = op.getStoredVal().getType(); - auto storeValPtr = rewriter.create( - loc, ptr_ty(ctx), storeValType.getElementType(), + auto storeValPtr = LLVM::AllocaOp::create( + rewriter, loc, ptr_ty(ctx), storeValType.getElementType(), b.i32_val(storeValType.getNumElements())); - rewriter.create(loc, op.getStoredVal(), storeValPtr); + LLVM::StoreOp::create(rewriter, loc, op.getStoredVal(), storeValPtr); std::string fnName = "__spirv_Subgroup2DBlockStoreINTEL"; @@ -1026,7 +1027,7 @@ static std::string getSubGroupBlockManglingName(OpType op, Type type) { TypeSwitch(type) .Case([](VectorType vecType) { return vecType.getElementType(); }) // Scalar case - .Default(llvm::identity()); + .Default([](Type t) { return t; }); const unsigned numElems = TypeSwitch(type) .Case([](VectorType vecType) { return vecType.getNumElements(); }) diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/BF16Casts.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/BF16Casts.cpp index 13b9997bd0..225c60c1e3 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/BF16Casts.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/BF16Casts.cpp @@ -106,7 +106,7 @@ Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter, return call.getResult(); } - return rewriter.create(loc, f32_ty, v); + return LLVM::FPExtOp::create(rewriter, loc, f32_ty, v); } } @@ -145,7 +145,7 @@ Value convertFp32ToBf16(Location loc, ConversionPatternRewriter &rewriter, return b.bitcast(call.getResult(), outTy); } - return rewriter.create(loc, bf16_ty, v); + return LLVM::FPTruncOp::create(rewriter, loc, bf16_ty, v); } } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp index 4b01f8ae7d..94e448667e 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp @@ -179,9 +179,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion for (Value val : inVals) { for (int32_t i = 0; i < numElems; ++i) { res.push_back( - rewriter - .create(loc, val, b.i32_val(i), width, - mlir::gpu::ShuffleMode::IDX) + mlir::gpu::ShuffleOp::create(rewriter, loc, val, b.i32_val(i), + width, mlir::gpu::ShuffleMode::IDX) .getShuffleResult()); } } @@ -192,9 +191,8 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion for (int32_t i = 0; i < numElems; ++i) { for (Value val : inVals) { res.push_back( - rewriter - .create(loc, val, b.i32_val(i), width, - mlir::gpu::ShuffleMode::IDX) + mlir::gpu::ShuffleOp::create(rewriter, loc, val, b.i32_val(i), + width, mlir::gpu::ShuffleMode::IDX) .getShuffleResult()); } } @@ -339,12 +337,12 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion unsigned offsetBitWidth = offsetType.getIntOrFloatBitWidth(); Value subGroupId = getValueOrCreateCastToIndexLike( rewriter, loc, offsetType, - rewriter.create( - loc, /*upper_bound=*/IntegerAttr{})); + mlir::gpu::SubgroupIdOp::create(rewriter, loc, + /*upper_bound=*/IntegerAttr{})); Value subGroupLocalId = getValueOrCreateCastToIndexLike( rewriter, loc, offsetType, - rewriter.create(loc, - /*upper_bound=*/IntegerAttr{})); + mlir::gpu::LaneIdOp::create(rewriter, loc, + /*upper_bound=*/IntegerAttr{})); Value subGroupOffset = b.mul(subGroupId, b.int_val(offsetBitWidth, rowLength * numRows)); Value subGroupBasePtr = @@ -353,7 +351,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion Value base = subGroupBasePtr; // Store in matrix, transposed for (Value val : inVals) { - rewriter.create(loc, base, val); + TritonGEN::SubGroupBlockWriteOp::create(rewriter, loc, base, val); base = b.gep(base.getType(), elementType, base, ArrayRef{rowLength}, LLVM::GEPNoWrapFlags::inbounds); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp index 71f1a20b9f..c2c0799bc2 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/DotOpToLLVM/DPAS.cpp @@ -190,8 +190,8 @@ class DotOpDPASConversionHelper { TritonGEN::PrecisionTypeAttr::get(B.getContext(), BPrecision); auto RC = IntegerAttr::get(rewriter.getIntegerType(32), dpasEncoding.getRepeatCount()); - fc.at({b, m, n}) = rewriter.create( - loc, dTy, tb.bitcast(valc, cTy), tb.bitcast(valA, aTy), + fc.at({b, m, n}) = TritonGEN::MatrixDPASOp::create( + rewriter, loc, dTy, tb.bitcast(valc, cTy), tb.bitcast(valA, aTy), tb.bitcast(valB, bTy), pA, pB, RC); }; @@ -350,7 +350,7 @@ class DotOpDPASConversionHelper { for (int j = 0; j < inner; ++j) { for (int repOuter = 0; repOuter < repClusterOuter; ++repOuter) { for (int repInner = 0; repInner < repClusterInner; ++repInner) { - Value matVal = rewriter.create(loc, dotOpTy); + Value matVal = LLVM::UndefOp::create(rewriter, loc, dotOpTy); if (numElemsPerOperand != 1) for (int k = 0; k < numElemsPerOperand; ++k) matVal = tb.insert_element(dotOpTy, matVal, elems[offset++], @@ -358,7 +358,7 @@ class DotOpDPASConversionHelper { else matVal = elems[offset++]; if (isFToTF32Enabled) - matVal = rewriter.create(loc, matVal) + matVal = TritonGEN::FToTf32Op::create(rewriter, loc, matVal) .getResult(); vals[{b, i * repClusterOuter + repOuter, j * repClusterInner + repInner}] = matVal; diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp index 95d4cce003..7c35faf7dd 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -490,7 +490,7 @@ static SmallVector Fp_to_Fp8_RTNE(Location loc, // if (dstExp) dstMan = srcMan * 2^(DstMBits - SrcMBits) // else dstMan = src * 2^(DstMBits + DstBias - 1) Value scale = fval(1.0 / static_cast(1 << (SrcMBits - DstMBits))); - man = b.create(srcTy, man); + man = LLVM::UIToFPOp::create(b, b.getLoc(), srcTy, man); if constexpr (SrcBias != DstBias) { exp = b.smax(b.sub(exp, ival(SrcBias - DstBias)), zero); Value isSubnorm = b.icmp_eq(exp, zero); @@ -500,7 +500,8 @@ static SmallVector Fp_to_Fp8_RTNE(Location loc, scale); } man = b.fmul(man, scale, LLVM::FastmathFlags::fast); - man = b.create(srcITy, b.create(man)); + man = LLVM::FPToUIOp::create(b, b.getLoc(), srcITy, + LLVM::NearbyintOp::create(b, b.getLoc(), man)); val = b.add(b.shl(exp, ival(DstMBits)), man); val = b.umin(ival(DST_MAX), val); @@ -751,7 +752,7 @@ appendOrGetExternFuncOp(ConversionPatternRewriter &rewriter, Operation *op, auto parent = op->getParentOfType(); OpBuilder b(parent); - auto ret = b.create(op->getLoc(), funcName, funcType); + auto ret = LLVMFuncOp::create(b, op->getLoc(), funcName, funcType); ret.getOperation()->setAttr("libname", StringAttr::get(op->getContext(), libname)); ret.getOperation()->setAttr("libpath", @@ -784,7 +785,7 @@ struct FpToFpOpConversion static Value convertFp16ToFp32(Location loc, ConversionPatternRewriter &rewriter, const Value &v) { - return rewriter.create(loc, f32_ty, v); + return LLVM::FPExtOp::create(rewriter, loc, f32_ty, v); } static Value convertFp32ToFp16(Location loc, @@ -792,8 +793,8 @@ struct FpToFpOpConversion const Value &v, const triton::RoundingMode rounding) { MLIRContext *ctx = rewriter.getContext(); - return rewriter.create( - loc, f16_ty, v, + return LLVM::ConstrainedFPTruncIntr::create( + rewriter, loc, f16_ty, v, LLVM::RoundingModeAttr::get( ctx, LLVM::intel::convertTritonRoundingModeToLLVM(rounding)), arith::getLLVMDefaultFPExceptionBehavior(*ctx)); @@ -975,7 +976,7 @@ struct ElementwiseOpConversion !getElementType(operands[0][1]).isBF16()) && "unsupported conversion"); return { - rewriter.create(loc, elemTy, operands[0][0], operands[0][1])}; + DestOp::create(rewriter, loc, elemTy, operands[0][0], operands[0][1])}; } }; @@ -992,16 +993,18 @@ struct SIToFPOpConversion Type inElemTy = getElementType(op.getIn()); Type outElemTy = getElementType(op.getOut()); if (outElemTy.isBF16() && inElemTy.isInteger(8) && operands.size() >= 4) { - auto value = rewriter.create(loc, f32_ty, operands[0][0]); + auto value = + LLVM::SIToFPOp::create(rewriter, loc, f32_ty, operands[0][0]); return { intel::convertFp32ToBf16(loc, rewriter, value, RoundingMode::RTNE)}; } else if (outElemTy.isBF16()) { - auto value = rewriter.create(loc, f32_ty, operands[0][0]); + auto value = + LLVM::SIToFPOp::create(rewriter, loc, f32_ty, operands[0][0]); return { intel::convertFp32ToBf16(loc, rewriter, value, RoundingMode::RTNE)}; } - return {rewriter.create(loc, elemTy, operands[0][0])}; + return {LLVM::SIToFPOp::create(rewriter, loc, elemTy, operands[0][0])}; } }; @@ -1018,10 +1021,10 @@ struct FPToSIOpConversion auto inElemTy = getElementType(op.getIn()); if (inElemTy.isBF16()) { auto value = intel::convertBf16ToFp32(loc, rewriter, operands[0][0]); - return {rewriter.create(loc, elemTy, value)}; + return {LLVM::FPToSIOp::create(rewriter, loc, elemTy, value)}; } - return {rewriter.create(loc, elemTy, operands[0][0])}; + return {LLVM::FPToSIOp::create(rewriter, loc, elemTy, operands[0][0])}; } }; @@ -1042,7 +1045,7 @@ struct ExtFOpConversion return {intel::convertBf16ToFp32(loc, rewriter, operands[0][0])}; } - return {rewriter.create(loc, elemTy, operands[0][0])}; + return {LLVM::FPExtOp::create(rewriter, loc, elemTy, operands[0][0])}; } }; @@ -1064,7 +1067,7 @@ struct TruncFOpConversion intel::convertFp32ToBf16(loc, rewriter, operands[0][0], RoundingMode::RTNE)}; } - return {rewriter.create(loc, elemTy, operands[0][0])}; + return {LLVM::FPTruncOp::create(rewriter, loc, elemTy, operands[0][0])}; } }; @@ -1125,14 +1128,14 @@ struct AbsFOpConversion assert(num_bits <= 16); auto mask = (1u << (num_bits - 1u)) - 1u; auto maskAttr = rewriter.getIntegerAttr(elemTy, mask); - auto maskConst = rewriter.create(loc, maskAttr); + auto maskConst = LLVM::ConstantOp::create(rewriter, loc, maskAttr); Value res = b.and_(v, maskConst); if (llvm::isa(origTy)) res = b.bitcast(res, origTy); return {res}; } - return {rewriter.create(loc, elemTy, v)}; + return {LLVM::FAbsOp::create(rewriter, loc, elemTy, v)}; } }; diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Fp4ToFpOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/Fp4ToFpOpToLLVM.cpp index 97edc925db..a6d4df9bad 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/Fp4ToFpOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/Fp4ToFpOpToLLVM.cpp @@ -40,7 +40,7 @@ class CachingBuilder : public TritonLLVMOpBuilder { auto it = cache.find(key); if (it != cache.end()) return it->second; - auto cst = builder->create(loc, type, attr); + auto cst = LLVM::ConstantOp::create(*builder, loc, type, attr); cache[key] = cst; return cst; } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp index 6ce6f4b1f0..bc043d0f62 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp @@ -64,8 +64,8 @@ static SmallVector computeWarpLevelHistogram( } // at this point, 'bin_mask' tells you which elements are in the kth bin // owned by this thread. - Value bitCount = rewriter.create( - loc, int_ty(numThreadPerWarp), binMask); + Value bitCount = LLVM::CtPopOp::create(rewriter, loc, + int_ty(numThreadPerWarp), binMask); if (numThreadPerWarp > 32) bitCount = b.trunc(i32_ty, bitCount); else if (numThreadPerWarp < 32) @@ -78,8 +78,8 @@ static SmallVector computeWarpLevelHistogram( static void atomicAdd(Value ptr, Value val, Location loc, ConversionPatternRewriter &rewriter) { - rewriter.create(loc, LLVM::AtomicBinOp::add, ptr, val, - LLVM::AtomicOrdering::monotonic); + LLVM::AtomicRMWOp::create(rewriter, loc, LLVM::AtomicBinOp::add, ptr, val, + LLVM::AtomicOrdering::monotonic); } static SmallVector computeCrossWarpHistogram( @@ -113,7 +113,7 @@ static SmallVector computeCrossWarpHistogram( atomicAdd(sharedMemPtr, warpLevelHistogramValue, loc, rewriter); } if (afterAtomics) { - rewriter.create(loc, afterAtomics); + LLVM::BrOp::create(rewriter, loc, afterAtomics); rewriter.setInsertionPointToStart(afterAtomics); } b.barrier(); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp index c45655d1e6..074ce14ca8 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -281,8 +281,8 @@ struct LoadStoreConversionBase { Value other; switch (*padding) { case PaddingOption::PAD_ZERO: - other = rewriter.create( - loc, valueElemTy, rewriter.getZeroAttr(valueElemTy)); + other = LLVM::ConstantOp::create(rewriter, loc, valueElemTy, + rewriter.getZeroAttr(valueElemTy)); break; case PaddingOption::PAD_NAN: { @@ -290,8 +290,9 @@ struct LoadStoreConversionBase { "Expect element type to be non-integer type"); auto apNaN = llvm::APFloat::getNaN( cast(valueElemTy).getFloatSemantics()); - other = rewriter.create( - loc, valueElemTy, rewriter.getFloatAttr(valueElemTy, apNaN)); + other = + LLVM::ConstantOp::create(rewriter, loc, valueElemTy, + rewriter.getFloatAttr(valueElemTy, apNaN)); } break; } @@ -784,10 +785,10 @@ struct PrefetchOpConversion StringAttr kWarp = S("warp"); StringAttr kBlock = S("block"); - Value warpId = rewriter.create( - loc, i32_ty, - rewriter.create(loc, - /*upperBound=*/nullptr)); + Value warpId = arith::IndexCastOp::create( + rewriter, loc, i32_ty, + mlir::gpu::SubgroupIdOp::create(rewriter, loc, + /*upperBound=*/nullptr)); for (unsigned tile = 0; tile < numTilesPerWarp; ++tile) { unsigned off = tile * tileSizeInElem; @@ -797,8 +798,8 @@ struct PrefetchOpConversion Value offsetX = b.add(offsets[1].second, offsetBaseX); Value offsetY = b.add(offsets[0].second, offsetBaseY); - auto newOp = rewriter.create( - loc, + auto newOp = TritonGEN::Matrix2DBlockPrefetchOp::create( + rewriter, loc, /*ptr*/ base, /*base_width*/ baseWidth, /*base_height*/ baseHeight, @@ -1000,8 +1001,8 @@ struct PrefetchOpConversion Value addr = targetInfo.shuffleIdx( rewriter, loc, baseAddrs[{offsetM, offsetN}], 0); - auto newOp = rewriter.create( - loc, + auto newOp = TritonGEN::Matrix2DBlockPrefetchOp::create( + rewriter, loc, /*ptr*/ addr, /*base_width*/ baseWidth, /*base_height*/ baseHeight, @@ -1162,9 +1163,9 @@ struct LoadOpToBlockIOConversion unsigned threadsPerWarp = product(getThreadsPerWarp(dpasLayout, tensorShape)); - Value warpId = rewriter.create( - loc, i32_ty, - rewriter.create(loc, /*upperBound=*/nullptr)); + Value warpId = arith::IndexCastOp::create( + rewriter, loc, i32_ty, + mlir::gpu::SubgroupIdOp::create(rewriter, loc, /*upperBound=*/nullptr)); SmallVector multiDimWarpId = delinearize(rewriter, loc, warpId, warpsPerCTA, dpasWarpsOrder); @@ -1246,8 +1247,8 @@ struct LoadOpToBlockIOConversion b.add(warpId1Offset, b.i32_val(n * replicaStride[1] + repN * tileWidth)); - auto load2dOp = rewriter.create( - loc, load2DGenXType, + auto load2dOp = TritonGEN::Matrix2DBlockLoadOp::create( + rewriter, loc, load2DGenXType, /*ptr*/ base, /*base_width*/ b.mul(baseWidth, elemSizeInBytes), /*base_height*/ baseHeight, @@ -1762,8 +1763,8 @@ struct LoadOpToBlockIOConversion offsetX = b.udiv(offsetX, b.i32_val(32 / originalElemBits)); } - auto load2dOp = rewriter.create( - loc, load2DGenXType, + auto load2dOp = TritonGEN::Matrix2DBlockLoadOp::create( + rewriter, loc, load2DGenXType, /*ptr*/ base, /*base_width*/ b.mul(baseWidth, elemSizeInBytes), /*base_height*/ baseHeight, @@ -1815,8 +1816,9 @@ struct LoadOpToBlockIOConversion }); } DenseI32ArrayAttr attr = rewriter.getDenseI32ArrayAttr(indices); - Value loadVal = rewriter.create( - loc, packedDPASOperandType, load2dOp, load2dOp, attr); + Value loadVal = LLVM::ShuffleVectorOp::create( + rewriter, loc, packedDPASOperandType, load2dOp, load2dOp, + attr); // Save the decomposed vals to the map; switch (opIdx) { @@ -2068,7 +2070,7 @@ struct LoadOpToBlockIOConversion if (!splatVal.isZero()) { otherElems = SmallVector( numElems, - rewriter.create(loc, elemTy, splatVal)); + LLVM::ConstantOp::create(rewriter, loc, elemTy, splatVal)); } }; @@ -2240,10 +2242,10 @@ struct LoadOpToBlockIOConversion } break; } } - Value warpId = rewriter.create( - loc, i32_ty, - rewriter.create(loc, - /*upperBound=*/nullptr)); + Value warpId = arith::IndexCastOp::create( + rewriter, loc, i32_ty, + mlir::gpu::SubgroupIdOp::create(rewriter, loc, + /*upperBound=*/nullptr)); SmallVector unpackedLoadedVals(numElems); for (size_t elemIdx = 0; elemIdx < numElems; elemIdx += numElemsPerLoad) { @@ -2290,8 +2292,8 @@ struct LoadOpToBlockIOConversion } assert(numPackedVals > 0 && "numPackedVals should be greater than zero."); - Value ret = rewriter.create( - loc, load2DGenXType, + Value ret = TritonGEN::Matrix2DBlockLoadOp::create( + rewriter, loc, load2DGenXType, /*ptr*/ addrElem, /*base_width*/ adjustedBaseWidth, /*base_height*/ baseHeight, @@ -2344,8 +2346,8 @@ struct LoadOpToBlockIOConversion shuffleIndices[valueIndex] = firstIndexVecIdx; } DenseI32ArrayAttr attr = rewriter.getDenseI32ArrayAttr(shuffleIndices); - ret = rewriter.create( - loc, load2DGenXType, firstIndexVec, firstIndexVec, attr); + ret = LLVM::ShuffleVectorOp::create(rewriter, loc, load2DGenXType, + firstIndexVec, firstIndexVec, attr); } unsigned numElemsPerUnpackedType = @@ -2365,8 +2367,8 @@ struct LoadOpToBlockIOConversion indices[i] = opsIdx * numValsPerDPASOperand + i; } DenseI32ArrayAttr attr = rewriter.getDenseI32ArrayAttr(indices); - Value dpasOperand = rewriter.create( - loc, packedDPASOperandType, ret, ret, attr); + Value dpasOperand = LLVM::ShuffleVectorOp::create( + rewriter, loc, packedDPASOperandType, ret, ret, attr); unpackedVal = b.bitcast(dpasOperand, unpackedType); @@ -2525,8 +2527,8 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern, Value other_ = b.undef(retTy); if (otherElems.empty()) { - other_ = rewriter.create(loc, retTy, - rewriter.getZeroAttr(retTy)); + other_ = LLVM::ConstantOp::create(rewriter, loc, retTy, + rewriter.getZeroAttr(retTy)); } else { for (size_t ii = 0; ii < nWords; ++ii) { size_t size = width / valueElemNBits; @@ -2571,8 +2573,8 @@ struct LoadOpConversion : public ConvertOpToLLVMPattern, if (!pred) ret = createLoadWithAttrs()[0]; else if (canUsePredicatedInstructions(op)) - ret = rewriter.create( - loc, retTy, addrElem, b.i64_val(alignment), pred, other_); + ret = TritonGEN::PredicatedLoadOp::create( + rewriter, loc, retTy, addrElem, b.i64_val(alignment), pred, other_); else { Block &endBlock = LLVM::intel::createPredicatedBlock( rewriter, loc, pred, SmallVector{other_}, @@ -2668,10 +2670,10 @@ struct StoreOpToBlockIOConversion Location loc = op.getLoc(); auto b = TritonLLVMOpBuilder(loc, rewriter); MLIRContext *ctx = rewriter.getContext(); - Value warpId = rewriter.create( - loc, i32_ty, - rewriter.create(loc, - /*upperBound=*/nullptr)); + Value warpId = arith::IndexCastOp::create( + rewriter, loc, i32_ty, + mlir::gpu::SubgroupIdOp::create(rewriter, loc, + /*upperBound=*/nullptr)); Value llPtr = adaptor.getPtr(); @@ -2861,7 +2863,7 @@ struct StoreOpToBlockIOConversion assert(numPackedVals > 0 && "numPackedVals should be greater than zero."); // Compose the matrix by stacking the scalar into vector. - Value storeVal = rewriter.create(loc, store2DComposeType); + Value storeVal = LLVM::UndefOp::create(rewriter, loc, store2DComposeType); for (size_t i = 0; i < numElemsPerStore; ++i) { unsigned registerIdx = regMapping.apply({{kRegister, valIdx + i}})[0].second; @@ -2871,8 +2873,8 @@ struct StoreOpToBlockIOConversion if (store2DComposeType != store2DGenXType) storeVal = b.bitcast(storeVal, store2DGenXType); - auto newOp = rewriter.create( - loc, addrElem, adjustedBaseWidth, adjustedBaseHeight, pitch, + auto newOp = TritonGEN::Matrix2DBlockStoreOp::create( + rewriter, loc, addrElem, adjustedBaseWidth, adjustedBaseHeight, pitch, // offsetX was in terms of original elements. The 2d block io requires // offsetX to be in terms of packed elements. b.udiv(offsetX, b.i32_val(numPackedVals)), offsetY, @@ -3023,8 +3025,8 @@ struct StoreOpConversion if (!maskVal) auto _ = createStoreWithAttrs(); else if (canUsePredicatedInstructions(op)) - rewriter.create( - loc, addrElem, vecWord, b.i64_val(alignment), maskVal); + TritonGEN::PredicatedStoreOp::create(rewriter, loc, addrElem, vecWord, + b.i64_val(alignment), maskVal); else LLVM::intel::createPredicatedBlock(rewriter, loc, maskVal, createStoreWithAttrs); @@ -3119,18 +3121,18 @@ struct AtomicCASOpConversion ret = endBlock->getArgument(0); } else { if (op.getResult().use_empty()) - rewriter.create(loc, - TritonGEN::MemFence::GLOBAL); + TritonGEN::BarrierOp::create(rewriter, loc, + TritonGEN::MemFence::GLOBAL); auto createAtomicCASInstruction = [&]() -> SmallVector { Value localCasCmp = b.bitcast(casCmp, zero.getType()); Value localCasVal = b.bitcast(casVal, zero.getType()); - auto cmpxchg = rewriter.create( - loc, casPtr, localCasCmp, localCasVal, successOrdering, + auto cmpxchg = LLVM::AtomicCmpXchgOp::create( + rewriter, loc, casPtr, localCasCmp, localCasVal, successOrdering, failureOrdering); Value newLoaded = - rewriter.create(loc, cmpxchg, 0); + LLVM::ExtractValueOp::create(rewriter, loc, cmpxchg, 0); return SmallVector{newLoaded}; }; @@ -3180,7 +3182,7 @@ struct AtomicCASOpConversion Block *endBlock = rewriter.splitBlock(headerBlock, headerBlock->begin()); rewriter.setInsertionPointToEnd(insertionBlock); - rewriter.create(loc, mask, headerBlock, endBlock, ops); + cf::CondBranchOp::create(rewriter, loc, mask, headerBlock, endBlock, ops); rewriter.setInsertionPointToStart(headerBlock); casCmp = b.bitcast(casCmp, i16_ty); @@ -3200,8 +3202,8 @@ struct AtomicCASOpConversion auto origValInt = bodyBlock->addArgument(firstValInt.getType(), firstValInt.getLoc()); rewriter.setInsertionPointToEnd(headerBlock); - rewriter.create(loc, bodyBlock, - SmallVector{firstValInt}); + cf::BranchOp::create(rewriter, loc, bodyBlock, + SmallVector{firstValInt}); rewriter.setInsertionPointToStart(bodyBlock); auto origValVec = b.bitcast(origValInt, vec_ty(i16_ty, 2)); @@ -3213,23 +3215,23 @@ struct AtomicCASOpConversion rewriter.splitBlock(bodyBlock, rewriter.getInsertionPoint()); rewriter.setInsertionPointToEnd(bodyBlock); SmallVector exitOps = {origVal}; - rewriter.create(loc, isEqual, casBlock, ValueRange{}, - endBlock, exitOps); + cf::CondBranchOp::create(rewriter, loc, isEqual, casBlock, ValueRange{}, + endBlock, exitOps); rewriter.setInsertionPointToStart(casBlock); Value newValVec = b.insert_element(origValVec, casVal, elemIndex); Value newValInt = b.bitcast(newValVec, i32_ty); - auto cmpxchg = rewriter.create( - loc, alignedPtr, origValInt, newValInt, LLVM::AtomicOrdering::acq_rel, - LLVM::AtomicOrdering::monotonic); + auto cmpxchg = LLVM::AtomicCmpXchgOp::create( + rewriter, loc, alignedPtr, origValInt, newValInt, + LLVM::AtomicOrdering::acq_rel, LLVM::AtomicOrdering::monotonic); auto newLoaded = b.extract_val(cmpxchg, 0); auto done = b.extract_val(cmpxchg, 1); SmallVector endOps = {origVal}; - rewriter.create(loc, done, endBlock, endOps, bodyBlock, - SmallVector{newLoaded}); + cf::CondBranchOp::create(rewriter, loc, done, endBlock, endOps, bodyBlock, + SmallVector{newLoaded}); for (Value op : ops) endBlock->addArgument(op.getType(), op.getLoc()); @@ -3350,8 +3352,8 @@ struct AtomicRMWOpConversion ret = endBlock->getArgument(0); } else { if (op.getResult().use_empty()) - rewriter.create(loc, - TritonGEN::MemFence::GLOBAL); + TritonGEN::BarrierOp::create(rewriter, loc, + TritonGEN::MemFence::GLOBAL); auto createAtomicBinOpInstruction = [&]() -> SmallVector { std::optional rmwKind = @@ -3360,8 +3362,8 @@ struct AtomicRMWOpConversion llvm_unreachable("Unhandled RMWOp in case statement"); rmwVal = b.bitcast(rmwVal, valueElemTy); - auto atomRMW = rewriter.create( - loc, *rmwKind, rmwPtr, rmwVal, llvmMemOrdering); + auto atomRMW = LLVM::AtomicRMWOp::create( + rewriter, loc, *rmwKind, rmwPtr, rmwVal, llvmMemOrdering); return {atomRMW.getRes()}; }; @@ -3419,7 +3421,8 @@ struct AtomicRMWOpConversion rewriter.splitBlock(insertionBlock, rewriter.getInsertionPoint()); Block *endBlock = rewriter.splitBlock(headerBlock, headerBlock->begin()); rewriter.setInsertionPointToEnd(insertionBlock); - rewriter.create(loc, rmwMask, headerBlock, endBlock, ops); + cf::CondBranchOp::create(rewriter, loc, rmwMask, headerBlock, endBlock, + ops); rewriter.setInsertionPointToStart(headerBlock); rmwVal = b.bitcast(rmwVal, valueElemTy); @@ -3444,8 +3447,8 @@ struct AtomicRMWOpConversion auto origValInt = bodyBlock->addArgument(firstValInt.getType(), firstValInt.getLoc()); rewriter.setInsertionPointToEnd(headerBlock); - rewriter.create(loc, bodyBlock, - SmallVector{firstValInt}); + cf::BranchOp::create(rewriter, loc, bodyBlock, + SmallVector{firstValInt}); rewriter.setInsertionPointToEnd(bodyBlock); // Extract value for modification. @@ -3456,13 +3459,13 @@ struct AtomicRMWOpConversion Value newVal = nullptr; switch (atomicOp) { case RMWOp::FADD: - newVal = rewriter.create(loc, origVal, rmwVal); + newVal = LLVM::FAddOp::create(rewriter, loc, origVal, rmwVal); break; case RMWOp::MAX: - newVal = rewriter.create(loc, origVal, rmwVal); + newVal = LLVM::MaximumOp::create(rewriter, loc, origVal, rmwVal); break; case RMWOp::MIN: - newVal = rewriter.create(loc, origVal, rmwVal); + newVal = LLVM::MinimumOp::create(rewriter, loc, origVal, rmwVal); break; case RMWOp::XCHG: newVal = rmwVal; @@ -3479,14 +3482,15 @@ struct AtomicRMWOpConversion // Execute cmpxchg and loop back if it fails. auto successOrdering = LLVM::AtomicOrdering::acq_rel; auto failureOrdering = LLVM::AtomicOrdering::monotonic; - auto cmpxchg = rewriter.create( - loc, alignPtr, origValInt, newValInt, successOrdering, failureOrdering); + auto cmpxchg = LLVM::AtomicCmpXchgOp::create( + rewriter, loc, alignPtr, origValInt, newValInt, successOrdering, + failureOrdering); auto newLoaded = b.extract_val(cmpxchg, 0); auto done = b.extract_val(cmpxchg, 1); assert(ops.size() == (size_t)1); SmallVector endOps = {origVal}; - rewriter.create(loc, done, endBlock, endOps, bodyBlock, - SmallVector{newLoaded}); + cf::CondBranchOp::create(rewriter, loc, done, endBlock, endOps, bodyBlock, + SmallVector{newLoaded}); for (Value op : ops) endBlock->addArgument(op.getType(), op.getLoc()); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h index 7dfad56f23..8805af0b90 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h @@ -119,8 +119,9 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern { } // 3. Add the new arguments to the region - auto amendedFuncOp = rewriter.create( - funcOp.getLoc(), funcOp.getName(), amendedFuncTy, amendedAttrs); + auto amendedFuncOp = + triton::FuncOp::create(rewriter, funcOp.getLoc(), funcOp.getName(), + amendedFuncTy, amendedAttrs); Region ®ion = funcOp.getBody(); if (!isKernel) region.addArgument(sharedPtrTy, loc); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/SPIRVTargetInfo.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/SPIRVTargetInfo.cpp index 8d527980bf..ebd1862110 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/SPIRVTargetInfo.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/SPIRVTargetInfo.cpp @@ -25,13 +25,13 @@ Value createSPIRVGroupOp(RewriterBase &rewriter, Location loc, Type resultTy, Value clusterSize; if (numLanesToReduce != warpSize) { spvGroupOp = spirv::GroupOperation::ClusteredReduce; - clusterSize = rewriter.create( - loc, rewriter.getI32Type(), - rewriter.getI32IntegerAttr(numLanesToReduce)); + clusterSize = + arith::ConstantOp::create(rewriter, loc, rewriter.getI32Type(), + rewriter.getI32IntegerAttr(numLanesToReduce)); } - return rewriter.create(loc, resultTy, spirv::Scope::Subgroup, - spvGroupOp, acc, clusterSize); + return GroupOp::create(rewriter, loc, resultTy, spirv::Scope::Subgroup, + spvGroupOp, acc, clusterSize); } } // namespace diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/SPMDOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/SPMDOpToLLVM.cpp index 32f9c683e2..be7885e17e 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/SPMDOpToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/SPMDOpToLLVM.cpp @@ -19,7 +19,7 @@ struct GetNumProgramsOpConversion Location loc = op->getLoc(); assert(op.getAxisAsInt() < 3); Value blockId = - rewriter.create<::mlir::gpu::GridDimOp>(loc, dims[op.getAxisAsInt()]); + ::mlir::gpu::GridDimOp::create(rewriter, loc, dims[op.getAxisAsInt()]); rewriter.replaceOpWithNewOp(op, i32_ty, blockId); return success(); } diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp index 3a466f9874..d7d751613c 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp @@ -99,8 +99,8 @@ Value TargetInfo::permute(RewriterBase &rewriter, Location loc, Value a, Value TargetInfo::programId(RewriterBase &rewriter, Location loc, ModuleOp moduleOp, ProgramIDDim axis) const { Value blockId = - rewriter.create<::mlir::gpu::BlockIdOp>(loc, mlir::gpu::Dimension(axis)); - return rewriter.create(loc, i32_ty, blockId); + ::mlir::gpu::BlockIdOp::create(rewriter, loc, mlir::gpu::Dimension(axis)); + return arith::IndexCastOp::create(rewriter, loc, i32_ty, blockId); } bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc, diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp index 35f8713d0f..9ca921061b 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp @@ -154,8 +154,8 @@ struct ConvertTritonGPUToLLVM // Ask for 16B alignment on global_smem because that's the largest we should // ever need (4xi32). auto arrayTy = LLVM::LLVMArrayType::get(elemTy, 0); - auto global = b.create( - loc, arrayTy, /*isConstant=*/false, LLVM::Linkage::External, + auto global = LLVM::GlobalOp::create( + b, loc, arrayTy, /*isConstant=*/false, LLVM::Linkage::External, "global_smem", /*value=*/Attribute(), /*alignment=*/16, // Add ROCm support. static_cast(TritonGEN::TritonGENMemorySpace::kWorkgroup)); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp index d61fe1e593..d9bfc52023 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp @@ -52,7 +52,7 @@ static Value shuffleCommonImpl(Location loc, RewriterBase &rewriter, Value val, i.getDefiningOp()->getParentOfType()); Value widthConstant = b.i32_val(width); Value result = - rewriter.create(loc, val, i, widthConstant, mode) + mlir::gpu::ShuffleOp::create(rewriter, loc, val, i, widthConstant, mode) .getShuffleResult(); if (shuffleType != valType) { diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h index 7fbae8e2a0..0445e459b5 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h @@ -39,7 +39,7 @@ Block &createPredicatedBlock(RewriterBase &rewriter, Location loc, Value cond, Block *endBlock = rewriter.splitBlock(thenBlock, thenBlock->begin()); rewriter.setInsertionPointToEnd(insertionBlock); - rewriter.create(loc, cond, thenBlock, endBlock, ops); + cf::CondBranchOp::create(rewriter, loc, cond, thenBlock, endBlock, ops); rewriter.setInsertionPointToStart(thenBlock); auto thenOps = thenOpsFn(); @@ -52,9 +52,9 @@ Block &createPredicatedBlock(RewriterBase &rewriter, Location loc, Value cond, "type mismatch found"); if (thenOps.empty()) - rewriter.create(loc, endBlock); + cf::BranchOp::create(rewriter, loc, endBlock); else - rewriter.create(loc, endBlock, thenOps); + cf::BranchOp::create(rewriter, loc, endBlock, thenOps); for (Value op : thenOps) endBlock->addArgument(op.getType(), op.getLoc()); diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/XeAsmFormat.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/XeAsmFormat.cpp index 7ffa9b9c31..f8e7c5978b 100644 --- a/third_party/intel/lib/TritonIntelGPUToLLVM/XeAsmFormat.cpp +++ b/third_party/intel/lib/TritonIntelGPUToLLVM/XeAsmFormat.cpp @@ -98,12 +98,12 @@ mlir::Value XeBuilder::launch(OpBuilder &rewriter, Location loc, Type resTy, bool hasSideEffect, bool isAlignStack, ArrayRef attrs) const { auto *ctx = rewriter.getContext(); - auto inlineAsm = rewriter.create( - loc, resTy, getAllMLIRArgs(), // operands - dump(), // asm_string - getConstraints(), // constraints - hasSideEffect, // has_side_effects - isAlignStack, // is_align_stack + auto inlineAsm = LLVM::InlineAsmOp::create( + rewriter, loc, resTy, getAllMLIRArgs(), // operands + dump(), // asm_string + getConstraints(), // constraints + hasSideEffect, // has_side_effects + isAlignStack, // is_align_stack LLVM::TailCallKind::None, LLVM::AsmDialectAttr::get(ctx, LLVM::AsmDialect::AD_ATT), // asm_dialect diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp index 750ec4b660..9bdf6b4743 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp @@ -199,8 +199,8 @@ class BlockedToDPAS : public OpRewritePattern { // convert accumulator TensorValue oldAcc = dotOp.getC(); - auto newAcc = rewriter.create(oldAcc.getLoc(), - newRetType, oldAcc); + auto newAcc = ttg::ConvertLayoutOp::create(rewriter, oldAcc.getLoc(), + newRetType, oldAcc); // opA are packed to i16 for scalar type < 16 bits. opB are packed to i32. auto newAEncoding = ttg::DotOperandEncodingAttr::get( oldAType.getContext(), 0, newRetType.getEncoding(), @@ -213,11 +213,11 @@ class BlockedToDPAS : public OpRewritePattern { auto newBType = RankedTensorType::get( oldBType.getShape(), oldBType.getElementType(), newBEncoding); - a = rewriter.create(a.getLoc(), newAType, a); - b = rewriter.create(b.getLoc(), newBType, b); - auto newDot = rewriter.create(dotOp.getLoc(), newRetType, a, b, - newAcc, dotOp.getInputPrecision(), - dotOp.getMaxNumImpreciseAcc()); + a = ttg::ConvertLayoutOp::create(rewriter, a.getLoc(), newAType, a); + b = ttg::ConvertLayoutOp::create(rewriter, b.getLoc(), newBType, b); + auto newDot = tt::DotOp::create(rewriter, dotOp.getLoc(), newRetType, a, b, + newAcc, dotOp.getInputPrecision(), + dotOp.getMaxNumImpreciseAcc()); rewriter.replaceOpWithNewOp(dotOp, oldRetType, newDot.getResult()); @@ -234,17 +234,18 @@ static Value promoteOperand(OpBuilder &builder, Location loc, Value operand, return llvm::TypeSwitch(elemType) .Case([&](auto) { - return builder.create(loc, tensorPromotedType, operand); + return tt::FpToFpOp::create(builder, loc, tensorPromotedType, operand); }) .Case([&](auto) { unsigned tgtBitWidth = elemType.getIntOrFloatBitWidth(), valBitWidth = cast(operand.getType()) .getElementTypeBitWidth(); - Operation *castOp = (valBitWidth <= tgtBitWidth) - ? builder.create( - loc, tensorPromotedType, operand) - : builder.create( - loc, tensorPromotedType, operand); + Operation *castOp = + (valBitWidth <= tgtBitWidth) + ? arith::ExtSIOp::create(builder, loc, tensorPromotedType, + operand) + : arith::TruncIOp::create(builder, loc, tensorPromotedType, + operand); return castOp->getResult(0); }); } @@ -298,7 +299,7 @@ updateUsers(Value result, const SetVector &slice) { OpBuilder builder(result.getContext()); builder.setInsertionPointAfterValue(result); auto transOp = - builder.create(result.getLoc(), result, ArrayRef({1, 0})); + tt::TransOp::create(builder, result.getLoc(), result, ArrayRef({1, 0})); result.replaceUsesWithIf(transOp.getResult(), [&](OpOperand &operand) { return operand.getOwner() != transOp.getOperation() && slice.count(operand.getOwner()) == 0; @@ -376,8 +377,8 @@ static void sinkTransposeOp(tt::TransOp input) { queue.push_back(argTrans.value()); OpBuilder builder(forOp); OpOperand &init = forOp.getInitsMutable()[operand.getOperandNumber()]; - auto initTranspose = builder.create( - forOp.getLoc(), init.get(), ArrayRef({1, 0})); + auto initTranspose = tt::TransOp::create( + builder, forOp.getLoc(), init.get(), ArrayRef({1, 0})); init.set(initTranspose); } } @@ -392,18 +393,18 @@ static tt::TransOp transposeDotOp(tt::DotScaledOp dotOp) { Value lhs = dotOp.getA(); std::array transOrder = {1, 0}; auto lhsTransposed = - builder.create(lhs.getLoc(), lhs, transOrder); + tt::TransOp::create(builder, lhs.getLoc(), lhs, transOrder); Value rhs = dotOp.getB(); auto rhsTransposed = - builder.create(rhs.getLoc(), rhs, transOrder); + tt::TransOp::create(builder, rhs.getLoc(), rhs, transOrder); Value c = dotOp.getC(); - auto cTransposed = builder.create(c.getLoc(), c, transOrder); - auto result = builder.create( - dotOp.getLoc(), cTransposed.getType(), rhsTransposed, lhsTransposed, - cTransposed, dotOp.getBScale(), dotOp.getAScale(), dotOp.getBElemType(), - dotOp.getAElemType(), dotOp.getFastMath()); + auto cTransposed = tt::TransOp::create(builder, c.getLoc(), c, transOrder); + auto result = tt::DotScaledOp::create( + builder, dotOp.getLoc(), cTransposed.getType(), rhsTransposed, + lhsTransposed, cTransposed, dotOp.getBScale(), dotOp.getAScale(), + dotOp.getBElemType(), dotOp.getAElemType(), dotOp.getFastMath()); auto transOp = - builder.create(result.getLoc(), result, transOrder); + tt::TransOp::create(builder, result.getLoc(), result, transOrder); dotOp.replaceAllUsesWith(transOp.getOperation()); dotOp.erase(); return transOp; diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp index c690aaffff..8877ff597d 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Coalesce.cpp @@ -339,8 +339,8 @@ struct CoalescePass if (tensorType && !isa(tensorType.getEncoding())) { RankedTensorType newType = getNewType(tensorType, encoding); - newArgs.push_back(builder.create( - op->getLoc(), newType, operand)); + newArgs.push_back(ttg::ConvertLayoutOp::create(builder, op->getLoc(), + newType, operand)); } else { assert(tt::isTensorPointerType(operand.getType()) && "Expecting operand to have blocked pointer type"); @@ -378,8 +378,8 @@ struct CoalescePass for (size_t i = 0; i < op->getNumResults(); i++) { Value newResult = newOp->getResult(i); if (newTypes[i] != op->getResultTypes()[i]) { - newResult = builder.create( - op->getLoc(), op->getResult(i).getType(), newResult); + newResult = ttg::ConvertLayoutOp::create( + builder, op->getLoc(), op->getResult(i).getType(), newResult); } op->getResult(i).replaceAllUsesWith(newResult); } diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/DecomposeScaledBlocked.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/DecomposeScaledBlocked.cpp index 7adf9aa1f7..63e3844e89 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/DecomposeScaledBlocked.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/DecomposeScaledBlocked.cpp @@ -44,7 +44,7 @@ class DecomposeScaledBlocked : public OpRewritePattern { vType.getElementType()); auto retTy = RankedTensorType::get(vType.getShape(), vType.getElementType(), encoding); - return rewriter.create(loc, retTy, v); + return ConvertLayoutOp::create(rewriter, loc, retTy, v); }; auto scaledA = scaleArg(rewriter, scaledDotOp, 0, computeType); @@ -52,8 +52,8 @@ class DecomposeScaledBlocked : public OpRewritePattern { auto scaledB = scaleArg(rewriter, scaledDotOp, 1, computeType); scaledB = cvtDotOperand(scaledB, 1); auto newDot = - rewriter.create(scaledDotOp.getLoc(), scaledA, scaledB, - scaledDotOp.getC(), InputPrecision::TF32, 0); + DotOp::create(rewriter, scaledDotOp.getLoc(), scaledA, scaledB, + scaledDotOp.getC(), InputPrecision::TF32, 0); rewriter.replaceOpWithNewOp(scaledDotOp, scaledDotOp.getType(), newDot); @@ -84,20 +84,20 @@ class DecomposeScaledBlocked : public OpRewritePattern { auto intType = rewriter.getIntegerType(intWidth); auto zexted = - rewriter.create(loc, scaleTy.clone(intType), scale); + arith::ExtUIOp::create(rewriter, loc, scaleTy.clone(intType), scale); // getFpMantissaWidth() returns the number of bits in the mantissa plus the // sign bit! int shiftValue = largeFpType.getFPMantissaWidth() - 1; auto shiftConst = - rewriter.create(loc, shiftValue, intWidth); + arith::ConstantIntOp::create(rewriter, loc, shiftValue, intWidth); auto shift = - rewriter.create(loc, scaleTy.clone(intType), shiftConst); - auto shlRes = rewriter.create(loc, zexted, shift); + SplatOp::create(rewriter, loc, scaleTy.clone(intType), shiftConst); + auto shlRes = arith::ShLIOp::create(rewriter, loc, zexted, shift); Value scaleFP = - rewriter.create(loc, scaleTy.clone(largeFpType), shlRes); + BitcastOp::create(rewriter, loc, scaleTy.clone(largeFpType), shlRes); if (largeFpType != computeType) { - scaleFP = rewriter.create( - loc, scaleTy.clone(computeType), scaleFP); + scaleFP = arith::TruncFOp::create(rewriter, loc, + scaleTy.clone(computeType), scaleFP); } return cast>(scaleFP); } @@ -124,24 +124,24 @@ class DecomposeScaledBlocked : public OpRewritePattern { auto sliceEnc = SliceEncodingAttr::get(ctx, rank, blockedEnc); auto sliceType = RankedTensorType::get( scaleTy.getShape(), scaleTy.getElementType(), sliceEnc); - scale = rewriter.create(loc, sliceType, scale); + scale = ConvertLayoutOp::create(rewriter, loc, sliceType, scale); } - auto expandScale = rewriter.create(loc, scale, rank); + auto expandScale = ExpandDimsOp::create(rewriter, loc, scale, rank); // 2.2) Broadcast the dimension to size 32 auto scaleShape = to_vector(scaleTy.getShape()); scaleShape.push_back(32); - auto broadcastScale = rewriter.create( - loc, expandScale.getType().clone(scaleShape), expandScale); + auto broadcastScale = BroadcastOp::create( + rewriter, loc, expandScale.getType().clone(scaleShape), expandScale); // 2.3) Transpose the dimension to the scaled dimension auto transposeOrder = llvm::to_vector(llvm::seq(rank)); transposeOrder.insert(transposeOrder.begin() + dim + 1, rank); auto transposedScale = - rewriter.create(loc, broadcastScale, transposeOrder); + TransOp::create(rewriter, loc, broadcastScale, transposeOrder); // 2.4) Reshape to the shape of v scaleShape.pop_back(); scaleShape[dim] *= 32; auto reshapeScale = - rewriter.create(loc, scaleShape, transposedScale); + ReshapeOp::create(rewriter, loc, scaleShape, transposedScale); return reshapeScale; } @@ -155,30 +155,29 @@ class DecomposeScaledBlocked : public OpRewritePattern { // Scale is NaN auto scaleTy = scale.getType(); - auto constFF = rewriter.create( - loc, scaleTy, + auto constFF = arith::ConstantOp::create( + rewriter, loc, scaleTy, DenseElementsAttr::get(scaleTy, APInt(scaleTy.getElementTypeBitWidth(), 0xff))); auto scaleIsNan = cast>( - rewriter - .create(loc, arith::CmpIPredicate::eq, scale, - constFF) + arith::CmpIOp::create(rewriter, loc, arith::CmpIPredicate::eq, scale, + constFF) .getResult()); auto cond = broadcastScale(rewriter, scaledDotOp, mod, scaleIsNan, dim); // Make scale is NaN compatible with mxfp auto condTy = cond.getType(); condTy = RankedTensorType::get(condTy.getShape(), condTy.getElementType(), mxfp.getType().getEncoding()); - cond = rewriter.create(loc, condTy, cond); + cond = ConvertLayoutOp::create(rewriter, loc, condTy, cond); // Create NaN auto mxfpTy = mxfp.getType(); auto nan = APFloat::getNaN( cast(mxfpTy.getElementType()).getFloatSemantics()); - auto constNan = rewriter.create( - loc, mxfpTy, DenseElementsAttr::get(mxfpTy, nan)); + auto constNan = arith::ConstantOp::create( + rewriter, loc, mxfpTy, DenseElementsAttr::get(mxfpTy, nan)); - auto result = rewriter.create(loc, cond, constNan, mxfp); + auto result = arith::SelectOp::create(rewriter, loc, cond, constNan, mxfp); return cast>(result.getResult()); } @@ -208,11 +207,11 @@ class DecomposeScaledBlocked : public OpRewritePattern { (opIdx == 1 && resShape[rank - 1] != vShape[rank - 1])) { packDim = (packDim + 1) % 2; } - v = rewriter.create(loc, v, computeType, packDim); + v = Fp4ToFpOp::create(rewriter, loc, v, computeType, packDim); } else { auto vType16 = v.getType().clone(computeType); v = cast>( - rewriter.create(loc, vType16, v).getResult()); + FpToFpOp::create(rewriter, loc, vType16, v).getResult()); } if (!scale) return v; @@ -222,7 +221,7 @@ class DecomposeScaledBlocked : public OpRewritePattern { // this parametre transposed, as we do with the mxfp. if (opIdx == 1) { auto order = getTransposeOrder(rank); - scale = rewriter.create(loc, scale, order); + scale = TransOp::create(rewriter, loc, scale, order); } // 1) Cast scale to compute type (fp16/bf16) @@ -232,11 +231,11 @@ class DecomposeScaledBlocked : public OpRewritePattern { auto reshapeScale = broadcastScale(rewriter, scaledDotOp, mod, scale16, kDim); reshapeScale = - rewriter.create(loc, v.getType(), reshapeScale); + ConvertLayoutOp::create(rewriter, loc, v.getType(), reshapeScale); // 3) Multiply auto mxfp = cast>( - rewriter.create(loc, v, reshapeScale).getResult()); + arith::MulFOp::create(rewriter, loc, v, reshapeScale).getResult()); // Skip NaN checks if fastMath if (fastMath) diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp index fcbcff5bde..3b3deed37e 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeDotOperands.cpp @@ -124,9 +124,10 @@ class FuseTransWithLoad : public tt::intel::Fuser { SmallVector newOffsets(llvm::reverse(makeTensorPtrOp.getOffsets())); OpBuilder builder(makeTensorPtrOp); - Value ptr = builder.create( - makeTensorPtrOp.getLoc(), newPtrType, makeTensorPtrOp.getBase(), - newShape, newStrides, newOffsets, makeTensorPtrOp.getOrderAttr()); + Value ptr = tt::MakeTensorPtrOp::create( + builder, makeTensorPtrOp.getLoc(), newPtrType, + makeTensorPtrOp.getBase(), newShape, newStrides, newOffsets, + makeTensorPtrOp.getOrderAttr()); LLVM_DEBUG(llvm::dbgs() << "newMakeTensorPtrOp:\n " << ptr << "\n"); // ... and propagate it through the def-use chain. @@ -315,8 +316,8 @@ class FuseTransWithLoad : public tt::intel::Fuser { if (auto advanceOp = dyn_cast(user)) { OpBuilder rewriter(advanceOp); SmallVector newOffsets(llvm::reverse(advanceOp.getOffsets())); - auto newAdvanceOp = rewriter.create(loc, newVal.getType(), - newVal, newOffsets); + auto newAdvanceOp = tt::AdvanceOp::create(rewriter, loc, newVal.getType(), + newVal, newOffsets); LLVM_DEBUG(llvm::dbgs().indent(2) << "newAdvanceOp: " << newAdvanceOp << "\n"); cleanUp.insert(advanceOp); @@ -326,10 +327,11 @@ class FuseTransWithLoad : public tt::intel::Fuser { if (auto loadOp = dyn_cast(user)) { OpBuilder rewriter(loadOp); - auto newLoadOp = rewriter.create( - loadOp.getLoc(), newVal, loadOp.getMask(), loadOp.getOther(), - loadOp.getBoundaryCheckAttr(), loadOp.getPaddingAttr(), - loadOp.getCache(), loadOp.getEvict(), loadOp.getIsVolatile()); + auto newLoadOp = tt::LoadOp::create( + rewriter, loadOp.getLoc(), newVal, loadOp.getMask(), + loadOp.getOther(), loadOp.getBoundaryCheckAttr(), + loadOp.getPaddingAttr(), loadOp.getCache(), loadOp.getEvict(), + loadOp.getIsVolatile()); StringRef blockIOAttrName = ttgi::TritonIntelGPUDialect::getBlockIOAttrName(); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeReductionLocality.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeReductionLocality.cpp index b8e863221c..fe369ff490 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeReductionLocality.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/OptimizeReductionLocality.cpp @@ -360,17 +360,17 @@ struct DpasOperandPattern final : OpRewritePattern { // Although this is a NOP, we have to pass allow_reorder=true as static // analysis will fail to infer it. - return rewriter.create(op.getLoc(), - static_cast(type), val, - /*allow_reorder=*/true, - /*efficient_layout=*/true); + return ReshapeOp::create(rewriter, op.getLoc(), + static_cast(type), val, + /*allow_reorder=*/true, + /*efficient_layout=*/true); } Value performReduction(ReduceOp op, PatternRewriter &rewriter, Value val, int axis) const { assert(axis >= 0 && "Expecting positive axis"); - auto newOp = rewriter.create(op.getLoc(), val, /*axis=*/axis); + auto newOp = ReduceOp::create(rewriter, op.getLoc(), val, /*axis=*/axis); auto &newCombineOp = newOp.getCombineOp(); rewriter.cloneRegionBefore(op.getCombineOp(), newCombineOp, newCombineOp.end()); @@ -414,8 +414,8 @@ struct DpasOperandPattern final : OpRewritePattern { type.setEncoding(encoding); - return rewriter.create( - op.getLoc(), static_cast(type), val); + return ConvertLayoutOp::create(rewriter, op.getLoc(), + static_cast(type), val); } Value reshapeForFinalReduction(ReduceOp op, PatternRewriter &rewriter, @@ -451,10 +451,10 @@ struct DpasOperandPattern final : OpRewritePattern { // Although this is a NOP, we have to pass allow_reorder=true as static // analysis will fail to infer it. - return rewriter.create(op.getLoc(), - static_cast(type), val, - /*allow_reorder=*/true, - /*efficient_layout=*/true); + return ReshapeOp::create(rewriter, op.getLoc(), + static_cast(type), val, + /*allow_reorder=*/true, + /*efficient_layout=*/true); } Value performFinalElementwiseReduction(ReduceOp op, PatternRewriter &rewriter, @@ -490,16 +490,16 @@ struct DpasOperandPattern final : OpRewritePattern { type.setEncoding(SliceEncodingAttr::get(getContext(), 0, parentEncoding)); - return rewriter.create(op.getLoc(), - static_cast(type), val, - /*allow_reorder=*/true, - /*efficient_layout=*/true); + return ReshapeOp::create(rewriter, op.getLoc(), + static_cast(type), val, + /*allow_reorder=*/true, + /*efficient_layout=*/true); } Value convertLayoutToOriginalType(ReduceOp op, PatternRewriter &rewriter, Value val) const { - return rewriter.create( - op.getLoc(), op.getResult().front().getType(), val); + return ConvertLayoutOp::create(rewriter, op.getLoc(), + op.getResult().front().getType(), val); } }; diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp index 7d4ae2e1e4..8d765c86ea 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/MatmulLoopPipeline.cpp @@ -44,7 +44,7 @@ static void appendToYield(scf::ForOp forOp, ArrayRef newOperands) { operands.append(newOperands.begin(), newOperands.end()); OpBuilder builder(yieldOp); - builder.create(yieldOp->getLoc(), operands); + scf::YieldOp::create(builder, yieldOp->getLoc(), operands); yieldOp->erase(); } @@ -89,9 +89,9 @@ static ttg::DotOperandEncodingAttr allTransitiveUsesHaveDotEncoding(Value val) { static void createPrefetchOp(scf::ForOp &forOp, tt::LoadOp loadOp) { OpBuilder builder(forOp); builder.setInsertionPoint(loadOp); - auto prefetchOp = builder.create( - loadOp->getLoc(), loadOp.getPtr(), loadOp.getMask(), loadOp.getCache(), - loadOp.getEvict(), loadOp.getIsVolatile()); + auto prefetchOp = ttgi::PrefetchOp::create( + builder, loadOp->getLoc(), loadOp.getPtr(), loadOp.getMask(), + loadOp.getCache(), loadOp.getEvict(), loadOp.getIsVolatile()); // inherit attributes from the load operation auto attrs = loadOp->getAttrDictionary(); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp index f2a498ffd5..75e4b073c1 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Pipeliner/SoftwarePipeliner.cpp @@ -66,12 +66,12 @@ static void pipelineLoop( OpBuilder b(loop); Location loc = loop.getLoc(); b.setInsertionPointToStart(loop.getBody()); - b.create(loc, *barrierScope, - *barrierScope); + triton::TritonGEN::SplitBarrierArriveOp::create(b, loc, *barrierScope, + *barrierScope); auto yield = cast(loop.getBody()->getTerminator()); b.setInsertionPoint(yield); - b.create(loc, *barrierScope, - *barrierScope); + triton::TritonGEN::SplitBarrierWaitOp::create(b, loc, *barrierScope, + *barrierScope); } } diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/ReduceDataDuplication.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/ReduceDataDuplication.cpp index d93e5bacdc..d0c6771cf9 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/ReduceDataDuplication.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/ReduceDataDuplication.cpp @@ -71,10 +71,10 @@ class TritonIntelGPUReduceDataDuplicationPass mod.getContext(), dstDotOp, srcType.getShape(), sharedOrder, triton::gpu::getCTALayout(srcEncoding), srcType.getElementType()), sharedMemorySpace); - auto tmp = builder.create( - cvtOp.getLoc(), tmpType, cvtOp.getSrc()); - auto newConvert = builder.create(cvtOp.getLoc(), - dstType, tmp); + auto tmp = triton::gpu::LocalAllocOp::create(builder, cvtOp.getLoc(), + tmpType, cvtOp.getSrc()); + auto newConvert = triton::gpu::LocalLoadOp::create( + builder, cvtOp.getLoc(), dstType, tmp); cvtOp.replaceAllUsesWith(newConvert.getResult()); cvtOp.erase(); }); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/ReduceVariableLiveness.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/ReduceVariableLiveness.cpp index 57772ba44f..ead4528d8e 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/ReduceVariableLiveness.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/ReduceVariableLiveness.cpp @@ -143,9 +143,9 @@ void createPrefetchOp(tt::LoadOp loadOp) { // TODO: Add prefetchOp after last dependency between ptr and mask, // if this support is extended to support masks. builder.setInsertionPointAfter(op); - auto prefetchOp = builder.create( - loadOp->getLoc(), loadOp.getPtr(), loadOp.getCache(), loadOp.getEvict(), - loadOp.getIsVolatile()); + auto prefetchOp = ttgi::PrefetchOp::create( + builder, loadOp->getLoc(), loadOp.getPtr(), loadOp.getCache(), + loadOp.getEvict(), loadOp.getIsVolatile()); // inherit attributes from the load operation auto attrs = loadOp->getAttrDictionary(); @@ -205,8 +205,8 @@ bool optimizeDotOperands(scf::ForOp forOp, SmallVector &prefetchedValue, } b.setInsertionPoint(insertBeforeOp); auto newLoad = cast(b.clone(*loadOp.getOperation())); - auto newCvt = b.create(tensorV.getLoc(), tensorType, - newLoad.getResult()); + auto newCvt = ttg::ConvertLayoutOp::create(b, tensorV.getLoc(), tensorType, + newLoad.getResult()); dotOp.setOperand(opId, newCvt.getResult()); // Update other user in the same loop if any diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index 807b97066f..9c04b7bbf5 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -531,8 +531,8 @@ Value LayoutPropagation::getValueAs(Value value, Attribute encoding) { OpBuilder rewriter(value.getContext()); rewriter.setInsertionPointAfterValue(rewrittenValue); auto tmpType = tensorType.cloneWithEncoding(encoding); - Value converted = rewriter.create(value.getLoc(), tmpType, - rewrittenValue); + Value converted = ConvertLayoutOp::create(rewriter, value.getLoc(), tmpType, + rewrittenValue); // TODO: we could cache the conversion. return converted; } @@ -588,9 +588,9 @@ Operation *LayoutPropagation::rewriteForOp(scf::ForOp forOp) { getValueAs(operand, *layouts[result].encodings.begin()); operands.push_back(convertedOperand); } - auto newForOp = rewriter.create( - forOp.getLoc(), forOp.getLowerBound(), forOp.getUpperBound(), - forOp.getStep(), operands); + auto newForOp = + scf::ForOp::create(rewriter, forOp.getLoc(), forOp.getLowerBound(), + forOp.getUpperBound(), forOp.getStep(), operands); newForOp->setAttrs(forOp->getAttrs()); newForOp.getBody()->getOperations().splice( newForOp.getBody()->getOperations().begin(), @@ -639,7 +639,7 @@ Operation *LayoutPropagation::rewriteWhileOp(scf::WhileOp whileOp) { } auto newWhileOp = - rewriter.create(whileOp.getLoc(), returnTypes, operands); + scf::WhileOp::create(rewriter, whileOp.getLoc(), returnTypes, operands); SmallVector argsTypesBefore; for (Value operand : operands) argsTypesBefore.push_back(operand.getType()); @@ -686,8 +686,8 @@ Operation *LayoutPropagation::rewriteIfOp(scf::IfOp ifOp) { Attribute encoding = *(it->second.encodings.begin()); newResultTypes[i] = origType.cloneWithEncoding(encoding); } - auto newIfOp = rewriter.create(ifOp.getLoc(), newResultTypes, - ifOp.getCondition(), true, true); + auto newIfOp = scf::IfOp::create(rewriter, ifOp.getLoc(), newResultTypes, + ifOp.getCondition(), true, true); newIfOp.getThenRegion().takeBody(ifOp.getThenRegion()); newIfOp.getElseRegion().takeBody(ifOp.getElseRegion()); for (auto [oldResult, newResult] : @@ -769,8 +769,8 @@ static void updateAdvanceOpChain(AdvanceOp advanceOp, StoreOp storeOp, Value makeTensorPtrOp, Value dataToStore) { OpBuilder rewriter(advanceOp); auto newAdvanceOp = - rewriter.create(advanceOp.getLoc(), makeTensorPtrOp.getType(), - makeTensorPtrOp, advanceOp.getOffsets()); + AdvanceOp::create(rewriter, advanceOp.getLoc(), makeTensorPtrOp.getType(), + makeTensorPtrOp, advanceOp.getOffsets()); SmallVector advanceOpUsers(advanceOp->getUsers()); for (Operation *user : advanceOpUsers) { @@ -848,8 +848,8 @@ bool LayoutPropagation::rewriteTensorPtrStoreOp(StoreOp storeOp) { // Create a new MakeTensorPtrOp with the new layout. OpBuilder rewriter(makeTensorPtrOp); - Value newMakeTensorPtrOp = rewriter.create( - makeTensorPtrOp.getLoc(), newPtrType, makeTensorPtrOp.getBase(), + Value newMakeTensorPtrOp = MakeTensorPtrOp::create( + rewriter, makeTensorPtrOp.getLoc(), newPtrType, makeTensorPtrOp.getBase(), makeTensorPtrOp.getShape(), makeTensorPtrOp.getStrides(), makeTensorPtrOp.getOffsets(), makeTensorPtrOp.getOrderAttr()); @@ -941,7 +941,7 @@ Operation *LayoutPropagation::rewriteOp(Operation *op) { Value src = getValueAs(convertOp.getSrc(), srcEncoding); auto tensorType = cast(op->getResult(0).getType()); auto newType = tensorType.cloneWithEncoding(encoding); - auto cvt = rewriter.create(op->getLoc(), newType, src); + auto cvt = ConvertLayoutOp::create(rewriter, op->getLoc(), newType, src); map(op->getResult(0), cvt.getResult()); return cvt.getOperation(); } @@ -949,8 +949,8 @@ Operation *LayoutPropagation::rewriteOp(Operation *op) { Operation *newOp = rewriter.clone(*op); auto tensorType = cast(op->getResult(0).getType()); auto newType = tensorType.cloneWithEncoding(encoding); - auto cvt = rewriter.create(op->getLoc(), newType, - newOp->getResult(0)); + auto cvt = ConvertLayoutOp::create(rewriter, op->getLoc(), newType, + newOp->getResult(0)); map(op->getResult(0), cvt.getResult()); return cvt.getOperation(); } @@ -1041,9 +1041,9 @@ void LayoutRematerialization::reduceLoopCarriedValues() { TypeSwitch(user) .Case([&](auto loadOp) { auto newLoadOp = - rewriter.create(loc, rematRes, loadOp->getAttrs()); - auto convOp = rewriter.create( - loc, loadOp.getType(), newLoadOp.getResult()); + LoadOp::create(rewriter, loc, rematRes, loadOp->getAttrs()); + auto convOp = ConvertLayoutOp::create( + rewriter, loc, loadOp.getType(), newLoadOp.getResult()); loadOp->replaceAllUsesWith(convOp); opToDelete.insert(loadOp); LLVM_DEBUG({ @@ -1061,9 +1061,9 @@ void LayoutRematerialization::reduceLoopCarriedValues() { .getEncoding(); RankedTensorType newDataType = dataType.cloneWithEncoding(encoding); auto convOp = - rewriter.create(loc, newDataType, data); - auto newStoreOp = rewriter.create( - loc, rematRes, convOp, storeOp.getBoundaryCheck(), + ConvertLayoutOp::create(rewriter, loc, newDataType, data); + auto newStoreOp = StoreOp::create( + rewriter, loc, rematRes, convOp, storeOp.getBoundaryCheck(), storeOp.getCache(), storeOp.getEvict()); opToDelete.insert(storeOp); LLVM_DEBUG({ @@ -1073,8 +1073,9 @@ void LayoutRematerialization::reduceLoopCarriedValues() { }); }) .Case([&](auto advanceOp) { - auto newAdvanceOp = rewriter.create( - loc, rematRes.getType(), rematRes, advanceOp.getOffsets()); + auto newAdvanceOp = + AdvanceOp::create(rewriter, loc, rematRes.getType(), rematRes, + advanceOp.getOffsets()); opToDelete.insert(advanceOp); LLVM_DEBUG({ DBGS() << "Replaced:\n\t" << *advanceOp << "\n" @@ -1275,7 +1276,7 @@ void LayoutRematerialization::rewriteSlice(SetVector &slice, for (int operandIdx : operandsToRewrite) { yieldOperands.push_back(mapping.lookup(yieldOp.getOperand(operandIdx))); } - builder.create(op->getLoc(), yieldOperands); + scf::YieldOp::create(builder, op->getLoc(), yieldOperands); op->erase(); continue; } @@ -1283,8 +1284,8 @@ void LayoutRematerialization::rewriteSlice(SetVector &slice, Operation *newOp = builder.clone(*op); auto tensorType = cast(op->getResult(0).getType()); auto newType = tensorType.cloneWithEncoding(layout[op->getResult(0)]); - auto cvt = builder.create(op->getLoc(), newType, - newOp->getResult(0)); + auto cvt = ConvertLayoutOp::create(builder, op->getLoc(), newType, + newOp->getResult(0)); mapping.map(op->getResult(0), cvt.getResult()); addRematValue(op->getResult(0), layout[op->getResult(0)], cvt.getResult()); @@ -1735,8 +1736,8 @@ void LayoutRematerialization::hoistConvertDotOperand( if (!type) continue; auto newType = type.cloneWithEncoding(layout[loadOp->getResult(0)]); - auto newConvertOp = builder.create( - convertOp.getLoc(), newType, loadOp->getResult(0)); + auto newConvertOp = ConvertLayoutOp::create(builder, convertOp.getLoc(), + newType, loadOp->getResult(0)); mapping.map(loadOp->getResult(0), newConvertOp.getResult()); } @@ -1837,8 +1838,8 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast( auto tensorType = cast(extOrBroadcastOp->getOperand(0).getType()); auto newType = tensorType.cloneWithEncoding(srcEncoding); - auto newConvertOp = builder.create( - convertOp.getLoc(), newType, extOrBroadcastOp->getOperand(0)); + auto newConvertOp = ConvertLayoutOp::create( + builder, convertOp.getLoc(), newType, extOrBroadcastOp->getOperand(0)); Operation *newExtOrBroadcast = builder.clone(*extOrBroadcastOp); newExtOrBroadcast->setOperand(0, newConvertOp.getResult()); auto oldExtOrBroadcastType = @@ -1951,7 +1952,7 @@ void LayoutRematerialization::hoistConvertIntoConditionals( auto hoistRemat = [&](OpBuilder &b, Value v, Attribute encoding) { auto tensorType = cast(v.getType()); auto newType = tensorType.cloneWithEncoding(encoding); - Value newCvt = b.create(convertOp.getLoc(), newType, v); + Value newCvt = ConvertLayoutOp::create(b, convertOp.getLoc(), newType, v); mapping.map(v, newCvt); slice.remove(v); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteStackPtr.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteStackPtr.cpp index 70d4bd5d9d..5dbb629f7e 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteStackPtr.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteStackPtr.cpp @@ -64,7 +64,7 @@ struct TritonIntelGPURewriteStackPtrPass builder.setInsertionPoint(addressOp); Value newValue; if (usePoison) { - newValue = builder.create(addressOp.getLoc(), ptrTy); + newValue = LLVM::PoisonOp::create(builder, addressOp.getLoc(), ptrTy); } else { auto funcOp = addressOp->getParentOfType(); assert(funcOp && "AddressOfOp must be inside a function"); diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp index 9c9ca9d9f2..879e9aee25 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/Utility.cpp @@ -324,8 +324,8 @@ LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable, StringRef name, SymbolTable::lookupSymbolIn(symbolTable, name)); if (!func) { OpBuilder b(symbolTable->getRegion(0)); - func = b.create( - symbolTable->getLoc(), name, + func = LLVM::LLVMFuncOp::create( + b, symbolTable->getLoc(), name, LLVM::LLVMFunctionType::get(resultType, paramTypes)); func.setCConv(LLVM::cconv::CConv::SPIR_FUNC); } @@ -335,7 +335,7 @@ LLVM::LLVMFuncOp lookupOrCreateSPIRVFn(Operation *symbolTable, StringRef name, LLVM::CallOp createSPIRVBuiltinCall(Location loc, ConversionPatternRewriter &rewriter, LLVM::LLVMFuncOp func, ValueRange args) { - auto call = rewriter.create(loc, func, args); + auto call = LLVM::CallOp::create(rewriter, loc, func, args); call.setCConv(func.getCConv()); return call; } diff --git a/third_party/intel/lib/Utils/LLVMIntr.cpp b/third_party/intel/lib/Utils/LLVMIntr.cpp index 5334082e1e..d5c39e3b93 100644 --- a/third_party/intel/lib/Utils/LLVMIntr.cpp +++ b/third_party/intel/lib/Utils/LLVMIntr.cpp @@ -31,7 +31,7 @@ LLVM::CallOp createDeviceFunctionCall( if (!passthroughAttrs.getFnAttributes().empty()) funcOp->setAttrs(passthroughAttrs.getFnAttributes().getDictionary(ctx)); - auto callOp = rewriter.create(loc, funcOp, args); + auto callOp = LLVM::CallOp::create(rewriter, loc, funcOp, args); callOp->setAttrs(funcOp->getAttrs()); return callOp; diff --git a/third_party/intel/lib/Utils/LibCallEmitter.cpp b/third_party/intel/lib/Utils/LibCallEmitter.cpp index 35297bc26f..b722aedc3e 100644 --- a/third_party/intel/lib/Utils/LibCallEmitter.cpp +++ b/third_party/intel/lib/Utils/LibCallEmitter.cpp @@ -50,8 +50,9 @@ static LLVM::LLVMFuncOp getSpirvPrintfDeclaration(RewriterBase &rewriter) { ConversionPatternRewriter::InsertionGuard guard(rewriter); rewriter.setInsertionPointToStart(moduleOp.getBody()); - auto printFunc = rewriter.create( - UnknownLoc::get(context), funcName, funcType, LLVM::Linkage::External, + auto printFunc = LLVM::LLVMFuncOp::create( + rewriter, UnknownLoc::get(context), funcName, funcType, + LLVM::Linkage::External, /*dsoLocal*/ false, LLVM::CConv::SPIR_FUNC, /*comdat=*/SymbolRefAttr{}); printFunc->setAttr("nounwind", rewriter.getUnitAttr()); @@ -77,8 +78,8 @@ static LLVM::LLVMFuncOp getAssertfailDeclaration(RewriterBase &rewriter) { RewriterBase::InsertionGuard guard(rewriter); rewriter.setInsertionPointToStart(moduleOp.getBody()); - auto func = rewriter.create(UnknownLoc::get(ctx), funcName, - funcType); + auto func = LLVM::LLVMFuncOp::create(rewriter, UnknownLoc::get(ctx), funcName, + funcType); func.setCConv(LLVM::cconv::CConv::SPIR_FUNC); return func; } @@ -91,7 +92,7 @@ Value LibCallEmitter::getGlobalStringStart(Location loc, RewriterBase &rewriter, getGlobalString(loc, rewriter, name, value, addressSpace); MLIRContext *ctx = rewriter.getContext(); Type globalPtrType = ptr_ty(ctx, addressSpace); - Value globalPtr = rewriter.create(loc, global); + Value globalPtr = LLVM::AddressOfOp::create(rewriter, loc, global); return b.gep(globalPtrType, i8_ty, globalPtr, LLVM::GEPArg{0}); } @@ -115,8 +116,8 @@ LLVM::GlobalOp LibCallEmitter::getGlobalString(Location loc, auto createGlobal = [&](StringRef name) { RewriterBase::InsertionGuard guard(rewriter); rewriter.setInsertionPointToStart(moduleOp.getBody()); - return rewriter.create( - rewriter.getUnknownLoc(), globalType, + return LLVM::GlobalOp::create( + rewriter, rewriter.getUnknownLoc(), globalType, /*isConstant=*/true, LLVM::Linkage::Internal, name, valueAttr, /*alignment=*/0, addressSpace); }; diff --git a/third_party/intel/unittest/Conversion/TritonIntelGPUToLLVM/XeAsmFormatTest.cpp b/third_party/intel/unittest/Conversion/TritonIntelGPUToLLVM/XeAsmFormatTest.cpp index 315ee9a690..891835cffd 100644 --- a/third_party/intel/unittest/Conversion/TritonIntelGPUToLLVM/XeAsmFormatTest.cpp +++ b/third_party/intel/unittest/Conversion/TritonIntelGPUToLLVM/XeAsmFormatTest.cpp @@ -24,10 +24,10 @@ class XeAsmFormatTest : public ::testing::Test { builder.setInsertionPointToStart(&block); // a b1 value for predicate. - v[0] = builder.create(builder.getUnknownLoc(), 1, 1); + v[0] = arith::ConstantIntOp::create(builder, builder.getUnknownLoc(), 1, 1); for (int i = 0; i < numValues; i++) { v[i + 1] = - builder.create(builder.getUnknownLoc(), i, 32); + arith::ConstantIntOp::create(builder, builder.getUnknownLoc(), i, 32); } } diff --git a/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp b/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp index 81c88c7676..5ef27f5f02 100644 --- a/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp +++ b/third_party/nvidia/lib/Dialect/NVWS/Transforms/LowerAref.cpp @@ -305,10 +305,9 @@ void createTMALoad(triton::nvws::DescriptorLoadOp op, PatternRewriter &rewriter, } } } - auto newLoadOp = - rewriter.create( - op.getLoc(), op.getDesc(), indices, barrierAlloc, op.getResult(), - pred); + auto newLoadOp = triton::nvidia_gpu::AsyncTMACopyGlobalToLocalOp::create( + rewriter, op.getLoc(), op.getDesc(), indices, barrierAlloc, + op.getResult(), pred); assignStageCluster(newLoadOp, getPartitionWsTagIds(op), getStageCluster(op), rewriter); }; diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp index 8860ce6d59..054d95cc7e 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ConvertWarpSpecializeToLLVM.cpp @@ -91,8 +91,8 @@ static void createBarrier(TritonLLVMIRRewriter &b, unsigned barIdx, if (numThreads == 32) LLVM::NVIDIA::createSyncWarp(b.getLoc(), b); else - NVVM::BarrierOp::create(b, b.getLoc(), b.i32_val(barIdx), - b.i32_val(numThreads)); + NVVM::BarrierOp::create(b, b.getLoc(), TypeRange{}, b.i32_val(barIdx), + b.i32_val(numThreads), {}, Value{}); } static void createAllBarrier(TritonLLVMIRRewriter &b, unsigned barIdx) { diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp index 3cea4e1844..d62d399596 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/ElementwiseOpToLLVM.cpp @@ -650,7 +650,7 @@ struct ExpOpConversionApprox Value prod = b.fmul(f32_ty, operands[0][0], b.f32_val(log2e)); Type resultTy = operands[0][0].getType(); - StringRef name = "llvm.nvvm.ex2.approx.f"; + StringRef name = "llvm.nvvm.ex2.approx.f32"; auto callOp = LLVM::createLLVMIntrinsicCallOp(rewriter, loc, name, resultTy, {prod}); return {callOp.getResult(0)}; diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp index 5d9bea6e27..fa0bd25600 100644 --- a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp +++ b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/LoadStoreOpToLLVM.cpp @@ -1803,8 +1803,8 @@ struct AsyncCopyMbarrierArriveOpConversion typeConverter->convertType(op.getBarrier().getType().getElementType()), rewriter); TritonLLVMOpBuilder b(loc, rewriter); - NVVM::CpAsyncMBarrierArriveSharedOp::create(rewriter, loc, - barrierMemObj.getBase(), noinc); + NVVM::CpAsyncMBarrierArriveOp::create(rewriter, loc, + barrierMemObj.getBase(), noinc); op->erase(); return success(); } diff --git a/third_party/proton/test/test_instrumentation.py b/third_party/proton/test/test_instrumentation.py index 271b0ff835..6eb6ae9235 100644 --- a/third_party/proton/test/test_instrumentation.py +++ b/third_party/proton/test/test_instrumentation.py @@ -15,6 +15,7 @@ is_cuda, is_hip, is_hip_cdna2, + is_hip_cdna4, supports_tma, supports_ws, ) @@ -643,6 +644,7 @@ def foo(x, y, size: tl.constexpr): assert trace_events[-1]["args"]["call_stack"][-2] == "test" +@pytest.mark.skipif(is_hip_cdna4(), reason="nondeterministic failure") def test_globaltime(tmp_path: pathlib.Path): temp_file = tmp_path / "test_globaltime.chrome_trace" mode = proton.mode.Default(