From 7f50dcfa231e65fd6ff47a828f5c56f48a88142b Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Mon, 18 Nov 2024 13:47:27 -0700 Subject: [PATCH 01/18] Prevent UB in div/rem instructions during optimization --- third_party/intel/triton_xpu.cc | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index e6d13915ee..bf65bc1a50 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -6,6 +6,7 @@ #include "llvm/Passes/PassPlugin.h" #include "llvm/Passes/StandardInstrumentations.h" #include "llvm/Transforms/InstCombine/InstCombine.h" +#include "llvm/Transforms/Scalar/DivRemPairs.h" #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h" #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" @@ -204,6 +205,17 @@ void init_triton_intel(py::module &&m) { fpm.addPass(BreakStructPhiNodesPass()); fpm.addPass(InstCombinePass()); }); + pb.registerPeepholeEPCallback( + [&](llvm::FunctionPassManager &fpm, llvm::OptimizationLevel level) { + // The Triton masked load pattern can generate instances where the + // mask false path appears to cause undefined behavior during + // computation. Even though the result of that behavior will never be + // used, LLVM can choose to optimize away the false path resulting in + // an incorrect result for the kernel. Adding `DivRemPairsPass` + // introduces freeze instructions which prevent UB from leaking into + // div/rem instructions. + fpm.addPass(DivRemPairsPass()); + }); mpm.addPass(pb.buildPerModuleDefaultPipeline(opt)); mpm.run(*mod, mam); }); From 1dd55a48ad7a0c5fc4ca8df05abc831faaf4197a Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Mon, 18 Nov 2024 19:32:07 -0700 Subject: [PATCH 02/18] Add regression test 1/? --- python/test/regression/test_divide.py | 75 +++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 python/test/regression/test_divide.py diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py new file mode 100644 index 0000000000..57f0a8ea84 --- /dev/null +++ b/python/test/regression/test_divide.py @@ -0,0 +1,75 @@ +import torch +aten = torch.ops.aten + +import pytest + +import triton +import triton.language as tl + + +def patch_kernel(template, to_replace): + kernel = triton.JITFunction(template.fn) + for key, value in to_replace.items(): + kernel.src = kernel.src.replace(key, value) + return kernel + +def test_divide(device): + # regression test for various division cases + + @triton.jit + def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel, XBLOCK: tl.constexpr): + xoffset = tl.program_id(0) * XBLOCK + xindex = xoffset + tl.arange(0, XBLOCK)[:] + xmask = xindex < xnumel + x0 = xindex + tmp0 = tl.load(a + (x0), xmask) + tmp2 = tl.load(b + (x0), xmask) + # custom bits + tmp1 = tmp0.to(tl.float32) + tmp3 = tmp2.to(tl.float32) + tmp4 = tmp1 / tmp3 + tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2), tmp0 // tmp2) + tmp6 = tmp0 // tmp2 + tl.store(out_ptr0 + (x0), tmp4, xmask) + tl.store(out_ptr1 + (x0), tmp5, xmask) + tl.store(out_ptr2 + (x0), tmp6, xmask) + tl.store(out_ptr3 + (x0), tmp4, xmask) + tl.store(out_ptr4 + (x0), tmp5, xmask) + + torch.manual_seed(0) + + def launch_triton(a, b): + output0 = torch.empty_like(a) + output1 = torch.empty_like(a) + output2 = torch.empty_like(a) + output3 = torch.empty_like(a) + output4 = torch.empty_like(a) + + n_elements = output0.numel() + + grid = lambda meta: (triton.cdiv(n_elements, meta['XBLOCK']), ) + + divide_kernel[grid](a, b, output0, output1, output2, output3, output4, n_elements, XBLOCK=128) + + return (output0, output1, output2, output3, output4) + + def launch_torch(a, b): + return ( + aten.div(a, b, rounding_mode=None), + aten.div(a, b, rounding_mode="floor"), + aten.div(a, b, rounding_mode="trunc"), + a / b, + a // b, + ) + + a = torch.randint(2**32, 2**40, [100, 100], device=device) + b = torch.randint(-10, -1, [100, 100], device=device) + + for iter in range(100): + triton_result = launch_triton(a, b) + torch_result = launch_torch(a, b) + + for i in range(5): + torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Iteration {iter}, {i} failed\n{msg}") + + From c728635a1366e2b5684e0de0c5e7daf2e8d8bf44 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Mon, 18 Nov 2024 20:12:42 -0700 Subject: [PATCH 03/18] Parametrize test_divide (2/?) --- python/test/regression/test_divide.py | 43 +++++++++++++++------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py index 57f0a8ea84..44a46765db 100644 --- a/python/test/regression/test_divide.py +++ b/python/test/regression/test_divide.py @@ -12,8 +12,11 @@ def patch_kernel(template, to_replace): for key, value in to_replace.items(): kernel.src = kernel.src.replace(key, value) return kernel - -def test_divide(device): + +@pytest.mark.parametrize("float_div", [True, False]) +@pytest.mark.parametrize("floor", [True, False]) +@pytest.mark.parametrize("trunc", [True, False]) +def test_divide(float_div, floor, trunc, device): # regression test for various division cases @triton.jit @@ -30,36 +33,38 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel tmp4 = tmp1 / tmp3 tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2), tmp0 // tmp2) tmp6 = tmp0 // tmp2 - tl.store(out_ptr0 + (x0), tmp4, xmask) - tl.store(out_ptr1 + (x0), tmp5, xmask) - tl.store(out_ptr2 + (x0), tmp6, xmask) - tl.store(out_ptr3 + (x0), tmp4, xmask) - tl.store(out_ptr4 + (x0), tmp5, xmask) + GENERATE_OUTPUTS_HERE torch.manual_seed(0) + outputs_float_div = "tl.store(out_ptr0 + (x0), tmp4, xmask)\n tl.store(out_ptr3 + (x0), tmp4, xmask)" if float_div is True else "" + outputs_floor = "\n tl.store(out_ptr1 + (x0), tmp5, xmask)\n tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor is True else "" + outputs_trunc = "\n tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc is True else "" + + divide_kernel = patch_kernel(divide_kernel, {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"}) + def launch_triton(a, b): - output0 = torch.empty_like(a) - output1 = torch.empty_like(a) - output2 = torch.empty_like(a) - output3 = torch.empty_like(a) - output4 = torch.empty_like(a) + output0 = torch.zeros_like(a) + output1 = torch.zeros_like(a) + output2 = torch.zeros_like(a) + output3 = torch.zeros_like(a) + output4 = torch.zeros_like(a) n_elements = output0.numel() grid = lambda meta: (triton.cdiv(n_elements, meta['XBLOCK']), ) - + divide_kernel[grid](a, b, output0, output1, output2, output3, output4, n_elements, XBLOCK=128) return (output0, output1, output2, output3, output4) def launch_torch(a, b): return ( - aten.div(a, b, rounding_mode=None), - aten.div(a, b, rounding_mode="floor"), - aten.div(a, b, rounding_mode="trunc"), - a / b, - a // b, + aten.div(a, b, rounding_mode=None) if float_div is True else torch.zeros_like(a), + aten.div(a, b, rounding_mode="floor") if floor is True else torch.zeros_like(a), + aten.div(a, b, rounding_mode="trunc") if trunc is True else torch.zeros_like(a), + a / b if float_div is True else torch.zeros_like(a), + a // b if floor is True else torch.zeros_like(a), ) a = torch.randint(2**32, 2**40, [100, 100], device=device) @@ -70,6 +75,6 @@ def launch_torch(a, b): torch_result = launch_torch(a, b) for i in range(5): - torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Iteration {iter}, {i} failed\n{msg}") + torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Float: {float_div}, Floor: {floor}, Trunc: {trunc}\nIteration {iter}, {i} failed\n{msg}") From e36f345301ba0a4b955554794014c3e32184b2fa Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Tue, 19 Nov 2024 14:57:58 -0700 Subject: [PATCH 04/18] fixup format in test_divide --- python/test/regression/test_divide.py | 44 +++++++++++++++------------ 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py index 44a46765db..a9c1da053c 100644 --- a/python/test/regression/test_divide.py +++ b/python/test/regression/test_divide.py @@ -1,10 +1,11 @@ +# flake8: noqa: F821, F841 import torch -aten = torch.ops.aten - -import pytest +import pytest import triton -import triton.language as tl +import triton.language as tl + +aten = torch.ops.aten def patch_kernel(template, to_replace): @@ -13,13 +14,14 @@ def patch_kernel(template, to_replace): kernel.src = kernel.src.replace(key, value) return kernel + @pytest.mark.parametrize("float_div", [True, False]) @pytest.mark.parametrize("floor", [True, False]) @pytest.mark.parametrize("trunc", [True, False]) def test_divide(float_div, floor, trunc, device): - # regression test for various division cases + # regression test for various division cases - @triton.jit + @triton.jit def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel, XBLOCK: tl.constexpr): xoffset = tl.program_id(0) * XBLOCK xindex = xoffset + tl.arange(0, XBLOCK)[:] @@ -27,11 +29,12 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel x0 = xindex tmp0 = tl.load(a + (x0), xmask) tmp2 = tl.load(b + (x0), xmask) - # custom bits + # custom bits tmp1 = tmp0.to(tl.float32) tmp3 = tmp2.to(tl.float32) tmp4 = tmp1 / tmp3 - tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2), tmp0 // tmp2) + tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2), + tmp0 // tmp2) tmp6 = tmp0 // tmp2 GENERATE_OUTPUTS_HERE @@ -41,7 +44,8 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel outputs_floor = "\n tl.store(out_ptr1 + (x0), tmp5, xmask)\n tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor is True else "" outputs_trunc = "\n tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc is True else "" - divide_kernel = patch_kernel(divide_kernel, {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"}) + divide_kernel = patch_kernel(divide_kernel, + {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"}) def launch_triton(a, b): output0 = torch.zeros_like(a) @@ -57,15 +61,15 @@ def launch_triton(a, b): divide_kernel[grid](a, b, output0, output1, output2, output3, output4, n_elements, XBLOCK=128) return (output0, output1, output2, output3, output4) - + def launch_torch(a, b): - return ( - aten.div(a, b, rounding_mode=None) if float_div is True else torch.zeros_like(a), - aten.div(a, b, rounding_mode="floor") if floor is True else torch.zeros_like(a), - aten.div(a, b, rounding_mode="trunc") if trunc is True else torch.zeros_like(a), - a / b if float_div is True else torch.zeros_like(a), - a // b if floor is True else torch.zeros_like(a), - ) + return ( + aten.div(a, b, rounding_mode=None) if float_div is True else torch.zeros_like(a), + aten.div(a, b, rounding_mode="floor") if floor is True else torch.zeros_like(a), + aten.div(a, b, rounding_mode="trunc") if trunc is True else torch.zeros_like(a), + a / b if float_div is True else torch.zeros_like(a), + a // b if floor is True else torch.zeros_like(a), + ) a = torch.randint(2**32, 2**40, [100, 100], device=device) b = torch.randint(-10, -1, [100, 100], device=device) @@ -75,6 +79,6 @@ def launch_torch(a, b): torch_result = launch_torch(a, b) for i in range(5): - torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Float: {float_div}, Floor: {floor}, Trunc: {trunc}\nIteration {iter}, {i} failed\n{msg}") - - + torch.testing.assert_close( + triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: + f"Float: {float_div}, Floor: {floor}, Trunc: {trunc}\nIteration {iter}, {i} failed\n{msg}") From e0310dd1fb5b3cee439e34e2e8947739b5073fa2 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Tue, 19 Nov 2024 20:33:54 -0700 Subject: [PATCH 05/18] LLVM freeze instruction between mask and div 1/? --- third_party/intel/lib/LLVMIR/CMakeLists.txt | 6 ++ .../lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 63 +++++++++++++++++++ third_party/intel/lib/LLVMIR/LLVMPasses.h | 11 ++++ 3 files changed, 80 insertions(+) create mode 100644 third_party/intel/lib/LLVMIR/CMakeLists.txt create mode 100644 third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp create mode 100644 third_party/intel/lib/LLVMIR/LLVMPasses.h diff --git a/third_party/intel/lib/LLVMIR/CMakeLists.txt b/third_party/intel/lib/LLVMIR/CMakeLists.txt new file mode 100644 index 0000000000..408e229590 --- /dev/null +++ b/third_party/intel/lib/LLVMIR/CMakeLists.txt @@ -0,0 +1,6 @@ +add_triton_library(TritonIntelLLVMIR + LLVMIRFreezeMaskedDivRem.cpp + + DEPENDS + LLVMIRIncGen + ) \ No newline at end of file diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp new file mode 100644 index 0000000000..d224adb881 --- /dev/null +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -0,0 +1,63 @@ +#include "LLVMPasses.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/Dominators.h" + +using namespace llvm; + +static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) { + llvm::errs() << "YOLO: " << *phiNode << "\n"; + + const auto phiHasNullValue = any_of(phiNode->incoming_values(), [](Use& U) { + if (Constant *C = dyn_cast(&U)) { + return C->isNullValue(); + } + return false; + }); + + if (phiHasNullValue) { + auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) { + auto *Use = cast(U); + llvm::errs() << "User: " << *Use << "\n"; + return (Use->getOpcode() == Instruction::SDiv); + }); + if (FindUse == phiNode->user_end()) { + llvm::errs() << "no div :(\n"; + return false; + } + auto *Use = cast(*FindUse); + assert() + llvm::errs() << "Got our user! " << *Use << "\n"; + + // insert freeze between phi and sdiv + // + } + return false; +} + +static bool runOnFunction(Function& F, const TargetTransformInfo &TTI, + const DominatorTree &DT) { + bool Changed = false; + + SmallVector PhiNodes; + for (BasicBlock &BB : F) { + for (Instruction &inst : BB) { + if (PHINode *phiNode = dyn_cast(&inst)) { + Changed |= processPhiNode(phiNode, BB); + continue; + } + break; + } + } + + return Changed; +} + + PreservedAnalyses FreezeMaskedDivRemPass::run(Function &F, FunctionAnalysisManager &FAM) { + TargetTransformInfo &TTI = FAM.getResult(F); + DominatorTree &DT = FAM.getResult(F); + const auto b = runOnFunction(F, TTI, DT); + + return b ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} \ No newline at end of file diff --git a/third_party/intel/lib/LLVMIR/LLVMPasses.h b/third_party/intel/lib/LLVMIR/LLVMPasses.h new file mode 100644 index 0000000000..5e70c38e22 --- /dev/null +++ b/third_party/intel/lib/LLVMIR/LLVMPasses.h @@ -0,0 +1,11 @@ +#include "llvm/IR/PassManager.h" +#include "llvm/Pass.h" + +namespace llvm { + +struct FreezeMaskedDivRemPass : PassInfoMixin { + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); + static StringRef name() { return "FreezeMaskedDivRemPass"; } +}; + +} \ No newline at end of file From 2243b25ee39a240fe8ebf4481834eb3b717d8b76 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Tue, 19 Nov 2024 20:34:53 -0700 Subject: [PATCH 06/18] LLVM freeze instruction between mask and div 2/? --- third_party/intel/lib/CMakeLists.txt | 1 + third_party/intel/triton_xpu.cc | 8 +++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/third_party/intel/lib/CMakeLists.txt b/third_party/intel/lib/CMakeLists.txt index b2d8e610d0..2b58d7f122 100644 --- a/third_party/intel/lib/CMakeLists.txt +++ b/third_party/intel/lib/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(Analysis) add_subdirectory(Dialect) add_subdirectory(GPUToTritonGEN) +add_subdirectory(LLVMIR) add_subdirectory(Target) add_subdirectory(TritonAnnotateModule) add_subdirectory(TritonGENToLLVM) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index bf65bc1a50..582d21a3ad 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -17,6 +17,7 @@ #include "intel/include/TritonAnnotateModule/Passes.h" #include "intel/include/TritonIntelGPUToLLVM/Passes.h" #include "intel/include/TritonToTritonGPUWarp/Passes.h" +#include "intel/lib/LLVMIR/LLVMPasses.h" #include "triton/Target/SPIRV/SPIRVTranslation.h" #include "triton/Tools/Sys/GetEnv.hpp" @@ -205,6 +206,7 @@ void init_triton_intel(py::module &&m) { fpm.addPass(BreakStructPhiNodesPass()); fpm.addPass(InstCombinePass()); }); +#if 1 pb.registerPeepholeEPCallback( [&](llvm::FunctionPassManager &fpm, llvm::OptimizationLevel level) { // The Triton masked load pattern can generate instances where the @@ -214,8 +216,12 @@ void init_triton_intel(py::module &&m) { // an incorrect result for the kernel. Adding `DivRemPairsPass` // introduces freeze instructions which prevent UB from leaking into // div/rem instructions. - fpm.addPass(DivRemPairsPass()); + // fpm.addPass(DivRemPairsPass()); + fpm.addPass(FreezeMaskedDivRemPass()); }); +#else + mpm.addPass(createModuleToFunctionPassAdaptor(FreezeMaskedDivRemPass())); +#endif mpm.addPass(pb.buildPerModuleDefaultPipeline(opt)); mpm.run(*mod, mam); }); From 6c6a0f0614adc9dc2e4fbed4a3862a09d00be60c Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Wed, 20 Nov 2024 10:11:45 -0700 Subject: [PATCH 07/18] LLVM freeze instruction between mask and div 3/? --- .../intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index d224adb881..971aa49867 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -16,6 +16,7 @@ static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) { return false; }); + bool Changed = false; if (phiHasNullValue) { auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) { auto *Use = cast(U); @@ -27,13 +28,15 @@ static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) { return false; } auto *Use = cast(*FindUse); - assert() - llvm::errs() << "Got our user! " << *Use << "\n"; - - // insert freeze between phi and sdiv - // + if (Use->getOperand(1) == phiNode) { + llvm::errs() << "Got our user! " << *Use << "\n"; + llvm::errs() << "Operand 1: " << *Use->getOperand(1) << "\n"; + auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", Use->getIterator()); + Use->setOperand(1, freezePhi); + Changed = true; + } } - return false; + return Changed; } static bool runOnFunction(Function& F, const TargetTransformInfo &TTI, From 3ef26d06b2182079306099bd7e370777b3017f44 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Wed, 20 Nov 2024 10:45:30 -0700 Subject: [PATCH 08/18] LLVM freeze instruction between mask and div 4/? --- .../lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index 971aa49867..72765167ae 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -18,23 +18,37 @@ static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) { bool Changed = false; if (phiHasNullValue) { + for (Instruction &I : BB) { + if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem) { + const size_t OpIdx = 1; // I.getOpcode() == Instruction::SRem ? 0 : 1; + if (I.getOperand(OpIdx) == phiNode) { + auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", I.getIterator()); + I.setOperand(OpIdx, freezePhi); + Changed = true; + } + } + } +#if 0 auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) { auto *Use = cast(U); llvm::errs() << "User: " << *Use << "\n"; - return (Use->getOpcode() == Instruction::SDiv); + return (Use->getOpcode() == Instruction::SDiv || Use->getOpcode() == Instruction::SRem); }); if (FindUse == phiNode->user_end()) { llvm::errs() << "no div :(\n"; return false; } auto *Use = cast(*FindUse); - if (Use->getOperand(1) == phiNode) { + assert(Use->isIntDivRem()); + const size_t OpIdx = Use->getOpcode() == Instruction::SRem ? 0 : 1; + if (Use->getOperand(OpIdx) == phiNode) { llvm::errs() << "Got our user! " << *Use << "\n"; llvm::errs() << "Operand 1: " << *Use->getOperand(1) << "\n"; auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", Use->getIterator()); - Use->setOperand(1, freezePhi); + Use->setOperand(OpIdx, freezePhi); Changed = true; } +#endif } return Changed; } From a35b0edf0c899ead6eea2d425f06b7e566c925b0 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Wed, 20 Nov 2024 10:59:29 -0700 Subject: [PATCH 09/18] LLVM freeze instruction between mask and div 5/5 --- third_party/intel/lib/LLVMIR/CMakeLists.txt | 2 +- .../lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 91 +++++++------------ third_party/intel/lib/LLVMIR/LLVMPasses.h | 2 +- third_party/intel/triton_xpu.cc | 21 ++--- 4 files changed, 46 insertions(+), 70 deletions(-) diff --git a/third_party/intel/lib/LLVMIR/CMakeLists.txt b/third_party/intel/lib/LLVMIR/CMakeLists.txt index 408e229590..6da101e950 100644 --- a/third_party/intel/lib/LLVMIR/CMakeLists.txt +++ b/third_party/intel/lib/LLVMIR/CMakeLists.txt @@ -3,4 +3,4 @@ add_triton_library(TritonIntelLLVMIR DEPENDS LLVMIRIncGen - ) \ No newline at end of file + ) diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index 72765167ae..630e791e54 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -1,80 +1,57 @@ #include "LLVMPasses.h" -#include "llvm/IR/Instructions.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Dominators.h" +#include "llvm/IR/Instructions.h" using namespace llvm; -static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) { - llvm::errs() << "YOLO: " << *phiNode << "\n"; - - const auto phiHasNullValue = any_of(phiNode->incoming_values(), [](Use& U) { - if (Constant *C = dyn_cast(&U)) { - return C->isNullValue(); - } - return false; - }); +static bool processPhiNode(PHINode *PhiNode, BasicBlock &BB) { + if (!any_of(PhiNode->incoming_values(), [](Use &U) { + if (Constant *C = dyn_cast(&U)) { + return C->isNullValue(); + } + return false; + })) { + return false; + } bool Changed = false; - if (phiHasNullValue) { - for (Instruction &I : BB) { - if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem) { - const size_t OpIdx = 1; // I.getOpcode() == Instruction::SRem ? 0 : 1; - if (I.getOperand(OpIdx) == phiNode) { - auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", I.getIterator()); - I.setOperand(OpIdx, freezePhi); - Changed = true; - } + for (Instruction &I : BB) { + if (I.getOpcode() == Instruction::SDiv || + I.getOpcode() == Instruction::SRem) { + const size_t OpIdx = 1; + if (I.getOperand(OpIdx) == PhiNode) { + auto *freezePhi = new FreezeInst( + PhiNode, PhiNode->getName() + ".frozen", I.getIterator()); + I.setOperand(OpIdx, freezePhi); + Changed = true; } } -#if 0 - auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) { - auto *Use = cast(U); - llvm::errs() << "User: " << *Use << "\n"; - return (Use->getOpcode() == Instruction::SDiv || Use->getOpcode() == Instruction::SRem); - }); - if (FindUse == phiNode->user_end()) { - llvm::errs() << "no div :(\n"; - return false; - } - auto *Use = cast(*FindUse); - assert(Use->isIntDivRem()); - const size_t OpIdx = Use->getOpcode() == Instruction::SRem ? 0 : 1; - if (Use->getOperand(OpIdx) == phiNode) { - llvm::errs() << "Got our user! " << *Use << "\n"; - llvm::errs() << "Operand 1: " << *Use->getOperand(1) << "\n"; - auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", Use->getIterator()); - Use->setOperand(OpIdx, freezePhi); - Changed = true; - } -#endif } - return Changed; + return Changed; } -static bool runOnFunction(Function& F, const TargetTransformInfo &TTI, - const DominatorTree &DT) { - bool Changed = false; +static bool runOnFunction(Function &F) { + bool Changed = false; - SmallVector PhiNodes; - for (BasicBlock &BB : F) { - for (Instruction &inst : BB) { - if (PHINode *phiNode = dyn_cast(&inst)) { - Changed |= processPhiNode(phiNode, BB); + SmallVector PhiNodes; + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + if (PHINode *PhiNode = dyn_cast(&I)) { + Changed |= processPhiNode(PhiNode, BB); continue; } break; } } - return Changed; + return Changed; } - PreservedAnalyses FreezeMaskedDivRemPass::run(Function &F, FunctionAnalysisManager &FAM) { - TargetTransformInfo &TTI = FAM.getResult(F); - DominatorTree &DT = FAM.getResult(F); - const auto b = runOnFunction(F, TTI, DT); +PreservedAnalyses FreezeMaskedDivRemPass::run(Function &F, + FunctionAnalysisManager &FAM) { + const auto b = runOnFunction(F); - return b ? PreservedAnalyses::none() : PreservedAnalyses::all(); -} \ No newline at end of file + return b ? PreservedAnalyses::none() : PreservedAnalyses::all(); +} diff --git a/third_party/intel/lib/LLVMIR/LLVMPasses.h b/third_party/intel/lib/LLVMIR/LLVMPasses.h index 5e70c38e22..72f71dd983 100644 --- a/third_party/intel/lib/LLVMIR/LLVMPasses.h +++ b/third_party/intel/lib/LLVMIR/LLVMPasses.h @@ -8,4 +8,4 @@ struct FreezeMaskedDivRemPass : PassInfoMixin { static StringRef name() { return "FreezeMaskedDivRemPass"; } }; -} \ No newline at end of file +} // namespace llvm diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index 582d21a3ad..e6ad385924 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -206,22 +206,21 @@ void init_triton_intel(py::module &&m) { fpm.addPass(BreakStructPhiNodesPass()); fpm.addPass(InstCombinePass()); }); -#if 1 pb.registerPeepholeEPCallback( [&](llvm::FunctionPassManager &fpm, llvm::OptimizationLevel level) { // The Triton masked load pattern can generate instances where the - // mask false path appears to cause undefined behavior during - // computation. Even though the result of that behavior will never be - // used, LLVM can choose to optimize away the false path resulting in - // an incorrect result for the kernel. Adding `DivRemPairsPass` - // introduces freeze instructions which prevent UB from leaking into - // div/rem instructions. - // fpm.addPass(DivRemPairsPass()); + // mask value causes undefined behavior in sdiv/srem instructions. The + // language allows this UB as the result of those arithmetic + // instructions is never used, and control flow to avoid computation + // of these instructions would negatively affect performance. But, + // LLVM SimplifyCFG aggressively marks code paths with undefined + // behavior as dead. This can result in removal of the mask path and + // incorrect results from legal Triton kernels due to masked elements + // being used in computation. Run a pass to add a freeze instruction + // between masked loads and sdiv/srem to signal to LLVM we consider + // the sdiv/srem operands to be well defined. fpm.addPass(FreezeMaskedDivRemPass()); }); -#else - mpm.addPass(createModuleToFunctionPassAdaptor(FreezeMaskedDivRemPass())); -#endif mpm.addPass(pb.buildPerModuleDefaultPipeline(opt)); mpm.run(*mod, mam); }); From 302fd39bbff34492f047d76c2169a3a6f3f0e522 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Wed, 20 Nov 2024 11:02:16 -0700 Subject: [PATCH 10/18] fixup --- third_party/intel/triton_xpu.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc index e6ad385924..362e404c5f 100644 --- a/third_party/intel/triton_xpu.cc +++ b/third_party/intel/triton_xpu.cc @@ -6,7 +6,6 @@ #include "llvm/Passes/PassPlugin.h" #include "llvm/Passes/StandardInstrumentations.h" #include "llvm/Transforms/InstCombine/InstCombine.h" -#include "llvm/Transforms/Scalar/DivRemPairs.h" #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h" #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h" From 9f814ee41690705f046e0f029c64a27057a59134 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Thu, 21 Nov 2024 08:12:53 -0500 Subject: [PATCH 11/18] Remove unused variable Co-authored-by: Arun Jose <40291569+arunjose696@users.noreply.github.com> --- third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index 630e791e54..cec40e6f1d 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -35,7 +35,6 @@ static bool processPhiNode(PHINode *PhiNode, BasicBlock &BB) { static bool runOnFunction(Function &F) { bool Changed = false; - SmallVector PhiNodes; for (BasicBlock &BB : F) { for (Instruction &I : BB) { if (PHINode *PhiNode = dyn_cast(&I)) { From e9723dc64db2d1eafabafe23d3e9d8750b936b19 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Thu, 21 Nov 2024 07:10:50 -0700 Subject: [PATCH 12/18] rename processPhiNode -> processBasicBlock --- third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index cec40e6f1d..0387f168b7 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -6,7 +6,7 @@ using namespace llvm; -static bool processPhiNode(PHINode *PhiNode, BasicBlock &BB) { +static bool processBasicBlock(BasicBlock &BB, PHINode *PhiNode) { if (!any_of(PhiNode->incoming_values(), [](Use &U) { if (Constant *C = dyn_cast(&U)) { return C->isNullValue(); @@ -38,7 +38,7 @@ static bool runOnFunction(Function &F) { for (BasicBlock &BB : F) { for (Instruction &I : BB) { if (PHINode *PhiNode = dyn_cast(&I)) { - Changed |= processPhiNode(PhiNode, BB); + Changed |= processBasicBlock(BB, PhiNode); continue; } break; From 928afd194cdc27cd226e894e29ce712d385c501d Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Tue, 26 Nov 2024 19:46:12 +0000 Subject: [PATCH 13/18] simplify phi node incoming values constant check expression --- third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index 0387f168b7..b5485e76be 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -8,10 +8,8 @@ using namespace llvm; static bool processBasicBlock(BasicBlock &BB, PHINode *PhiNode) { if (!any_of(PhiNode->incoming_values(), [](Use &U) { - if (Constant *C = dyn_cast(&U)) { - return C->isNullValue(); - } - return false; + Constant *C = dyn_cast(&U); + return C && C->isNullValue(); })) { return false; } From 33491098658cd2c1f9dec518e01de7330277515a Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Tue, 26 Nov 2024 19:48:10 +0000 Subject: [PATCH 14/18] cleanup formatting in division test --- python/test/regression/test_divide.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py index a9c1da053c..282b7b5c50 100644 --- a/python/test/regression/test_divide.py +++ b/python/test/regression/test_divide.py @@ -40,9 +40,9 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel torch.manual_seed(0) - outputs_float_div = "tl.store(out_ptr0 + (x0), tmp4, xmask)\n tl.store(out_ptr3 + (x0), tmp4, xmask)" if float_div is True else "" - outputs_floor = "\n tl.store(out_ptr1 + (x0), tmp5, xmask)\n tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor is True else "" - outputs_trunc = "\n tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc is True else "" + outputs_float_div = "tl.store(out_ptr0 + (x0), tmp4, xmask)\n tl.store(out_ptr3 + (x0), tmp4, xmask)" if float_div else "" + outputs_floor = " tl.store(out_ptr1 + (x0), tmp5, xmask)\n tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor else "" + outputs_trunc = " tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc else "" divide_kernel = patch_kernel(divide_kernel, {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"}) From f1a6029145f5258aa80c1e60bbacc6abc2662eb5 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Thu, 28 Nov 2024 17:15:00 +0000 Subject: [PATCH 15/18] add lit test --- bin/CMakeLists.txt | 4 ++ bin/triton-llvm-opt.cpp | 8 ++++ test/LLVMIR/freeze-masked-div-rem.ll | 57 ++++++++++++++++++++++++++++ 3 files changed, 69 insertions(+) create mode 100644 test/LLVMIR/freeze-masked-div-rem.ll diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt index b66ef71193..99c311536e 100644 --- a/bin/CMakeLists.txt +++ b/bin/CMakeLists.txt @@ -13,6 +13,9 @@ target_link_libraries(triton-opt PRIVATE TritonTransforms TritonGPUTransforms TritonNvidiaGPUTransforms + TritonIntelLLVMIR + TritonIntelGPUIR + TritonIntelGPUTransforms MLIRGPUToROCDLTransforms ${dialect_libs} ${conversion_libs} @@ -88,6 +91,7 @@ target_link_libraries(triton-llvm-opt PRIVATE LLVMSupport LLVMOption LLVMCodeGen + TritonIntelLLVMIR TritonIntelGPUIR ) export_executable_symbols_for_plugins(triton-llvm-opt) diff --git a/bin/triton-llvm-opt.cpp b/bin/triton-llvm-opt.cpp index 1ec804cb50..f521394f47 100644 --- a/bin/triton-llvm-opt.cpp +++ b/bin/triton-llvm-opt.cpp @@ -1,6 +1,7 @@ /// Trimmed down clone of llvm opt to be able to test triton custom llvm ir /// passes. #include "lib/Target/LLVMIR/LLVMPasses.h" +#include "third_party/intel/lib/LLVMIR/LLVMPasses.h" #include "llvm/CodeGen/CommandFlags.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -42,6 +43,11 @@ static cl::opt llvm::cl::desc("run pass to break phi struct"), cl::init(false)); +static cl::opt FreezeMaskedDivRem( + "freeze-masked-div-rem", + llvm::cl::desc("run pass to insert freeze between masked load and div/rem"), + cl::init(false)); + namespace { static std::function makeOptimizingPipeline() { return [](Module *m) -> Error { @@ -62,6 +68,8 @@ static std::function makeOptimizingPipeline() { llvm::FunctionPassManager fpm; if (BreakStructPhiNodes) fpm.addPass(BreakStructPhiNodesPass()); + if (FreezeMaskedDivRem) + fpm.addPass(FreezeMaskedDivRemPass()); mpm.addPass(createModuleToFunctionPassAdaptor(std::move(fpm))); mpm.run(*m, mam); return Error::success(); diff --git a/test/LLVMIR/freeze-masked-div-rem.ll b/test/LLVMIR/freeze-masked-div-rem.ll new file mode 100644 index 0000000000..6d29a2d0f6 --- /dev/null +++ b/test/LLVMIR/freeze-masked-div-rem.ll @@ -0,0 +1,57 @@ +; RUN: triton-llvm-opt -freeze-masked-div-rem %s | FileCheck %s + +define void @phi_div_of_zero_okay(i8 noundef %x, i8 %i, ptr %v) { +; CHECK-LABEL: @phi_div_of_zero_okay( +entry: + %cmp = icmp ult i8 %i, 9 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %y = load i8, ptr %v, align 8 + br label %if.end + +if.end: + %yy = phi i8 [ %y, %if.then ], [ 0, %entry ] + ; CHECK: [[F0:%.*]] = freeze i8 %yy + ; CHECK-NEXT: %z = sdiv i8 %x, [[F0:%.*]] + %z = sdiv i8 %x, %yy + br i1 %cmp, label %if2.then, label %if2.end + +if2.then: + store i8 %z, ptr %v, align 8 + br label %if2.end + +if2.end: + ret void +} + +define void @two_phi_div_of_zero_okay(i8 noundef %x, i8 %i, ptr %v) { +; CHECK-LABEL: @two_phi_div_of_zero_okay( +entry: + %cmp = icmp ult i8 %i, 9 + br i1 %cmp, label %if.then, label %if.end + +if.then: + %y = load i8, ptr %v, align 8 + %vv = getelementptr inbounds i64, ptr %v, i64 1 + %b = load i8, ptr %vv, align 8 + br label %if.end + +if.end: + %bb = phi i8 [ %b, %if.then ], [ undef, %entry ] + %yy = phi i8 [ %y, %if.then ], [ 0, %entry ] + ; CHECK: [[F0:%.*]] = freeze i8 %yy + ; CHECK-NEXT: %z = sdiv i8 %x, [[F0:%.*]] + %z = sdiv i8 %x, %yy + ; CHECK: [[F1:%.*]] = freeze i8 %bb + ; CHECK-NEXT: %zz = sdiv i8 %x, [[F1:%.*]] + %zz = sdiv i8 %x, %bb + br i1 %cmp, label %if2.then, label %if2.end + +if2.then: + store i8 %z, ptr %v, align 8 + br label %if2.end + +if2.end: + ret void +} \ No newline at end of file From 7ff052bea1d97978ec4170f06595a16deef93bd8 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Thu, 28 Nov 2024 17:56:52 +0000 Subject: [PATCH 16/18] support multiple phis and undef --- test/LLVMIR/freeze-masked-div-rem.ll | 6 +++--- .../intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 12 ++++-------- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/test/LLVMIR/freeze-masked-div-rem.ll b/test/LLVMIR/freeze-masked-div-rem.ll index 6d29a2d0f6..0909a0b994 100644 --- a/test/LLVMIR/freeze-masked-div-rem.ll +++ b/test/LLVMIR/freeze-masked-div-rem.ll @@ -15,7 +15,7 @@ if.end: ; CHECK: [[F0:%.*]] = freeze i8 %yy ; CHECK-NEXT: %z = sdiv i8 %x, [[F0:%.*]] %z = sdiv i8 %x, %yy - br i1 %cmp, label %if2.then, label %if2.end + br i1 %cmp, label %if2.then, label %if2.end if2.then: store i8 %z, ptr %v, align 8 @@ -46,7 +46,7 @@ if.end: ; CHECK: [[F1:%.*]] = freeze i8 %bb ; CHECK-NEXT: %zz = sdiv i8 %x, [[F1:%.*]] %zz = sdiv i8 %x, %bb - br i1 %cmp, label %if2.then, label %if2.end + br i1 %cmp, label %if2.then, label %if2.end if2.then: store i8 %z, ptr %v, align 8 @@ -54,4 +54,4 @@ if2.then: if2.end: ret void -} \ No newline at end of file +} diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index b5485e76be..3d860c045b 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -6,10 +6,10 @@ using namespace llvm; -static bool processBasicBlock(BasicBlock &BB, PHINode *PhiNode) { +static bool processPhiNode(BasicBlock &BB, PHINode *PhiNode) { if (!any_of(PhiNode->incoming_values(), [](Use &U) { Constant *C = dyn_cast(&U); - return C && C->isNullValue(); + return isa(U) || C && C->isNullValue(); })) { return false; } @@ -34,12 +34,8 @@ static bool runOnFunction(Function &F) { bool Changed = false; for (BasicBlock &BB : F) { - for (Instruction &I : BB) { - if (PHINode *PhiNode = dyn_cast(&I)) { - Changed |= processBasicBlock(BB, PhiNode); - continue; - } - break; + for (PHINode &PhiNode : BB.phis()) { + Changed |= processPhiNode(BB, &PhiNode); } } From ed6df2382df2bf5621377876333b8835ea0995cc Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Mon, 2 Dec 2024 22:43:38 +0000 Subject: [PATCH 17/18] remove unused libs --- bin/CMakeLists.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt index 99c311536e..c0398fb60b 100644 --- a/bin/CMakeLists.txt +++ b/bin/CMakeLists.txt @@ -14,8 +14,6 @@ target_link_libraries(triton-opt PRIVATE TritonGPUTransforms TritonNvidiaGPUTransforms TritonIntelLLVMIR - TritonIntelGPUIR - TritonIntelGPUTransforms MLIRGPUToROCDLTransforms ${dialect_libs} ${conversion_libs} From dc9c16ec7ca1151fcc0e04677a2c06f32132af84 Mon Sep 17 00:00:00 2001 From: Alex Baden Date: Tue, 3 Dec 2024 01:37:43 +0000 Subject: [PATCH 18/18] address review comments --- .../intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp index 3d860c045b..5344f92d70 100644 --- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp +++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp @@ -6,8 +6,8 @@ using namespace llvm; -static bool processPhiNode(BasicBlock &BB, PHINode *PhiNode) { - if (!any_of(PhiNode->incoming_values(), [](Use &U) { +static bool processPhiNode(PHINode *PhiNode) { + if (none_of(PhiNode->incoming_values(), [](Use &U) { Constant *C = dyn_cast(&U); return isa(U) || C && C->isNullValue(); })) { @@ -15,7 +15,8 @@ static bool processPhiNode(BasicBlock &BB, PHINode *PhiNode) { } bool Changed = false; - for (Instruction &I : BB) { + BasicBlock *BB = const_cast(PhiNode->getParent()); + for (Instruction &I : *BB) { if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem) { const size_t OpIdx = 1; @@ -35,7 +36,7 @@ static bool runOnFunction(Function &F) { for (BasicBlock &BB : F) { for (PHINode &PhiNode : BB.phis()) { - Changed |= processPhiNode(BB, &PhiNode); + Changed |= processPhiNode(&PhiNode); } }