From 7f50dcfa231e65fd6ff47a828f5c56f48a88142b Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 18 Nov 2024 13:47:27 -0700
Subject: [PATCH 01/18] Prevent UB in div/rem instructions during optimization

---
 third_party/intel/triton_xpu.cc | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index e6d13915ee..bf65bc1a50 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -6,6 +6,7 @@
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
 
 #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -204,6 +205,17 @@ void init_triton_intel(py::module &&m) {
           fpm.addPass(BreakStructPhiNodesPass());
           fpm.addPass(InstCombinePass());
         });
+    pb.registerPeepholeEPCallback(
+        [&](llvm::FunctionPassManager &fpm, llvm::OptimizationLevel level) {
+          // The Triton masked load pattern can generate instances where the
+          // mask false path appears to cause undefined behavior during
+          // computation. Even though the result of that behavior will never be
+          // used, LLVM can choose to optimize away the false path resulting in
+          // an incorrect result for the kernel. Adding `DivRemPairsPass`
+          // introduces freeze instructions which prevent UB from leaking into
+          // div/rem instructions.
+          fpm.addPass(DivRemPairsPass());
+        });
     mpm.addPass(pb.buildPerModuleDefaultPipeline(opt));
     mpm.run(*mod, mam);
   });

From 1dd55a48ad7a0c5fc4ca8df05abc831faaf4197a Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 18 Nov 2024 19:32:07 -0700
Subject: [PATCH 02/18] Add regression test 1/?

---
 python/test/regression/test_divide.py | 75 +++++++++++++++++++++++++++
 1 file changed, 75 insertions(+)
 create mode 100644 python/test/regression/test_divide.py

diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py
new file mode 100644
index 0000000000..57f0a8ea84
--- /dev/null
+++ b/python/test/regression/test_divide.py
@@ -0,0 +1,75 @@
+import torch
+aten = torch.ops.aten
+
+import pytest 
+
+import triton
+import triton.language as tl 
+
+
+def patch_kernel(template, to_replace):
+    kernel = triton.JITFunction(template.fn)
+    for key, value in to_replace.items():
+        kernel.src = kernel.src.replace(key, value)
+    return kernel
+    
+def test_divide(device):
+    # regression test for various division cases 
+
+    @triton.jit 
+    def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel, XBLOCK: tl.constexpr):
+        xoffset = tl.program_id(0) * XBLOCK
+        xindex = xoffset + tl.arange(0, XBLOCK)[:]
+        xmask = xindex < xnumel
+        x0 = xindex
+        tmp0 = tl.load(a + (x0), xmask)
+        tmp2 = tl.load(b + (x0), xmask)
+        # custom bits 
+        tmp1 = tmp0.to(tl.float32)
+        tmp3 = tmp2.to(tl.float32)
+        tmp4 = tmp1 / tmp3
+        tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2), tmp0 // tmp2)
+        tmp6 = tmp0 // tmp2
+        tl.store(out_ptr0 + (x0), tmp4, xmask)
+        tl.store(out_ptr1 + (x0), tmp5, xmask)
+        tl.store(out_ptr2 + (x0), tmp6, xmask)
+        tl.store(out_ptr3 + (x0), tmp4, xmask)
+        tl.store(out_ptr4 + (x0), tmp5, xmask)
+
+    torch.manual_seed(0)
+
+    def launch_triton(a, b):
+        output0 = torch.empty_like(a)
+        output1 = torch.empty_like(a)
+        output2 = torch.empty_like(a)
+        output3 = torch.empty_like(a)
+        output4 = torch.empty_like(a)
+
+        n_elements = output0.numel()
+
+        grid = lambda meta: (triton.cdiv(n_elements, meta['XBLOCK']), )
+ 
+        divide_kernel[grid](a, b, output0, output1, output2, output3, output4, n_elements, XBLOCK=128)
+
+        return (output0, output1, output2, output3, output4)
+    
+    def launch_torch(a, b):
+            return (
+                aten.div(a, b, rounding_mode=None),
+                aten.div(a, b, rounding_mode="floor"),
+                aten.div(a, b, rounding_mode="trunc"),
+                a / b,
+                a // b,
+            )
+
+    a = torch.randint(2**32, 2**40, [100, 100], device=device)
+    b = torch.randint(-10, -1, [100, 100], device=device)
+
+    for iter in range(100):
+        triton_result = launch_triton(a, b)
+        torch_result = launch_torch(a, b)
+
+        for i in range(5):
+            torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Iteration {iter}, {i} failed\n{msg}")
+
+    

From c728635a1366e2b5684e0de0c5e7daf2e8d8bf44 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 18 Nov 2024 20:12:42 -0700
Subject: [PATCH 03/18] Parametrize test_divide (2/?)

---
 python/test/regression/test_divide.py | 43 +++++++++++++++------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py
index 57f0a8ea84..44a46765db 100644
--- a/python/test/regression/test_divide.py
+++ b/python/test/regression/test_divide.py
@@ -12,8 +12,11 @@ def patch_kernel(template, to_replace):
     for key, value in to_replace.items():
         kernel.src = kernel.src.replace(key, value)
     return kernel
-    
-def test_divide(device):
+
+@pytest.mark.parametrize("float_div", [True, False])
+@pytest.mark.parametrize("floor", [True, False])
+@pytest.mark.parametrize("trunc", [True, False])
+def test_divide(float_div, floor, trunc, device):
     # regression test for various division cases 
 
     @triton.jit 
@@ -30,36 +33,38 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel
         tmp4 = tmp1 / tmp3
         tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2), tmp0 // tmp2)
         tmp6 = tmp0 // tmp2
-        tl.store(out_ptr0 + (x0), tmp4, xmask)
-        tl.store(out_ptr1 + (x0), tmp5, xmask)
-        tl.store(out_ptr2 + (x0), tmp6, xmask)
-        tl.store(out_ptr3 + (x0), tmp4, xmask)
-        tl.store(out_ptr4 + (x0), tmp5, xmask)
+        GENERATE_OUTPUTS_HERE
 
     torch.manual_seed(0)
 
+    outputs_float_div = "tl.store(out_ptr0 + (x0), tmp4, xmask)\n    tl.store(out_ptr3 + (x0), tmp4, xmask)" if float_div is True else ""
+    outputs_floor = "\n    tl.store(out_ptr1 + (x0), tmp5, xmask)\n    tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor is True else ""
+    outputs_trunc = "\n    tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc is True else ""
+
+    divide_kernel = patch_kernel(divide_kernel, {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"})
+
     def launch_triton(a, b):
-        output0 = torch.empty_like(a)
-        output1 = torch.empty_like(a)
-        output2 = torch.empty_like(a)
-        output3 = torch.empty_like(a)
-        output4 = torch.empty_like(a)
+        output0 = torch.zeros_like(a)
+        output1 = torch.zeros_like(a)
+        output2 = torch.zeros_like(a)
+        output3 = torch.zeros_like(a)
+        output4 = torch.zeros_like(a)
 
         n_elements = output0.numel()
 
         grid = lambda meta: (triton.cdiv(n_elements, meta['XBLOCK']), )
- 
+
         divide_kernel[grid](a, b, output0, output1, output2, output3, output4, n_elements, XBLOCK=128)
 
         return (output0, output1, output2, output3, output4)
     
     def launch_torch(a, b):
             return (
-                aten.div(a, b, rounding_mode=None),
-                aten.div(a, b, rounding_mode="floor"),
-                aten.div(a, b, rounding_mode="trunc"),
-                a / b,
-                a // b,
+                aten.div(a, b, rounding_mode=None) if float_div is True else torch.zeros_like(a),
+                aten.div(a, b, rounding_mode="floor") if floor is True else torch.zeros_like(a),
+                aten.div(a, b, rounding_mode="trunc") if trunc is True else torch.zeros_like(a),
+                a / b if float_div is True else torch.zeros_like(a),
+                a // b if floor is True else torch.zeros_like(a),
             )
 
     a = torch.randint(2**32, 2**40, [100, 100], device=device)
@@ -70,6 +75,6 @@ def launch_torch(a, b):
         torch_result = launch_torch(a, b)
 
         for i in range(5):
-            torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Iteration {iter}, {i} failed\n{msg}")
+            torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Float: {float_div}, Floor: {floor}, Trunc: {trunc}\nIteration {iter}, {i} failed\n{msg}")
 
     

From e36f345301ba0a4b955554794014c3e32184b2fa Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 19 Nov 2024 14:57:58 -0700
Subject: [PATCH 04/18] fixup format in test_divide

---
 python/test/regression/test_divide.py | 44 +++++++++++++++------------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py
index 44a46765db..a9c1da053c 100644
--- a/python/test/regression/test_divide.py
+++ b/python/test/regression/test_divide.py
@@ -1,10 +1,11 @@
+# flake8: noqa: F821, F841
 import torch
-aten = torch.ops.aten
-
-import pytest 
+import pytest
 
 import triton
-import triton.language as tl 
+import triton.language as tl
+
+aten = torch.ops.aten
 
 
 def patch_kernel(template, to_replace):
@@ -13,13 +14,14 @@ def patch_kernel(template, to_replace):
         kernel.src = kernel.src.replace(key, value)
     return kernel
 
+
 @pytest.mark.parametrize("float_div", [True, False])
 @pytest.mark.parametrize("floor", [True, False])
 @pytest.mark.parametrize("trunc", [True, False])
 def test_divide(float_div, floor, trunc, device):
-    # regression test for various division cases 
+    # regression test for various division cases
 
-    @triton.jit 
+    @triton.jit
     def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel, XBLOCK: tl.constexpr):
         xoffset = tl.program_id(0) * XBLOCK
         xindex = xoffset + tl.arange(0, XBLOCK)[:]
@@ -27,11 +29,12 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel
         x0 = xindex
         tmp0 = tl.load(a + (x0), xmask)
         tmp2 = tl.load(b + (x0), xmask)
-        # custom bits 
+        # custom bits
         tmp1 = tmp0.to(tl.float32)
         tmp3 = tmp2.to(tl.float32)
         tmp4 = tmp1 / tmp3
-        tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2), tmp0 // tmp2)
+        tmp5 = tl.where((tmp0 < 0) != (tmp2 < 0), tl.where(tmp0 % tmp2 != 0, tmp0 // tmp2 - 1, tmp0 // tmp2),
+                        tmp0 // tmp2)
         tmp6 = tmp0 // tmp2
         GENERATE_OUTPUTS_HERE
 
@@ -41,7 +44,8 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel
     outputs_floor = "\n    tl.store(out_ptr1 + (x0), tmp5, xmask)\n    tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor is True else ""
     outputs_trunc = "\n    tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc is True else ""
 
-    divide_kernel = patch_kernel(divide_kernel, {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"})
+    divide_kernel = patch_kernel(divide_kernel,
+                                 {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"})
 
     def launch_triton(a, b):
         output0 = torch.zeros_like(a)
@@ -57,15 +61,15 @@ def launch_triton(a, b):
         divide_kernel[grid](a, b, output0, output1, output2, output3, output4, n_elements, XBLOCK=128)
 
         return (output0, output1, output2, output3, output4)
-    
+
     def launch_torch(a, b):
-            return (
-                aten.div(a, b, rounding_mode=None) if float_div is True else torch.zeros_like(a),
-                aten.div(a, b, rounding_mode="floor") if floor is True else torch.zeros_like(a),
-                aten.div(a, b, rounding_mode="trunc") if trunc is True else torch.zeros_like(a),
-                a / b if float_div is True else torch.zeros_like(a),
-                a // b if floor is True else torch.zeros_like(a),
-            )
+        return (
+            aten.div(a, b, rounding_mode=None) if float_div is True else torch.zeros_like(a),
+            aten.div(a, b, rounding_mode="floor") if floor is True else torch.zeros_like(a),
+            aten.div(a, b, rounding_mode="trunc") if trunc is True else torch.zeros_like(a),
+            a / b if float_div is True else torch.zeros_like(a),
+            a // b if floor is True else torch.zeros_like(a),
+        )
 
     a = torch.randint(2**32, 2**40, [100, 100], device=device)
     b = torch.randint(-10, -1, [100, 100], device=device)
@@ -75,6 +79,6 @@ def launch_torch(a, b):
         torch_result = launch_torch(a, b)
 
         for i in range(5):
-            torch.testing.assert_close(triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg: f"Float: {float_div}, Floor: {floor}, Trunc: {trunc}\nIteration {iter}, {i} failed\n{msg}")
-
-    
+            torch.testing.assert_close(
+                triton_result[i], torch_result[i], check_dtype=False, msg=lambda msg:
+                f"Float: {float_div}, Floor: {floor}, Trunc: {trunc}\nIteration {iter}, {i} failed\n{msg}")

From e0310dd1fb5b3cee439e34e2e8947739b5073fa2 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 19 Nov 2024 20:33:54 -0700
Subject: [PATCH 05/18] LLVM freeze instruction between mask and div 1/?

---
 third_party/intel/lib/LLVMIR/CMakeLists.txt   |  6 ++
 .../lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp   | 63 +++++++++++++++++++
 third_party/intel/lib/LLVMIR/LLVMPasses.h     | 11 ++++
 3 files changed, 80 insertions(+)
 create mode 100644 third_party/intel/lib/LLVMIR/CMakeLists.txt
 create mode 100644 third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
 create mode 100644 third_party/intel/lib/LLVMIR/LLVMPasses.h

diff --git a/third_party/intel/lib/LLVMIR/CMakeLists.txt b/third_party/intel/lib/LLVMIR/CMakeLists.txt
new file mode 100644
index 0000000000..408e229590
--- /dev/null
+++ b/third_party/intel/lib/LLVMIR/CMakeLists.txt
@@ -0,0 +1,6 @@
+add_triton_library(TritonIntelLLVMIR
+        LLVMIRFreezeMaskedDivRem.cpp
+
+        DEPENDS
+        LLVMIRIncGen
+        )
\ No newline at end of file
diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
new file mode 100644
index 0000000000..d224adb881
--- /dev/null
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -0,0 +1,63 @@
+#include "LLVMPasses.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+
+using namespace llvm;
+
+static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) {
+      llvm::errs() << "YOLO: " << *phiNode << "\n";
+
+  const auto phiHasNullValue = any_of(phiNode->incoming_values(), [](Use& U) {
+    if (Constant *C = dyn_cast<Constant>(&U)) {
+      return C->isNullValue();
+    }
+    return false; 
+  });
+
+  if (phiHasNullValue) {
+        auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) {
+          auto *Use = cast<Instruction>(U);
+          llvm::errs() << "User: " << *Use << "\n";
+          return (Use->getOpcode() == Instruction::SDiv);
+        });
+        if (FindUse == phiNode->user_end()) {
+          llvm::errs() << "no div :(\n";
+          return false; 
+        }
+        auto *Use = cast<Instruction>(*FindUse);
+        assert()
+        llvm::errs() << "Got our user! " << *Use << "\n";
+
+        // insert freeze between phi and sdiv 
+        // 
+  }
+  return false; 
+}
+
+static bool runOnFunction(Function& F, const TargetTransformInfo &TTI,
+                           const DominatorTree &DT) {
+    bool Changed = false;
+
+    SmallVector<PHINode *> PhiNodes;
+    for (BasicBlock &BB : F) {
+    for (Instruction &inst : BB) {
+      if (PHINode *phiNode = dyn_cast<PHINode>(&inst)) {
+        Changed |= processPhiNode(phiNode, BB);
+        continue;
+      }
+      break;
+    }
+  }
+
+    return Changed;
+}
+
+ PreservedAnalyses FreezeMaskedDivRemPass::run(Function &F, FunctionAnalysisManager &FAM) {
+    TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+    DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+    const auto b = runOnFunction(F, TTI, DT);
+
+    return b ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
\ No newline at end of file
diff --git a/third_party/intel/lib/LLVMIR/LLVMPasses.h b/third_party/intel/lib/LLVMIR/LLVMPasses.h
new file mode 100644
index 0000000000..5e70c38e22
--- /dev/null
+++ b/third_party/intel/lib/LLVMIR/LLVMPasses.h
@@ -0,0 +1,11 @@
+#include "llvm/IR/PassManager.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+struct FreezeMaskedDivRemPass : PassInfoMixin<FreezeMaskedDivRemPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+  static StringRef name() { return "FreezeMaskedDivRemPass"; }
+};
+
+}
\ No newline at end of file

From 2243b25ee39a240fe8ebf4481834eb3b717d8b76 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 19 Nov 2024 20:34:53 -0700
Subject: [PATCH 06/18] LLVM freeze instruction between mask and div 2/?

---
 third_party/intel/lib/CMakeLists.txt | 1 +
 third_party/intel/triton_xpu.cc      | 8 +++++++-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/third_party/intel/lib/CMakeLists.txt b/third_party/intel/lib/CMakeLists.txt
index b2d8e610d0..2b58d7f122 100644
--- a/third_party/intel/lib/CMakeLists.txt
+++ b/third_party/intel/lib/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(Analysis)
 add_subdirectory(Dialect)
 add_subdirectory(GPUToTritonGEN)
+add_subdirectory(LLVMIR)
 add_subdirectory(Target)
 add_subdirectory(TritonAnnotateModule)
 add_subdirectory(TritonGENToLLVM)
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index bf65bc1a50..582d21a3ad 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -17,6 +17,7 @@
 #include "intel/include/TritonAnnotateModule/Passes.h"
 #include "intel/include/TritonIntelGPUToLLVM/Passes.h"
 #include "intel/include/TritonToTritonGPUWarp/Passes.h"
+#include "intel/lib/LLVMIR/LLVMPasses.h"
 
 #include "triton/Target/SPIRV/SPIRVTranslation.h"
 #include "triton/Tools/Sys/GetEnv.hpp"
@@ -205,6 +206,7 @@ void init_triton_intel(py::module &&m) {
           fpm.addPass(BreakStructPhiNodesPass());
           fpm.addPass(InstCombinePass());
         });
+#if 1
     pb.registerPeepholeEPCallback(
         [&](llvm::FunctionPassManager &fpm, llvm::OptimizationLevel level) {
           // The Triton masked load pattern can generate instances where the
@@ -214,8 +216,12 @@ void init_triton_intel(py::module &&m) {
           // an incorrect result for the kernel. Adding `DivRemPairsPass`
           // introduces freeze instructions which prevent UB from leaking into
           // div/rem instructions.
-          fpm.addPass(DivRemPairsPass());
+          // fpm.addPass(DivRemPairsPass());
+          fpm.addPass(FreezeMaskedDivRemPass());
         });
+#else
+    mpm.addPass(createModuleToFunctionPassAdaptor(FreezeMaskedDivRemPass()));
+#endif
     mpm.addPass(pb.buildPerModuleDefaultPipeline(opt));
     mpm.run(*mod, mam);
   });

From 6c6a0f0614adc9dc2e4fbed4a3862a09d00be60c Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 20 Nov 2024 10:11:45 -0700
Subject: [PATCH 07/18] LLVM freeze instruction between mask and div 3/?

---
 .../intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index d224adb881..971aa49867 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -16,6 +16,7 @@ static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) {
     return false; 
   });
 
+  bool Changed = false;
   if (phiHasNullValue) {
         auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) {
           auto *Use = cast<Instruction>(U);
@@ -27,13 +28,15 @@ static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) {
           return false; 
         }
         auto *Use = cast<Instruction>(*FindUse);
-        assert()
-        llvm::errs() << "Got our user! " << *Use << "\n";
-
-        // insert freeze between phi and sdiv 
-        // 
+        if (Use->getOperand(1) == phiNode) {
+          llvm::errs() << "Got our user! " << *Use << "\n";
+          llvm::errs() << "Operand 1: " << *Use->getOperand(1) << "\n";
+          auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", Use->getIterator());
+          Use->setOperand(1, freezePhi);
+          Changed = true;
+        }
   }
-  return false; 
+  return Changed; 
 }
 
 static bool runOnFunction(Function& F, const TargetTransformInfo &TTI,

From 3ef26d06b2182079306099bd7e370777b3017f44 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 20 Nov 2024 10:45:30 -0700
Subject: [PATCH 08/18] LLVM freeze instruction between mask and div 4/?

---
 .../lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp   | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index 971aa49867..72765167ae 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -18,23 +18,37 @@ static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) {
 
   bool Changed = false;
   if (phiHasNullValue) {
+    for (Instruction &I : BB) {
+      if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem) {
+        const size_t OpIdx = 1; // I.getOpcode() == Instruction::SRem ? 0 : 1;
+        if (I.getOperand(OpIdx) == phiNode) {
+          auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", I.getIterator());
+          I.setOperand(OpIdx, freezePhi);
+          Changed = true;
+        }
+      }
+    }
+#if 0
         auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) {
           auto *Use = cast<Instruction>(U);
           llvm::errs() << "User: " << *Use << "\n";
-          return (Use->getOpcode() == Instruction::SDiv);
+          return (Use->getOpcode() == Instruction::SDiv || Use->getOpcode() == Instruction::SRem);
         });
         if (FindUse == phiNode->user_end()) {
           llvm::errs() << "no div :(\n";
           return false; 
         }
         auto *Use = cast<Instruction>(*FindUse);
-        if (Use->getOperand(1) == phiNode) {
+        assert(Use->isIntDivRem());
+        const size_t OpIdx = Use->getOpcode() == Instruction::SRem ? 0 : 1;
+        if (Use->getOperand(OpIdx) == phiNode) {
           llvm::errs() << "Got our user! " << *Use << "\n";
           llvm::errs() << "Operand 1: " << *Use->getOperand(1) << "\n";
           auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", Use->getIterator());
-          Use->setOperand(1, freezePhi);
+          Use->setOperand(OpIdx, freezePhi);
           Changed = true;
         }
+#endif
   }
   return Changed; 
 }

From a35b0edf0c899ead6eea2d425f06b7e566c925b0 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 20 Nov 2024 10:59:29 -0700
Subject: [PATCH 09/18] LLVM freeze instruction between mask and div 5/5

---
 third_party/intel/lib/LLVMIR/CMakeLists.txt   |  2 +-
 .../lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp   | 91 +++++++------------
 third_party/intel/lib/LLVMIR/LLVMPasses.h     |  2 +-
 third_party/intel/triton_xpu.cc               | 21 ++---
 4 files changed, 46 insertions(+), 70 deletions(-)

diff --git a/third_party/intel/lib/LLVMIR/CMakeLists.txt b/third_party/intel/lib/LLVMIR/CMakeLists.txt
index 408e229590..6da101e950 100644
--- a/third_party/intel/lib/LLVMIR/CMakeLists.txt
+++ b/third_party/intel/lib/LLVMIR/CMakeLists.txt
@@ -3,4 +3,4 @@ add_triton_library(TritonIntelLLVMIR
 
         DEPENDS
         LLVMIRIncGen
-        )
\ No newline at end of file
+        )
diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index 72765167ae..630e791e54 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -1,80 +1,57 @@
 #include "LLVMPasses.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
-static bool processPhiNode(PHINode *phiNode, BasicBlock& BB) {
-      llvm::errs() << "YOLO: " << *phiNode << "\n";
-
-  const auto phiHasNullValue = any_of(phiNode->incoming_values(), [](Use& U) {
-    if (Constant *C = dyn_cast<Constant>(&U)) {
-      return C->isNullValue();
-    }
-    return false; 
-  });
+static bool processPhiNode(PHINode *PhiNode, BasicBlock &BB) {
+  if (!any_of(PhiNode->incoming_values(), [](Use &U) {
+        if (Constant *C = dyn_cast<Constant>(&U)) {
+          return C->isNullValue();
+        }
+        return false;
+      })) {
+    return false;
+  }
 
   bool Changed = false;
-  if (phiHasNullValue) {
-    for (Instruction &I : BB) {
-      if (I.getOpcode() == Instruction::SDiv || I.getOpcode() == Instruction::SRem) {
-        const size_t OpIdx = 1; // I.getOpcode() == Instruction::SRem ? 0 : 1;
-        if (I.getOperand(OpIdx) == phiNode) {
-          auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", I.getIterator());
-          I.setOperand(OpIdx, freezePhi);
-          Changed = true;
-        }
+  for (Instruction &I : BB) {
+    if (I.getOpcode() == Instruction::SDiv ||
+        I.getOpcode() == Instruction::SRem) {
+      const size_t OpIdx = 1;
+      if (I.getOperand(OpIdx) == PhiNode) {
+        auto *freezePhi = new FreezeInst(
+            PhiNode, PhiNode->getName() + ".frozen", I.getIterator());
+        I.setOperand(OpIdx, freezePhi);
+        Changed = true;
       }
     }
-#if 0
-        auto FindUse = llvm::find_if(phiNode->users(), [](auto *U) {
-          auto *Use = cast<Instruction>(U);
-          llvm::errs() << "User: " << *Use << "\n";
-          return (Use->getOpcode() == Instruction::SDiv || Use->getOpcode() == Instruction::SRem);
-        });
-        if (FindUse == phiNode->user_end()) {
-          llvm::errs() << "no div :(\n";
-          return false; 
-        }
-        auto *Use = cast<Instruction>(*FindUse);
-        assert(Use->isIntDivRem());
-        const size_t OpIdx = Use->getOpcode() == Instruction::SRem ? 0 : 1;
-        if (Use->getOperand(OpIdx) == phiNode) {
-          llvm::errs() << "Got our user! " << *Use << "\n";
-          llvm::errs() << "Operand 1: " << *Use->getOperand(1) << "\n";
-          auto *freezePhi = new FreezeInst(phiNode, phiNode->getName() + ".frozen", Use->getIterator());
-          Use->setOperand(OpIdx, freezePhi);
-          Changed = true;
-        }
-#endif
   }
-  return Changed; 
+  return Changed;
 }
 
-static bool runOnFunction(Function& F, const TargetTransformInfo &TTI,
-                           const DominatorTree &DT) {
-    bool Changed = false;
+static bool runOnFunction(Function &F) {
+  bool Changed = false;
 
-    SmallVector<PHINode *> PhiNodes;
-    for (BasicBlock &BB : F) {
-    for (Instruction &inst : BB) {
-      if (PHINode *phiNode = dyn_cast<PHINode>(&inst)) {
-        Changed |= processPhiNode(phiNode, BB);
+  SmallVector<PHINode *> PhiNodes;
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (PHINode *PhiNode = dyn_cast<PHINode>(&I)) {
+        Changed |= processPhiNode(PhiNode, BB);
         continue;
       }
       break;
     }
   }
 
-    return Changed;
+  return Changed;
 }
 
- PreservedAnalyses FreezeMaskedDivRemPass::run(Function &F, FunctionAnalysisManager &FAM) {
-    TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
-    DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
-    const auto b = runOnFunction(F, TTI, DT);
+PreservedAnalyses FreezeMaskedDivRemPass::run(Function &F,
+                                              FunctionAnalysisManager &FAM) {
+  const auto b = runOnFunction(F);
 
-    return b ? PreservedAnalyses::none() : PreservedAnalyses::all();
-}
\ No newline at end of file
+  return b ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/third_party/intel/lib/LLVMIR/LLVMPasses.h b/third_party/intel/lib/LLVMIR/LLVMPasses.h
index 5e70c38e22..72f71dd983 100644
--- a/third_party/intel/lib/LLVMIR/LLVMPasses.h
+++ b/third_party/intel/lib/LLVMIR/LLVMPasses.h
@@ -8,4 +8,4 @@ struct FreezeMaskedDivRemPass : PassInfoMixin<FreezeMaskedDivRemPass> {
   static StringRef name() { return "FreezeMaskedDivRemPass"; }
 };
 
-}
\ No newline at end of file
+} // namespace llvm
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index 582d21a3ad..e6ad385924 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -206,22 +206,21 @@ void init_triton_intel(py::module &&m) {
           fpm.addPass(BreakStructPhiNodesPass());
           fpm.addPass(InstCombinePass());
         });
-#if 1
     pb.registerPeepholeEPCallback(
         [&](llvm::FunctionPassManager &fpm, llvm::OptimizationLevel level) {
           // The Triton masked load pattern can generate instances where the
-          // mask false path appears to cause undefined behavior during
-          // computation. Even though the result of that behavior will never be
-          // used, LLVM can choose to optimize away the false path resulting in
-          // an incorrect result for the kernel. Adding `DivRemPairsPass`
-          // introduces freeze instructions which prevent UB from leaking into
-          // div/rem instructions.
-          // fpm.addPass(DivRemPairsPass());
+          // mask value causes undefined behavior in sdiv/srem instructions. The
+          // language allows this UB as the result of those arithmetic
+          // instructions is never used, and control flow to avoid computation
+          // of these instructions would negatively affect performance. But,
+          // LLVM SimplifyCFG aggressively marks code paths with undefined
+          // behavior as dead. This can result in removal of the mask path and
+          // incorrect results from legal Triton kernels due to masked elements
+          // being used in computation. Run a pass to add a freeze instruction
+          // between masked loads and sdiv/srem to signal to LLVM we consider
+          // the sdiv/srem operands to be well defined.
           fpm.addPass(FreezeMaskedDivRemPass());
         });
-#else
-    mpm.addPass(createModuleToFunctionPassAdaptor(FreezeMaskedDivRemPass()));
-#endif
     mpm.addPass(pb.buildPerModuleDefaultPipeline(opt));
     mpm.run(*mod, mam);
   });

From 302fd39bbff34492f047d76c2169a3a6f3f0e522 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Wed, 20 Nov 2024 11:02:16 -0700
Subject: [PATCH 10/18] fixup

---
 third_party/intel/triton_xpu.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
index e6ad385924..362e404c5f 100644
--- a/third_party/intel/triton_xpu.cc
+++ b/third_party/intel/triton_xpu.cc
@@ -6,7 +6,6 @@
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
-#include "llvm/Transforms/Scalar/DivRemPairs.h"
 
 #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"

From 9f814ee41690705f046e0f029c64a27057a59134 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexbaden@users.noreply.github.com>
Date: Thu, 21 Nov 2024 08:12:53 -0500
Subject: [PATCH 11/18] Remove unused variable

Co-authored-by: Arun Jose <40291569+arunjose696@users.noreply.github.com>
---
 third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index 630e791e54..cec40e6f1d 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -35,7 +35,6 @@ static bool processPhiNode(PHINode *PhiNode, BasicBlock &BB) {
 static bool runOnFunction(Function &F) {
   bool Changed = false;
 
-  SmallVector<PHINode *> PhiNodes;
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       if (PHINode *PhiNode = dyn_cast<PHINode>(&I)) {

From e9723dc64db2d1eafabafe23d3e9d8750b936b19 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 21 Nov 2024 07:10:50 -0700
Subject: [PATCH 12/18] rename processPhiNode -> processBasicBlock

---
 third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index cec40e6f1d..0387f168b7 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -6,7 +6,7 @@
 
 using namespace llvm;
 
-static bool processPhiNode(PHINode *PhiNode, BasicBlock &BB) {
+static bool processBasicBlock(BasicBlock &BB, PHINode *PhiNode) {
   if (!any_of(PhiNode->incoming_values(), [](Use &U) {
         if (Constant *C = dyn_cast<Constant>(&U)) {
           return C->isNullValue();
@@ -38,7 +38,7 @@ static bool runOnFunction(Function &F) {
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       if (PHINode *PhiNode = dyn_cast<PHINode>(&I)) {
-        Changed |= processPhiNode(PhiNode, BB);
+        Changed |= processBasicBlock(BB, PhiNode);
         continue;
       }
       break;

From 928afd194cdc27cd226e894e29ce712d385c501d Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 26 Nov 2024 19:46:12 +0000
Subject: [PATCH 13/18] simplify phi node incoming values constant check
 expression

---
 third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index 0387f168b7..b5485e76be 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -8,10 +8,8 @@ using namespace llvm;
 
 static bool processBasicBlock(BasicBlock &BB, PHINode *PhiNode) {
   if (!any_of(PhiNode->incoming_values(), [](Use &U) {
-        if (Constant *C = dyn_cast<Constant>(&U)) {
-          return C->isNullValue();
-        }
-        return false;
+        Constant *C = dyn_cast<Constant>(&U);
+        return C && C->isNullValue();
       })) {
     return false;
   }

From 33491098658cd2c1f9dec518e01de7330277515a Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 26 Nov 2024 19:48:10 +0000
Subject: [PATCH 14/18] cleanup formatting in division test

---
 python/test/regression/test_divide.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/test/regression/test_divide.py b/python/test/regression/test_divide.py
index a9c1da053c..282b7b5c50 100644
--- a/python/test/regression/test_divide.py
+++ b/python/test/regression/test_divide.py
@@ -40,9 +40,9 @@ def divide_kernel(a, b, out_ptr0, out_ptr1, out_ptr2, out_ptr3, out_ptr4, xnumel
 
     torch.manual_seed(0)
 
-    outputs_float_div = "tl.store(out_ptr0 + (x0), tmp4, xmask)\n    tl.store(out_ptr3 + (x0), tmp4, xmask)" if float_div is True else ""
-    outputs_floor = "\n    tl.store(out_ptr1 + (x0), tmp5, xmask)\n    tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor is True else ""
-    outputs_trunc = "\n    tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc is True else ""
+    outputs_float_div = "tl.store(out_ptr0 + (x0), tmp4, xmask)\n    tl.store(out_ptr3 + (x0), tmp4, xmask)" if float_div else ""
+    outputs_floor = "    tl.store(out_ptr1 + (x0), tmp5, xmask)\n    tl.store(out_ptr4 + (x0), tmp5, xmask)" if floor else ""
+    outputs_trunc = "    tl.store(out_ptr2 + (x0), tmp6, xmask)" if trunc else ""
 
     divide_kernel = patch_kernel(divide_kernel,
                                  {"GENERATE_OUTPUTS_HERE": f"{outputs_float_div}\n{outputs_floor}\n{outputs_trunc}"})

From f1a6029145f5258aa80c1e60bbacc6abc2662eb5 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 28 Nov 2024 17:15:00 +0000
Subject: [PATCH 15/18] add lit test

---
 bin/CMakeLists.txt                   |  4 ++
 bin/triton-llvm-opt.cpp              |  8 ++++
 test/LLVMIR/freeze-masked-div-rem.ll | 57 ++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)
 create mode 100644 test/LLVMIR/freeze-masked-div-rem.ll

diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index b66ef71193..99c311536e 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -13,6 +13,9 @@ target_link_libraries(triton-opt PRIVATE
   TritonTransforms
   TritonGPUTransforms
   TritonNvidiaGPUTransforms
+  TritonIntelLLVMIR
+  TritonIntelGPUIR
+  TritonIntelGPUTransforms
   MLIRGPUToROCDLTransforms
   ${dialect_libs}
   ${conversion_libs}
@@ -88,6 +91,7 @@ target_link_libraries(triton-llvm-opt PRIVATE
   LLVMSupport
   LLVMOption
   LLVMCodeGen
+  TritonIntelLLVMIR
   TritonIntelGPUIR
   )
 export_executable_symbols_for_plugins(triton-llvm-opt)
diff --git a/bin/triton-llvm-opt.cpp b/bin/triton-llvm-opt.cpp
index 1ec804cb50..f521394f47 100644
--- a/bin/triton-llvm-opt.cpp
+++ b/bin/triton-llvm-opt.cpp
@@ -1,6 +1,7 @@
 /// Trimmed down clone of llvm opt to be able to test triton custom llvm ir
 /// passes.
 #include "lib/Target/LLVMIR/LLVMPasses.h"
+#include "third_party/intel/lib/LLVMIR/LLVMPasses.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -42,6 +43,11 @@ static cl::opt<bool>
                         llvm::cl::desc("run pass to break phi struct"),
                         cl::init(false));
 
+static cl::opt<bool> FreezeMaskedDivRem(
+    "freeze-masked-div-rem",
+    llvm::cl::desc("run pass to insert freeze between masked load and div/rem"),
+    cl::init(false));
+
 namespace {
 static std::function<Error(Module *)> makeOptimizingPipeline() {
   return [](Module *m) -> Error {
@@ -62,6 +68,8 @@ static std::function<Error(Module *)> makeOptimizingPipeline() {
     llvm::FunctionPassManager fpm;
     if (BreakStructPhiNodes)
       fpm.addPass(BreakStructPhiNodesPass());
+    if (FreezeMaskedDivRem)
+      fpm.addPass(FreezeMaskedDivRemPass());
     mpm.addPass(createModuleToFunctionPassAdaptor(std::move(fpm)));
     mpm.run(*m, mam);
     return Error::success();
diff --git a/test/LLVMIR/freeze-masked-div-rem.ll b/test/LLVMIR/freeze-masked-div-rem.ll
new file mode 100644
index 0000000000..6d29a2d0f6
--- /dev/null
+++ b/test/LLVMIR/freeze-masked-div-rem.ll
@@ -0,0 +1,57 @@
+; RUN: triton-llvm-opt -freeze-masked-div-rem %s | FileCheck %s
+
+define void @phi_div_of_zero_okay(i8 noundef %x, i8 %i, ptr %v) {
+; CHECK-LABEL: @phi_div_of_zero_okay(
+entry:
+  %cmp = icmp ult i8 %i, 9
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %y = load i8, ptr %v, align 8
+  br label %if.end
+
+if.end:
+  %yy = phi i8 [ %y, %if.then ], [ 0, %entry ]
+  ; CHECK: [[F0:%.*]] = freeze i8 %yy
+  ; CHECK-NEXT: %z = sdiv i8 %x, [[F0:%.*]]
+  %z = sdiv i8 %x, %yy
+  br i1 %cmp, label %if2.then, label %if2.end 
+
+if2.then:
+  store i8 %z, ptr %v, align 8
+  br label %if2.end
+
+if2.end:
+  ret void
+}
+
+define void @two_phi_div_of_zero_okay(i8 noundef %x, i8 %i, ptr %v) {
+; CHECK-LABEL: @two_phi_div_of_zero_okay(
+entry:
+  %cmp = icmp ult i8 %i, 9
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  %y = load i8, ptr %v, align 8
+  %vv = getelementptr inbounds i64, ptr %v, i64 1
+  %b = load i8, ptr %vv, align 8
+  br label %if.end
+
+if.end:
+  %bb = phi i8 [ %b, %if.then ], [ undef, %entry ]
+  %yy = phi i8 [ %y, %if.then ], [ 0, %entry ]
+  ; CHECK: [[F0:%.*]] = freeze i8 %yy
+  ; CHECK-NEXT: %z = sdiv i8 %x, [[F0:%.*]]
+  %z = sdiv i8 %x, %yy
+  ; CHECK: [[F1:%.*]] = freeze i8 %bb
+  ; CHECK-NEXT: %zz = sdiv i8 %x, [[F1:%.*]]
+  %zz = sdiv i8 %x, %bb
+  br i1 %cmp, label %if2.then, label %if2.end 
+
+if2.then:
+  store i8 %z, ptr %v, align 8
+  br label %if2.end
+
+if2.end:
+  ret void
+}
\ No newline at end of file

From 7ff052bea1d97978ec4170f06595a16deef93bd8 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Thu, 28 Nov 2024 17:56:52 +0000
Subject: [PATCH 16/18] support multiple phis and undef

---
 test/LLVMIR/freeze-masked-div-rem.ll                 |  6 +++---
 .../intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp    | 12 ++++--------
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/test/LLVMIR/freeze-masked-div-rem.ll b/test/LLVMIR/freeze-masked-div-rem.ll
index 6d29a2d0f6..0909a0b994 100644
--- a/test/LLVMIR/freeze-masked-div-rem.ll
+++ b/test/LLVMIR/freeze-masked-div-rem.ll
@@ -15,7 +15,7 @@ if.end:
   ; CHECK: [[F0:%.*]] = freeze i8 %yy
   ; CHECK-NEXT: %z = sdiv i8 %x, [[F0:%.*]]
   %z = sdiv i8 %x, %yy
-  br i1 %cmp, label %if2.then, label %if2.end 
+  br i1 %cmp, label %if2.then, label %if2.end
 
 if2.then:
   store i8 %z, ptr %v, align 8
@@ -46,7 +46,7 @@ if.end:
   ; CHECK: [[F1:%.*]] = freeze i8 %bb
   ; CHECK-NEXT: %zz = sdiv i8 %x, [[F1:%.*]]
   %zz = sdiv i8 %x, %bb
-  br i1 %cmp, label %if2.then, label %if2.end 
+  br i1 %cmp, label %if2.then, label %if2.end
 
 if2.then:
   store i8 %z, ptr %v, align 8
@@ -54,4 +54,4 @@ if2.then:
 
 if2.end:
   ret void
-}
\ No newline at end of file
+}
diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index b5485e76be..3d860c045b 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -6,10 +6,10 @@
 
 using namespace llvm;
 
-static bool processBasicBlock(BasicBlock &BB, PHINode *PhiNode) {
+static bool processPhiNode(BasicBlock &BB, PHINode *PhiNode) {
   if (!any_of(PhiNode->incoming_values(), [](Use &U) {
         Constant *C = dyn_cast<Constant>(&U);
-        return C && C->isNullValue();
+        return isa<UndefValue>(U) || C && C->isNullValue();
       })) {
     return false;
   }
@@ -34,12 +34,8 @@ static bool runOnFunction(Function &F) {
   bool Changed = false;
 
   for (BasicBlock &BB : F) {
-    for (Instruction &I : BB) {
-      if (PHINode *PhiNode = dyn_cast<PHINode>(&I)) {
-        Changed |= processBasicBlock(BB, PhiNode);
-        continue;
-      }
-      break;
+    for (PHINode &PhiNode : BB.phis()) {
+      Changed |= processPhiNode(BB, &PhiNode);
     }
   }
 

From ed6df2382df2bf5621377876333b8835ea0995cc Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Mon, 2 Dec 2024 22:43:38 +0000
Subject: [PATCH 17/18] remove unused libs

---
 bin/CMakeLists.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/bin/CMakeLists.txt b/bin/CMakeLists.txt
index 99c311536e..c0398fb60b 100644
--- a/bin/CMakeLists.txt
+++ b/bin/CMakeLists.txt
@@ -14,8 +14,6 @@ target_link_libraries(triton-opt PRIVATE
   TritonGPUTransforms
   TritonNvidiaGPUTransforms
   TritonIntelLLVMIR
-  TritonIntelGPUIR
-  TritonIntelGPUTransforms
   MLIRGPUToROCDLTransforms
   ${dialect_libs}
   ${conversion_libs}

From dc9c16ec7ca1151fcc0e04677a2c06f32132af84 Mon Sep 17 00:00:00 2001
From: Alex Baden <alexander.baden@intel.com>
Date: Tue, 3 Dec 2024 01:37:43 +0000
Subject: [PATCH 18/18] address review comments

---
 .../intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp        | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
index 3d860c045b..5344f92d70 100644
--- a/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
+++ b/third_party/intel/lib/LLVMIR/LLVMIRFreezeMaskedDivRem.cpp
@@ -6,8 +6,8 @@
 
 using namespace llvm;
 
-static bool processPhiNode(BasicBlock &BB, PHINode *PhiNode) {
-  if (!any_of(PhiNode->incoming_values(), [](Use &U) {
+static bool processPhiNode(PHINode *PhiNode) {
+  if (none_of(PhiNode->incoming_values(), [](Use &U) {
         Constant *C = dyn_cast<Constant>(&U);
         return isa<UndefValue>(U) || C && C->isNullValue();
       })) {
@@ -15,7 +15,8 @@ static bool processPhiNode(BasicBlock &BB, PHINode *PhiNode) {
   }
 
   bool Changed = false;
-  for (Instruction &I : BB) {
+  BasicBlock *BB = const_cast<BasicBlock *>(PhiNode->getParent());
+  for (Instruction &I : *BB) {
     if (I.getOpcode() == Instruction::SDiv ||
         I.getOpcode() == Instruction::SRem) {
       const size_t OpIdx = 1;
@@ -35,7 +36,7 @@ static bool runOnFunction(Function &F) {
 
   for (BasicBlock &BB : F) {
     for (PHINode &PhiNode : BB.phis()) {
-      Changed |= processPhiNode(BB, &PhiNode);
+      Changed |= processPhiNode(&PhiNode);
     }
   }