Prevent UB in div/rem instructions during optimization

alexbaden · alexbaden · commit 7f50dcfa231e · 2024-12-03T01:20:57.000Z
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -6,6 +6,7 @@
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Transforms/InstCombine/InstCombine.h"
+#include "llvm/Transforms/Scalar/DivRemPairs.h"
 
 #include "intel/include/Dialect/TritonGEN/IR/TritonGENDialect.h"
 #include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
@@ -204,6 +205,17 @@ void init_triton_intel(py::module &&m) {
           fpm.addPass(BreakStructPhiNodesPass());
           fpm.addPass(InstCombinePass());
         });
+    pb.registerPeepholeEPCallback(
+        [&](llvm::FunctionPassManager &fpm, llvm::OptimizationLevel level) {
+          // The Triton masked load pattern can generate instances where the
+          // mask false path appears to cause undefined behavior during
+          // computation. Even though the result of that behavior will never be
+          // used, LLVM can choose to optimize away the false path resulting in
+          // an incorrect result for the kernel. Adding `DivRemPairsPass`
+          // introduces freeze instructions which prevent UB from leaking into
+          // div/rem instructions.
+          fpm.addPass(DivRemPairsPass());
+        });
     mpm.addPass(pb.buildPerModuleDefaultPipeline(opt));
     mpm.run(*mod, mam);
   });