Add attribute to control 16-bit atomics lowering. (#4149)

ienkovich · web-flow · commit c788bfd0a87f · 2025-05-14T09:10:12.000-05:00
Currently, we always emulate 16-bit atomic operations. The new attribute
allows targets to disable the emulation.

With the growth of the number of feature attributes, I modified how we
pass them to the annotate module pass. Passing all features in as a
structure simplifies adding a new attribute, allows usage of default
values, and decreases merge conflicts.

Signed-off-by: Ilya Enkovich &lt;ilya.enkovich@intel.com&gt;
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -218,11 +218,14 @@ def annotate_module(mod, properties, opt, target_arch):
         # Annotate module with information required by subsequent transformations.
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
-        intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]),
-                                                        properties["has_subgroup_2d_block_io"],
-                                                        properties["has_subgroup_matrix_multiply_accumulate"],
-                                                        properties["has_bfloat16_conversions"], opt.threads_per_warp,
-                                                        target_arch)
+        module_opts = intel.passes.ttgpuir.AnnotateModuleOptions()
+        module_opts.min_sg_size = min(properties["sub_group_sizes"])
+        module_opts.support_sg_2d_block = properties["has_subgroup_2d_block_io"]
+        module_opts.support_dpas = properties["has_subgroup_matrix_multiply_accumulate"]
+        module_opts.support_bf16_conversion = properties["has_bfloat16_conversions"]
+        module_opts.threads_per_warp = opt.threads_per_warp
+        module_opts.target_arch = target_arch
+        intel.passes.ttgpuir.add_triton_annotate_module(pm, module_opts)
         pm.run(mod)
 
     @staticmethod
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td
@@ -55,6 +55,12 @@ def TritonIntelGPU_Dialect : Dialect {
     static constexpr llvm::StringRef getTargetArchAttrName() {
       return "triton_intel_gpu.target_arch";
     }
+
+    /// Get the name of the attribute used to indicate whether the native 16bit
+    /// atomic operations are available.
+    static constexpr llvm::StringRef getSupport16BitAtomicsAttrName() {
+      return "triton_intel_gpu.support_16bit_atomics";
+    }
   }];
 
   let useDefaultAttributePrinterParser = 1;
diff --git a/third_party/intel/include/TritonAnnotateModule/Passes.td b/third_party/intel/include/TritonAnnotateModule/Passes.td
@@ -33,6 +33,8 @@ def TritonAnnotateModule: Pass<"triton-annotate-module", "mlir::ModuleOp"> {
            "whether DPAS instruction is available">,
     Option<"supportBF16Conversion", "support-bf16-conversion", "bool", /*default*/"false",
            "whether BF16 conversion instruction is available">,
+    Option<"support16BitAtomics", "support-16bit-atomics", "bool", /*default*/"false",
+           "whether 16bit atomic operations are available">,
     Option<"threadsPerWarp", "threads-per-warp",
            "unsigned", /*default*/"32",
            "number of threads per warp (aka subgroup size)">,
diff --git a/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp b/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp
@@ -40,6 +40,11 @@ struct TritonAnnotateModule
     mod->setAttr(intel::TritonIntelGPUDialect::getTargetArchAttrName(),
                  builder.getStringAttr(targetArch));
 
+    if (support16BitAtomics)
+      mod->setAttr(
+          intel::TritonIntelGPUDialect::getSupport16BitAtomicsAttrName(),
+          builder.getUnitAttr());
+
     DPASAnalysis &dpasAnalysis = getAnalysis<DPASAnalysis>();
     setThreadsPerWarp(mod, dpasAnalysis);
   }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -2910,7 +2910,9 @@ struct AtomicRMWOpConversion
       // TODO: check device capabilities to avoid unnecessary emulation or
       // emit unsupported feature error.
       Value ret;
-      if (valueElemNBits == 16) {
+      bool support16BitAtomics = moduleOp->hasAttr(
+          TritonIntelGPUDialect::getSupport16BitAtomicsAttrName());
+      if (valueElemNBits == 16 && !support16BitAtomics) {
         op.emitWarning(
             "'tt.atomic_rmw' op fp16 datatype is not supported in the target "
             "HW, software emulation is an experimental feature (use at own "
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -52,11 +52,6 @@ using ret = py::return_value_policy;
   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2) {        \
     pm.addPass(builder({val0, val1, val2}));                                   \
   })
-#define ADD_PASS_WRAPPER_OPT_6(name, builder, ty0, ty1, ty2, ty3, ty4, ty5)    \
-  m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
-                 ty3 val3, ty4 val4, ty5 val5) {                               \
-    pm.addPass(builder({val0, val1, val2, val3, val4, val5}));                 \
-  })
 
 static uint32_t findKernels(llvm::Module &M,
                             std::set<llvm::Function *> &functions) {
@@ -103,9 +98,30 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
                      gpu::intel::createTritonIntelGPUMatchTargetSize);
   ADD_PASS_WRAPPER_0("add_schedule_load",
                      gpu::intel::createTritonIntelGPUScheduleLoad);
-  ADD_PASS_WRAPPER_OPT_6("add_triton_annotate_module",
-                         gpu::intel::createTritonAnnotateModule, unsigned, bool,
-                         bool, bool, unsigned, const std::string &);
+
+  py::class_<gpu::intel::TritonAnnotateModuleOptions>(m,
+                                                      "AnnotateModuleOptions")
+      .def(py::init<>())
+      .def_readwrite("min_sg_size",
+                     &gpu::intel::TritonAnnotateModuleOptions::minSGSize)
+      .def_readwrite("support_sg_2d_block",
+                     &gpu::intel::TritonAnnotateModuleOptions::supportSG2DBlock)
+      .def_readwrite("support_dpas",
+                     &gpu::intel::TritonAnnotateModuleOptions::supportDPAS)
+      .def_readwrite(
+          "support_bf16_conversion",
+          &gpu::intel::TritonAnnotateModuleOptions::supportBF16Conversion)
+      .def_readwrite(
+          "support_16bit_atomics",
+          &gpu::intel::TritonAnnotateModuleOptions::support16BitAtomics)
+      .def_readwrite("threads_per_warp",
+                     &gpu::intel::TritonAnnotateModuleOptions::threadsPerWarp)
+      .def_readwrite("target_arch",
+                     &gpu::intel::TritonAnnotateModuleOptions::targetArch);
+  ADD_PASS_WRAPPER_OPT_1("add_triton_annotate_module",
+                         gpu::intel::createTritonAnnotateModule,
+                         gpu::intel::TritonAnnotateModuleOptions);
+
   ADD_PASS_WRAPPER_0("add_reduce_data_duplication",
                      gpu::intel::createTritonIntelGPUReduceDataDuplication);
   ADD_PASS_WRAPPER_0("add_materialize_block_pointer",