Skip to content

Commit c788bfd

Browse files
authored
Add attribute to control 16-bit atomics lowering. (#4149)
Currently, we always emulate 16-bit atomic operations. The new attribute allows targets to disable the emulation. With the growth of the number of feature attributes, I modified how we pass them to the annotate module pass. Passing all features in as a structure simplifies adding a new attribute, allows usage of default values, and decreases merge conflicts. Signed-off-by: Ilya Enkovich <[email protected]>
1 parent 0ca38b7 commit c788bfd

File tree

6 files changed

+48
-14
lines changed

6 files changed

+48
-14
lines changed

third_party/intel/backend/compiler.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -218,11 +218,14 @@ def annotate_module(mod, properties, opt, target_arch):
218218
# Annotate module with information required by subsequent transformations.
219219
pm = ir.pass_manager(mod.context)
220220
pm.enable_debug()
221-
intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]),
222-
properties["has_subgroup_2d_block_io"],
223-
properties["has_subgroup_matrix_multiply_accumulate"],
224-
properties["has_bfloat16_conversions"], opt.threads_per_warp,
225-
target_arch)
221+
module_opts = intel.passes.ttgpuir.AnnotateModuleOptions()
222+
module_opts.min_sg_size = min(properties["sub_group_sizes"])
223+
module_opts.support_sg_2d_block = properties["has_subgroup_2d_block_io"]
224+
module_opts.support_dpas = properties["has_subgroup_matrix_multiply_accumulate"]
225+
module_opts.support_bf16_conversion = properties["has_bfloat16_conversions"]
226+
module_opts.threads_per_warp = opt.threads_per_warp
227+
module_opts.target_arch = target_arch
228+
intel.passes.ttgpuir.add_triton_annotate_module(pm, module_opts)
226229
pm.run(mod)
227230

228231
@staticmethod

third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,12 @@ def TritonIntelGPU_Dialect : Dialect {
5555
static constexpr llvm::StringRef getTargetArchAttrName() {
5656
return "triton_intel_gpu.target_arch";
5757
}
58+
59+
/// Get the name of the attribute used to indicate whether the native 16bit
60+
/// atomic operations are available.
61+
static constexpr llvm::StringRef getSupport16BitAtomicsAttrName() {
62+
return "triton_intel_gpu.support_16bit_atomics";
63+
}
5864
}];
5965

6066
let useDefaultAttributePrinterParser = 1;

third_party/intel/include/TritonAnnotateModule/Passes.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,8 @@ def TritonAnnotateModule: Pass<"triton-annotate-module", "mlir::ModuleOp"> {
3333
"whether DPAS instruction is available">,
3434
Option<"supportBF16Conversion", "support-bf16-conversion", "bool", /*default*/"false",
3535
"whether BF16 conversion instruction is available">,
36+
Option<"support16BitAtomics", "support-16bit-atomics", "bool", /*default*/"false",
37+
"whether 16bit atomic operations are available">,
3638
Option<"threadsPerWarp", "threads-per-warp",
3739
"unsigned", /*default*/"32",
3840
"number of threads per warp (aka subgroup size)">,

third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,11 @@ struct TritonAnnotateModule
4040
mod->setAttr(intel::TritonIntelGPUDialect::getTargetArchAttrName(),
4141
builder.getStringAttr(targetArch));
4242

43+
if (support16BitAtomics)
44+
mod->setAttr(
45+
intel::TritonIntelGPUDialect::getSupport16BitAtomicsAttrName(),
46+
builder.getUnitAttr());
47+
4348
DPASAnalysis &dpasAnalysis = getAnalysis<DPASAnalysis>();
4449
setThreadsPerWarp(mod, dpasAnalysis);
4550
}

third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2910,7 +2910,9 @@ struct AtomicRMWOpConversion
29102910
// TODO: check device capabilities to avoid unnecessary emulation or
29112911
// emit unsupported feature error.
29122912
Value ret;
2913-
if (valueElemNBits == 16) {
2913+
bool support16BitAtomics = moduleOp->hasAttr(
2914+
TritonIntelGPUDialect::getSupport16BitAtomicsAttrName());
2915+
if (valueElemNBits == 16 && !support16BitAtomics) {
29142916
op.emitWarning(
29152917
"'tt.atomic_rmw' op fp16 datatype is not supported in the target "
29162918
"HW, software emulation is an experimental feature (use at own "

third_party/intel/triton_xpu.cc

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,6 @@ using ret = py::return_value_policy;
5252
m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2) { \
5353
pm.addPass(builder({val0, val1, val2})); \
5454
})
55-
#define ADD_PASS_WRAPPER_OPT_6(name, builder, ty0, ty1, ty2, ty3, ty4, ty5) \
56-
m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, \
57-
ty3 val3, ty4 val4, ty5 val5) { \
58-
pm.addPass(builder({val0, val1, val2, val3, val4, val5})); \
59-
})
6055

6156
static uint32_t findKernels(llvm::Module &M,
6257
std::set<llvm::Function *> &functions) {
@@ -103,9 +98,30 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
10398
gpu::intel::createTritonIntelGPUMatchTargetSize);
10499
ADD_PASS_WRAPPER_0("add_schedule_load",
105100
gpu::intel::createTritonIntelGPUScheduleLoad);
106-
ADD_PASS_WRAPPER_OPT_6("add_triton_annotate_module",
107-
gpu::intel::createTritonAnnotateModule, unsigned, bool,
108-
bool, bool, unsigned, const std::string &);
101+
102+
py::class_<gpu::intel::TritonAnnotateModuleOptions>(m,
103+
"AnnotateModuleOptions")
104+
.def(py::init<>())
105+
.def_readwrite("min_sg_size",
106+
&gpu::intel::TritonAnnotateModuleOptions::minSGSize)
107+
.def_readwrite("support_sg_2d_block",
108+
&gpu::intel::TritonAnnotateModuleOptions::supportSG2DBlock)
109+
.def_readwrite("support_dpas",
110+
&gpu::intel::TritonAnnotateModuleOptions::supportDPAS)
111+
.def_readwrite(
112+
"support_bf16_conversion",
113+
&gpu::intel::TritonAnnotateModuleOptions::supportBF16Conversion)
114+
.def_readwrite(
115+
"support_16bit_atomics",
116+
&gpu::intel::TritonAnnotateModuleOptions::support16BitAtomics)
117+
.def_readwrite("threads_per_warp",
118+
&gpu::intel::TritonAnnotateModuleOptions::threadsPerWarp)
119+
.def_readwrite("target_arch",
120+
&gpu::intel::TritonAnnotateModuleOptions::targetArch);
121+
ADD_PASS_WRAPPER_OPT_1("add_triton_annotate_module",
122+
gpu::intel::createTritonAnnotateModule,
123+
gpu::intel::TritonAnnotateModuleOptions);
124+
109125
ADD_PASS_WRAPPER_0("add_reduce_data_duplication",
110126
gpu::intel::createTritonIntelGPUReduceDataDuplication);
111127
ADD_PASS_WRAPPER_0("add_materialize_block_pointer",

0 commit comments

Comments
 (0)