Use module attribute to specify target arch. (#3387)

ienkovich · web-flow · commit fff08ef85ab2 · 2025-02-14T11:46:57.000-05:00
This variant makes the target arch attribute mandatory for conversion to
the LLVM dialect. We can fall back to the optional attribute if it looks
too intrusive.

Signed-off-by: Ilya Enkovich &lt;ilya.enkovich@intel.com&gt;
diff --git a/test/TritonIntelGPU/triton_annotate_module.mlir b/test/TritonIntelGPU/triton_annotate_module.mlir
@@ -2,7 +2,7 @@
 
 module {
   // COM: Ensure that the 'threads-per-warp' attribute is set according to the option.
-  // CHECK: module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, "ttg.threads-per-warp" = 32 : i32}
+  // CHECK: module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.threads-per-warp" = 32 : i32}
   tt.func @kernel() {
     tt.return
   }
@@ -13,7 +13,7 @@ module {
 module {
   // COM: Ensure that the 'threads-per-warp' attribute is overwritten when the kernel contains a 'tt.dot'
   //      operation that can be lowered to DPAS instructions.
-  // CHECK: module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, "ttg.threads-per-warp" = 16 : i32}
+  // CHECK: module attributes {triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block, triton_intel_gpu.target_arch = "spir64", "ttg.threads-per-warp" = 16 : i32}
   tt.func @kernel() {
     %a = arith.constant dense<1.00e+00> : tensor<128x32xf16>
     %b = arith.constant dense<2.00e+00> : tensor<32x128xf16>
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -252,10 +252,12 @@ def make_ttgir(mod, metadata, opt, properties):
         # Annotate module with information required by subsequent transformations.
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
+        target_arch = "spir64"
         intel.passes.ttgpuir.add_triton_annotate_module(pm, min(properties["sub_group_sizes"]),
                                                         properties["has_subgroup_2d_block_io"],
                                                         properties["has_subgroup_matrix_multiply_accumulate"],
-                                                        properties["has_bfloat16_conversions"], opt.threads_per_warp)
+                                                        properties["has_bfloat16_conversions"], opt.threads_per_warp,
+                                                        target_arch)
         pm.run(mod)
 
         # Overwrite the threads_per_warp option with the module annotation.
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUDialect.td
@@ -48,6 +48,13 @@ def TritonIntelGPU_Dialect : Dialect {
     static constexpr llvm::StringRef getBlockIOAttrName() {
       return "triton_intel_gpu.block_io";
     }
+
+    /// Get the name of the attribute used to specify the target architecture. This
+    /// attribute matches architecture in a target triple used for the resulting LLVM
+    /// IR module.
+    static constexpr llvm::StringRef getTargetArchAttrName() {
+      return "triton_intel_gpu.target_arch";
+    }
   }];
 
   let useDefaultAttributePrinterParser = 1;
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/Utils.h b/third_party/intel/include/Dialect/TritonIntelGPU/IR/Utils.h
@@ -10,6 +10,7 @@
 #define TRITON_DIALECT_TRITON_INTEL_GPU_IR_UTILS_H
 
 #include "intel/include/Analysis/AxisInfo.h"
+#include "intel/include/Dialect/TritonIntelGPU/IR/Dialect.h"
 #include "mlir/IR/Operation.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include <triton/Tools/Sys/GetEnv.hpp>
@@ -46,6 +47,16 @@ inline unsigned getNumElementsPerThread(
 inline bool applyTransposedReduction() {
   return tools::getBoolEnv("TRITON_INTEL_REDUCE_TRANSPOSE");
 }
+
+// Check if module's target arch is SPIRV. If there is no target arch
+// attribute, then we assume SPIRV target by default.
+inline bool hasSpirvTargetArch(Operation *op) {
+  if (!isa<ModuleOp>(op))
+    op = op->getParentOfType<ModuleOp>();
+  auto arch = op->getAttrOfType<StringAttr>(
+      triton::gpu::intel::TritonIntelGPUDialect::getTargetArchAttrName());
+  return !arch || arch.str().substr(0, 4) == "spir";
+}
 } // namespace mlir::triton::gpu::intel
 
 #endif // TRITON_DIALECT_TRITON_INTEL_GPU_IR_UTILS_H
diff --git a/third_party/intel/include/TritonAnnotateModule/Passes.td b/third_party/intel/include/TritonAnnotateModule/Passes.td
@@ -36,6 +36,8 @@ def TritonAnnotateModule: Pass<"triton-annotate-module", "mlir::ModuleOp"> {
     Option<"threadsPerWarp", "threads-per-warp",
            "unsigned", /*default*/"32",
            "number of threads per warp (aka subgroup size)">,
+    Option<"targetArch", "target-arch", "std::string", /*default*/"\"spir64\"",
+           "target architecture name">
   ];
 }
 
diff --git a/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp b/third_party/intel/lib/TritonAnnotateModule/TritonAnnotateModule.cpp
@@ -37,6 +37,9 @@ struct TritonAnnotateModule
           intel::TritonIntelGPUDialect::getSupportBF16ConversionAttrName(),
           builder.getUnitAttr());
 
+    mod->setAttr(intel::TritonIntelGPUDialect::getTargetArchAttrName(),
+                 builder.getStringAttr(targetArch));
+
     DPASAnalysis &dpasAnalysis = getAnalysis<DPASAnalysis>();
     setThreadsPerWarp(mod, dpasAnalysis);
   }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h b/third_party/intel/lib/TritonIntelGPUToLLVM/PipelineManager.h
@@ -23,6 +23,7 @@
 #include "mlir/Dialect/SPIRV/IR/TargetAndABI.h"
 #include "mlir/IR/PatternMatch.h"
 
+#include "intel/include/Dialect/TritonIntelGPU/IR/Utils.h"
 #include "intel/include/GPUToTritonGEN/GPUToTritonGENPass.h"
 #include "intel/include/TritonGENToLLVM/TritonGENToLLVMPass.h"
 #include "triton/Analysis/AxisInfo.h"
@@ -143,7 +144,7 @@ struct AddSPIRVEnvPattern : public mlir::OpRewritePattern<ModuleOp> {
 
   LogicalResult matchAndRewrite(ModuleOp op,
                                 PatternRewriter &rewriter) const override {
-    if (spirv::lookupTargetEnv(op)) {
+    if (!gpu::intel::hasSpirvTargetArch(op) || spirv::lookupTargetEnv(op)) {
       return failure();
     }
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonGPUToLLVM.cpp
@@ -59,8 +59,10 @@ class TritonLLVMConversionTarget : public ConversionTarget {
     addIllegalDialect<triton::gpu::intel::TritonIntelGPUDialect>();
     addIllegalDialect<mlir::gpu::GPUDialect>();
     addLegalOp<mlir::UnrealizedConversionCastOp>();
-    addDynamicallyLegalOp<ModuleOp>(
-        [](ModuleOp op) { return spirv::lookupTargetEnv(op) != nullptr; });
+    addDynamicallyLegalOp<ModuleOp>([](ModuleOp op) {
+      return !triton::gpu::intel::hasSpirvTargetArch(op) ||
+             spirv::lookupTargetEnv(op) != nullptr;
+    });
   }
 };
 
diff --git a/third_party/intel/triton_xpu.cc b/third_party/intel/triton_xpu.cc
@@ -47,10 +47,11 @@ using ret = py::return_value_policy;
   m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1) {                  \
     pm.addPass(builder({val0, val1}));                                         \
   })
-#define ADD_PASS_WRAPPER_OPT_5(name, builder, ty0, ty1, ty2, ty3, ty4)         \
-  m.def(name,                                                                  \
-        [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2, ty3 val3,      \
-           ty4 val4) { pm.addPass(builder({val0, val1, val2, val3, val4})); })
+#define ADD_PASS_WRAPPER_OPT_6(name, builder, ty0, ty1, ty2, ty3, ty4, ty5)    \
+  m.def(name, [](mlir::PassManager &pm, ty0 val0, ty1 val1, ty2 val2,          \
+                 ty3 val3, ty4 val4, ty5 val5) {                               \
+    pm.addPass(builder({val0, val1, val2, val3, val4, val5}));                 \
+  })
 
 static uint32_t findKernels(llvm::Module &M,
                             std::set<llvm::Function *> &functions) {
@@ -97,9 +98,9 @@ void init_triton_intel_passes_ttgpuir(py::module &&m) {
                      gpu::intel::createTritonIntelGPUMatchTargetSize);
   ADD_PASS_WRAPPER_0("add_schedule_load",
                      gpu::intel::createTritonIntelGPUScheduleLoad);
-  ADD_PASS_WRAPPER_OPT_5("add_triton_annotate_module",
+  ADD_PASS_WRAPPER_OPT_6("add_triton_annotate_module",
                          gpu::intel::createTritonAnnotateModule, unsigned, bool,
-                         bool, bool, unsigned);
+                         bool, bool, unsigned, const std::string &);
   ADD_PASS_WRAPPER_0("add_reduce_data_duplication",
                      gpu::intel::createTritonIntelGPUReduceDataDuplication);
   ADD_PASS_WRAPPER_0("add_materialize_block_pointer",

Original file line number	Diff line number	Diff line change
`@@ -36,6 +36,8 @@ def TritonAnnotateModule: Pass<"triton-annotate-module", "mlir::ModuleOp"> {`
`36`	`36`	`Option<"threadsPerWarp", "threads-per-warp",`
`37`	`37`	`"unsigned", /default/"32",`
`38`	`38`	`"number of threads per warp (aka subgroup size)">,`
	`39`	`+ Option<"targetArch", "target-arch", "std::string", /default/"\"spir64\"",`
	`40`	`+ "target architecture name">`
`39`	`41`	`];`
`40`	`42`	`}`
`41`	`43`
Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,9 @@ struct TritonAnnotateModule`
`37`	`37`	`intel::TritonIntelGPUDialect::getSupportBF16ConversionAttrName(),`
`38`	`38`	`builder.getUnitAttr());`
`39`	`39`
	`40`	`+ mod->setAttr(intel::TritonIntelGPUDialect::getTargetArchAttrName(),`
	`41`	`+ builder.getStringAttr(targetArch));`
	`42`	`+`
`40`	`43`	`DPASAnalysis &dpasAnalysis = getAnalysis<DPASAnalysis>();`
`41`	`44`	`setThreadsPerWarp(mod, dpasAnalysis);`
`42`	`45`	`}`