Add force genisa

chengjunlu · chengjunlu · commit d1dd73698dea · 2025-07-02T15:28:34.000Z
Signed-off-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
@@ -50,6 +50,10 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "TRITON_INTEL_ENABLE_INSTR_SCHED",
     "TRITON_INTEL_FAST_MATH",
     "TRITON_INTEL_REDUCE_TRANSPOSE",
+    "TRITON_INTEL_ENABLE_SIMD_REDUCE",
+    "TRITON_INTEL_ENHANCED_ACCELERATION_MATMUL",
+    "TRITON_INTEL_ENABLE_DPAS_WARP_SIZE_32",
+    "TRITONGEN_FORCE_GENISA",
     // clang-format on
 };
 
diff --git a/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENAttrDefs.td b/third_party/intel/include/Dialect/TritonGEN/IR/TritonGENAttrDefs.td
@@ -55,9 +55,9 @@ def TritonGEN_PrecisionTypeAttr : I32EnumAttr<"PrecisionType",
     I32EnumAttrCase<"S4",     5,  "i4">,
     I32EnumAttrCase<"S2",     6,  "i2">,
     I32EnumAttrCase<"BF8",    7,  "bf8">,
-    I32EnumAttrCase<"TF32",   8,  "tf32">,
-    I32EnumAttrCase<"BF16",   9,  "bf16">,
-    I32EnumAttrCase<"FP16",   10, "f16">
+    I32EnumAttrCase<"TF32",   10,  "tf32">,
+    I32EnumAttrCase<"BF16",   11,  "bf16">,
+    I32EnumAttrCase<"FP16",   12, "f16">
   ]> {
   let cppNamespace = "::mlir::triton::TritonGEN";
 }
diff --git a/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp b/third_party/intel/lib/Dialect/TritonGEN/IR/TritonGENOps.cpp
@@ -85,12 +85,15 @@ LogicalResult TritonGEN::MatrixDPASOp::verify() {
     return this->emitOpError(
         "1st operand (C) and result (D) should have the same type");
 
-  if (CTy.getNumElements() != getRc() || DTy.getNumElements() != getRc())
+  auto useGenISA = tools::getBoolEnv("TRITONGEN_FORCE_GENISA");
+
+  if (!useGenISA &&
+      (CTy.getNumElements() != getRc() || DTy.getNumElements() != getRc()))
     return this->emitOpError("the dimension for 1st operand (C) and "
                              "result (D) should match repeat count");
 
   constexpr unsigned SD = 8;
-  if (BTy.getNumElements() != SD)
+  if (!useGenISA && BTy.getNumElements() != SD)
     return this->emitOpError("the dimension for the 3rd operand (B) should "
                              "match the systolic depth of 8");
 
@@ -141,7 +144,7 @@ LogicalResult TritonGEN::MatrixDPASOp::verify() {
   case TritonGEN::PrecisionType::FP16:
   case TritonGEN::PrecisionType::U8:
   case TritonGEN::PrecisionType::S8:
-    if (ATy.getNumElements() != getRc())
+    if (!useGenISA && ATy.getNumElements() != getRc())
       return this->emitOpError("2nd operand (A) should have the same number of "
                                "elements as repeat count");
     if (!AElemTy.isInteger(16))
@@ -303,6 +306,9 @@ LogicalResult TritonGEN::Matrix2DBlockLoadOp::verify() {
   if (verify2DBlockLoadHWRestriction(*this).failed())
     return failure();
 
+  if (tools::getBoolEnv("TRITONGEN_FORCE_GENISA"))
+    return success();
+
   if (verifyMatrixInput(*this).failed())
     return failure();
 
@@ -367,6 +373,9 @@ LogicalResult TritonGEN::Matrix2DBlockStoreOp::verify() {
   if (verify2DBlockStoreHWRestriction(*this).failed())
     return failure();
 
+  if (tools::getBoolEnv("TRITONGEN_FORCE_GENISA"))
+    return success();
+
   if (verifyMatrixInput(*this).failed())
     return failure();
 
diff --git a/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp b/third_party/intel/lib/TritonGENToLLVM/TritonGENToLLVMPass.cpp
@@ -40,6 +40,10 @@
 #include "intel/include/TritonGENToLLVM/TritonGENToLLVMPass.h"
 #include "intel/include/TritonGENToSPIRV/TritonGENToSPIRVPass.h"
 
+#include <triton/Tools/Sys/GetEnv.hpp>
+
+#include "GenIntrinsicHelper.h"
+
 namespace mlir::triton {
 #define GEN_PASS_DEF_CONVERTTRITONGENTOLLVM
 #include "intel/include/TritonGENToLLVM/Passes.h.inc"
@@ -431,27 +435,48 @@ struct TritonMatrixDPASLowering
     if (cOrigTy != cTy)
       c = rewriter.create<LLVM::BitcastOp>(loc, cTy, c);
 
-    std::string fnName = "__spirv_SubgroupMatrixMultiplyAccumulateINTEL";
-    SmallVector<Type> argTypes{int32Ty, aTy, bTy, cTy, int32Ty};
-    fnName = intel::mangle(fnName, argTypes);
-
-    TritonLLVMOpBuilder builder(loc, rewriter);
-    Value kDim = builder.i32_val(8 /*systolic depth*/ *
-                                 getNumOperandsPerDword(precisionA));
-    SmallVector<Value> args{
-        kDim, a, b, c,
-        builder.i32_val(getMatrixMultiplyAccumulateOperandsVal(
-            cOrigTy.getElementType(), precisionA))};
-    auto memAttr = rewriter.getAttr<LLVM::MemoryEffectsAttr>(
-        /*other=*/LLVM::ModRefInfo::NoModRef,
-        /*argMem=*/LLVM::ModRefInfo::NoModRef,
-        /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef);
-    auto funcAttrs = intel::convergentNoUnwindWillReturnAttrs;
-    funcAttrs.memEffectsAttr = memAttr;
+    Value result;
+    if (tools::getBoolEnv("TRITONGEN_FORCE_GENISA")) {
+      MLIRContext *ctx = rewriter.getContext();
+      auto builder = TritonLLVMOpBuilder(loc, rewriter);
+      mlir::triton::gpu::intel::GenISA_Dpas dpasOp(rewriter, cTy, cTy, aTy,
+                                                   bTy);
+
+      // refer the call signature in GenISA
+      result =
+          dpasOp(rewriter, loc, c, a, b,
+                 builder.i32_val(
+                     static_cast<unsigned>(precisionA)), /*src0's precision*/
+                 builder.i32_val(
+                     static_cast<unsigned>(op.getPb())), /*src1's precision*/
+                 builder.i32_val(8),                     /*systolic depth*/
+                 builder.i32_val(8),                     /*repeate count*/
+                 builder.int_val(1, 0) /*is double = false*/)
+              ->getResult(0);
+    } else {
+      std::string fnName = "__spirv_SubgroupMatrixMultiplyAccumulateINTEL";
+      SmallVector<Type> argTypes{int32Ty, aTy, bTy, cTy, int32Ty};
+      fnName = intel::mangle(fnName, argTypes);
+
+      TritonLLVMOpBuilder builder(loc, rewriter);
+      Value kDim = builder.i32_val(8 /*systolic depth*/ *
+                                   getNumOperandsPerDword(precisionA));
+      SmallVector<Value> args{
+          kDim, a, b, c,
+          builder.i32_val(getMatrixMultiplyAccumulateOperandsVal(
+              cOrigTy.getElementType(), precisionA))};
+      auto memAttr = rewriter.getAttr<LLVM::MemoryEffectsAttr>(
+          /*other=*/LLVM::ModRefInfo::NoModRef,
+          /*argMem=*/LLVM::ModRefInfo::NoModRef,
+          /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef);
+      auto funcAttrs = intel::convergentNoUnwindWillReturnAttrs;
+      funcAttrs.memEffectsAttr = memAttr;
+
+      result = intel::createDeviceFunctionCall(rewriter, fnName, cTy, argTypes,
+                                               args, {}, funcAttrs)
+                   ->getResult(0);
+    }
 
-    Value result = intel::createDeviceFunctionCall(
-                       rewriter, fnName, cTy, argTypes, args, {}, funcAttrs)
-                       ->getResult(0);
     if (cOrigTy != cTy)
       result = rewriter.create<LLVM::BitcastOp>(loc, cOrigTy, result);
 
@@ -508,7 +533,8 @@ struct TritonMatrix2DBlockLoadLowering
   LogicalResult
   matchAndRewrite(TritonGEN::Matrix2DBlockLoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    if (!isSPVBuiltinAvailable(op)) {
+    if (tools::getBoolEnv("TRITONGEN_FORCE_GENISA") ||
+        !isSPVBuiltinAvailable(op)) {
       // Fallback to GenISA interface.
       rewriter.replaceOp(op, createGenISA2DBlockRead(op, rewriter));
       return success();
@@ -583,6 +609,12 @@ struct TritonMatrix2DBlockStoreLowering
   LogicalResult
   matchAndRewrite(TritonGEN::Matrix2DBlockStoreOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    // TODO: Remove GenISA lowering after PoC productization is completed.
+    if (tools::getBoolEnv("TRITONGEN_FORCE_GENISA")) {
+      rewriter.replaceOp(op, createGenISA2DBlockWrite(op, rewriter));
+      return success();
+    }
+
     MLIRContext *ctx = rewriter.getContext();
     Location loc = op->getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);
@@ -651,6 +683,13 @@ struct TritonMatrix2DBlockPrefetchLowering
   LogicalResult
   matchAndRewrite(TritonGEN::Matrix2DBlockPrefetchOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    // TODO: Remove GenISA lowering after PoC productization is completed.
+    bool useGenISA = tools::getBoolEnv("TRITONGEN_FORCE_GENISA");
+    if (useGenISA) {
+      rewriter.replaceOp(op, createGenISA2DBlockPrefetch(op, rewriter));
+      return success();
+    }
+
     MLIRContext *ctx = rewriter.getContext();
     Location loc = op->getLoc();
     auto b = TritonLLVMOpBuilder(loc, rewriter);