[XPU][TritonIntelGPUToLLVM] Handle arithmetic reductions of i1 values (#3113)

victor-eds · web-flow · commit 66391a3a0647 · 2025-01-09T22:35:53.000-05:00
Use logical operations to represent arithmetic reductions of `i1` values. Only the SPIR-V builtins being used need to be changed as LLVM will handle the scalar arithmetic operations used. Closes #3109 --------- Signed-off-by: victor-eds <victor.perez@codeplay.com>
diff --git a/test/Conversion/intel/tritongpu_to_gen.mlir b/test/Conversion/intel/tritongpu_to_gen.mlir
@@ -1508,7 +1508,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "triton_
 #blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [32, 1], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 #slice = #ttg.slice<{dim = 0, parent = #blocked}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
-  tt.func public @reduce_all(%arg: tensor<256x1xi32, #blocked>, %arg_0: tensor<256x1xf32, #blocked>) {
+  tt.func public @reduce_all(%arg: tensor<256x1xi32, #blocked>, %arg_0: tensor<256x1xf32, #blocked>, %arg_1: tensor<256x1xi1, #blocked>) {
 
     // CHECK: @_Z27__spirv_GroupNonUniformFAddiif
     %0 = "tt.reduce"(%arg_0) <{axis = 0 : i32}> ({
@@ -1573,6 +1573,48 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
       tt.reduce.return %48 : i32
     }) : (tensor<256x1xi32, #blocked>) -> tensor<1xi32, #slice>
 
+    // CHECK: @_Z32__spirv_GroupNonUniformLogicalOriib
+    %10 = "tt.reduce"(%arg_1) <{axis = 0 : i32}> ({
+    ^bb0(%arg4: i1, %arg5: i1):
+      %48 = arith.addi %arg4, %arg5 : i1
+      tt.reduce.return %48 : i1
+    }) : (tensor<256x1xi1, #blocked>) -> tensor<1xi1, #slice>
+
+    // CHECK: @_Z33__spirv_GroupNonUniformLogicalAndiib
+    %11 = "tt.reduce"(%arg_1) <{axis = 0 : i32}> ({
+    ^bb0(%arg4: i1, %arg5: i1):
+      %48 = arith.muli %arg4, %arg5 : i1
+      tt.reduce.return %48 : i1
+    }) : (tensor<256x1xi1, #blocked>) -> tensor<1xi1, #slice>
+
+    // CHECK: @_Z32__spirv_GroupNonUniformLogicalOriib
+    %12 = "tt.reduce"(%arg_1) <{axis = 0 : i32}> ({
+    ^bb0(%arg4: i1, %arg5: i1):
+      %48 = arith.maxsi %arg4, %arg5 : i1
+      tt.reduce.return %48 : i1
+    }) : (tensor<256x1xi1, #blocked>) -> tensor<1xi1, #slice>
+
+    // CHECK: @_Z32__spirv_GroupNonUniformLogicalOriib
+    %13 = "tt.reduce"(%arg_1) <{axis = 0 : i32}> ({
+    ^bb0(%arg4: i1, %arg5: i1):
+      %48 = arith.maxui %arg4, %arg5 : i1
+      tt.reduce.return %48 : i1
+    }) : (tensor<256x1xi1, #blocked>) -> tensor<1xi1, #slice>
+
+    // CHECK: @_Z33__spirv_GroupNonUniformLogicalAndiib
+    %14 = "tt.reduce"(%arg_1) <{axis = 0 : i32}> ({
+    ^bb0(%arg4: i1, %arg5: i1):
+      %48 = arith.minsi %arg4, %arg5 : i1
+      tt.reduce.return %48 : i1
+    }) : (tensor<256x1xi1, #blocked>) -> tensor<1xi1, #slice>
+
+    // CHECK: @_Z33__spirv_GroupNonUniformLogicalAndiib
+    %15 = "tt.reduce"(%arg_1) <{axis = 0 : i32}> ({
+    ^bb0(%arg4: i1, %arg5: i1):
+      %48 = arith.minui %arg4, %arg5 : i1
+      tt.reduce.return %48 : i1
+    }) : (tensor<256x1xi1, #blocked>) -> tensor<1xi1, #slice>
+
     tt.return
   }
 }
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/SPIRVSubgroupOps.h b/third_party/intel/lib/TritonIntelGPUToLLVM/SPIRVSubgroupOps.h
@@ -21,56 +21,50 @@ using namespace mlir;
 
 namespace mlir::triton::intel {
 
-template <typename OpTy> struct SPIRVArithmeticGroupOp {};
+template <typename OpTy> struct SPIRVGroupOp {};
 
-template <> struct SPIRVArithmeticGroupOp<arith::AddFOp> {
+template <> struct SPIRVGroupOp<arith::AddFOp> {
   using type = spirv::GroupNonUniformFAddOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::AddIOp> {
+template <> struct SPIRVGroupOp<arith::AddIOp> {
   using type = spirv::GroupNonUniformIAddOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MulFOp> {
+template <> struct SPIRVGroupOp<arith::MulFOp> {
   using type = spirv::GroupNonUniformFMulOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MulIOp> {
+template <> struct SPIRVGroupOp<arith::MulIOp> {
   using type = spirv::GroupNonUniformIMulOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MaxSIOp> {
+template <> struct SPIRVGroupOp<arith::MaxSIOp> {
   using type = spirv::GroupNonUniformSMaxOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MaxUIOp> {
+template <> struct SPIRVGroupOp<arith::MaxUIOp> {
   using type = spirv::GroupNonUniformUMaxOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MinSIOp> {
+template <> struct SPIRVGroupOp<arith::MinSIOp> {
   using type = spirv::GroupNonUniformSMinOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MinUIOp> {
+template <> struct SPIRVGroupOp<arith::MinUIOp> {
   using type = spirv::GroupNonUniformUMinOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MaxNumFOp> {
+template <> struct SPIRVGroupOp<arith::MaxNumFOp> {
   using type = spirv::GroupNonUniformFMaxOp;
 };
-template <> struct SPIRVArithmeticGroupOp<arith::MinNumFOp> {
+template <> struct SPIRVGroupOp<arith::MinNumFOp> {
   using type = spirv::GroupNonUniformFMinOp;
 };
-
-template <typename OpTy>
-using SPIRVArithmeticGroupOpTy = typename SPIRVArithmeticGroupOp<OpTy>::type;
-
-template <typename OpTy> struct SPIRVBitwiseGroupOp {};
-
-template <> struct SPIRVBitwiseGroupOp<arith::AndIOp> {
+template <> struct SPIRVGroupOp<arith::AndIOp> {
   using type = spirv::GroupNonUniformBitwiseAndOp;
 };
-template <> struct SPIRVBitwiseGroupOp<arith::OrIOp> {
+template <> struct SPIRVGroupOp<arith::OrIOp> {
   using type = spirv::GroupNonUniformBitwiseOrOp;
 };
-template <> struct SPIRVBitwiseGroupOp<arith::XOrIOp> {
+template <> struct SPIRVGroupOp<arith::XOrIOp> {
   using type = spirv::GroupNonUniformBitwiseXorOp;
 };
 
 template <typename OpTy>
-using SPIRVBitwiseGroupOpTy = typename SPIRVBitwiseGroupOp<OpTy>::type;
+using SPIRVGroupOpTy = typename SPIRVGroupOp<OpTy>::type;
 
 template <typename OpTy> struct SPIRVLogicalGroupOp {};
 
@@ -83,6 +77,24 @@ template <> struct SPIRVLogicalGroupOp<arith::OrIOp> {
 template <> struct SPIRVLogicalGroupOp<arith::XOrIOp> {
   using type = spirv::GroupNonUniformLogicalXorOp;
 };
+template <> struct SPIRVLogicalGroupOp<arith::AddIOp> {
+  using type = spirv::GroupNonUniformLogicalOrOp;
+};
+template <> struct SPIRVLogicalGroupOp<arith::MulIOp> {
+  using type = spirv::GroupNonUniformLogicalAndOp;
+};
+template <> struct SPIRVLogicalGroupOp<arith::MaxUIOp> {
+  using type = spirv::GroupNonUniformLogicalOrOp;
+};
+template <> struct SPIRVLogicalGroupOp<arith::MaxSIOp> {
+  using type = spirv::GroupNonUniformLogicalOrOp;
+};
+template <> struct SPIRVLogicalGroupOp<arith::MinUIOp> {
+  using type = spirv::GroupNonUniformLogicalAndOp;
+};
+template <> struct SPIRVLogicalGroupOp<arith::MinSIOp> {
+  using type = spirv::GroupNonUniformLogicalAndOp;
+};
 
 template <typename OpTy>
 using SPIRVLogicalGroupOpTy = typename SPIRVLogicalGroupOp<OpTy>::type;
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TargetInfo.cpp
@@ -133,25 +133,23 @@ Value warpReduceHelper(RewriterBase &rewriter, Location loc, Value acc,
                        Operation *reduceOp, unsigned numLanesToReduce,
                        unsigned warpSize) {
   auto resultType = reduceOp->getResult(0).getType();
-  Value warpReduce =
-      TypeSwitch<mlir::Operation *, Value>(reduceOp)
-          .Case<arith::AddFOp, arith::AddIOp, arith::MulFOp, arith::MulIOp,
-                arith::MaxSIOp, arith::MaxUIOp, arith::MinSIOp, arith::MinUIOp,
-                arith::MaxNumFOp, arith::MinNumFOp>([&](auto groupOp) {
-            return createSPIRVGroupOp<
-                SPIRVArithmeticGroupOpTy<decltype(groupOp)>>(
-                rewriter, loc, resultType, acc, numLanesToReduce, warpSize);
-          })
-          .Case<arith::AndIOp, arith::OrIOp, arith::XOrIOp>([&](auto groupOp) {
-            if (resultType.isInteger(1)) {
-              return createSPIRVGroupOp<
-                  SPIRVLogicalGroupOpTy<decltype(groupOp)>>(
-                  rewriter, loc, resultType, acc, numLanesToReduce, warpSize);
-            }
-            return createSPIRVGroupOp<SPIRVBitwiseGroupOpTy<decltype(groupOp)>>(
-                rewriter, loc, resultType, acc, numLanesToReduce, warpSize);
-          });
-  return warpReduce;
+  // Use bit-equivalent logical operation for Boolean values.
+  if (resultType.isInteger(1))
+    return TypeSwitch<mlir::Operation *, Value>(reduceOp)
+        .Case<arith::AddIOp, arith::MulIOp, arith::MaxSIOp, arith::MaxUIOp,
+              arith::MinSIOp, arith::MinUIOp, arith::AndIOp, arith::OrIOp,
+              arith::XOrIOp>([&](auto groupOp) {
+          return createSPIRVGroupOp<SPIRVLogicalGroupOpTy<decltype(groupOp)>>(
+              rewriter, loc, resultType, acc, numLanesToReduce, warpSize);
+        });
+  return TypeSwitch<mlir::Operation *, Value>(reduceOp)
+      .Case<arith::AddFOp, arith::AddIOp, arith::MulFOp, arith::MulIOp,
+            arith::MaxSIOp, arith::MaxUIOp, arith::MinSIOp, arith::MinUIOp,
+            arith::MaxNumFOp, arith::MinNumFOp, arith::AndIOp, arith::OrIOp,
+            arith::XOrIOp>([&](auto groupOp) {
+        return createSPIRVGroupOp<SPIRVGroupOpTy<decltype(groupOp)>>(
+            rewriter, loc, resultType, acc, numLanesToReduce, warpSize);
+      });
 }
 
 } // namespace
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/TritonOpsToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/TritonOpsToLLVM.cpp
@@ -598,8 +598,7 @@ class ReduceOpConversion : public ConvertTritonGPUOpToLLVMPattern<ReduceOp> {
     // FIXME: support all possible reduction modes
     TypeSwitch<Operation *>(combine).Case<arith::AddFOp, arith::MaxNumFOp>(
         [&](auto reduce) {
-          rewriter.replaceOpWithNewOp<
-              intel::SPIRVArithmeticGroupOpTy<decltype(reduce)>>(
+          rewriter.replaceOpWithNewOp<intel::SPIRVGroupOpTy<decltype(reduce)>>(
               op, typeConverter->convertType(op.getType(0)),
               spirv::Scope::Subgroup, spirv::GroupOperation::Reduce,
               adaptor.getSrcs()[0], Value());