update transpose injection when convert tt.reduce to linalg.reduce (#312)

enjustli · web-flow · commit e15146b20462 · 2025-08-08T11:06:03.000-04:00
when convert tt.reduce to linalg.reduce, we will insert transpose so
that the axis becomes 0. if reduce axis==0, the other dimensions can be
colllapsed. this will make the reduction easier and more efficient to
vectorize.
The pseudocode is as follows:
before 
```
... = tt.reduce axis=[x]
```
after, if x!=0
```
%v = linalg.transpose permutation [x, ...]
... = linalg.reduce %v dimensions=[0]
```

test case is `python/examples/test_reduce.py::test_reduce_max`
diff --git a/.clang-format b/.clang-format
@@ -0,0 +1 @@
+triton/.clang-format
diff --git a/include/triton-shared/Conversion/TritonArithToLinalg/ConversionPatterns.hpp b/include/triton-shared/Conversion/TritonArithToLinalg/ConversionPatterns.hpp
@@ -11,9 +11,9 @@
 #include "triton-shared/Analysis/MaskAnalysis.h"
 #include "triton-shared/Analysis/OpFoldResultUtils.h"
 #include "triton-shared/Analysis/PtrAnalysis.h"
+#include "triton-shared/Conversion/TritonArithToLinalg/ConversionTools.h"
 #include "triton-shared/Dialect/TritonTilingExt/IR/TritonTilingExtDialect.h"
 #include "triton-shared/Utils/Utils.h"
-#include "triton-shared/Conversion/TritonArithToLinalg/ConversionTools.h"
 
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
@@ -1284,11 +1284,22 @@ struct ReduceConverter : public OpConversionPattern<triton::ReduceOp> {
 
     auto rop = reductionOps.front();
     auto axis = op.getAxis();
-    auto isVectorReduce = sourceType.getRank() == 1;
-
-    if (axis == sourceType.getRank() - 1 && !isVectorReduce) {
-      source = getTransposedValue(source, op.getLoc(), rewriter);
-      axis = sourceType.getRank() - 2;
+    auto rank = sourceType.getRank();
+    auto isVectorReduce = (rank == 1);
+
+    // if it is not a vector reduce, we can transpose the source
+    // so that the reduction axis is the first dimension.
+    if (!isVectorReduce && axis != 0) {
+      SmallVector<int32_t> order;
+      order.reserve(rank);
+      order.push_back(axis);
+      for (int i = 0; i < rank; ++i) {
+        if (i != axis) {
+          order.push_back(i);
+        }
+      }
+      source = getTransposedValue(source, op.getLoc(), rewriter, order);
+      axis = 0;
     }
 
     bool convertToF32Precision = requiresF32Conversion(resType, rop);
@@ -1334,7 +1345,7 @@ struct ReduceConverter : public OpConversionPattern<triton::ReduceOp> {
                 })
             .getResult(0);
 
-    if (sourceType.getRank() == 1) {
+    if (isVectorReduce) {
       finalResult =
           rewriter.create<tensor::ExtractOp>(loc, constantType, finalResult);
     }
diff --git a/python/examples/test_reduce.py b/python/examples/test_reduce.py
@@ -1,5 +1,6 @@
 import torch
-
+import pytest
+import math
 import triton
 from triton.backends.compiler import GPUTarget
 import triton.language as tl
@@ -35,27 +36,61 @@ def test(device):
     x = torch.rand([n_cols, n_rows], device=device, dtype=torch.float32)
     output = torch.empty([n_cols], device=device, dtype=x.dtype)
     BLOCK_SIZE = n_rows
-    grid = lambda meta: (n_cols,)
+    grid = lambda meta: (n_cols, )
 
     reduce_kernel_2d[grid](x, output, x.stride(0), n_rows, BLOCK_SIZE=BLOCK_SIZE)
     ans = torch.sum(x, dim=1)
     torch.testing.assert_close(output, ans, rtol=0.001, atol=1e-5)
 
     # TODO: need to check some conditions otherwise the code below does not make any difference for the test
     src = triton.compiler.ASTSource(
-        fn=reduce_kernel_2d,
-        signature={"x_ptr": "*fp32",
-                   "output_ptr": "*fp32",
-                   "stride": "i32",
-                   "n_elements": "i32",
-                   "BLOCK_SIZE": "constexpr"},
-        constexprs={"BLOCK_SIZE": 32}
-    )
-    ret = triton.compile(
-        src,
-        target=GPUTarget(device, 0, 0)
-    )
+        fn=reduce_kernel_2d, signature={
+            "x_ptr": "*fp32", "output_ptr": "*fp32", "stride": "i32", "n_elements": "i32", "BLOCK_SIZE": "constexpr"
+        }, constexprs={"BLOCK_SIZE": 32})
+    ret = triton.compile(src, target=GPUTarget(device, 0, 0))
     print(ret.asm["ttir"])
     print(ret.asm["ttsharedir"])
     print(ret.asm["llir"])
     print(ret.asm["obj"])
+
+
+@pytest.mark.interpreter
+@pytest.mark.parametrize("dtype_str", ["int32", "float32"])
+@pytest.mark.parametrize("shape", [(128, 2, 4), (64, 2, 4), (32, 2, 4), (2, 4, 32), (2, 4, 2)])
+@pytest.mark.parametrize("axis", [0, 1, 2])
+def test_reduce_max(dtype_str, shape, axis, device):
+
+    @triton.jit
+    def kernel(
+        In,
+        Out,
+        in_shape1: tl.constexpr,
+        in_shape2: tl.constexpr,
+        in_shape3: tl.constexpr,
+        ou_shape1: tl.constexpr,
+        ou_shape2: tl.constexpr,
+        axis: tl.constexpr,
+    ):
+        in_desc = tl.make_tensor_descriptor(
+            base=In,
+            shape=[in_shape1 * in_shape2 * in_shape3],
+            strides=[1],
+            block_shape=[in_shape1 * in_shape2 * in_shape3],
+        )
+        out_desc = tl.make_tensor_descriptor(
+            base=Out,
+            shape=[ou_shape1 * ou_shape2],
+            strides=[1],
+            block_shape=[ou_shape1 * ou_shape2],
+        )
+        val = in_desc.load([0]).reshape(in_shape1, in_shape2, in_shape3)
+        output = tl.max(val, axis=axis)
+        out_desc.store([0], output.reshape(out_desc.block_shape))
+
+    input = torch.arange(math.prod(shape), dtype=getattr(torch, dtype_str),
+                         device="cpu").reshape(shape).to(device=device)
+    expected, indices = torch.max(input, dim=axis)
+    actual = torch.zeros(expected.shape, dtype=getattr(torch, dtype_str), device=device)
+    kernel[(1, )](input, actual, *shape, *expected.shape, axis=axis)
+
+    assert torch.equal(expected, actual)
diff --git a/test/Conversion/StructuredToMemref/reducesum_middle_dim.mlir b/test/Conversion/StructuredToMemref/reducesum_middle_dim.mlir
@@ -53,9 +53,11 @@ module {
 // CHECK-DAG:       [[RES_:%.+]] = memref.alloc() : memref<32x256x16xbf16>
 // CHECK:           memref.copy [[VAR_reinterpret_cast_]], [[RES_]] : memref<32x256x16xbf16, strided<[256, 1, 1]>> to memref<32x256x16xbf16>
 // CHECK-DAG:       [[VAR_0_:%.+]] = bufferization.to_tensor [[RES_]] restrict writable : memref<32x256x16xbf16>
+// CHECK:           [[VAL_7:%.+]] = tensor.empty() : tensor<256x32x16xbf16>
+// CHECK:           [[VAL_8:%.+]] = linalg.transpose ins([[VAR_0_]] : tensor<32x256x16xbf16>) outs([[VAL_7]] : tensor<256x32x16xbf16>) permutation = [1, 0, 2]
 // CHECK-DAG:       [[VAR_1_:%.+]] = tensor.empty() : tensor<32x16xbf16>
 // CHECK:           [[VAR_2_:%.+]] = linalg.fill ins([[CST_0_dot_000000_]] : bf16) outs([[VAR_1_]] : tensor<32x16xbf16>) -> tensor<32x16xbf16>
-// CHECK:           [[VAR_reduced_:%.+]] = linalg.reduce ins([[VAR_0_]] : tensor<32x256x16xbf16>) outs([[VAR_2_]] : tensor<32x16xbf16>) dimensions = [1]
+// CHECK:           [[VAR_reduced_:%.+]] = linalg.reduce ins([[VAL_8]] : tensor<256x32x16xbf16>) outs([[VAR_2_]] : tensor<32x16xbf16>) dimensions = [0]
 // CHECK:             ([[in_:.+]]: bf16, [[init_:.+]]: bf16) {
 // CHECK:               [[VAR_3_:%.+]] = arith.addf [[in_]], [[init_]] : bf16
 // CHECK:               linalg.yield [[VAR_3_]] : bf16
diff --git a/test/Conversion/TritonArithToLinalg/reducesum_middle_dim.mlir b/test/Conversion/TritonArithToLinalg/reducesum_middle_dim.mlir
@@ -132,9 +132,11 @@ module {
 // CHECK:             linalg.yield [[VAR_29_6_]] : !tt.ptr<bf16>
 // CHECK:           } -> tensor<32x256x16x!tt.ptr<bf16>>
 // CHECK-DAG:       [[LOAD_VAR_25_MEM_:%.+]] = tt.load [[VAR_25_]] : tensor<32x256x16x!tt.ptr<bf16>>
+// CHECK:           [[VAL_72:%.+]] = tensor.empty() : tensor<256x32x16xbf16>
+// CHECK:           [[VAL_73:%.+]] = linalg.transpose ins([[LOAD_VAR_25_MEM_]] : tensor<32x256x16xbf16>) outs([[VAL_72]] : tensor<256x32x16xbf16>) permutation = [1, 0, 2]
 // CHECK-DAG:       [[VAR_27_:%.+]] = tensor.empty() : tensor<32x16xbf16>
 // CHECK:           [[VAR_28_:%.+]] = linalg.fill ins([[CST_0_dot_000000_]] : bf16) outs([[VAR_27_]] : tensor<32x16xbf16>) -> tensor<32x16xbf16>
-// CHECK:           [[VAR_reduced_:%.+]] = linalg.reduce ins([[LOAD_VAR_25_MEM_]] : tensor<32x256x16xbf16>) outs([[VAR_28_]] : tensor<32x16xbf16>) dimensions = [1]
+// CHECK:           [[VAR_reduced_:%.+]] = linalg.reduce ins([[VAL_73]] : tensor<256x32x16xbf16>) outs([[VAR_28_]] : tensor<32x16xbf16>) dimensions = [0]
 // CHECK:             ([[in_]]: bf16, [[in_]]it: bf16) {
 // CHECK:               [[VAR_29_7_:%.+]] = arith.addf [[in_]], [[in_]]it : bf16
 // CHECK:               linalg.yield [[VAR_29_7_]] : bf16
diff --git a/test/Conversion/TritonToLinalg/reducesum_middle_dim.mlir b/test/Conversion/TritonToLinalg/reducesum_middle_dim.mlir
@@ -39,20 +39,22 @@ module {
     }
 }
 // CHECK-LABEL:   func.func @kernel(
-// CHECK-SAME:                      %[[VAL_0:.*]]: memref<*xbf16>, %[[VAL_1:.*]]: memref<*xbf16>, %[[VAL_2:.*]]: memref<32x16xbf16>, %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: i32, %[[VAL_5:.*]]: i32) {
-// CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 256 : index
-// CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 0.000000e+00 : bf16
-// CHECK:           %[[VAL_8:.*]] = memref.reinterpret_cast %[[VAL_0]] to offset: [0], sizes: [32, 256, 16], strides: {{\[}}%[[VAL_6]], 1, 1] : memref<*xbf16> to memref<32x256x16xbf16, strided<[?, 1, 1]>>
-// CHECK:           %[[VAL_9:.*]] = memref.alloc() : memref<32x256x16xbf16>
-// CHECK:           memref.copy %[[VAL_8]], %[[VAL_9]] : memref<32x256x16xbf16, strided<[?, 1, 1]>> to memref<32x256x16xbf16>
-// CHECK:           %[[VAL_10:.*]] = bufferization.to_tensor %[[VAL_9]] restrict writable : memref<32x256x16xbf16>
-// CHECK:           %[[VAL_11:.*]] = tensor.empty() : tensor<32x16xbf16>
-// CHECK:           %[[VAL_12:.*]] = linalg.fill ins(%[[VAL_7]] : bf16) outs(%[[VAL_11]] : tensor<32x16xbf16>) -> tensor<32x16xbf16>
-// CHECK:           %[[VAL_13:.*]] = linalg.reduce ins(%[[VAL_10]] : tensor<32x256x16xbf16>) outs(%[[VAL_12]] : tensor<32x16xbf16>) dimensions = [1]
-// CHECK:             (%[[VAL_14:.*]]: bf16, %[[VAL_15:.*]]: bf16) {
-// CHECK:               %[[VAL_16:.*]] = arith.addf %[[VAL_14]], %[[VAL_15]] : bf16
-// CHECK:               linalg.yield %[[VAL_16]] : bf16
+// CHECK-SAME:      %[[ARG0:.*]]: memref<*xbf16>, %[[ARG1:.*]]: memref<*xbf16>, %[[ARG2:.*]]: memref<32x16xbf16>, %[[ARG3:.*]]: i32,  %[[ARG4:.*]]: i32,  %[[ARG5:.*]]: i32, %[[ARG6:.*]]: i32, %[[ARG7:.*]]: i32, %[[ARG8:.*]]: i32) {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0.000000e+00 : bf16
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 256 : index
+// CHECK:           %[[VAL_2:.*]] = memref.reinterpret_cast %[[ARG0]] to offset: [0], sizes: [32, 256, 16], strides: {{\[}}%[[VAL_1]], 1, 1] : memref<*xbf16> to memref<32x256x16xbf16, strided<[?, 1, 1]>>
+// CHECK:           %[[VAL_3:.*]] = memref.alloc() : memref<32x256x16xbf16>
+// CHECK:           memref.copy %[[VAL_2]], %[[VAL_3]] : memref<32x256x16xbf16, strided<[?, 1, 1]>> to memref<32x256x16xbf16>
+// CHECK:           %[[VAL_4:.*]] = bufferization.to_tensor %[[VAL_3]] restrict writable : memref<32x256x16xbf16> to tensor<32x256x16xbf16>
+// CHECK:           %[[VAL_5:.*]] = tensor.empty() : tensor<256x32x16xbf16>
+// CHECK:           %[[VAL_6:.*]] = linalg.transpose ins(%[[VAL_4]] : tensor<32x256x16xbf16>) outs(%[[VAL_5]] : tensor<256x32x16xbf16>) permutation = [1, 0, 2]
+// CHECK:           %[[VAL_7:.*]] = tensor.empty() : tensor<32x16xbf16>
+// CHECK:           %[[VAL_8:.*]] = linalg.fill ins(%[[VAL_0]] : bf16) outs(%[[VAL_7]] : tensor<32x16xbf16>) -> tensor<32x16xbf16>
+// CHECK:           %[[VAL_9:.*]] = linalg.reduce ins(%[[VAL_6]] : tensor<256x32x16xbf16>) outs(%[[VAL_8]] : tensor<32x16xbf16>) dimensions = [0]
+// CHECK:             (%[[VAL_10:.*]]: bf16, %[[VAL_11:.*]]: bf16) {
+// CHECK:               %[[VAL_12:.*]] = arith.addf %[[VAL_10]], %[[VAL_11]] : bf16
+// CHECK:               linalg.yield %[[VAL_12]] : bf16
 // CHECK:             }
-// CHECK:           bufferization.materialize_in_destination %[[VAL_13]] in writable %[[VAL_2]]
+// CHECK:           bufferization.materialize_in_destination %[[VAL_9]] in writable %[[ARG2]] : (tensor<32x16xbf16>, memref<32x16xbf16>) -> ()
 // CHECK:           return
 // CHECK:         }