Implements unsplat support for single element reduction. (#305)

red1bluelost · web-flow · commit e7a375a8029d · 2025-07-24T18:19:51.000-04:00
This enables support for extracting an element from a single value
tensor using `.item()`
diff --git a/include/triton-shared/Conversion/TritonArithToLinalg/ConversionPatterns.hpp b/include/triton-shared/Conversion/TritonArithToLinalg/ConversionPatterns.hpp
@@ -1393,6 +1393,24 @@ struct ReduceConverter : public OpConversionPattern<triton::ReduceOp> {
     return success();
   }
 
+  LogicalResult
+  convertToTensorExtract(triton::ReduceOp op,
+                         typename triton::ReduceOp::Adaptor adaptor,
+                         ConversionPatternRewriter &rewriter) const {
+    assert(llvm::hasSingleElement(op.getSrcs()));
+
+    auto returnOp = cast<triton::ReduceReturnOp>(*op.getOps().begin());
+    assert(llvm::hasSingleElement(returnOp.getResult()));
+    assert(cast<BlockArgument>(returnOp.getResult().front()).getArgNumber() ==
+           0);
+
+    auto source = op.getSrcs().front();
+    auto zeroIdx =
+        rewriter.createOrFold<arith::ConstantIndexOp>(op.getLoc(), 0);
+    rewriter.replaceOpWithNewOp<tensor::ExtractOp>(op, source, zeroIdx);
+    return success();
+  }
+
 public:
   LogicalResult
   matchAndRewrite(triton::ReduceOp op,
@@ -1409,6 +1427,14 @@ struct ReduceConverter : public OpConversionPattern<triton::ReduceOp> {
            "axis is within "
            "operand's rank");
 
+    // Unsplat is implemented as a single element, rank 1 reduction where
+    // single element is yielded immediately. This can be simplified into
+    // a single element extract.
+    if (llvm::hasSingleElement(op.getOps()) && sourceType.getRank() == 1 &&
+        sourceType.getShape()[0] == 1) {
+      return convertToTensorExtract(op, adaptor, rewriter);
+    }
+
     return convertToLinalgReduce(op, adaptor, rewriter);
   }
 };
diff --git a/python/examples/conftest.py b/python/examples/conftest.py
@@ -73,6 +73,7 @@ def with_allocator():
     "test_addptr",
     "test_transpose",
     "test_trans_4d",
+    "test_unsplat",
     "test_arange",
 }
 
diff --git a/test/Conversion/TritonToLinalgExperimental/reduce_unsplat.mlir b/test/Conversion/TritonToLinalgExperimental/reduce_unsplat.mlir
@@ -0,0 +1,51 @@
+// RUN: triton-shared-opt --triton-to-linalg-experimental %s | FileCheck %s
+module {
+  tt.func public @unsplat_kernel(%arg0: !tt.ptr<i32> {maia.rank = 1 : i32, tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    %cst = arith.constant dense<42> : tensor<1xi32>
+    %c42_i32 = arith.constant 42 : i32
+    %0 = tt.splat %arg0 : !tt.ptr<i32> -> tensor<1x!tt.ptr<i32>>
+    %1 = tt.load %0 : tensor<1x!tt.ptr<i32>>
+    %2 = arith.cmpi sgt, %1, %cst : tensor<1xi32>
+    %3 = "tt.reduce"(%2) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: i1, %arg2: i1):
+      tt.reduce.return %arg1 : i1
+    }) : (tensor<1xi1>) -> i1
+    scf.if %3 {
+      tt.store %arg0, %c42_i32 : !tt.ptr<i32>
+    }
+    tt.return
+  }
+}
+
+// CHECK-DAG:   [[MAP_0_:#.+]] = affine_map<(d0) -> (d0)>
+// CHECK-LABEL:  func.func @unsplat_kernel
+// CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<*xi32> {maia.rank = 1 : i32, tt.divisibility = 16 : i32}, [[PARAM_1_:%.+]]: i32, [[PARAM_2_:%.+]]: i32, [[PARAM_3_:%.+]]: i32, [[PARAM_4_:%.+]]: i32, [[PARAM_5_:%.+]]: i32, [[PARAM_6_:%.+]]: i32) {
+// CHECK-DAG:       [[CST_42_:%.+]] = arith.constant 42 : i32
+// CHECK-DAG:       [[CST_0_:%.+]] = arith.constant 0 : i32
+// CHECK-DAG:       [[CST_0_1_:%.+]] = arith.constant 0 : index
+// CHECK-DAG:       [[VAR_0_:%.+]] = tensor.empty() : tensor<1xi32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_1_:%.+]] = linalg.fill ins([[CST_42_]] : i32) outs([[VAR_0_]] : tensor<1xi32>) -> tensor<1xi32>
+// CHECK-DAG:       [[VAR_2_:%.+]] = linalg.fill ins([[CST_0_]] : i32) outs([[VAR_0_]] : tensor<1xi32>) -> tensor<1xi32>
+// CHECK-DAG:       [[VAR_cast_:%.+]] = memref.cast [[PARAM_0_]] : memref<*xi32> to memref<?xi32>
+// CHECK-NOT: separator of consecutive DAGs
+// CHECK-DAG:       [[VAR_3_:%.+]] = bufferization.to_tensor [[VAR_cast_]] restrict : memref<?xi32> to tensor<?xi32>
+// CHECK-DAG:       [[VAR_4_:%.+]] = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} ins([[VAR_2_]] : tensor<1xi32>) outs([[VAR_0_]] : tensor<1xi32>) {
+// CHECK:           ^bb0([[IN_0_:%.+]]: i32, [[IN_1_:%.+]]: i32):
+// CHECK:             [[VAR_7_:%.+]] = arith.index_cast [[IN_0_]] : i32 to index
+// CHECK:             [[VAR_extracted_0_:%.+]] = tensor.extract [[VAR_3_]]{{.}}[[VAR_7_]]{{.}} : tensor<?xi32>
+// CHECK:             linalg.yield [[VAR_extracted_0_]] : i32
+// CHECK:           } -> tensor<1xi32>
+// CHECK:           [[VAR_5_:%.+]] = tensor.empty() : tensor<1xi1>
+// CHECK:           [[VAR_6_:%.+]] = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel"]} ins([[VAR_4_]], [[VAR_1_]] : tensor<1xi32>, tensor<1xi32>) outs([[VAR_5_]] : tensor<1xi1>) {
+// CHECK:           ^bb0([[IN_2_:%.+]]: i32, [[IN_3_:%.+]]: i32, [[IN_4_:%.+]]: i1):
+// CHECK:             [[VAR_7_1_:%.+]] = arith.cmpi sgt, [[IN_2_]], [[IN_3_]] : i32
+// CHECK:             linalg.yield [[VAR_7_1_]] : i1
+// CHECK:           } -> tensor<1xi1>
+// CHECK:           [[VAR_extracted_:%.+]] = tensor.extract [[VAR_6_]]{{.}}[[CST_0_1_]]{{.}} : tensor<1xi1>
+// CHECK:           scf.if [[VAR_extracted_]] {
+// CHECK:             [[VAR_reinterpret_cast_:%.+]] = memref.reinterpret_cast [[PARAM_0_]] to offset: {{.}}[[CST_0_1_]]{{.}}, sizes: [1], strides: [1] : memref<*xi32> to memref<1xi32, strided<[1], offset: ?>>
+// CHECK:             affine.store [[CST_42_]], [[VAR_reinterpret_cast_]][0] : memref<1xi32, strided<[1], offset: ?>>
+// CHECK:           }
+// CHECK:           return
+// CHECK:         }

Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,7 @@ def with_allocator():`
`73`	`73`	`"test_addptr",`
`74`	`74`	`"test_transpose",`
`75`	`75`	`"test_trans_4d",`
	`76`	`+ "test_unsplat",`
`76`	`77`	`"test_arange",`
`77`	`78`	`}`
`78`	`79`