MakeRangeOp needs to consider the start of the range (#304)

dbd64 · web-flow · commit cb466ef6995e · 2025-07-21T16:13:02.000-04:00
The conversion for MakeRangeOp needs to consider the start of the range.
It currently assumes all ranges are constructed from 0 to the size.

---------

Authored-by: Daniel Donenfeld &lt;ddonenfeld@microsoft.com&gt;
diff --git a/include/triton-shared/Conversion/TritonArithToLinalg/ConversionPatterns.hpp b/include/triton-shared/Conversion/TritonArithToLinalg/ConversionPatterns.hpp
@@ -817,6 +817,12 @@ struct MakeRangeConverter : public OpConversionPattern<triton::MakeRangeOp> {
           Value index = nestedBuilder.create<linalg::IndexOp>(loc, 0);
           Value res = nestedBuilder.create<arith::IndexCastOp>(
               loc, type.getElementType(), index);
+          if (op.getStart()) {
+            auto start = rewriter.create<mlir::arith::ConstantIntOp>(
+                op.getLoc(), op.getStart(),
+                type.getElementType().getIntOrFloatBitWidth());
+            res = nestedBuilder.create<arith::AddIOp>(loc, res, start);
+          }
           nestedBuilder.create<linalg::YieldOp>(loc, res);
         });
 
diff --git a/python/examples/conftest.py b/python/examples/conftest.py
@@ -73,6 +73,7 @@ def with_allocator():
     "test_addptr",
     "test_transpose",
     "test_trans_4d",
+    "test_arange",
 }
 
 
diff --git a/test/Conversion/StructuredToMemref/use_end_chain.mlir b/test/Conversion/StructuredToMemref/use_end_chain.mlir
@@ -41,6 +41,8 @@ module {
 // CHECK-LABEL:  func.func @kernel
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<*xbf16>, [[PARAM_1_:%.+]]: memref<*xbf16>, [[PARAM_2_:%.+]]: i32, [[PARAM_3_:%.+]]: i32, [[PARAM_4_:%.+]]: i32, [[PARAM_5_:%.+]]: i32, [[PARAM_6_:%.+]]: i32, [[PARAM_7_:%.+]]: i32) {
 // CHECK-DAG:       [[CST_6_:%.+]] = arith.constant 6 : i32
+// CHECK-DAG:       [[CST_512_:%.+]] = arith.constant 512 : i32
+// CHECK-DAG:       [[CST_1024_:%.+]] = arith.constant 1024 : i32
 // CHECK-DAG:       [[VAR_0_:%.+]] = tensor.empty() : tensor<256x128xi32>
 // CHECK-NOT: separator of consecutive DAGs
 // CHECK-DAG:       [[VAR_1_:%.+]] = linalg.fill ins([[CST_6_]] : i32) outs([[VAR_0_]] : tensor<256x128xi32>) -> tensor<256x128xi32>
@@ -49,7 +51,8 @@ module {
 // CHECK:           ^bb0([[IN_0_:%.+]]: i32):
 // CHECK:             [[VAR_13_:%.+]] = linalg.index 0 : index
 // CHECK:             [[VAR_14_:%.+]] = arith.index_cast [[VAR_13_]] : index to i32
-// CHECK:             linalg.yield [[VAR_14_]] : i32
+// CHECK:             [[VAL_24:%.+]] = arith.addi [[VAR_14_]], [[CST_512_]] : i32
+// CHECK:             linalg.yield [[VAL_24]] : i32
 // CHECK:           } -> tensor<256xi32>
 // CHECK:           [[VAR_expanded_:%.+]] = tensor.expand_shape [[VAR_3_]] {{.}}[0, 1]{{.}} output_shape [256, 1] : tensor<256xi32> into tensor<256x1xi32>
 // CHECK:           [[VAR_4_:%.+]] = linalg.generic {indexing_maps = [#map1, #map2], iterator_types = ["parallel", "parallel"]} ins([[VAR_expanded_]] : tensor<256x1xi32>) outs([[VAR_0_]] : tensor<256x128xi32>) attrs =  {broadcastDims = array<i64: 1>} {
@@ -61,7 +64,8 @@ module {
 // CHECK:           ^bb0([[IN_3_:%.+]]: i32):
 // CHECK:             [[VAR_13_1_:%.+]] = linalg.index 0 : index
 // CHECK:             [[VAR_14_1_:%.+]] = arith.index_cast [[VAR_13_1_]] : index to i32
-// CHECK:             linalg.yield [[VAR_14_1_]] : i32
+// CHECK:             [[VAL_25:%.+]] = arith.addi [[VAR_14_1_]], [[CST_1024_]] : i32
+// CHECK:             linalg.yield [[VAL_25]] : i32
 // CHECK:           } -> tensor<128xi32>
 // CHECK:           [[VAR_expanded_0_:%.+]] = tensor.expand_shape [[VAR_6_]] {{.}}[0, 1]{{.}} output_shape [1, 128] : tensor<128xi32> into tensor<1x128xi32>
 // CHECK:           [[VAR_7_:%.+]] = linalg.generic {indexing_maps = [#map3, #map2], iterator_types = ["parallel", "parallel"]} ins([[VAR_expanded_0_]] : tensor<1x128xi32>) outs([[VAR_0_]] : tensor<256x128xi32>) attrs =  {broadcastDims = array<i64: 0>} {
diff --git a/test/Conversion/StructuredToMemref/use_mid_chain.mlir b/test/Conversion/StructuredToMemref/use_mid_chain.mlir
@@ -41,12 +41,14 @@ module {
 // CHECK-DAG:   [[MAP_2_:#.+]] = affine_map<(d0, d1) -> (d0, d1)>
 // CHECK-LABEL:  func.func @kernel
 // CHECK-SAME:   ([[PARAM_0_:%.+]]: memref<*xbf16>, [[PARAM_1_:%.+]]: memref<*xbf16>, [[PARAM_2_:%.+]]: memref<*xi32>, [[PARAM_3_:%.+]]: i32, [[PARAM_4_:%.+]]: i32, [[PARAM_5_:%.+]]: i32, [[PARAM_6_:%.+]]: i32, [[PARAM_7_:%.+]]: i32, [[PARAM_8_:%.+]]: i32) {
+// CHECK-DAG:       [[VAL_25:%.+]] = arith.constant 512 : i32
 // CHECK-DAG:       [[VAR_0_:%.+]] = tensor.empty() : tensor<256xi32>
 // CHECK:           [[VAR_1_:%.+]] = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs([[VAR_0_]] : tensor<256xi32>) {
 // CHECK:           ^bb0([[IN_0_:%.+]]: i32):
 // CHECK:             [[VAR_5_:%.+]] = linalg.index 0 : index
 // CHECK:             [[VAR_6_:%.+]] = arith.index_cast [[VAR_5_]] : index to i32
-// CHECK:             linalg.yield [[VAR_6_]] : i32
+// CHECK:             [[VAL_24:%.+]] = arith.addi [[VAR_6_]], [[VAL_25]] : i32
+// CHECK:             linalg.yield [[VAL_24]] : i32
 // CHECK:           } -> tensor<256xi32>
 // CHECK-DAG:       [[VAR_expanded_:%.+]] = tensor.expand_shape [[VAR_1_]] {{.}}[0, 1]{{.}} output_shape [256, 1] : tensor<256xi32> into tensor<256x1xi32>
 // CHECK-DAG:       [[VAR_2_:%.+]] = tensor.empty() : tensor<256x128xi32>
diff --git a/test/Conversion/TritonToLinalg/use_end_chain.mlir b/test/Conversion/TritonToLinalg/use_end_chain.mlir
@@ -37,14 +37,17 @@ module {
 // CHECK-SAME:                      %[[VAL_0:.*]]: memref<*xbf16>, %[[VAL_1:.*]]: memref<*xbf16>, %[[VAL_2:.*]]: i32, %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: i32) {
 // CHECK-DAG:           %[[VAL_6:.*]] = arith.constant 6 : index
 // CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 6 : i32
+// CHECK-DAG:           %[[CST_512_:.*]] = arith.constant 512 : i32
+// CHECK-DAG:           %[[CST_1024_:.*]] = arith.constant 1024 : i32
 // CHECK:           %[[VAL_30:.*]] = tensor.empty() : tensor<256x128xi32>
 // CHECK:           %[[VAL_31:.*]] = linalg.fill ins(%[[VAL_7]] : i32) outs(%[[VAL_30]] : tensor<256x128xi32>) -> tensor<256x128xi32>
 // CHECK:           %[[VAL_8:.*]] = tensor.empty() : tensor<256xi32>
 // CHECK:           %[[VAL_9:.*]] = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%[[VAL_8]] : tensor<256xi32>) {
 // CHECK:           ^bb0(%[[VAL_10:.*]]: i32):
 // CHECK:             %[[VAL_11:.*]] = linalg.index 0 : index
 // CHECK:             %[[VAL_12:.*]] = arith.index_cast %[[VAL_11]] : index to i32
-// CHECK:             linalg.yield %[[VAL_12]] : i32
+// CHECK:             %[[VAL_55:.*]] = arith.addi %[[VAL_12]], %[[CST_512_]] : i32
+// CHECK:             linalg.yield %[[VAL_55]] : i32
 // CHECK:           } -> tensor<256xi32>
 // CHECK:           %[[VAL_13:.*]] = tensor.expand_shape %[[VAL_14:.*]] {{\[\[}}0, 1]] output_shape [256, 1] : tensor<256xi32> into tensor<256x1xi32>
 // CHECK:           %[[VAL_15:.*]] = tensor.empty() : tensor<256x128xi32>
@@ -57,7 +60,8 @@ module {
 // CHECK:           ^bb0(%[[VAL_21:.*]]: i32):
 // CHECK:             %[[VAL_22:.*]] = linalg.index 0 : index
 // CHECK:             %[[VAL_23:.*]] = arith.index_cast %[[VAL_22]] : index to i32
-// CHECK:             linalg.yield %[[VAL_23]] : i32
+// CHECK:             %[[VAL_56:.*]] = arith.addi %[[VAL_23]], %[[CST_1024_]] : i32
+// CHECK:             linalg.yield %[[VAL_56]] : i32
 // CHECK:           } -> tensor<128xi32>
 // CHECK:           %[[VAL_24:.*]] = tensor.expand_shape %[[VAL_25:.*]] {{\[\[}}0, 1]] output_shape [1, 128] : tensor<128xi32> into tensor<1x128xi32>
 // CHECK:           %[[VAL_26:.*]] = tensor.empty() : tensor<256x128xi32>
diff --git a/test/Conversion/TritonToLinalg/use_mid_chain.mlir b/test/Conversion/TritonToLinalg/use_mid_chain.mlir
@@ -38,12 +38,14 @@ module {
 // CHECK-LABEL:   func.func @kernel(
 // CHECK-SAME:                      %[[VAL_0:.*]]: memref<*xbf16>, %[[VAL_1:.*]]: memref<*xbf16>, %[[VAL_2:.*]]: memref<*xi32>, %[[VAL_3:.*]]: i32, %[[VAL_4:.*]]: i32, %[[VAL_5:.*]]: i32) {
 // CHECK-DAG:           %[[VAL_7:.*]] = arith.constant 6 : index
+// CHECK-DAG:           %[[VAL_25:.*]] = arith.constant 512 : i32
 // CHECK:           %[[VAL_8:.*]] = tensor.empty() : tensor<256xi32>
 // CHECK:           %[[VAL_9:.*]] = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%[[VAL_8]] : tensor<256xi32>) {
 // CHECK:           ^bb0(%[[VAL_10:.*]]: i32):
 // CHECK:             %[[VAL_11:.*]] = linalg.index 0 : index
 // CHECK:             %[[VAL_12:.*]] = arith.index_cast %[[VAL_11]] : index to i32
-// CHECK:             linalg.yield %[[VAL_12]] : i32
+// CHECK:             %[[VAL_24:.*]] = arith.addi %[[VAL_12]], %[[VAL_25]] : i32
+// CHECK:             linalg.yield %[[VAL_24]] : i32
 // CHECK:           } -> tensor<256xi32>
 // CHECK:           %[[VAL_13:.*]] = tensor.expand_shape %[[VAL_14:.*]] {{\[\[}}0, 1]] output_shape [256, 1] : tensor<256xi32> into tensor<256x1xi32>
 // CHECK:           %[[VAL_15:.*]] = tensor.empty() : tensor<256x128xi32>

Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,7 @@ def with_allocator():`
`73`	`73`	`"test_addptr",`
`74`	`74`	`"test_transpose",`
`75`	`75`	`"test_trans_4d",`
	`76`	`+ "test_arange",`
`76`	`77`	`}`
`77`	`78`
`78`	`79`