Fix codegen

etiotto · etiotto · commit 26ca64e27e9d · 2025-10-08T21:15:50.000Z
Signed-off-by: Ettore Tiotto &lt;ettore.tiotto@intel.com&gt;
diff --git a/test/Triton/Intel/FuseReshape/fuse-reshape.mlir b/test/Triton/Intel/FuseReshape/fuse-reshape.mlir
@@ -1,65 +1,130 @@
 // RUN: triton-opt %s -split-input-file -triton-intel-fuse-reshape | FileCheck %s
 
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  // COM: tt.load -> tt.reshape -> tt.dot chain, not in a loop.
-  tt.func public @fuseLoadWithReshape1(%arg0: !tt.ptr<tensor<256x32xbf16>>, %arg1: !tt.ptr<bf16>) {
-    %c0_i32 = arith.constant 0 : i32
-    %c1_i32 = arith.constant 1 : i32
-    %c2_i32 = arith.constant 2 : i32
-    %c1_i64 = arith.constant 1 : i64
-    %c2_i64 = arith.constant 2 : i64
-    %c3_i64 = arith.constant 3 : i64
-    %c1024_i64 = arith.constant 1024 : i64
-    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32>
-    %0 = tt.make_tensor_ptr %arg1, [%c2_i64, %c1_i64, %c1024_i64], [%c3_i64, %c1024_i64, %c1_i64], [%c2_i32, %c1_i32, %c0_i32] {order = array<i32: 2, 1, 0>} : <tensor<1x32x256xbf16>>
-    %1 = tt.load %arg0 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xbf16>>
-    %3 = tt.load %0 {boundaryCheck = array<i32: 1, 2>} : !tt.ptr<tensor<1x32x256xbf16>>
-    %4 = tt.reshape %3 : tensor<1x32x256xbf16> -> tensor<32x256xbf16>
-    %5 = tt.dot %1, %4, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
-    tt.return
+// COM: tt.load -> tt.reshape -> tt.dot chain, not in a loop.
+tt.func public @fuseLoadWithReshape1(%arg0: !tt.ptr<tensor<256x32xbf16>>, %arg1: !tt.ptr<bf16>) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c2_i32 = arith.constant 2 : i32
+  %c1_i64 = arith.constant 1 : i64
+  %c2_i64 = arith.constant 2 : i64
+  %c3_i64 = arith.constant 3 : i64
+  %c1024_i64 = arith.constant 1024 : i64
+  %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32>
+  %0 = tt.make_tensor_ptr %arg1, [%c2_i64, %c1_i64, %c1024_i64], [%c3_i64, %c1024_i64, %c1_i64], [%c2_i32, %c1_i32, %c0_i32] {order = array<i32: 2, 1, 0>} : <tensor<1x32x256xbf16>>
+  %1 = tt.load %arg0 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xbf16>>
+  %3 = tt.load %0 {boundaryCheck = array<i32: 1, 2>} : !tt.ptr<tensor<1x32x256xbf16>>
+  %4 = tt.reshape %3 : tensor<1x32x256xbf16> -> tensor<32x256xbf16>
+  %5 = tt.dot %1, %4, %cst, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+  tt.return
+}
+// CHECK-LABEL: fuseLoadWithReshape1
+// CHECK-NOT: tt.reshape
+// CHECK: [[MUL1:%.*]] = arith.muli %c3_i64, %c2_i64 : i64
+// CHECK: [[ADD1:%.*]] = arith.addi [[MUL1]], %c1024_i64 : i64
+// CHECK: [[TRUNC:%.*]] = arith.trunci %c3_i64 : i64 to i32
+// CHECK: [[MUL2:%.*]] = arith.muli [[TRUNC]], %c2_i32 : i32
+// CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c0_i32 : i32
+// CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [%c1_i64, [[ADD1]]], [%c1024_i64, %c1_i64], [%c1_i32, [[ADD2]]] {order = array<i32: 1, 0>} : <tensor<32x256xbf16>>
+// CHECK: [[LOAD_B:%.*]] = tt.load [[PTR]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>>
+// CHECK: tt.dot {{.*}}, [[LOAD_B]], {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+
+// -----
+
+// COM: tt.load -> tt.reshape -> tt.dot chain, in a loop.
+// COM: where the 'make_tensor_ptr' result is not loop carried.
+tt.func public @fuseLoadWithReshape2(%arg0: !tt.ptr<tensor<32x256xbf16>>, %arg1: !tt.ptr<bf16>) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c1_i64 = arith.constant 1 : i64
+  %c32_i32 = arith.constant 32 : i32
+  %c1024_i32 = arith.constant 1024 : i32
+  %c512_i64 = arith.constant 512 : i64
+  %c1024_i64 = arith.constant 1024 : i64
+  %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32>
+  %0 = tt.make_tensor_ptr %arg1, [%c512_i64, %c1024_i64, %c1_i64], [%c512_i64, %c1_i64, %c1024_i64], [%c1_i32, %c32_i32, %c0_i32] {order = array<i32: 2, 0, 1>} : <tensor<1x256x32xbf16>>
+  %res:2 = scf.for %arg3 = %c0_i32 to %c1024_i32 step %c32_i32 iter_args(%arg4 = %cst, %arg5 = %c0_i32) -> (tensor<256x256xf32>, i32) : i32 {
+    %1 = tt.load %arg0 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>>
+    %3 = tt.load %0 {boundaryCheck = array<i32: 2, 1>} : !tt.ptr<tensor<1x256x32xbf16>>
+    %2 = tt.reshape %3 : tensor<1x256x32xbf16> -> tensor<256x32xbf16>
+    %4 = tt.dot %2, %1, %arg4, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+    %5 = arith.addi %arg5, %c32_i32 : i32
+    scf.yield %4, %5 : tensor<256x256xf32>, i32
   }
-  // CHECK-LABEL: fuseLoadWithReshape1
-  // CHECK-NOT: tt.reshape
-  // CHECK: [[TRUNC:%.*]] = arith.trunci %c3_i64 : i64 to i32
-  // CHECK: [[MUL:%.*]] = arith.muli [[TRUNC]], %c2_i32 : i32
-  // CHECK: [[ADD:%.*]] = arith.addi [[MUL]], %c0_i32 : i32
-  // CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [%c1_i64, %c1024_i64], [%c1024_i64, %c1_i64], [%c1_i32, [[ADD]]] {order = array<i32: 1, 0>} : <tensor<32x256xbf16>>
-  // CHECK: [[LOAD_B:%.*]] = tt.load [[PTR]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>>
-  // CHECK: tt.dot {{.*}}, [[LOAD_B]], {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+  tt.return
 }
+// CHECK-LABEL: fuseLoadWithReshape2
+// CHECK-NOT: tt.reshape
+// CHECK: [[MUL1:%.*]] = arith.muli %c512_i64, %c512_i64 : i64
+// CHECK: [[ADD1:%.*]] = arith.addi [[MUL1]], %c1024_i64 : i64
+// CHECK: [[TRUNC:%.*]] = arith.trunci %c512_i64 : i64 to i32
+// CHECK: [[MUL2:%.*]] = arith.muli [[TRUNC]], %c1_i32 : i32
+// CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c32_i32 : i32
+// CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [[[ADD1]], %c1_i64], [%c1_i64, %c1024_i64], [[[ADD2]], %c0_i32] {order = array<i32: 0, 1>} : <tensor<256x32xbf16>>
+// CHECK: scf.for
+// CHECK:   [[LOAD_A:%.*]] = tt.load [[PTR]] {boundaryCheck = array<i32: 1, 0>} : !tt.ptr<tensor<256x32xbf16>>
+// CHECK:   tt.dot [[LOAD_A]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
 
 // -----
 
-module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 32 : i32, "ttg.threads-per-warp" = 16 : i32} {
-  // COM: tt.load -> tt.reshape -> tt.dot chain, in a loop.
-  // COM: where the 'make_tensor_ptr' result is not loop carried.
-  tt.func public @fuseLoadWithReshape2(%arg0: !tt.ptr<tensor<32x256xbf16>>, %arg1: !tt.ptr<bf16>) {
-    %c0_i32 = arith.constant 0 : i32
-    %c1_i32 = arith.constant 1 : i32
-    %c1_i64 = arith.constant 1 : i64
-    %c32_i32 = arith.constant 32 : i32
-    %c1024_i32 = arith.constant 1024 : i32
-    %c512_i64 = arith.constant 512 : i64
-    %c1024_i64 = arith.constant 1024 : i64
-    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32>
-    %0 = tt.make_tensor_ptr %arg1, [%c512_i64, %c1024_i64, %c1_i64], [%c512_i64, %c1_i64, %c1024_i64], [%c1_i32, %c32_i32, %c0_i32] {order = array<i32: 2, 0, 1>} : <tensor<1x256x32xbf16>>
-    %res:2 = scf.for %arg3 = %c0_i32 to %c1024_i32 step %c32_i32 iter_args(%arg4 = %cst, %arg5 = %c0_i32) -> (tensor<256x256xf32>, i32) : i32 {
-      %1 = tt.load %arg0 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x256xbf16>>
-      %3 = tt.load %0 {boundaryCheck = array<i32: 2, 1>} : !tt.ptr<tensor<1x256x32xbf16>>
-      %2 = tt.reshape %3 : tensor<1x256x32xbf16> -> tensor<256x32xbf16>
-      %4 = tt.dot %2, %1, %arg4, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
-      %5 = arith.addi %arg5, %c32_i32 : i32
-      scf.yield %4, %5 : tensor<256x256xf32>, i32
-    }
-    tt.return
+// COM: tt.load -> tt.reshape -> tt.dot chain,  in a loop
+// COM: Where the 'make_tensor_ptr' result is loop carried.
+tt.func public @test_matmul(%a_ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %b_ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %c_ptr: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %M: i32 {tt.divisibility = 16 : i32}, %N: i32 {tt.divisibility = 16 : i32}, %K: i32 {tt.divisibility = 16 : i32}, %stride_am: i32 {tt.divisibility = 16 : i32}, %stride_bk: i32 {tt.divisibility = 16 : i32}, %stride_cm: i32 {tt.divisibility = 16 : i32}) {
+  %c127_i32 = arith.constant 127 : i32
+  %c255_i32 = arith.constant 255 : i32
+  %cst = arith.constant dense<0.000000e+00> : tensor<256x128xf32>
+  %c32_i32 = arith.constant 32 : i32
+  %c128_i32 = arith.constant 128 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i64 = arith.constant 1 : i64
+  %c256_i32 = arith.constant 256 : i32
+  %c4_i32 = arith.constant 4 : i32
+  %0 = tt.get_program_id x : i32
+  %1 = arith.addi %M, %c255_i32 : i32
+  %2 = arith.divsi %1, %c256_i32 : i32
+  %3 = arith.addi %N, %c127_i32 : i32
+  %4 = arith.divsi %3, %c128_i32 : i32
+  %5 = arith.muli %4, %c4_i32 : i32
+  %6 = arith.divsi %0, %5 : i32
+  %7 = arith.muli %6, %c4_i32 : i32
+  %8 = arith.subi %2, %7 : i32
+  %9 = arith.minsi %8, %c4_i32 : i32
+  %10 = arith.remsi %0, %5 : i32
+  %11 = arith.remsi %10, %9 : i32
+  %12 = arith.addi %7, %11 : i32
+  %13 = arith.divsi %10, %9 : i32
+  %14 = arith.muli %12, %c256_i32 : i32
+  %15 = arith.extsi %M : i32 to i64
+  %16 = arith.extsi %K : i32 to i64
+  %17 = arith.extsi %stride_am : i32 to i64
+  %18 = tt.make_tensor_ptr %a_ptr, [%c1_i64, %15, %16], [%c1_i64, %17, %c1_i64], [%c0_i32, %14, %c0_i32] {order = array<i32: 2, 1, 0>} : <tensor<1x256x32xf32>>
+  %19 = arith.muli %13, %c128_i32 : i32
+  %20 = arith.extsi %N : i32 to i64
+  %21 = arith.extsi %stride_bk : i32 to i64
+  %22 = tt.make_tensor_ptr %b_ptr, [%16, %20], [%21, %c1_i64], [%c0_i32, %19] {order = array<i32: 1, 0>} : <tensor<32x128xf32>>
+  %accumulator:3 = scf.for %k = %c0_i32 to %K step %c32_i32 iter_args(%a_block_ptr = %18, %b_block_ptr = %22, %accumulator_0 = %cst) -> (!tt.ptr<tensor<1x256x32xf32>>, !tt.ptr<tensor<32x128xf32>>, tensor<256x128xf32>)  : i32 {
+    %25 = tt.load %a_block_ptr {boundaryCheck = array<i32: 1, 2>} : !tt.ptr<tensor<1x256x32xf32>>
+    %26 = tt.reshape %25 : tensor<1x256x32xf32> -> tensor<256x32xf32>
+    %27 = tt.load %b_block_ptr {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<32x128xf32>>
+    %28 = tt.dot %26, %27, %cst, inputPrecision = tf32 : tensor<256x32xf32> * tensor<32x128xf32> -> tensor<256x128xf32>
+    %29 = arith.addf %accumulator_0, %28 : tensor<256x128xf32>
+    %30 = tt.advance %a_block_ptr, [%c0_i32, %c0_i32, %c32_i32] : <tensor<1x256x32xf32>>
+    %31 = tt.advance %b_block_ptr, [%c32_i32, %c0_i32] : <tensor<32x128xf32>>
+    scf.yield %30, %31, %29 : !tt.ptr<tensor<1x256x32xf32>>, !tt.ptr<tensor<32x128xf32>>, tensor<256x128xf32>
   }
-  // CHECK-LABEL: fuseLoadWithReshape2
-  // CHECK-NOT: tt.reshape
-  // CHECK: [[TRUNC:%.*]] = arith.trunci %c512_i64 : i64 to i32
-  // CHECK: [[MUL:%.*]] = arith.muli [[TRUNC]], %c1_i32 : i32
-  // CHECK: [[ADD:%.*]] = arith.addi [[MUL]], %c0_i32 : i32
-  // CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg1, [%c1024_i64, %c1_i64], [%c1_i64, %c1024_i64], [%c32_i32, [[ADD]]] {order = array<i32: 0, 1>} : <tensor<256x32xbf16>>
-  // CHECK: scf.for
-  // CHECK:   [[LOAD_A:%.*]] = tt.load [[PTR]] {boundaryCheck = array<i32: 1, 0>} : !tt.ptr<tensor<256x32xbf16>>
-  // CHECK:   tt.dot [[LOAD_A]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xbf16> * tensor<32x256xbf16> -> tensor<256x256xf32>
+  %23 = arith.extsi %stride_cm : i32 to i64
+  %24 = tt.make_tensor_ptr %c_ptr, [%15, %20], [%23, %c1_i64], [%14, %19] {order = array<i32: 1, 0>} : <tensor<256x128xf32>>
+  tt.store %24, %accumulator#2 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x128xf32>>
+  tt.return
 }
+// CHECK-LABEL: test_matmul
+// CHECK-NOT: tt.reshape
+// CHECK: [[MUL1:%.*]] = arith.muli %c1_i64, %c1_i64 : i64
+// CHECK: [[ADD1:%.*]] = arith.addi [[MUL1]], %16 : i64
+// CHECK: [[TRUNC:%.*]] = arith.trunci %c1_i64 : i64 to i32
+// CHECK: [[MUL2:%.*]] = arith.muli [[TRUNC]], %c0_i32 : i32
+// CHECK: [[ADD2:%.*]] = arith.addi [[MUL2]], %c0_i32 : i32
+// CHECK: [[PTR:%.*]] = tt.make_tensor_ptr %arg0, [%15, [[ADD1]]], [%17, %c1_i64], [%14, [[ADD2]]] {order = array<i32: 1, 0>} : <tensor<256x32xf32>>
+// CHECK: scf.for {{.*}} = %c0_i32 to {{.*}} step %c32_i32 iter_args([[ARG:%.*]] = [[PTR]]
+// CHECK:   [[LOAD_A:%.*]] = tt.load [[ARG]] {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x32xf32>>
+// CHECK:   tt.dot [[LOAD_A]], {{.*}}, {{.*}}, inputPrecision = tf32 : tensor<256x32xf32> * tensor<32x128xf32> -> tensor<256x128xf32>
+// CHECK:   tt.advance [[ARG]], [%c0_i32, %c32_i32] : <tensor<256x32xf32>>
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp
@@ -182,8 +182,6 @@ class FuseReshape {
     auto tensorType = cast<RankedTensorType>(reshapeOp.getType());
     auto newPtrType =
         tt::PointerType::get(tensorType, ptrType.getAddressSpace());
-    SmallVector<Value> newShape(makeTensorPtrOp.getShape().drop_front());
-    SmallVector<Value> newStrides(makeTensorPtrOp.getStrides().drop_front());
 
     unsigned innermostDimIdx = 0;
     ArrayRef<int> order = makeTensorPtrOp.getOrder();
@@ -195,8 +193,15 @@ class FuseReshape {
 
     OpBuilder builder(makeTensorPtrOp);
     Location loc = makeTensorPtrOp.getLoc();
+    Value firstShape = makeTensorPtrOp.getShape().front();
     Value firstStride = makeTensorPtrOp.getStrides().front();
     Value firstOffset = makeTensorPtrOp.getOffsets().front();
+
+    SmallVector<Value> newShape(makeTensorPtrOp.getShape().drop_front());
+    newShape[innermostDimIdx - 1] = builder.create<arith::AddIOp>(
+        loc, builder.create<arith::MulIOp>(loc, firstStride, firstShape),
+        newShape[innermostDimIdx - 1]);
+    SmallVector<Value> newStrides(makeTensorPtrOp.getStrides().drop_front());
     SmallVector<Value> newOffsets(makeTensorPtrOp.getOffsets().drop_front());
     newOffsets[innermostDimIdx - 1] = builder.create<arith::AddIOp>(
         loc,
@@ -372,10 +377,8 @@ class FuseReshape {
       SmallVector<Operation *> users(op->getUsers());
       if (users.size() > 2 || llvm::none_of(users, [&](Operation *user) {
             return user == yieldOp;
-          })) {
-        llvm::errs() << "at line " << __LINE__ << "\n";
+          }))
         return false;
-      }
 
       auto yieldedValUsedAfterLoop = [&op, &yieldOp]() {
         auto it =