Address code review comments

etiotto · etiotto · commit 79199c666416 · 2025-10-09T20:46:16.000Z
Signed-off-by: Ettore Tiotto &lt;ettore.tiotto@intel.com&gt;
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -94,9 +94,9 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::test::registerTestAMDGPUMembarPass();
   mlir::test::registerTestTritonAMDGPURangeAnalysis();
   mlir::triton::registerConvertTritonToTritonGPUPass();
-  mlir::triton::intel::registerTritonIntelTensorDescToBlockPointer();
-  mlir::triton::intel::registerTritonIntelRemoveMasks();
   mlir::triton::intel::registerTritonIntelFuseReshape();
+  mlir::triton::intel::registerTritonIntelRemoveMasks();
+  mlir::triton::intel::registerTritonIntelTensorDescToBlockPointer();
   mlir::triton::registerRelayoutTritonGPUPass();
   mlir::triton::gpu::registerAllocateSharedMemoryPass();
   mlir::triton::gpu::registerTritonGPUAllocateWarpGroups();
diff --git a/third_party/intel/include/Dialect/Triton/Transforms/Passes.td b/third_party/intel/include/Dialect/Triton/Transforms/Passes.td
@@ -46,21 +46,23 @@ def TritonIntelRemoveMasks
 
 def TritonIntelFuseReshape
     : Pass<"triton-intel-fuse-reshape", "mlir::ModuleOp"> {
-  let summary = "Fuse a tt.reshape operation with a tt.load operation";
+  let summary = "Fuse a tt.reshape operation with a tt.load operation (block ptrs only)";
 
   let description = [{
-    This pass attempts to fuse a tt.reshape operation with a tt.load operation using a block pointer.
+    This pass attempts to fuse a tt.reshape operation with a tt.load operation.
     For example, given:
-        %q_27 = arith.constant 1 : i64
-        %ptr = tt.make_tensor_ptr %q_view, [%q, %q_23, %q_24], [%q_25, %q_26, %q_27], [%offset_5, %offset_1_13, %q_28]
+        %ptr = tt.make_tensor_ptr %base_ptr, [%s0, %s1, %s2], [%a, %b, %c], [%x, %y, %z]
                                   {order = array<i32: 2, 1, 0>} : <tensor<1x512x64xf16>>
         %load = tt.load %ptr {boundaryCheck = array<i32: 1, 2>} : !tt.ptr<tensor<1x512x64xf16>>
-        %reshape = tt.reshape %load : tensor<1x512x64xf16> -> tensor<512x64xf16>
-    The transformation would drops the reshape operation and adjust the make_tensor_ptr operation as follows:
-        %q_27 = arith.constant 1 : i64
-        %ptr = tt.make_tensor_ptr %q_view, [%q_23, %q_24], [%q_26, %q_27], [%offset_1_13, %offset_5*%q_25+%q_28]
+        %A = tt.reshape %load : tensor<1x512x64xf16> -> tensor<512x64xf16>
+        %dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
+
+    The transformation drops the reshape operation, and generates:
+        %div = %a / %b
+        %ptr = tt.make_tensor_ptr %base_ptr, [%s0 * %div + %s1, %s2], [%b, %c], [%x * %div + %y, %z]
                                   {order = array<i32: 1, 0>} : <tensor<512x64xf16>>
-        %load = tt.load %ptr {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<512x64xf16>>
+        %A = tt.load %ptr {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<512x64xf16>>
+        %dot %A, ... : tensor<512x64xf16> x tensor<64x32xf16> -> tensor<512x32xf16>
   }];
 
   let dependentDialects = [
diff --git a/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp b/third_party/intel/lib/Dialect/Triton/Transforms/FuseReshape.cpp
@@ -93,8 +93,6 @@ class FuseReshape {
     // Remove operations that are no longer used.
     if (!cleanUp.empty())
       tt::intel::eraseOperations(cleanUp);
-
-    assert(succeeded(verify(moduleOp)) && "Module verification failed");
   }
 
 private:
@@ -165,7 +163,7 @@ class FuseReshape {
         isa<tt::MakeTensorPtrOp>(chain.getStart()) &&
         "Expecting 'chain' to be rooted by a 'tt.make_tensor_ptr' operation");
     assert(isa<tt::ReshapeOp>(chain.getEnd()) &&
-           "Expecting 'chain' to be terminated by a 'tt.rehape' operation");
+           "Expecting 'chain' to be terminated by a 'tt.reshape' operation");
 
     auto makeTensorPtrOp = cast<tt::MakeTensorPtrOp>(chain.getStart());
     auto reshapeOp = cast<tt::ReshapeOp>(chain.getEnd());
@@ -202,14 +200,16 @@ class FuseReshape {
     OperandRange offsets = makeTensorPtrOp.getOffsets();
 
     // Collapse the 3-dim tensor into a 2-dim tensor.
-    // Given a block pointer with:
+    // Given a make_tensor_ptr with:
     //   shape  [s0, s1, s2]
     //   stride [a, b, c]
     //   offset [x, y, z]
-    // We create a block pinter with:
+    //   order  [2, 1, 0]
+    // We create a make_tensor_ptr with:
     //   shape  [s0 * a / b + s1, s2]
     //   stride [b, c]
     //   offset [x * a / b + y, z]
+    //   order  [1, 0]
     SmallVector<Value> newShape(makeTensorPtrOp.getShape().drop_front());
     SmallVector<Value> newStrides(makeTensorPtrOp.getStrides().drop_front());
     SmallVector<Value> newOffsets(makeTensorPtrOp.getOffsets().drop_front());