address review comments

alexbaden · alexbaden · commit 5c1a0ac9a868 · 2024-10-09T21:28:02.000Z
diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -335,3 +335,53 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32
     tt.return
   }
 }
+
+// -----
+
+// COM: Case 5:
+// COM: Check that a make tensor ptr with no loads is handled properly
+// CHECK: #[[DPAS:.+]] = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [8, 4], repCluster = [4, 2], A = [32, 16], B = [16, 32], C = [32, 32]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 32 : i32, triton_gpu.target = "xpu", "triton_gpu.threads-per-warp" = 16 : i32, triton_intel_gpu.min_sg_size = 16 : i32, triton_intel_gpu.support_bf16_conversion, triton_intel_gpu.support_dpas, triton_intel_gpu.support_sg_2d_block} {
+  tt.func public @matmul_kernel_with_block_pointers(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+    // CHECK:  @matmul_kernel_with_block_pointers
+    %c4_i32 = arith.constant 4 : i32
+    %c256_i32 = arith.constant 256 : i32
+    %c1024_i64 = arith.constant 1024 : i64
+    %c5120_i64 = arith.constant 5120 : i64
+    %c1_i64 = arith.constant 1 : i64
+    %c0_i32 = arith.constant 0 : i32
+    %c4096_i64 = arith.constant 4096 : i64
+    %c32_i32 = arith.constant 32 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %c5120_i32 = arith.constant 5120 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #dpas>
+    %0 = tt.get_program_id x : i32
+    %1 = arith.divsi %0, %c64_i32 : i32
+    %2 = arith.muli %1, %c4_i32 : i32
+    %3 = arith.subi %c4_i32, %2 : i32
+    %4 = arith.minsi %3, %c4_i32 : i32
+    %5 = arith.remsi %0, %4 : i32
+    %6 = arith.addi %2, %5 : i32
+    %7 = arith.remsi %0, %c64_i32 : i32
+    %8 = arith.divsi %7, %4 : i32
+    %9 = arith.muli %6, %c256_i32 : i32
+    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+    %10 = tt.make_tensor_ptr %arg0, [%c1024_i64, %c5120_i64], [%c5120_i64, %c1_i64], [%9, %c0_i32] {order = array<i32: 1, 0>} : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>
+    %11 = arith.muli %8, %c256_i32 : i32
+    // CHECK: tt.make_tensor_ptr {{.*}}, {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}], {{\[}}{{.*}}, {{.*}}] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+    %12 = tt.make_tensor_ptr %arg1, [%c5120_i64, %c4096_i64], [%c1_i64, %c5120_i64], [%c0_i32, %11] {order = array<i32: 0, 1>} : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
+    %13:3 = scf.for %arg3 = %c0_i32 to %c5120_i32 step %c32_i32 iter_args(%arg4 = %cst, %arg5 = %10, %arg6 = %12) -> (tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>)  : i32 {
+      // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #[[DPAS]], kWidth = 2}>>>
+      // CHECK:  tt.advance {{.*}}, {{\[}}{{.*}}, {{.*}}] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #[[DPAS]], kWidth = 2}>>>
+      %19 = tt.advance %arg5, [%c0_i32, %c32_i32] : <tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>
+      %20 = tt.advance %arg6, [%c32_i32, %c0_i32] : <tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
+      scf.yield %arg4, %19, %20 : tensor<256x256xf32, #dpas>, !tt.ptr<tensor<256x32xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #dpas, kWidth = 2}>>>, !tt.ptr<tensor<32x256xf16, #triton_gpu.dot_op<{opIdx = 1, parent = #dpas, kWidth = 2}>>>
+    }
+    %14 = tt.make_tensor_ptr %arg2, [%c1024_i64, %c4096_i64], [%c4096_i64, %c1_i64], [%9, %11] {order = array<i32: 1, 0>} : <tensor<256x256xf16, #dpas>>
+    %15 = arith.truncf %13#0 : tensor<256x256xf32, #dpas> to tensor<256x256xf16, #dpas>
+    // CHECK: tt.store {{.*}}, {{.*}}, {{.*}} : !tt.ptr<tensor<256x256xf16, #[[DPAS]]>
+    tt.store %14, %15 {boundaryCheck = array<i32: 0, 1>} : !tt.ptr<tensor<256x256xf16, #dpas>>
+    tt.return
+  }
+}
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RewriteTensorPointer.cpp
@@ -675,52 +675,51 @@ class TritonIntelGPURewriteTensorPointerPass
     ModuleOp mod = getOperation();
 
     DenseSet<Operation *> tensorPointersToRemove;
-    mod.walk([&](Operation *op) {
-      if (isa<tt::MakeTensorPtrOp>(op)) {
-        DenseSet<Operation *> workingSet;
+    mod.walk([&](tt::MakeTensorPtrOp makeTensorPtrOp) {
+      DenseSet<Operation *> workingSet;
 
-        auto makeTensorPtrOp = dyn_cast<tt::MakeTensorPtrOp>(op);
-        LDBG("Considering: " << *op);
-        Value result = op->getResult(0);
-        for (auto user : result.getUsers()) {
-          workingSet.insert(user);
-        }
-        while (!workingSet.empty()) {
-          auto crtOpItr = workingSet.begin();
-          auto crtOp = *crtOpItr;
-          LDBG("Processing op: " << *crtOp);
-          if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
-            if (shouldRemove(makeTensorPtrOp,
-                             /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
-                             /*isBlockLoad=*/
-                             isa<tt::LoadOp>(crtOp) &&
-                                 crtOp->hasAttr(ttgi::TritonIntelGPUDialect::
-                                                    getBlockIOAttrName()))) {
-              tensorPointersToRemove.insert(makeTensorPtrOp);
-            }
-          } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
-            for (auto [arg, blockArg] :
-                 llvm::zip(forOp.getInitArgs(),
-                           forOp.getBody()->getArguments().drop_front(
-                               forOp.getNumInductionVars()))) {
-              if (arg == makeTensorPtrOp) {
-                // add users of block arg
-                for (auto user : blockArg.getUsers()) {
-                  workingSet.insert(user);
-                }
+      LDBG("Considering: " << makeTensorPtrOp);
+      Value result = makeTensorPtrOp.getResult();
+      for (auto user : result.getUsers()) {
+        workingSet.insert(user);
+      }
+      while (!workingSet.empty()) {
+        auto crtOpItr = workingSet.begin();
+        auto crtOp = *crtOpItr;
+        LDBG("Processing op: " << *crtOp);
+        if (isa<tt::LoadOp, tt::StoreOp>(crtOp)) {
+          if (shouldRemove(
+                  makeTensorPtrOp,
+                  /*isUsedByStoreOp=*/isa<tt::StoreOp>(crtOp),
+                  /*isBlockLoad=*/
+                  isa<tt::LoadOp>(crtOp) &&
+                      crtOp->hasAttr(
+                          ttgi::TritonIntelGPUDialect::getBlockIOAttrName()))) {
+            tensorPointersToRemove.insert(makeTensorPtrOp);
+            return;
+          }
+        } else if (auto forOp = dyn_cast<scf::ForOp>(crtOp)) {
+          for (auto [arg, blockArg] :
+               llvm::zip(forOp.getInitArgs(),
+                         forOp.getBody()->getArguments().drop_front(
+                             forOp.getNumInductionVars()))) {
+            if (arg == makeTensorPtrOp) {
+              // add users of block arg
+              for (auto user : blockArg.getUsers()) {
+                workingSet.insert(user);
               }
             }
-          } else if (crtOp->getNumResults() > 0) {
-            // TODO: should we handle more than one result?
-            auto crtOpResult = crtOp->getResult(0);
-            LDBG("Not a load store and not a loop, adding users to working "
-                 "set.");
-            for (auto user : crtOpResult.getUsers()) {
-              workingSet.insert(user);
-            }
           }
-          workingSet.erase(crtOpItr);
+        } else if (crtOp->getNumResults() > 0) {
+          // TODO: should we handle more than one result?
+          auto crtOpResult = crtOp->getResult(0);
+          LDBG("Not a load store and not a loop, adding users to working "
+               "set.");
+          for (auto user : crtOpResult.getUsers()) {
+            workingSet.insert(user);
+          }
         }
+        workingSet.erase(crtOpItr);
       }
     });