[Pipeliner] Fix epilogue peeling for num_stages=3+ (#4890)

sjw36 · web-flow · commit 79ace6216170 · 2024-10-15T16:25:43.000-07:00
- The epilogue ramp-down indexing must start at zero or greater (total_iterations - max_stage) to ensure alignment with the prologue ramp-up stages. - If total_iterations < max_stage, the trailing stages will be masked. This commit mirrors upstream llvm/llvm-project#112418 and adds a functional test for correctness with num_stages=1,2,3,4.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/PipelineExpander.cpp
@@ -285,19 +285,6 @@ LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
   Location loc = forOp.getLoc();
   SmallVector<Value> predicates(maxStage);
   for (int64_t i = 0; i < maxStage; i++) {
-    if (dynamicLoop) {
-      Type t = ub.getType();
-      // pred = ub > lb + (i * step)
-      Value iv = rewriter.create<arith::AddIOp>(
-          loc, lb,
-          rewriter.create<arith::MulIOp>(
-              loc, step,
-              rewriter.create<arith::ConstantOp>(
-                  loc, rewriter.getIntegerAttr(t, i))));
-      predicates[i] = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, iv, ub);
-    }
-
     // special handling for induction variable as the increment is implicit.
     // iv = lb + i * step
     Type t = lb.getType();
@@ -308,6 +295,13 @@ LogicalResult LoopPipelinerInternal::emitPrologue(RewriterBase &rewriter) {
             rewriter.create<arith::ConstantOp>(loc,
                                                rewriter.getIntegerAttr(t, i))));
     setValueMapping(forOp.getInductionVar(), iv, i);
+
+    if (dynamicLoop) {
+      // pred = ub > lb + (i * step)
+      predicates[i] = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::slt, iv, ub);
+    }
+
     for (Operation *op : opOrder) {
       if (stages[op] > i)
         continue;
@@ -655,50 +649,56 @@ LogicalResult
 LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
                                     llvm::SmallVector<Value> &returnValues) {
   Location loc = forOp.getLoc();
+  Type t = lb.getType();
   // Emit different versions of the induction variable. They will be
   // removed by dead code if not used.
 
-  // range_diff = ub - lb
-  // total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step
-  Type t = lb.getType();
-  Value zero =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 0));
-  Value one =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 1));
-  Value minusOne =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -1));
+  auto createConst = [&](int v) {
+    return rewriter.create<arith::ConstantOp>(loc,
+                                              rewriter.getIntegerAttr(t, v));
+  };
+
+  // total_iterations = cdiv(range_diff, step);
+  // - range_diff = ub - lb
+  // - total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step
+  Value zero = createConst(0);
+  Value one = createConst(1);
   Value stepLessZero = rewriter.create<arith::CmpIOp>(
       loc, arith::CmpIPredicate::slt, step, zero);
   Value stepDecr =
-      rewriter.create<arith::SelectOp>(loc, stepLessZero, one, minusOne);
+      rewriter.create<arith::SelectOp>(loc, stepLessZero, one, createConst(-1));
 
   Value rangeDiff = rewriter.create<arith::SubIOp>(loc, ub, lb);
   Value rangeIncrStep = rewriter.create<arith::AddIOp>(loc, rangeDiff, step);
   Value rangeDecr =
       rewriter.create<arith::AddIOp>(loc, rangeIncrStep, stepDecr);
   Value totalIterations = rewriter.create<arith::DivSIOp>(loc, rangeDecr, step);
 
+  // If total_iters < max_stage, start the epilogue at zero to match the
+  // ramp-up in the prologue.
+  // start_iter = max(0, total_iters - max_stage)
+  Value iterI = rewriter.create<arith::SubIOp>(loc, totalIterations,
+                                               createConst(maxStage));
+  iterI = rewriter.create<arith::MaxSIOp>(loc, zero, iterI);
+
   // Capture predicates for dynamic loops.
   SmallVector<Value> predicates(maxStage + 1);
 
-  for (int64_t i = 0; i < maxStage; i++) {
-    // iterI = total_iters - 1 - i
-    // May go negative...
-    Value minusI =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -i));
-    Value iterI = rewriter.create<arith::AddIOp>(
-        loc, rewriter.create<arith::AddIOp>(loc, totalIterations, minusOne),
-        minusI);
+  for (int64_t i = 1; i <= maxStage; i++) {
     // newLastIter = lb + step * iterI
     Value newlastIter = rewriter.create<arith::AddIOp>(
         loc, lb, rewriter.create<arith::MulIOp>(loc, step, iterI));
 
-    setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
+    setValueMapping(forOp.getInductionVar(), newlastIter, i);
+
+    // increment to next iterI
+    iterI = rewriter.create<arith::AddIOp>(loc, iterI, one);
 
     if (dynamicLoop) {
-      // pred = iterI >= 0
-      predicates[i + 1] = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sge, iterI, zero);
+      // Disable stages when `i` is greater than total_iters.
+      // pred = total_iters >= i
+      predicates[i] = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sge, totalIterations, createConst(i));
     }
   }
 
diff --git a/python/test/unit/language/test_pipeliner.py b/python/test/unit/language/test_pipeliner.py
@@ -180,3 +180,32 @@ def test_pipeline_vecadd(device):
         assert ttgir.count(f"num = {NUM_STAGES} : i32") != 0, "num_stages not match"
         # 3. check alloc
         assert ttgir.count("triton_gpu.local_alloc") == 2, "alloc number not match"
+
+
+@pytest.mark.parametrize("ROW_COUNT", [0, 1, 2, 3])
+@pytest.mark.parametrize("NUM_STAGES", [1, 2, 3, 4, 5])
+def test_pipeline_epilogue(ROW_COUNT, NUM_STAGES, device):
+
+    @triton.jit
+    def kernel_up(output_ptr, input_ptr, input_row_stride, output_row_stride, n_rows, n_cols, BLOCK_SIZE: tl.constexpr,
+                  NUM_STAGES: tl.constexpr):
+        row_step = tl.num_programs(0)
+        col_offsets = tl.arange(0, BLOCK_SIZE)
+        mask = col_offsets < n_cols
+        for row_idx in tl.range(0, n_rows, row_step, num_stages=NUM_STAGES):
+            row_start_ptr = input_ptr + row_idx * input_row_stride
+            input_ptrs = row_start_ptr + col_offsets
+            val = tl.load(input_ptrs, mask=mask, other=-float('inf'))
+            val += 1.0
+            output_row_start_ptr = output_ptr + row_idx * output_row_stride
+            output_ptrs = output_row_start_ptr + col_offsets
+            tl.store(output_ptrs, val, mask=mask)
+
+    width = ROW_COUNT
+    depth = 78
+    x = torch.zeros(width, depth, device='cuda')
+    y0 = torch.rand_like(x)
+    n_rows, n_cols = x.shape
+    BLOCK_SIZE = triton.next_power_of_2(n_cols)
+    kernel_up[(1, )](y0, x, x.stride(0), y0.stride(0), n_rows, n_cols, BLOCK_SIZE, NUM_STAGES)
+    assert (y0 == torch.ones_like(x)).all()
diff --git a/test/TritonGPU/loop-pipeline.mlir b/test/TritonGPU/loop-pipeline.mlir
@@ -84,19 +84,18 @@
 //       AMD:   %[[SUBI_23:.*]] = arith.subi %[[UB]], %[[LB]]
 //       AMD:   %[[ADDI_24:.*]] = arith.addi %[[SUBI_23]], %[[STEP]]
 //       AMD:   %[[ADDI_25:.*]] = arith.addi %[[ADDI_24]], %[[SELECT_22]]
-//       AMD:   %[[DIVUI_26:.*]] = arith.divsi %[[ADDI_25]], %[[STEP]]
-//       AMD:   %[[ADDI_27:.*]] = arith.addi %[[DIVUI_26]], %[[CM1]]
-//       AMD:   %[[CMPI_28:.*]] = arith.cmpi sge, %[[ADDI_27]], %[[C0]]
-//       AMD:   %[[LOCAL_LOAD_27:.*]] = triton_gpu.local_load %[[FOR]]#4
-//       AMD:   %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %[[FOR]]#5
-//       AMD:   %[[MULF_29:.*]] = arith.mulf %[[LOCAL_LOAD_28]], %{{.*}}
-//       AMD:   %[[IF_30:.*]] = scf.if %[[CMPI_28]]
-//       AMD:       %[[DOT_32:.*]] = tt.dot %[[LOCAL_LOAD_27]], %[[MULF_29]], %[[FOR]]#2
-//       AMD:       scf.yield %[[DOT_32]]
+//       AMD:   %[[DIVSI_26:.*]] = arith.divsi %[[ADDI_25]], %[[STEP]]
+//       AMD:   %[[CMPI_27:.*]] = arith.cmpi sge, %[[DIVSI_26]], %{{.*}}
+//       AMD:   %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %{{.*}}#4
+//       AMD:   %[[LOCAL_LOAD_29:.*]] = triton_gpu.local_load %{{.*}}#5
+//       AMD:   %[[MULF_30:.*]] = arith.mulf %[[LOCAL_LOAD_29]], %{{.*}}
+//       AMD:   %[[IF_31:.*]] = scf.if %[[CMPI_27]]
+//       AMD:     %[[DOT_33:.*]] = tt.dot %[[LOCAL_LOAD_28]], %[[MULF_30]], %{{.*}}#2
+//       AMD:     scf.yield %[[DOT_33]]
 //       AMD:   } else {
-//       AMD:       scf.yield %[[FOR]]#2
+//       AMD:     scf.yield %{{.*}}#2
 //       AMD:   }
-//       AMD:   %[[SELECT_31:.*]] = arith.select %[[CMPI_28]], %[[IF_30]], %[[FOR]]#2
+//       AMD:   %[[SELECT_32:.*]] = arith.select %[[CMPI_27]], %[[IF_31]], %{{.*}}#2
 //       AMD:   triton_gpu.local_dealloc %{{.*}}
 //       AMD:   triton_gpu.local_dealloc %{{.*}}
 
@@ -414,35 +413,33 @@ tt.func @matmul_loop_single_pipeline(%lb : index, %ub : index, %step : index,
 //       AMD:       triton_gpu.local_store %[[ARG14]], %[[MEMDESC_SUBVIEW_58]]
 //       AMD:       scf.yield %[[DOT_45]], %[[ADDPTR_46]], %[[ADDPTR_47]], %[[SELECT_56]], %[[MEMDESC_SUBVIEW_57]], %[[MEMDESC_SUBVIEW_58]], %[[LOAD_48]], %[[LOAD_53]]
 //       AMD:   }
-//       AMD:   %[[ADDI_26:.*]] = arith.addi %{{.*}}, %{{.*}}-1
-//       AMD:   %[[CMPI_27:.*]] = arith.cmpi sge, %[[ADDI_26]], %{{.*}}
-//       AMD:   %[[ADDI_28:.*]] = arith.addi %{{.*}}, %{{.*}}-2
-//       AMD:   %[[CMPI_29:.*]] = arith.cmpi sge, %[[ADDI_28]], %{{.*}}
-//       AMD:   %[[LOCAL_LOAD_30:.*]] = triton_gpu.local_load %{{.*}}#4
-//       AMD:   %[[LOCAL_LOAD_31:.*]] = triton_gpu.local_load %{{.*}}#5
-//       AMD:   %[[IF_32:.*]] = scf.if %[[CMPI_27]]
-//       AMD:       %[[DOT_43:.*]] = tt.dot %[[LOCAL_LOAD_30]], %[[LOCAL_LOAD_31]], %{{.*}}#0
-//       AMD:       scf.yield %[[DOT_43]]
+//       AMD:   %[[CMPI_26:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
+//       AMD:   %[[CMPI_27:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
+//       AMD:   %[[LOCAL_LOAD_28:.*]] = triton_gpu.local_load %{{.*}}#4
+//       AMD:   %[[LOCAL_LOAD_29:.*]] = triton_gpu.local_load %{{.*}}#5
+//       AMD:   %[[IF_30:.*]] = scf.if %[[CMPI_26]]
+//       AMD:     %[[DOT_41:.*]] = tt.dot %[[LOCAL_LOAD_28]], %[[LOCAL_LOAD_29]], %{{.*}}#0
+//       AMD:     scf.yield %[[DOT_41]]
 //       AMD:   } else {
-//       AMD:       scf.yield %{{.*}}#0
+//       AMD:     scf.yield %{{.*}}#0
 //       AMD:   }
-//       AMD:   %[[ADDI_33:.*]] = arith.addi %{{.*}}#3, %{{.*}}
-//       AMD:   %[[CMPI_34:.*]] = arith.cmpi slt, %[[ADDI_33]], %{{.*}}
-//       AMD:   %[[SELECT_35:.*]] = arith.select %[[CMPI_34]], %[[ADDI_33]], %{{.*}}
-//       AMD:   %[[MEMDESC_SUBVIEW_36:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_35]], %{{.*}}, %{{.*}}]
-//       AMD:   triton_gpu.local_store %{{.*}}#6, %[[MEMDESC_SUBVIEW_36]]
-//       AMD:   %[[MEMDESC_SUBVIEW_37:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_35]], %{{.*}}, %{{.*}}]
-//       AMD:   triton_gpu.local_store %{{.*}}#7, %[[MEMDESC_SUBVIEW_37]]
-//       AMD:   %[[SELECT_38:.*]] = arith.select %[[CMPI_27]], %[[IF_32]], %{{.*}}#0
-//       AMD:   %[[LOCAL_LOAD_39:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_36]]
-//       AMD:   %[[LOCAL_LOAD_40:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_37]]
-//       AMD:   %[[IF_41:.*]] = scf.if %[[CMPI_29]]
-//       AMD:       %[[DOT_43:.*]] = tt.dot %[[LOCAL_LOAD_39]], %[[LOCAL_LOAD_40]], %[[SELECT_38]]
-//       AMD:       scf.yield %[[DOT_43]]
+//       AMD:   %[[ADDI_31:.*]] = arith.addi %{{.*}}#3, %{{.*}}
+//       AMD:   %[[CMPI_32:.*]] = arith.cmpi slt, %[[ADDI_31]], %{{.*}}
+//       AMD:   %[[SELECT_33:.*]] = arith.select %[[CMPI_32]], %[[ADDI_31]], %{{.*}}
+//       AMD:   %[[MEMDESC_SUBVIEW_34:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_33]], %{{.*}}, %{{.*}}]
+//       AMD:   triton_gpu.local_store %{{.*}}#6, %[[MEMDESC_SUBVIEW_34]]
+//       AMD:   %[[MEMDESC_SUBVIEW_35:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_33]], %{{.*}}, %{{.*}}]
+//       AMD:   triton_gpu.local_store %{{.*}}#7, %[[MEMDESC_SUBVIEW_35]]
+//       AMD:   %[[SELECT_36:.*]] = arith.select %[[CMPI_26]], %[[IF_30]], %{{.*}}#0
+//       AMD:   %[[LOCAL_LOAD_37:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_34]]
+//       AMD:   %[[LOCAL_LOAD_38:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_35]]
+//       AMD:   %[[IF_39:.*]] = scf.if %[[CMPI_27]]
+//       AMD:     %[[DOT_41:.*]] = tt.dot %[[LOCAL_LOAD_37]], %[[LOCAL_LOAD_38]], %[[SELECT_36]]
+//       AMD:     scf.yield %[[DOT_41]]
 //       AMD:   } else {
-//       AMD:       scf.yield %[[SELECT_38]]
+//       AMD:     scf.yield %[[SELECT_36]]
 //       AMD:   }
-//       AMD:   %[[SELECT_42:.*]] = arith.select %[[CMPI_29]], %[[IF_41]], %[[SELECT_38]]
+//       AMD:   %[[SELECT_40:.*]] = arith.select %[[CMPI_27]], %[[IF_39]], %[[SELECT_36]]
 //       AMD:   triton_gpu.local_dealloc %[[LOCAL_ALLOC_0]]
 //       AMD:   triton_gpu.local_dealloc %[[LOCAL_ALLOC_1]]
 
@@ -976,6 +973,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 
 //   AMD-DIS: #[[$SHARED_LAYOUT:shared.*]] = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0], hasLeadingOffset = false}>
 // AMD-LABEL: tt.func @indirect_load_shared_layout
+//       AMD:   %[[LOCAL_ALLOC_0:.*]] = triton_gpu.local_alloc
+//       AMD:   %[[LOCAL_ALLOC_1:.*]] = triton_gpu.local_alloc
 //       AMD:   %{{.*}}:7 = scf.for %[[ARG6:.*]] = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[ARG7:.*]] = %{{.*}}, %[[ARG8:.*]] = %{{.*}}, %[[ARG9:.*]] = %{{.*}}, %[[ARG10:.*]] = %{{.*}}, %[[ARG11:.*]] = %{{.*}}, %[[ARG12:.*]] = %{{.*}}, %[[ARG13:.*]] = %{{.*}})
 //       AMD:     %[[LOCAL_LOAD_47:.*]] = triton_gpu.local_load %[[ARG11]]
 //       AMD:     %[[LOCAL_LOAD_48:.*]] = triton_gpu.local_load %[[ARG12]]
@@ -998,44 +997,42 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
 //       AMD:     triton_gpu.local_store %[[LOAD_57]], %[[MEMDESC_SUBVIEW_63]]
 //       AMD:     scf.yield %[[DOT_49]], %[[ADDPTR_50]], %[[ADDPTR_51]], %[[SELECT_61]], %[[MEMDESC_SUBVIEW_62]], %[[MEMDESC_SUBVIEW_63]], %[[LOAD_58]]
 //       AMD:   }
-//       AMD:   %[[ADDI_21:.*]] = arith.addi %{{.*}}, %{{.*}}-1
-//       AMD:   %[[CMPI_22:.*]] = arith.cmpi sge, %[[ADDI_21]], %{{.*}}
-//       AMD:   %[[ADDI_23:.*]] = arith.addi %{{.*}}, %{{.*}}-2
-//       AMD:   %[[CMPI_24:.*]] = arith.cmpi sge, %[[ADDI_23]], %{{.*}}
-//       AMD:   %[[LOCAL_LOAD_25:.*]] = triton_gpu.local_load %{{.*}}#4
-//       AMD:   %[[LOCAL_LOAD_26:.*]] = triton_gpu.local_load %{{.*}}#5
-//       AMD:   %[[IF_27:.*]] = scf.if %[[CMPI_22]]
-//       AMD:     %[[DOT_47:.*]] = tt.dot %[[LOCAL_LOAD_25]], %[[LOCAL_LOAD_26]], %{{.*}}#0
-//       AMD:     scf.yield %[[DOT_47]]
+//       AMD:   %[[CMPI_21:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
+//       AMD:   %[[CMPI_22:.*]] = arith.cmpi sge, %{{.*}}, %{{.*}}
+//       AMD:   %[[LOCAL_LOAD_23:.*]] = triton_gpu.local_load %{{.*}}#4
+//       AMD:   %[[LOCAL_LOAD_24:.*]] = triton_gpu.local_load %{{.*}}#5
+//       AMD:   %[[IF_25:.*]] = scf.if %[[CMPI_21]]
+//       AMD:     %[[DOT_45:.*]] = tt.dot %[[LOCAL_LOAD_23]], %[[LOCAL_LOAD_24]], %{{.*}}#0
+//       AMD:     scf.yield %[[DOT_45]]
 //       AMD:   } else {
 //       AMD:     scf.yield %{{.*}}#0
 //       AMD:   }
-//       AMD:   %[[ADDPTR_28:.*]] = tt.addptr %{{.*}}#1, %{{.*}}
-//       AMD:   %[[SPLAT_29:.*]] = tt.splat %[[CMPI_24]]
-//       AMD:   %[[LOAD_30:.*]] = tt.load %[[ADDPTR_28]], %[[SPLAT_29]]
-//       AMD:   %[[EXPAND_DIMS_31:.*]] = tt.expand_dims %{{.*}}#6 {axis = 1 : i32}
-//       AMD:   %[[BROADCAST_32:.*]] = tt.broadcast %[[EXPAND_DIMS_31]]
-//       AMD:   %[[MULI_33:.*]] = arith.muli %{{.*}}, %[[BROADCAST_32]]
-//       AMD:   %[[ADDPTR_34:.*]] = tt.addptr %{{.*}}, %[[MULI_33]]
-//       AMD:   %[[SPLAT_35:.*]] = tt.splat %[[CMPI_24]]
-//       AMD:   %[[LOAD_36:.*]] = tt.load %[[ADDPTR_34]], %[[SPLAT_35]]
-//       AMD:   %[[ADDI_37:.*]] = arith.addi %{{.*}}#3, %{{.*}}
-//       AMD:   %[[CMPI_38:.*]] = arith.cmpi slt, %[[ADDI_37]], %{{.*}}
-//       AMD:   %[[SELECT_39:.*]] = arith.select %[[CMPI_38]], %[[ADDI_37]], %{{.*}}
-//       AMD:   %[[MEMDESC_SUBVIEW_40:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}]
-//       AMD:   triton_gpu.local_store %[[LOAD_30]], %[[MEMDESC_SUBVIEW_40]]
-//       AMD:   %[[MEMDESC_SUBVIEW_41:.*]] = triton_gpu.memdesc_subview %{{.*}}[%[[SELECT_39]], %{{.*}}, %{{.*}}]
-//       AMD:   triton_gpu.local_store %[[LOAD_36]], %[[MEMDESC_SUBVIEW_41]]
-//       AMD:   %[[SELECT_42:.*]] = arith.select %[[CMPI_22]], %[[IF_27]], %{{.*}}#0
-//       AMD:   %[[LOCAL_LOAD_43:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_40]]
-//       AMD:   %[[LOCAL_LOAD_44:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_41]]
-//       AMD:   %[[IF_45:.*]] = scf.if %[[CMPI_24]]
-//       AMD:     %[[DOT_47:.*]] = tt.dot %[[LOCAL_LOAD_43]], %[[LOCAL_LOAD_44]], %[[SELECT_42]]
-//       AMD:     scf.yield %[[DOT_47]]
+//       AMD:   %[[ADDPTR_26:.*]] = tt.addptr %{{.*}}#1, %{{.*}}
+//       AMD:   %[[SPLAT_27:.*]] = tt.splat %[[CMPI_22]]
+//       AMD:   %[[LOAD_28:.*]] = tt.load %[[ADDPTR_26]], %[[SPLAT_27]]
+//       AMD:   %[[EXPAND_DIMS_29:.*]] = tt.expand_dims %{{.*}}#6 {axis = 1 : i32}
+//       AMD:   %[[BROADCAST_30:.*]] = tt.broadcast %[[EXPAND_DIMS_29]]
+//       AMD:   %[[MULI_31:.*]] = arith.muli %{{.*}}, %[[BROADCAST_30]]
+//       AMD:   %[[ADDPTR_32:.*]] = tt.addptr %{{.*}}, %[[MULI_31]]
+//       AMD:   %[[SPLAT_33:.*]] = tt.splat %[[CMPI_22]]
+//       AMD:   %[[LOAD_34:.*]] = tt.load %[[ADDPTR_32]], %[[SPLAT_33]]
+//       AMD:   %[[ADDI_35:.*]] = arith.addi %{{.*}}#3, %{{.*}}
+//       AMD:   %[[CMPI_36:.*]] = arith.cmpi slt, %[[ADDI_35]], %{{.*}}
+//       AMD:   %[[SELECT_37:.*]] = arith.select %[[CMPI_36]], %[[ADDI_35]], %{{.*}}
+//       AMD:   %[[MEMDESC_SUBVIEW_38:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_0]][%[[SELECT_37]], %{{.*}}, %{{.*}}]
+//       AMD:   triton_gpu.local_store %[[LOAD_28]], %[[MEMDESC_SUBVIEW_38]]
+//       AMD:   %[[MEMDESC_SUBVIEW_39:.*]] = triton_gpu.memdesc_subview %[[LOCAL_ALLOC_1]][%[[SELECT_37]], %{{.*}}, %{{.*}}]
+//       AMD:   triton_gpu.local_store %[[LOAD_34]], %[[MEMDESC_SUBVIEW_39]]
+//       AMD:   %[[SELECT_40:.*]] = arith.select %[[CMPI_21]], %[[IF_25]], %{{.*}}#0
+//       AMD:   %[[LOCAL_LOAD_41:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_38]]
+//       AMD:   %[[LOCAL_LOAD_42:.*]] = triton_gpu.local_load %[[MEMDESC_SUBVIEW_39]]
+//       AMD:   %[[IF_43:.*]] = scf.if %[[CMPI_22]]
+//       AMD:     %[[DOT_45:.*]] = tt.dot %[[LOCAL_LOAD_41]], %[[LOCAL_LOAD_42]], %[[SELECT_40]]
+//       AMD:     scf.yield %[[DOT_45]]
 //       AMD:   } else {
-//       AMD:     scf.yield %[[SELECT_42]]
+//       AMD:     scf.yield %[[SELECT_40]]
 //       AMD:   }
-//       AMD:   %[[SELECT_46:.*]] = arith.select %[[CMPI_24]], %[[IF_45]], %[[SELECT_42]]
+//       AMD:   %[[SELECT_44:.*]] = arith.select %[[CMPI_22]], %[[IF_43]], %[[SELECT_40]]
 //       AMD:   triton_gpu.local_dealloc %{{.*}}
 //       AMD:   triton_gpu.local_dealloc %{{.*}}