Revert "[Warp Specialization] Don't pipeline loops where latency ops are in the same stage" (#6983)

Mogball · web-flow · commit c00f747fb709 · 2025-05-29T17:15:22.000Z
Reverts triton-lang/triton#6969
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp
@@ -167,32 +167,6 @@ CoarseSchedule getInitialSchedule(scf::ForOp forOp,
   CoarseSchedule schedule;
   if (forOp->hasAttr(kWarpSpecializeAttrName) &&
       succeeded(schedule.deSerialize(forOp))) {
-    // The loop was partitioned from a warp-specialized loop, meaning it can
-    // have a partial view of the original loop stages. Re-schedule the loop
-    // root at the stages of the latency ops to prune unnecessary stages.
-    auto isLatencyOp = [&](Operation &op) {
-      return opLatency.count(&op) ||
-             isa<LocalStoreOp, LocalLoadOp, ttng::TMEMLoadOp, ttng::TMEMStoreOp,
-                 AsyncCopyGlobalToLocalOp, ttng::AsyncTMACopyGlobalToLocalOp,
-                 ttng::AsyncTMAGatherOp, ttng::MMAv5OpInterface,
-                 ttng::WaitBarrierOp, ttng::ArriveBarrierOp>(op);
-    };
-
-    // If there are no latency ops or all latency ops are in the same stage, we
-    // don't need to pipeline the loop. Return a new schedule with everything
-    // assigned to the same stage.
-    DenseSet<int> latencyStages;
-    auto ops = forOp.getBody()->without_terminator();
-    for (Operation &op : llvm::make_filter_range(ops, isLatencyOp))
-      latencyStages.insert(schedule[&op].first);
-    if (latencyStages.size() <= 1) {
-      CoarseSchedule normalized(/*numStages=*/1);
-      auto cluster = normalized.clusters.newAtFront();
-      for (Operation &op : ops)
-        normalized.insert(&op, 0, cluster);
-      return normalized;
-    }
-
     schedule.shrinkToFit();
     return schedule;
   }
diff --git a/test/TritonGPU/automatic-warp-specialization.mlir b/test/TritonGPU/automatic-warp-specialization.mlir
@@ -31,16 +31,11 @@ tt.func @matmul_change_desc_in_prologue(
   // CHECK-SAME: num_warps(1)
   // BASE-NOT: tt.make_tensor_descriptor
   // PIPELINE-NOT: tt.experimental_tensormap_create
-  // PIPELINE-COUNT-1: tc_gen5_mma
-  // PIPELINE-NOT: tc_gen5_mma
   // CHECK-LABEL: partition1
   // CHECK-SAME: num_warps(2)
   // BASE-COUNT-2: tt.make_tensor_descriptor
   // PIPELINE-COUNT-2: ttg.global_scratch_alloc {alignment = 128 : i32, nbytes = 512 : i32}
   // PIPELINE-COUNT-2: tt.experimental_tensormap_create
-  // PIPELINE-NOT: tt.experimental_tensormap_create
-  // PIPELINE-COUNT-2: async_tma_copy_global_to_local
-  // PIPELINE-NOT: async_tma_copy_global_to_local
   // CHECK-NOT: partition2
   scf.for %k = %c0_i32 to %k_tiles step %c1_i32 iter_args(%acc = %zero, %flag = %true, %a_desc = %a_desc_undef, %b_desc = %b_desc_undef) -> (tensor<128x128xf32, #acc_layout>, i1, !tt.tensordesc<tensor<128x64xf16, #shared>>, !tt.tensordesc<tensor<64x128xf16, #shared>>) : i32 {
     %do_prologue = "prologue_cond"(%k) : (i32) -> i1