@@ -167,32 +167,6 @@ CoarseSchedule getInitialSchedule(scf::ForOp forOp,
167
167
CoarseSchedule schedule;
168
168
if (forOp->hasAttr (kWarpSpecializeAttrName ) &&
169
169
succeeded (schedule.deSerialize (forOp))) {
170
- // The loop was partitioned from a warp-specialized loop, meaning it can
171
- // have a partial view of the original loop stages. Re-schedule the loop
172
- // root at the stages of the latency ops to prune unnecessary stages.
173
- auto isLatencyOp = [&](Operation &op) {
174
- return opLatency.count (&op) ||
175
- isa<LocalStoreOp, LocalLoadOp, ttng::TMEMLoadOp, ttng::TMEMStoreOp,
176
- AsyncCopyGlobalToLocalOp, ttng::AsyncTMACopyGlobalToLocalOp,
177
- ttng::AsyncTMAGatherOp, ttng::MMAv5OpInterface,
178
- ttng::WaitBarrierOp, ttng::ArriveBarrierOp>(op);
179
- };
180
-
181
- // If there are no latency ops or all latency ops are in the same stage, we
182
- // don't need to pipeline the loop. Return a new schedule with everything
183
- // assigned to the same stage.
184
- DenseSet<int > latencyStages;
185
- auto ops = forOp.getBody ()->without_terminator ();
186
- for (Operation &op : llvm::make_filter_range (ops, isLatencyOp))
187
- latencyStages.insert (schedule[&op].first );
188
- if (latencyStages.size () <= 1 ) {
189
- CoarseSchedule normalized (/* numStages=*/ 1 );
190
- auto cluster = normalized.clusters .newAtFront ();
191
- for (Operation &op : ops)
192
- normalized.insert (&op, 0 , cluster);
193
- return normalized;
194
- }
195
-
196
170
schedule.shrinkToFit ();
197
171
return schedule;
198
172
}
0 commit comments