3131#include " mlir/Transforms/GreedyPatternRewriteDriver.h"
3232#include " mlir/Transforms/Passes.h"
3333#include " llvm/ADT/SetOperations.h"
34+ #include " llvm/Support/MathExtras.h"
3435
3536#include < algorithm>
3637#include < map>
@@ -102,11 +103,11 @@ AddressSpace getAddressSpace(MemrefTypedValue val) {
102103MemoryAccessType getOperandAccessType (Operation *op, Value operand) {
103104 if (hasEffect<MemoryEffects::Write>(op, operand)) {
104105 return MemoryAccessType::WRITE;
105- } else if (hasEffect<MemoryEffects::Read>(op, operand)) {
106+ }
107+ if (hasEffect<MemoryEffects::Read>(op, operand)) {
106108 return MemoryAccessType::READ;
107- } else {
108- return MemoryAccessType::UNKNOWN;
109109 }
110+ return MemoryAccessType::UNKNOWN;
110111}
111112
112113// Simple rewrite pass to remove the stages and backward barriers in the
@@ -253,11 +254,14 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
253254 // Create the dependency graph
254255 DagType dag = createDependencyGraph (stages, resources);
255256
256- // Start building the schedules
257- //
258- // Since we accept the stages from the user, we don't need to do any
259- // analysis to determine what goes in each stage. We only have to group things
260- // in set of stages of length II.
257+ // Definition of initiation interval (II)
258+ // Initiation interval is defined by number of cycles in each iteration of a
259+ // loop. Only one cycle is counted for parallel stages. Assume each stage
260+ // executes in one cycle.
261+ // Start building the schedules. Since we accept the stages from the user, we
262+ // don't need to do any analysis to determine what goes in each stage. Each
263+ // `II` number of stages will execute in sequence. All groups of `II`
264+ // stages execute in parallel.
261265 // For instance, consider the following unpipelined schedule. The column `t`
262266 // represents
263267 // the time slot, and the subsequent columns represents the iterations.
@@ -282,7 +286,7 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
282286 // +===+=========+
283287 // In this case, we reduced the time slots to 3, and we have 2 set of stages
284288 // runnning in parallel. Please note that conflicts can only happen between S0
285- // and S3. If we increase II, we generate the following pipeline:
289+ // and S3. If we decrease II, we generate the following pipeline:
286290 // +t\i+=== 0 ===++=== 1 ===+
287291 // + 0 +== S0 ==++== S2 ==+
288292 // +===+=========++=========+
@@ -363,7 +367,7 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
363367
364368 // Whatever resource is shared, we need to select among multiple buffers.
365369 for (size_t i = 0 ; i < parallelStages.size (); i++) {
366- // The only resource that can conflict btween different stages is memory
370+ // The only resource that can conflict between different stages is memory
367371 // If there are memory conflicts we can sort them via multibuffers. I.e.,
368372 // we can (logically) provide a different buffer for different cycles
369373 for (size_t j = i + 1 ; j < parallelStages.size (); j++) {
@@ -385,31 +389,31 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
385389
386390 // Add the parallel stages
387391 for (auto stage : parallelStages) {
388- schedule.push_back ({ stage, stageIter[stage]} );
392+ schedule.emplace_back ( stage, stageIter[stage]);
389393 }
390394 }
391395}
392396
393397// Prune a dependency graph taking into account multi-buffers. Since
394398// multi-buffers are logically different for each iteration, if the dependency
395399// on a multi-buffer spans multiple iteration then it can be pruned
396- DagType pruneGraph (DagType dag) {
400+ DagType pruneGraph (const DagType & dag) {
397401 DagType prunedGraph;
398402 // Multibuffers have the logical property of being unique for each iteration
399403 // of the loop Hence, if we know we are dealing with a multi-buffer and the
400404 // dependency concerns two different iteration. In other words, if stageA
401405 // accesses LDS in iteration i and stageB accesses LDS in iteration j stageA
402406 // and stageB have no dependencies as long as i!=j
403- for (auto [sink , edges] : dag) {
404- for (auto [source , deps] : edges) {
407+ for (auto [source , edges] : dag) {
408+ for (auto [sink , deps] : edges) {
405409 DenseSet<std::pair<rock::GpuAllocOp, DependencyType>> newDeps;
406410 for (auto [alloc, type] : deps) {
407411 if (getAddressSpace (alloc) != gpu::AddressSpace::Workgroup)
408412 continue ;
409413 newDeps.insert ({alloc, type});
410414 }
411415 if (!newDeps.empty ())
412- prunedGraph[sink][source ] = newDeps;
416+ prunedGraph[source][sink ] = newDeps;
413417 }
414418 }
415419 return prunedGraph;
@@ -441,18 +445,13 @@ void placeBarriers(IRRewriter &rewriter, Location loc, scf::ForOp forOp,
441445 ArrayRef<rock::StageOp> stages,
442446 SetVector<rock::GpuAllocOp> &allocs,
443447 SmallVector<rock::StageOp> &extendedStages,
444- int64_t &initiationInterval) {
448+ int64_t &initiationInterval, int64_t numIterations ) {
445449 DagType dag = createDependencyGraph (stages, allocs);
446450 dag = pruneGraph (dag);
447451
448- auto maybeNumIterations =
449- rock::computeConstDiff (forOp.getLowerBound (), forOp.getUpperBound ());
450-
451452 // If there is a loop, we probably need a backward barrier, i.e.,
452453 // an LDS barrier that takes the loop dependency into account
453- const bool addBackwardBarrier =
454- (!maybeNumIterations.has_value () ||
455- (maybeNumIterations.has_value () && maybeNumIterations.value () > 1 ));
454+ const bool addBackwardBarrier = numIterations > 1 ;
456455
457456 DenseMap<rock::StageOp, int > timeSlotMap;
458457 int timeSlot = 0 ;
@@ -461,17 +460,17 @@ void placeBarriers(IRRewriter &rewriter, Location loc, scf::ForOp forOp,
461460 timeSlot++;
462461 }
463462
464- // Algorithm for barrier placment :
463+ // Algorithm for barrier placement :
465464 // a. Add forward barriers to address the dependency in the basic block
466465 // b. Add backward barriers to account for loop carried dependency
467466 // c. Add empty stages to make the pipeline balanced, so that we can double up
468- // the initiation interval and let the pipeline transformation automaticall
469- // do the work for us
467+ // the initiation interval and let the pipeline transformation
468+ // automatically do the work for us
470469 DenseSet<rock::StageOp> forwardStages;
471470
472471 // a. Place forward barriers
473- for (auto [source, edges] : dag) {
474- for (auto [sink, deps] : edges) {
472+ for (const auto & [source, edges] : dag) {
473+ for (const auto & [sink, deps] : edges) {
475474 if (!forwardStages.contains (sink)) {
476475 forwardStages.insert (sink);
477476 }
@@ -558,6 +557,28 @@ SmallVector<scf::ForOp> collectLoopLevels(mlir::func::FuncOp func) {
558557 return loops;
559558}
560559
560+ void adjustInitiationInterval (int64_t numIterations, size_t numStages,
561+ int64_t &ii) {
562+ int64_t numParallelStages = llvm::divideCeil (numStages, ii);
563+ // calculate number of prologue executions
564+ int64_t numPrologues = numParallelStages - 1 ;
565+ // if number of iterations are less than number of prologues that are going
566+ // to be emitted, it will not result in correct output therefore increase II
567+ // until that condition becomes false. This can help achieve maximum loop
568+ // pipelining
569+ while (numIterations < numPrologues) {
570+ ii++;
571+ LLVM_DEBUG (DBGS () << " Adjusted II to " << ii << " \n " );
572+ numParallelStages = llvm::divideCeil (numStages, ii);
573+ numPrologues = numParallelStages - 1 ;
574+ }
575+ LLVM_DEBUG (DBGS () << " Number of parallel stages: " << numParallelStages
576+ << " \n " );
577+ LLVM_DEBUG (DBGS () << " Number of Prologues: " << numPrologues << " \n " );
578+ // num of prologues == number of epilogues
579+ LLVM_DEBUG (DBGS () << " Number of Epilogues: " << numPrologues << " \n " );
580+ }
581+
561582struct RockPipeline : public rock ::impl::RockPipelinePassBase<RockPipeline> {
562583 using rock::impl::RockPipelinePassBase<RockPipeline>::RockPipelinePassBase;
563584 void runOnOperation () override ;
@@ -577,19 +598,25 @@ void RockPipeline::runOnOperation() {
577598
578599 // Always (try to) multi-buffer by one and store the new
579600 // allocs in a set
601+ // Store multibuffers in "multiAllocs" and store all buffers
602+ // including private and global in "resources"
580603 llvm::SetVector<rock::GpuAllocOp> multiAllocs;
604+ llvm::SetVector<rock::GpuAllocOp> resources;
581605 for (auto alloc : singleAllocs) {
582606 SmallVector<rock::GpuAllocOp> newAllocs;
583- if (succeeded (rock::multiBuffer (rewriter, alloc, newAllocs, 1 , true )))
607+ if (succeeded (rock::multiBuffer (rewriter, alloc, newAllocs, 1 , true ))) {
584608 multiAllocs.insert (newAllocs.back ());
609+ resources.insert (newAllocs.back ());
610+ } else {
611+ resources.insert (alloc);
612+ }
585613 }
586614
587615 // Collect the global resources (i.e., the memory allocations)
588616 // Note: we can only have two kind of memory:
589617 // - Registers
590618 // - LDS
591619 DenseMap<rock::GpuAllocOp, int > multiBufferFactors;
592- llvm::MapVector<scf::ForOp, ScheduleType> scheduleMap;
593620 for (auto res : multiAllocs)
594621 multiBufferFactors[res] = 1 ;
595622
@@ -632,19 +659,32 @@ void RockPipeline::runOnOperation() {
632659 });
633660
634661 if (stages.empty ())
635- WalkResult::advance () ;
662+ continue ;
636663
637664 LLVM_DEBUG (DBGS () << " Number of stages: " << stages.size () << " \n " );
638665 LLVM_DEBUG (DBGS () << " Initiation Interval: " << ii << " \n " );
666+ size_t numStages = stages.size ();
667+ auto maybeNumIterations =
668+ rock::computeConstDiff (forOp.getLowerBound (), forOp.getUpperBound ());
669+ assert (isConstantIntValue (forOp.getStep (), 1 ) &&
670+ " Step size other one is not permitted in rock-pipeline" );
671+ if (!maybeNumIterations.has_value ()) {
672+ emitError (loc,
673+ " Number of iterations are unknown while doing rock-pipeline\n " );
674+ return signalPassFailure ();
675+ }
676+ adjustInitiationInterval (maybeNumIterations.value (), numStages, ii);
639677
640678 // Insert the barriers as new stages
641679 SmallVector<rock::StageOp> extendedStages;
642- placeBarriers (rewriter, loc, forOp, stages, multiAllocs, extendedStages,
643- ii);
680+ // use "multiAllocs" to place LDS barriers, no need to explicitly place
681+ // barriers for registers or globals
682+ placeBarriers (rewriter, loc, forOp, stages, multiAllocs, extendedStages, ii,
683+ maybeNumIterations.value ());
644684
645685 ScheduleType schedule;
646- createSchedule (extendedStages, multiAllocs, ii, schedule,
647- multiBufferFactors);
686+ // use all "resources" to generate dependency graph and generate schedule
687+ createSchedule (extendedStages, resources, ii, schedule, multiBufferFactors);
648688
649689 RewritePatternSet patterns (&getContext ());
650690 mlir::scf::PipeliningOption options;
@@ -669,8 +709,8 @@ void RockPipeline::runOnOperation() {
669709 {
670710 if (removeStages) {
671711 RewritePatternSet patterns (&getContext ());
672- patterns.add <RemoveStagesRewritePattern, PushBarrierDownRewritePattern>(
673- &getContext ());
712+ patterns.add <RemoveStagesRewritePattern, PushBarrierDownRewritePattern,
713+ RemoveBackToBackBarriersRewritePattern>( &getContext ());
674714 (void )applyPatternsGreedily (getOperation (), std::move (patterns));
675715 }
676716 }
0 commit comments