Skip to content

Commit 9fdebc4

Browse files
authored
Fix dependency graph creation in RockPipeline and not generate loops with negative iterations (#1760)
This fixes two bugs in RockPipeline. It wasn't creating correct dependency graph while generating Schedule. It was only using "MultiBuffers" while creating dependency graph for generating schedule which was generating incorrect IR. For certain cases it was generating Loops with negative iterations. This PR adds some checks to avoid pipelining in such cases.
1 parent 3403d50 commit 9fdebc4

File tree

2 files changed

+494
-46
lines changed

2 files changed

+494
-46
lines changed

mlir/lib/Dialect/Rock/Transforms/RockPipeline.cpp

Lines changed: 76 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
3232
#include "mlir/Transforms/Passes.h"
3333
#include "llvm/ADT/SetOperations.h"
34+
#include "llvm/Support/MathExtras.h"
3435

3536
#include <algorithm>
3637
#include <map>
@@ -102,11 +103,11 @@ AddressSpace getAddressSpace(MemrefTypedValue val) {
102103
MemoryAccessType getOperandAccessType(Operation *op, Value operand) {
103104
if (hasEffect<MemoryEffects::Write>(op, operand)) {
104105
return MemoryAccessType::WRITE;
105-
} else if (hasEffect<MemoryEffects::Read>(op, operand)) {
106+
}
107+
if (hasEffect<MemoryEffects::Read>(op, operand)) {
106108
return MemoryAccessType::READ;
107-
} else {
108-
return MemoryAccessType::UNKNOWN;
109109
}
110+
return MemoryAccessType::UNKNOWN;
110111
}
111112

112113
// Simple rewrite pass to remove the stages and backward barriers in the
@@ -253,11 +254,14 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
253254
// Create the dependency graph
254255
DagType dag = createDependencyGraph(stages, resources);
255256

256-
// Start building the schedules
257-
//
258-
// Since we accept the stages from the user, we don't need to do any
259-
// analysis to determine what goes in each stage. We only have to group things
260-
// in set of stages of length II.
257+
// Definition of initiation interval (II)
258+
// Initiation interval is defined by number of cycles in each iteration of a
259+
// loop. Only one cycle is counted for parallel stages. Assume each stage
260+
// executes in one cycle.
261+
// Start building the schedules. Since we accept the stages from the user, we
262+
// don't need to do any analysis to determine what goes in each stage. Each
263+
// `II` number of stages will execute in sequence. All groups of `II`
264+
// stages execute in parallel.
261265
// For instance, consider the following unpipelined schedule. The column `t`
262266
// represents
263267
// the time slot, and the subsequent columns represents the iterations.
@@ -282,7 +286,7 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
282286
// +===+=========+
283287
// In this case, we reduced the time slots to 3, and we have 2 set of stages
284288
// runnning in parallel. Please note that conflicts can only happen between S0
285-
// and S3. If we increase II, we generate the following pipeline:
289+
// and S3. If we decrease II, we generate the following pipeline:
286290
// +t\i+=== 0 ===++=== 1 ===+
287291
// + 0 +== S0 ==++== S2 ==+
288292
// +===+=========++=========+
@@ -363,7 +367,7 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
363367

364368
// Whatever resource is shared, we need to select among multiple buffers.
365369
for (size_t i = 0; i < parallelStages.size(); i++) {
366-
// The only resource that can conflict btween different stages is memory
370+
// The only resource that can conflict between different stages is memory
367371
// If there are memory conflicts we can sort them via multibuffers. I.e.,
368372
// we can (logically) provide a different buffer for different cycles
369373
for (size_t j = i + 1; j < parallelStages.size(); j++) {
@@ -385,31 +389,31 @@ void createSchedule(SmallVector<rock::StageOp> &stages,
385389

386390
// Add the parallel stages
387391
for (auto stage : parallelStages) {
388-
schedule.push_back({stage, stageIter[stage]});
392+
schedule.emplace_back(stage, stageIter[stage]);
389393
}
390394
}
391395
}
392396

393397
// Prune a dependency graph taking into account multi-buffers. Since
394398
// multi-buffers are logically different for each iteration, if the dependency
395399
// on a multi-buffer spans multiple iteration then it can be pruned
396-
DagType pruneGraph(DagType dag) {
400+
DagType pruneGraph(const DagType &dag) {
397401
DagType prunedGraph;
398402
// Multibuffers have the logical property of being unique for each iteration
399403
// of the loop Hence, if we know we are dealing with a multi-buffer and the
400404
// dependency concerns two different iteration. In other words, if stageA
401405
// accesses LDS in iteration i and stageB accesses LDS in iteration j stageA
402406
// and stageB have no dependencies as long as i!=j
403-
for (auto [sink, edges] : dag) {
404-
for (auto [source, deps] : edges) {
407+
for (auto [source, edges] : dag) {
408+
for (auto [sink, deps] : edges) {
405409
DenseSet<std::pair<rock::GpuAllocOp, DependencyType>> newDeps;
406410
for (auto [alloc, type] : deps) {
407411
if (getAddressSpace(alloc) != gpu::AddressSpace::Workgroup)
408412
continue;
409413
newDeps.insert({alloc, type});
410414
}
411415
if (!newDeps.empty())
412-
prunedGraph[sink][source] = newDeps;
416+
prunedGraph[source][sink] = newDeps;
413417
}
414418
}
415419
return prunedGraph;
@@ -441,18 +445,13 @@ void placeBarriers(IRRewriter &rewriter, Location loc, scf::ForOp forOp,
441445
ArrayRef<rock::StageOp> stages,
442446
SetVector<rock::GpuAllocOp> &allocs,
443447
SmallVector<rock::StageOp> &extendedStages,
444-
int64_t &initiationInterval) {
448+
int64_t &initiationInterval, int64_t numIterations) {
445449
DagType dag = createDependencyGraph(stages, allocs);
446450
dag = pruneGraph(dag);
447451

448-
auto maybeNumIterations =
449-
rock::computeConstDiff(forOp.getLowerBound(), forOp.getUpperBound());
450-
451452
// If there is a loop, we probably need a backward barrier, i.e.,
452453
// an LDS barrier that takes the loop dependency into account
453-
const bool addBackwardBarrier =
454-
(!maybeNumIterations.has_value() ||
455-
(maybeNumIterations.has_value() && maybeNumIterations.value() > 1));
454+
const bool addBackwardBarrier = numIterations > 1;
456455

457456
DenseMap<rock::StageOp, int> timeSlotMap;
458457
int timeSlot = 0;
@@ -461,17 +460,17 @@ void placeBarriers(IRRewriter &rewriter, Location loc, scf::ForOp forOp,
461460
timeSlot++;
462461
}
463462

464-
// Algorithm for barrier placment:
463+
// Algorithm for barrier placement:
465464
// a. Add forward barriers to address the dependency in the basic block
466465
// b. Add backward barriers to account for loop carried dependency
467466
// c. Add empty stages to make the pipeline balanced, so that we can double up
468-
// the initiation interval and let the pipeline transformation automaticall
469-
// do the work for us
467+
// the initiation interval and let the pipeline transformation
468+
// automatically do the work for us
470469
DenseSet<rock::StageOp> forwardStages;
471470

472471
// a. Place forward barriers
473-
for (auto [source, edges] : dag) {
474-
for (auto [sink, deps] : edges) {
472+
for (const auto &[source, edges] : dag) {
473+
for (const auto &[sink, deps] : edges) {
475474
if (!forwardStages.contains(sink)) {
476475
forwardStages.insert(sink);
477476
}
@@ -558,6 +557,28 @@ SmallVector<scf::ForOp> collectLoopLevels(mlir::func::FuncOp func) {
558557
return loops;
559558
}
560559

560+
void adjustInitiationInterval(int64_t numIterations, size_t numStages,
561+
int64_t &ii) {
562+
int64_t numParallelStages = llvm::divideCeil(numStages, ii);
563+
// calculate number of prologue executions
564+
int64_t numPrologues = numParallelStages - 1;
565+
// if number of iterations are less than number of prologues that are going
566+
// to be emitted, it will not result in correct output therefore increase II
567+
// until that condition becomes false. This can help achieve maximum loop
568+
// pipelining
569+
while (numIterations < numPrologues) {
570+
ii++;
571+
LLVM_DEBUG(DBGS() << "Adjusted II to " << ii << "\n");
572+
numParallelStages = llvm::divideCeil(numStages, ii);
573+
numPrologues = numParallelStages - 1;
574+
}
575+
LLVM_DEBUG(DBGS() << "Number of parallel stages: " << numParallelStages
576+
<< "\n");
577+
LLVM_DEBUG(DBGS() << "Number of Prologues: " << numPrologues << "\n");
578+
// num of prologues == number of epilogues
579+
LLVM_DEBUG(DBGS() << "Number of Epilogues: " << numPrologues << "\n");
580+
}
581+
561582
struct RockPipeline : public rock::impl::RockPipelinePassBase<RockPipeline> {
562583
using rock::impl::RockPipelinePassBase<RockPipeline>::RockPipelinePassBase;
563584
void runOnOperation() override;
@@ -577,19 +598,25 @@ void RockPipeline::runOnOperation() {
577598

578599
// Always (try to) multi-buffer by one and store the new
579600
// allocs in a set
601+
// Store multibuffers in "multiAllocs" and store all buffers
602+
// including private and global in "resources"
580603
llvm::SetVector<rock::GpuAllocOp> multiAllocs;
604+
llvm::SetVector<rock::GpuAllocOp> resources;
581605
for (auto alloc : singleAllocs) {
582606
SmallVector<rock::GpuAllocOp> newAllocs;
583-
if (succeeded(rock::multiBuffer(rewriter, alloc, newAllocs, 1, true)))
607+
if (succeeded(rock::multiBuffer(rewriter, alloc, newAllocs, 1, true))) {
584608
multiAllocs.insert(newAllocs.back());
609+
resources.insert(newAllocs.back());
610+
} else {
611+
resources.insert(alloc);
612+
}
585613
}
586614

587615
// Collect the global resources (i.e., the memory allocations)
588616
// Note: we can only have two kind of memory:
589617
// - Registers
590618
// - LDS
591619
DenseMap<rock::GpuAllocOp, int> multiBufferFactors;
592-
llvm::MapVector<scf::ForOp, ScheduleType> scheduleMap;
593620
for (auto res : multiAllocs)
594621
multiBufferFactors[res] = 1;
595622

@@ -632,19 +659,32 @@ void RockPipeline::runOnOperation() {
632659
});
633660

634661
if (stages.empty())
635-
WalkResult::advance();
662+
continue;
636663

637664
LLVM_DEBUG(DBGS() << "Number of stages: " << stages.size() << "\n");
638665
LLVM_DEBUG(DBGS() << "Initiation Interval: " << ii << "\n");
666+
size_t numStages = stages.size();
667+
auto maybeNumIterations =
668+
rock::computeConstDiff(forOp.getLowerBound(), forOp.getUpperBound());
669+
assert(isConstantIntValue(forOp.getStep(), 1) &&
670+
"Step size other one is not permitted in rock-pipeline");
671+
if (!maybeNumIterations.has_value()) {
672+
emitError(loc,
673+
"Number of iterations are unknown while doing rock-pipeline\n");
674+
return signalPassFailure();
675+
}
676+
adjustInitiationInterval(maybeNumIterations.value(), numStages, ii);
639677

640678
// Insert the barriers as new stages
641679
SmallVector<rock::StageOp> extendedStages;
642-
placeBarriers(rewriter, loc, forOp, stages, multiAllocs, extendedStages,
643-
ii);
680+
// use "multiAllocs" to place LDS barriers, no need to explicitly place
681+
// barriers for registers or globals
682+
placeBarriers(rewriter, loc, forOp, stages, multiAllocs, extendedStages, ii,
683+
maybeNumIterations.value());
644684

645685
ScheduleType schedule;
646-
createSchedule(extendedStages, multiAllocs, ii, schedule,
647-
multiBufferFactors);
686+
// use all "resources" to generate dependency graph and generate schedule
687+
createSchedule(extendedStages, resources, ii, schedule, multiBufferFactors);
648688

649689
RewritePatternSet patterns(&getContext());
650690
mlir::scf::PipeliningOption options;
@@ -669,8 +709,8 @@ void RockPipeline::runOnOperation() {
669709
{
670710
if (removeStages) {
671711
RewritePatternSet patterns(&getContext());
672-
patterns.add<RemoveStagesRewritePattern, PushBarrierDownRewritePattern>(
673-
&getContext());
712+
patterns.add<RemoveStagesRewritePattern, PushBarrierDownRewritePattern,
713+
RemoveBackToBackBarriersRewritePattern>(&getContext());
674714
(void)applyPatternsGreedily(getOperation(), std::move(patterns));
675715
}
676716
}

0 commit comments

Comments
 (0)