From f56c4321413f3fd567b4044f1cc9521da65e6797 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Fri, 1 Nov 2024 18:03:14 +0100 Subject: [PATCH 1/5] Give cond. loop threshold bonus to outer loop in loop nests --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 68 ++++++++++++++- .../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 84 +++++++++++++++++++ 2 files changed, 151 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 5160851f8c442..79250ad1f8306 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -47,6 +47,13 @@ static cl::opt UnrollThresholdIf( cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden); +static cl::opt UnrollThresholdNestedStatic( + "amdgpu-unroll-threshold-nested-static", + cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose " + "trip count will be made runtime-independent when fully-unrolling " + "the outer loop"), + cl::init(200), cl::Hidden); + static cl::opt UnrollRuntimeLocal( "amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), @@ -148,8 +155,67 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, } } } - unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); + + if (llvm::PHINode *IV = L->getInductionVariable(SE)) { + // Look for subloops whose trip count would go from runtime-dependent to + // runtime-independent if we were to unroll the loop. Give a bonus to the + // current loop's unrolling threshold for each of these, as fully unrolling + // it would likely expose additional optimization opportunities. + for (const Loop *SubLoop : L->getSubLoops()) { + std::optional Bounds = SubLoop->getBounds(SE); + if (!Bounds) + continue; + Value *InitIV = &Bounds->getInitialIVValue(); + Value *FinalIV = &Bounds->getFinalIVValue(); + Value *StepVal = Bounds->getStepValue(); + if (!StepVal) + continue; + + // Determines whether SubIV's derivation depends exclusively on constants + // and/or IV; if it does, SubIVDependsOnIV is set to true if IV is + // involved in the derivation. + bool SubIVDependsOnIV = false; + std::function FromConstsOrLoopIV = + [&](const Value *SubIV, unsigned Depth) -> bool { + if (SubIV == IV) { + SubIVDependsOnIV = true; + return true; + } + if (isa(SubIV)) + return true; + if (Depth >= 10) + return false; + + const Instruction *I = dyn_cast(SubIV); + // No point in checking outside the loop since IV is necessarily inside + // it; also stop searching when encountering an instruction that will + // likely not allow SubIV's value to be statically computed. + if (!I || !L->contains(I) || !isa(I)) + return false; + + // SubIV depends on constants or IV if all of the instruction's + // operands involved in its derivation also depend on constants or IV. + return llvm::all_of(I->operand_values(), [&](const Value *V) { + return FromConstsOrLoopIV(V, Depth + 1); + }); + }; + + if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) && + FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) { + UP.Threshold += UnrollThresholdNestedStatic; + LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold + << " for loop:\n" + << *L + << " due to subloop's trip count becoming " + "runtime-independent after unrolling:\n " + << *SubLoop); + if (UP.Threshold >= MaxBoost) + return; + } + } + } + for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getDataLayout(); unsigned LocalGEPsSeen = 0; diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll new file mode 100644 index 0000000000000..36101c50db98a --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll @@ -0,0 +1,84 @@ +; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s + +; For @dependent_sub_fullunroll, the threshold bonus should apply +; CHECK: due to subloop's trip count becoming runtime-independent after unrolling + +; For @dependent_sub_no_fullunroll, the threshold bonus should not apply +; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling + +; Check that the outer loop of a double-nested loop where the inner loop's trip +; count depends exclusively on constants and the outer IV is fully unrolled +; thanks to receiving a threshold bonus in AMDGPU's TTI. + +; CHECK-LABEL: @dependent_sub_fullunroll +; CHECK: inner.header_latch_exiting.7 +; CHECK: outer.latch_exiting.7 + +define void @dependent_sub_fullunroll(ptr noundef %mem) { +entry: + br label %outer.header + +outer.header: ; preds = %entry, %outer.latch_exiting + %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] + br label %inner.header_latch_exiting + +inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting + %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] + %inner.iv_next = add nuw nsw i32 %inner.iv, 1 + %outer.iv.ext = zext nneg i32 %outer.iv to i64 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %inner.iv.ext = zext nneg i32 %inner.iv to i64 + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %addr = getelementptr inbounds i8, ptr %mem, i64 %idx + store i32 0, ptr %addr + %inner.cond = icmp ult i32 %inner.iv_next, 8 + br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1 + +outer.latch_exiting: ; preds = %inner.header_latch_exiting + %outer.iv_next = add nuw nsw i32 %outer.iv, 1 + %outer.cond = icmp ult i32 %outer.iv_next, 8 + br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 + +end: ; preds = %outer.latch_exiting + ret void +} + +; Check that the outer loop of the same loop nest as dependent_sub_fullunroll +; is not fully unrolled when the inner loop's final IV value depends on a +; function argument instead of a combination of the outer IV and constants. + +; CHECK-LABEL: @dependent_sub_no_fullunroll +; CHECK-NOT: outer.latch_exiting.7 +; CHECK-NOT: outer.latch_exiting.7 + +define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) { +entry: + br label %outer.header + +outer.header: ; preds = %entry, %outer.latch_exiting + %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] + br label %inner.header_latch_exiting + +inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting + %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] + %inner.iv_next = add nuw nsw i32 %inner.iv, 1 + %outer.iv.ext = zext nneg i32 %outer.iv to i64 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %inner.iv.ext = zext nneg i32 %inner.iv to i64 + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %addr = getelementptr inbounds i8, ptr %mem, i64 %idx + store i32 0, ptr %addr + %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub + br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1 + +outer.latch_exiting: ; preds = %inner.header_latch_exiting + %outer.iv_next = add nuw nsw i32 %outer.iv, 1 + %outer.cond = icmp ult i32 %outer.iv_next, 8 + br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 + +end: ; preds = %outer.latch_exiting + ret void +} + +!1 = !{!1, !2} +!2 = !{!"amdgpu.loop.unroll.threshold", i32 100} From 0a72dca4142ab896b39bd89f3c5fdab4e1ed6bd8 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Mon, 4 Nov 2024 11:53:43 +0100 Subject: [PATCH 2/5] Address reviewers' comments --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 2 +- .../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 183 ++++++++++++++++-- 2 files changed, 167 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 79250ad1f8306..8d6eb94af4a10 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -157,7 +157,7 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, } unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); - if (llvm::PHINode *IV = L->getInductionVariable(SE)) { + if (PHINode *IV = L->getInductionVariable(SE)) { // Look for subloops whose trip count would go from runtime-dependent to // runtime-independent if we were to unroll the loop. Give a bonus to the // current loop's unrolling threshold for each of these, as fully unrolling diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll index 36101c50db98a..97de4cbf0936c 100644 --- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll +++ b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll @@ -1,4 +1,6 @@ -; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; REQUIRES: asserts +; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug < %s 2>&1 | FileCheck %s ; For @dependent_sub_fullunroll, the threshold bonus should apply ; CHECK: due to subloop's trip count becoming runtime-independent after unrolling @@ -6,15 +8,63 @@ ; For @dependent_sub_no_fullunroll, the threshold bonus should not apply ; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling +; For @dont_unroll_illegal_convergent_op, the threshold bonus should apply even if there is no unrolling +; CHECK: due to subloop's trip count becoming runtime-independent after unrolling + ; Check that the outer loop of a double-nested loop where the inner loop's trip ; count depends exclusively on constants and the outer IV is fully unrolled ; thanks to receiving a threshold bonus in AMDGPU's TTI. -; CHECK-LABEL: @dependent_sub_fullunroll -; CHECK: inner.header_latch_exiting.7 -; CHECK: outer.latch_exiting.7 - define void @dependent_sub_fullunroll(ptr noundef %mem) { +; CHECK-LABEL: @dependent_sub_fullunroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] +; CHECK: inner.header_latch_exiting: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]] +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8 +; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: outer.latch_exiting: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]] +; CHECK: inner.header_latch_exiting.1: +; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ] +; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1 +; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64 +; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]] +; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]] +; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4 +; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], 8 +; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1:%.*]], !llvm.loop [[LOOP0]] +; CHECK: outer.latch_exiting.1: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_2:%.*]] +; CHECK: inner.header_latch_exiting.2: +; CHECK-NEXT: [[INNER_IV_2:%.*]] = phi i32 [ 2, [[OUTER_LATCH_EXITING_1]] ], [ [[INNER_IV_NEXT_2:%.*]], [[INNER_HEADER_LATCH_EXITING_2]] ] +; CHECK-NEXT: [[INNER_IV_NEXT_2]] = add nuw nsw i32 [[INNER_IV_2]], 1 +; CHECK-NEXT: [[INNER_IV_EXT_2:%.*]] = zext nneg i32 [[INNER_IV_2]] to i64 +; CHECK-NEXT: [[IDX_2:%.*]] = add nuw nsw i64 32, [[INNER_IV_EXT_2]] +; CHECK-NEXT: [[ADDR_2:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_2]] +; CHECK-NEXT: store i32 0, ptr [[ADDR_2]], align 4 +; CHECK-NEXT: [[INNER_COND_2:%.*]] = icmp ult i32 [[INNER_IV_NEXT_2]], 8 +; CHECK-NEXT: br i1 [[INNER_COND_2]], label [[INNER_HEADER_LATCH_EXITING_2]], label [[OUTER_LATCH_EXITING_2:%.*]], !llvm.loop [[LOOP0]] +; CHECK: outer.latch_exiting.2: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_3:%.*]] +; CHECK: inner.header_latch_exiting.3: +; CHECK-NEXT: [[INNER_IV_3:%.*]] = phi i32 [ 3, [[OUTER_LATCH_EXITING_2]] ], [ [[INNER_IV_NEXT_3:%.*]], [[INNER_HEADER_LATCH_EXITING_3]] ] +; CHECK-NEXT: [[INNER_IV_NEXT_3]] = add nuw nsw i32 [[INNER_IV_3]], 1 +; CHECK-NEXT: [[INNER_IV_EXT_3:%.*]] = zext nneg i32 [[INNER_IV_3]] to i64 +; CHECK-NEXT: [[IDX_3:%.*]] = add nuw nsw i64 48, [[INNER_IV_EXT_3]] +; CHECK-NEXT: [[ADDR_3:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_3]] +; CHECK-NEXT: store i32 0, ptr [[ADDR_3]], align 4 +; CHECK-NEXT: [[INNER_COND_3:%.*]] = icmp ult i32 [[INNER_IV_NEXT_3]], 8 +; CHECK-NEXT: br i1 [[INNER_COND_3]], label [[INNER_HEADER_LATCH_EXITING_3]], label [[OUTER_LATCH_EXITING_3:%.*]], !llvm.loop [[LOOP0]] +; CHECK: outer.latch_exiting.3: +; CHECK-NEXT: ret void +; entry: br label %outer.header @@ -26,9 +76,9 @@ inner.header_latch_exiting: ; preds = %outer.h %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] %inner.iv_next = add nuw nsw i32 %inner.iv, 1 %outer.iv.ext = zext nneg i32 %outer.iv to i64 - %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 %inner.iv.ext = zext nneg i32 %inner.iv to i64 - %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext %addr = getelementptr inbounds i8, ptr %mem, i64 %idx store i32 0, ptr %addr %inner.cond = icmp ult i32 %inner.iv_next, 8 @@ -36,9 +86,9 @@ inner.header_latch_exiting: ; preds = %outer.h outer.latch_exiting: ; preds = %inner.header_latch_exiting %outer.iv_next = add nuw nsw i32 %outer.iv, 1 - %outer.cond = icmp ult i32 %outer.iv_next, 8 + %outer.cond = icmp ult i32 %outer.iv_next, 4 br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 - + end: ; preds = %outer.latch_exiting ret void } @@ -47,11 +97,45 @@ end: ; preds = %outer.l ; is not fully unrolled when the inner loop's final IV value depends on a ; function argument instead of a combination of the outer IV and constants. -; CHECK-LABEL: @dependent_sub_no_fullunroll -; CHECK-NOT: outer.latch_exiting.7 -; CHECK-NOT: outer.latch_exiting.7 - define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) { +; CHECK-LABEL: @dependent_sub_no_fullunroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[OUTER_LATCH_EXITING_1:%.*]] ] +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] +; CHECK: inner.header_latch_exiting: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64 +; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16 +; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 +; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]] +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]] +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]] +; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0]] +; CHECK: outer.latch_exiting: +; CHECK-NEXT: [[OUTER_IV_NEXT:%.*]] = add nuw nsw i32 [[OUTER_IV]], 1 +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]] +; CHECK: inner.header_latch_exiting.1: +; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ [[OUTER_IV_NEXT]], [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ] +; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1 +; CHECK-NEXT: [[OUTER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV_NEXT]] to i64 +; CHECK-NEXT: [[IDX_PART_1:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT_1]], 16 +; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64 +; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_PART_1]], [[INNER_IV_EXT_1]] +; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]] +; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4 +; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]] +; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1]], !llvm.loop [[LOOP0]] +; CHECK: outer.latch_exiting.1: +; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 2 +; CHECK-NEXT: [[OUTER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 4 +; CHECK-NEXT: br i1 [[OUTER_COND_1]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]] +; CHECK: end: +; CHECK-NEXT: ret void +; entry: br label %outer.header @@ -63,9 +147,9 @@ inner.header_latch_exiting: ; preds = %outer.h %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] %inner.iv_next = add nuw nsw i32 %inner.iv, 1 %outer.iv.ext = zext nneg i32 %outer.iv to i64 - %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 %inner.iv.ext = zext nneg i32 %inner.iv to i64 - %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext %addr = getelementptr inbounds i8, ptr %mem, i64 %idx store i32 0, ptr %addr %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub @@ -73,9 +157,74 @@ inner.header_latch_exiting: ; preds = %outer.h outer.latch_exiting: ; preds = %inner.header_latch_exiting %outer.iv_next = add nuw nsw i32 %outer.iv, 1 - %outer.cond = icmp ult i32 %outer.iv_next, 8 + %outer.cond = icmp ult i32 %outer.iv_next, 4 br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 - + +end: ; preds = %outer.latch_exiting + ret void +} + +; Make sure that the threshold bonus does not override a correctness check and +; unrolling when a convergent operation that is illegal to unroll is present. +; The loop nest is the same as before except for the fact that the outer +; loop's upper bound is now 11 (instead of 4) and there is an uncontrolled +; convergent call in the outer loop's header. Were the call non-convergent, +; the outer loop would be partially unrolled by a factor of 2, with a breakout +; of 1. + +declare void @convergent_operation() convergent + +define void @dont_unroll_illegal_convergent_op(ptr noundef %mem) { +; CHECK-LABEL: @dont_unroll_illegal_convergent_op( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ] +; CHECK-NEXT: call void @convergent_operation() +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] +; CHECK: inner.header_latch_exiting: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64 +; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16 +; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 +; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]] +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]] +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8 +; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING]], !llvm.loop [[LOOP0]] +; CHECK: outer.latch_exiting: +; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1 +; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 11 +; CHECK-NEXT: br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: ; preds = %entry, %outer.latch_exiting + %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] + call void @convergent_operation() + br label %inner.header_latch_exiting + +inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting + %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] + %inner.iv_next = add nuw nsw i32 %inner.iv, 1 + %outer.iv.ext = zext nneg i32 %outer.iv to i64 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %inner.iv.ext = zext nneg i32 %inner.iv to i64 + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %addr = getelementptr inbounds i8, ptr %mem, i64 %idx + store i32 0, ptr %addr + %inner.cond = icmp ult i32 %inner.iv_next, 8 + br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1 + +outer.latch_exiting: ; preds = %inner.header_latch_exiting + %outer.iv_next = add nuw nsw i32 %outer.iv, 1 + %outer.cond = icmp ult i32 %outer.iv_next, 11 + br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 + end: ; preds = %outer.latch_exiting ret void } From 30a482719ea3156263e66f1aefe62afa8489abd7 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Thu, 19 Dec 2024 15:18:11 +0100 Subject: [PATCH 3/5] Moved logic to target-independent analysis and improved it --- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 68 +--- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 303 +++++++++++++-- .../LoopUnroll/AMDGPU/unroll-dependent-sub.ll | 233 ------------ ...mplete_unroll_profitability_with_assume.ll | 46 ++- .../LoopUnroll/full-unroll-cost-savings.ll | 354 ++++++++++++++++++ 5 files changed, 665 insertions(+), 339 deletions(-) delete mode 100644 llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll create mode 100644 llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 8d6eb94af4a10..5160851f8c442 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -47,13 +47,6 @@ static cl::opt UnrollThresholdIf( cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden); -static cl::opt UnrollThresholdNestedStatic( - "amdgpu-unroll-threshold-nested-static", - cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose " - "trip count will be made runtime-independent when fully-unrolling " - "the outer loop"), - cl::init(200), cl::Hidden); - static cl::opt UnrollRuntimeLocal( "amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), @@ -155,67 +148,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, } } } - unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); - - if (PHINode *IV = L->getInductionVariable(SE)) { - // Look for subloops whose trip count would go from runtime-dependent to - // runtime-independent if we were to unroll the loop. Give a bonus to the - // current loop's unrolling threshold for each of these, as fully unrolling - // it would likely expose additional optimization opportunities. - for (const Loop *SubLoop : L->getSubLoops()) { - std::optional Bounds = SubLoop->getBounds(SE); - if (!Bounds) - continue; - Value *InitIV = &Bounds->getInitialIVValue(); - Value *FinalIV = &Bounds->getFinalIVValue(); - Value *StepVal = Bounds->getStepValue(); - if (!StepVal) - continue; - - // Determines whether SubIV's derivation depends exclusively on constants - // and/or IV; if it does, SubIVDependsOnIV is set to true if IV is - // involved in the derivation. - bool SubIVDependsOnIV = false; - std::function FromConstsOrLoopIV = - [&](const Value *SubIV, unsigned Depth) -> bool { - if (SubIV == IV) { - SubIVDependsOnIV = true; - return true; - } - if (isa(SubIV)) - return true; - if (Depth >= 10) - return false; - - const Instruction *I = dyn_cast(SubIV); - // No point in checking outside the loop since IV is necessarily inside - // it; also stop searching when encountering an instruction that will - // likely not allow SubIV's value to be statically computed. - if (!I || !L->contains(I) || !isa(I)) - return false; - - // SubIV depends on constants or IV if all of the instruction's - // operands involved in its derivation also depend on constants or IV. - return llvm::all_of(I->operand_values(), [&](const Value *V) { - return FromConstsOrLoopIV(V, Depth + 1); - }); - }; - - if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) && - FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) { - UP.Threshold += UnrollThresholdNestedStatic; - LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold - << " for loop:\n" - << *L - << " due to subloop's trip count becoming " - "runtime-independent after unrolling:\n " - << *SubLoop); - if (UP.Threshold >= MaxBoost) - return; - } - } - } + unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal); for (const BasicBlock *BB : L->getBlocks()) { const DataLayout &DL = BB->getDataLayout(); unsigned LocalGEPsSeen = 0; diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index cbc35b6dd4292..a4bcc2d9e7efa 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -85,9 +85,9 @@ static cl::opt static cl::opt UnrollOptSizeThreshold( - "unroll-optsize-threshold", cl::init(0), cl::Hidden, - cl::desc("The cost threshold for loop unrolling when optimizing for " - "size")); + "unroll-optsize-threshold", cl::init(0), cl::Hidden, + cl::desc("The cost threshold for loop unrolling when optimizing for " + "size")); static cl::opt UnrollPartialThreshold( "unroll-partial-threshold", cl::Hidden, @@ -154,7 +154,7 @@ static cl::opt FlatLoopTripCountThreshold( static cl::opt UnrollUnrollRemainder( "unroll-remainder", cl::Hidden, - cl::desc("Allow the loop remainder to be unrolled.")); + cl::desc("Allow the loop remainder to be unrolled.")); // This option isn't ever intended to be enabled, it serves to allow // experiments to check the assumptions about when this kind of revisit is @@ -337,8 +337,239 @@ struct PragmaInfo { const bool PragmaEnableUnroll; }; +/// Helper type to estimate per-iteration cost savings coming from fully +/// unrolling a loop. +/// +/// The analysis maintains a set of "known instructions" inside the loop (i.e., +/// instructions whose result will be statically known after loop unrolling) +/// that we assume will be entirely removable if the loop is fully unrolled. +/// These instructions' cost can be deducted from the unrolled cost when +/// comparing against a threshold. +struct FullUnrollCostSavings { + FullUnrollCostSavings(const Loop *L) : L(L) {} + + /// Returns whether the instruction is known. + inline bool isKnown(const Instruction *I) const { + return KnownVals.contains(I); + } + + /// If the value is an instruction, returns whether that instruction is known, + /// false otherwise. + bool isKnown(const Value *V) const { + if (const Instruction *I = dyn_cast(V)) + return isKnown(I); + return false; + } + + /// Adds an instruction to the known set and re-evaluates unknown instructions + /// in the loop to determine whether their result can now be known. + void addToKnown(const Instruction *I) { + if (!KnownVals.insert(I).second) + return; + + // Every time we assume knowledge of an additional instruction result, we + // potentially need to revisit instructions that were previously seen as + // unoptimizable. + Evaluated.clear(); + + addUsersToExploreSet(I); + while (ToEvaluate.size()) { + const Instruction *I = ToEvaluate.back(); + ToEvaluate.pop_back(); + evalInstruction(I); + } + } + + /// Returns savings incurred by all known instructions, according to the \p + /// TTI. + InstructionCost computeSavings(const TargetTransformInfo &TTI) const { + TargetTransformInfo::TargetCostKind CostKind = + L->getHeader()->getParent()->hasMinSize() + ? TargetTransformInfo::TCK_CodeSize + : TargetTransformInfo::TCK_SizeAndLatency; + + InstructionCost CostSavings; + for (const Value *Val : KnownVals) + CostSavings += TTI.getInstructionCost(cast(Val), CostKind); + return CostSavings; + } + +private: + /// The set of instruction inside the loop whose results are considered known. + SmallPtrSet KnownVals; + /// Caches the set of instructions we have already evaluated when adding a new + /// instruction to the known set. + SmallPtrSet Evaluated; + /// Stack of instructions to evaluate when adding a new instruction to the + /// known set. + SmallVector ToEvaluate; + /// The loop under consideration. + const Loop *L; + + /// Adds all value users to the stack of instructions to evaluate, if they + /// have not been evaluated already. + void addUsersToExploreSet(const Value *Val) { + for (const User *U : Val->users()) { + if (const Instruction *I = dyn_cast(U)) + if (!Evaluated.contains(I)) + ToEvaluate.push_back(I); + } + } + + /// Evaluates an instruction to determine whether its result is "known", and + /// returns if that is the case. This may recurse on operands that are the + /// resul of yet unevaluated instructions inside the loop. + bool evalInstruction(const Instruction *I) { + Evaluated.insert(I); + if (isKnown(I)) + return true; + if (!isa(I)) + return false; + bool Known = llvm::all_of(I->operand_values(), [&](const Value *Val) { + if (isa(Val) || isKnown(Val)) + return true; + const Instruction *ValInstr = dyn_cast(Val); + if (!ValInstr || Evaluated.contains(ValInstr) || !L->contains(ValInstr)) + return false; + return evalInstruction(ValInstr); + }); + if (Known) { + KnownVals.insert(I); + addUsersToExploreSet(I); + } + return Known; + } +}; + } // end anonymous namespace +/// Runs a fast analysis on the loop to determine whether it is worth it to +/// fully unroll it. As opposed to analyzeLoopUnrollCost, this does not attempt +/// to simulate execution of every loop iteration but instead tries to identify +/// the set of instructions that will be optimizable away if the loop is fully +/// unrolled. Returns estimated instruction cost savings per loop iteration if +/// the loop were to be fully unrolled according to the trip count in UP.Count. +static InstructionCost analyzeFullUnrollCostSavings( + const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, + const TargetTransformInfo::UnrollingPreferences &UP) { + // Cost savings analysis is all based on unrolling making some values + // statically known; if we cannot identify the loop's IV then there is nothing + // we can do. + PHINode *IV = L->getInductionVariable(SE); + if (!IV) + return {}; + FullUnrollCostSavings Savings(L); + + // If we were to unroll the loop, everything that is only dependent on the IV + // and constants will get simplified away. + Savings.addToKnown(IV); + + // Look for subloops whose trip count would go from runtime-dependent to + // runtime-independent if we were to unroll the loop. These subloops are + // likely to be fully unrollable in the future and yield further cost savings. + unsigned NumUnrollableSubloops = 0; + for (const Loop *SubLoop : L->getSubLoops()) { + // We must be able to determine the loop's IV, initial/final IV value, and + // step. + PHINode *SubIV = SubLoop->getInductionVariable(SE); + if (!SubIV) + continue; + std::optional Bounds = SubLoop->getBounds(SE); + if (!Bounds) + continue; + Value *StepVal = Bounds->getStepValue(); + if (!StepVal) + continue; + + bool SubBoundsDependsOnIV = false; + auto IsValKnown = [&](const Value *Val) -> bool { + if (isa(Val)) + return true; + if (Savings.isKnown(Val)) { + SubBoundsDependsOnIV = true; + return true; + } + return false; + }; + + // Determine whether the derivation of the subloop's bounds depends + // exclusively on constants and the outer loop's IV. + if (IsValKnown(&Bounds->getInitialIVValue()) && + IsValKnown(&Bounds->getFinalIVValue()) && IsValKnown(StepVal) && + SubBoundsDependsOnIV) { + // Optimistically assume that we will be able to unroll the subloop in the + // future, which means that its IV will also be known on all inner loop + // iterations, leading to more instructions being optimized away. Properly + // estimating the cost savings per outer loop iteration would require us + // to estimate the average subloop trip count, but it is too complicated + // for this analysis. When determining cost savings, we will very + // conservatively assume that the inner loop will only execute once per + // outer loop iteration. This also reduces our cost savings estimation + // mistake in the case where the subloop does not end up being unrolled. + Savings.addToKnown(SubIV); + ++NumUnrollableSubloops; + + LLVM_DEBUG( + dbgs() << " Trip count of subloop %" + << SubLoop->getHeader()->getName() + << " will become runtime-independent by fully unrolling loop %" + << L->getHeader()->getName() << "\n"); + } + } + + // Look for condititional branches whose condition would be statically + // determined at each iteration of the loop if it were unrolled. In some + // cases, this means we will able to remove the branch entirely. + for (const BasicBlock *BB : L->getBlocks()) { + const Instruction *TermInstr = BB->getTerminator(); + if (const BranchInst *Br = dyn_cast(TermInstr)) { + if (Br->isConditional() && Savings.isKnown(Br->getCondition())) { + // The branch condition will be statically determined at each iteration + // of the loop. + BasicBlock *FalseSucc = Br->getSuccessor(0), + *TrueSucc = Br->getSuccessor(1); + + // Checks whether one of the branch successor has at most two + // predecessors which are either the branch's block or the other branch + // successor. + auto IsIfThen = [&](auto Predecessors, BasicBlock *OtherSucc) -> bool { + unsigned NumPreds = 0; + for (const BasicBlock *Pred : Predecessors) { + if (Pred != BB && Pred != OtherSucc) + return false; + if (++NumPreds > 2) + return false; + } + return true; + }; + + if ((TrueSucc->getSinglePredecessor() || + IsIfThen(predecessors(TrueSucc), FalseSucc)) && + (FalseSucc->getSinglePredecessor() || + IsIfThen(predecessors(FalseSucc), TrueSucc))) { + // The CFG corresponds to a simple if/then(/else) construct whose + // condition we will know, so we will able to remove the branch and + // one of the two blocks at each iteration of the outer loop. Only the + // branch represents a cost saving, since one successor block will + // still be executed. + Savings.addToKnown(Br); + LLVM_DEBUG(dbgs() << " Conditional branch will be removed by fully " + "unrolling loop %" + << L->getHeader()->getName() << "\n"); + } + } + } + } + + // Compute cost savings from instructions that will likely be optimized away + // by unrolling the loop. + InstructionCost CostSavings = Savings.computeSavings(TTI); + // Finally, for each subloop that we think will become unrollable, account for + // the backedge's branch being removed. + CostSavings += NumUnrollableSubloops; + return CostSavings; +} + /// Figure out if the loop is worth full unrolling. /// /// Complete loop unrolling can make some loads constant, and we need to know @@ -833,34 +1064,54 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo, return std::nullopt; } -static std::optional shouldFullUnroll( - Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, - ScalarEvolution &SE, const SmallPtrSetImpl &EphValues, - const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE, - const TargetTransformInfo::UnrollingPreferences &UP) { - assert(FullUnrollTripCount && "should be non-zero!"); +static bool +shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, + ScalarEvolution &SE, + const SmallPtrSetImpl &EphValues, + const UnrollCostEstimator UCE, + const TargetTransformInfo::UnrollingPreferences &UP) { + assert(UP.Count && "should be non-zero!"); - if (FullUnrollTripCount > UP.FullUnrollMaxCount) - return std::nullopt; + if (UP.Count > UP.FullUnrollMaxCount) + return false; // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. if (UCE.getUnrolledLoopSize(UP) < UP.Threshold) - return FullUnrollTripCount; + return true; // The loop isn't that small, but we still can fully unroll it if that - // helps to remove a significant number of instructions. - // To check that, run additional analysis on the loop. + // helps to remove a significant number of instructions. To check that, run + // additional analyses on the loop. First try a full iteration-by-iteration + // analysis on the loop. If that fails, run a simpler structural analysis that + // estimates per-iteration cost savings in the unrolled loop. if (std::optional Cost = analyzeLoopUnrollCost( - L, FullUnrollTripCount, DT, SE, EphValues, TTI, + L, UP.Count, DT, SE, EphValues, TTI, UP.Threshold * UP.MaxPercentThresholdBoost / 100, UP.MaxIterationsCountToAnalyze)) { unsigned Boost = - getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); + getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost); if (Cost->UnrolledCost < UP.Threshold * Boost / 100) - return FullUnrollTripCount; + return true; + } else { + InstructionCost Savings = analyzeFullUnrollCostSavings(L, SE, TTI, UP); + if (!(Savings.isValid() && *Savings.getValue())) + return false; + // Savings for one loop iteration are those estimated by the analaysis plus + // the loop backedge's branch. + uint64_t ItSavings = *Savings.getValue() + 1; + // Compute estimated cost of one loop iteration in the unrolled form. + uint64_t ItUnrollCost = UCE.getRolledLoopSize(); + if (ItSavings < ItUnrollCost) + ItUnrollCost -= ItSavings; + else + ItUnrollCost = 1; + uint64_t FullUnrollCost = ItUnrollCost * UP.Count + 1; + assert(FullUnrollCost && "loop has no cost"); + if (FullUnrollCost < UP.Threshold) + return true; } - return std::nullopt; + return false; } static std::optional @@ -873,7 +1124,7 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, if (!UP.Partial) { LLVM_DEBUG(dbgs() << " will not try to unroll partially because " - << "-unroll-allow-partial not given\n"); + << "-unroll-allow-partial not given\n"); return 0; } unsigned count = UP.Count; @@ -883,7 +1134,7 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount, // Reduce unroll count to be modulo of TripCount for partial unrolling. if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold) count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) / - (LoopSize - UP.BEInsns); + (LoopSize - UP.BEInsns); if (count > UP.MaxCount) count = UP.MaxCount; while (count != 0 && TripCount % count != 0) @@ -980,9 +1231,7 @@ bool llvm::computeUnrollCount( UP.Count = 0; if (TripCount) { UP.Count = TripCount; - if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, - TripCount, UCE, UP)) { - UP.Count = *UnrollFactor; + if (shouldFullUnroll(L, TTI, DT, SE, EphValues, UCE, UP)) { UseUpperBound = false; return ExplicitUnroll; } @@ -1003,9 +1252,7 @@ bool llvm::computeUnrollCount( if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) && MaxTripCount <= UP.MaxUpperBound) { UP.Count = MaxTripCount; - if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues, - MaxTripCount, UCE, UP)) { - UP.Count = *UnrollFactor; + if (shouldFullUnroll(L, TTI, DT, SE, EphValues, UCE, UP)) { UseUpperBound = true; return ExplicitUnroll; } @@ -1533,7 +1780,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, if (!Changed) return PreservedAnalyses::all(); - // The parent must not be damaged by unrolling! + // The parent must not be damaged by unrolling! #ifndef NDEBUG if (ParentL) ParentL->verifyLoop(); diff --git a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll b/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll deleted file mode 100644 index 97de4cbf0936c..0000000000000 --- a/llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-dependent-sub.ll +++ /dev/null @@ -1,233 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; REQUIRES: asserts -; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug < %s 2>&1 | FileCheck %s - -; For @dependent_sub_fullunroll, the threshold bonus should apply -; CHECK: due to subloop's trip count becoming runtime-independent after unrolling - -; For @dependent_sub_no_fullunroll, the threshold bonus should not apply -; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling - -; For @dont_unroll_illegal_convergent_op, the threshold bonus should apply even if there is no unrolling -; CHECK: due to subloop's trip count becoming runtime-independent after unrolling - -; Check that the outer loop of a double-nested loop where the inner loop's trip -; count depends exclusively on constants and the outer IV is fully unrolled -; thanks to receiving a threshold bonus in AMDGPU's TTI. - -define void @dependent_sub_fullunroll(ptr noundef %mem) { -; CHECK-LABEL: @dependent_sub_fullunroll( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] -; CHECK: outer.header: -; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] -; CHECK: inner.header_latch_exiting: -; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] -; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 -; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]] -; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 -; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: outer.latch_exiting: -; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]] -; CHECK: inner.header_latch_exiting.1: -; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ] -; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1 -; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64 -; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]] -; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]] -; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4 -; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], 8 -; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1:%.*]], !llvm.loop [[LOOP0]] -; CHECK: outer.latch_exiting.1: -; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_2:%.*]] -; CHECK: inner.header_latch_exiting.2: -; CHECK-NEXT: [[INNER_IV_2:%.*]] = phi i32 [ 2, [[OUTER_LATCH_EXITING_1]] ], [ [[INNER_IV_NEXT_2:%.*]], [[INNER_HEADER_LATCH_EXITING_2]] ] -; CHECK-NEXT: [[INNER_IV_NEXT_2]] = add nuw nsw i32 [[INNER_IV_2]], 1 -; CHECK-NEXT: [[INNER_IV_EXT_2:%.*]] = zext nneg i32 [[INNER_IV_2]] to i64 -; CHECK-NEXT: [[IDX_2:%.*]] = add nuw nsw i64 32, [[INNER_IV_EXT_2]] -; CHECK-NEXT: [[ADDR_2:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_2]] -; CHECK-NEXT: store i32 0, ptr [[ADDR_2]], align 4 -; CHECK-NEXT: [[INNER_COND_2:%.*]] = icmp ult i32 [[INNER_IV_NEXT_2]], 8 -; CHECK-NEXT: br i1 [[INNER_COND_2]], label [[INNER_HEADER_LATCH_EXITING_2]], label [[OUTER_LATCH_EXITING_2:%.*]], !llvm.loop [[LOOP0]] -; CHECK: outer.latch_exiting.2: -; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_3:%.*]] -; CHECK: inner.header_latch_exiting.3: -; CHECK-NEXT: [[INNER_IV_3:%.*]] = phi i32 [ 3, [[OUTER_LATCH_EXITING_2]] ], [ [[INNER_IV_NEXT_3:%.*]], [[INNER_HEADER_LATCH_EXITING_3]] ] -; CHECK-NEXT: [[INNER_IV_NEXT_3]] = add nuw nsw i32 [[INNER_IV_3]], 1 -; CHECK-NEXT: [[INNER_IV_EXT_3:%.*]] = zext nneg i32 [[INNER_IV_3]] to i64 -; CHECK-NEXT: [[IDX_3:%.*]] = add nuw nsw i64 48, [[INNER_IV_EXT_3]] -; CHECK-NEXT: [[ADDR_3:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_3]] -; CHECK-NEXT: store i32 0, ptr [[ADDR_3]], align 4 -; CHECK-NEXT: [[INNER_COND_3:%.*]] = icmp ult i32 [[INNER_IV_NEXT_3]], 8 -; CHECK-NEXT: br i1 [[INNER_COND_3]], label [[INNER_HEADER_LATCH_EXITING_3]], label [[OUTER_LATCH_EXITING_3:%.*]], !llvm.loop [[LOOP0]] -; CHECK: outer.latch_exiting.3: -; CHECK-NEXT: ret void -; -entry: - br label %outer.header - -outer.header: ; preds = %entry, %outer.latch_exiting - %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] - br label %inner.header_latch_exiting - -inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting - %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] - %inner.iv_next = add nuw nsw i32 %inner.iv, 1 - %outer.iv.ext = zext nneg i32 %outer.iv to i64 - %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 - %inner.iv.ext = zext nneg i32 %inner.iv to i64 - %idx = add nuw nsw i64 %idx_part, %inner.iv.ext - %addr = getelementptr inbounds i8, ptr %mem, i64 %idx - store i32 0, ptr %addr - %inner.cond = icmp ult i32 %inner.iv_next, 8 - br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1 - -outer.latch_exiting: ; preds = %inner.header_latch_exiting - %outer.iv_next = add nuw nsw i32 %outer.iv, 1 - %outer.cond = icmp ult i32 %outer.iv_next, 4 - br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 - -end: ; preds = %outer.latch_exiting - ret void -} - -; Check that the outer loop of the same loop nest as dependent_sub_fullunroll -; is not fully unrolled when the inner loop's final IV value depends on a -; function argument instead of a combination of the outer IV and constants. - -define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) { -; CHECK-LABEL: @dependent_sub_no_fullunroll( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] -; CHECK: outer.header: -; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[OUTER_LATCH_EXITING_1:%.*]] ] -; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] -; CHECK: inner.header_latch_exiting: -; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] -; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 -; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64 -; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16 -; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 -; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]] -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]] -; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 -; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]] -; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING:%.*]], !llvm.loop [[LOOP0]] -; CHECK: outer.latch_exiting: -; CHECK-NEXT: [[OUTER_IV_NEXT:%.*]] = add nuw nsw i32 [[OUTER_IV]], 1 -; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]] -; CHECK: inner.header_latch_exiting.1: -; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ [[OUTER_IV_NEXT]], [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ] -; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1 -; CHECK-NEXT: [[OUTER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV_NEXT]] to i64 -; CHECK-NEXT: [[IDX_PART_1:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT_1]], 16 -; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64 -; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 [[IDX_PART_1]], [[INNER_IV_EXT_1]] -; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]] -; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4 -; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]] -; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_LATCH_EXITING_1]], !llvm.loop [[LOOP0]] -; CHECK: outer.latch_exiting.1: -; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 2 -; CHECK-NEXT: [[OUTER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 4 -; CHECK-NEXT: br i1 [[OUTER_COND_1]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]] -; CHECK: end: -; CHECK-NEXT: ret void -; -entry: - br label %outer.header - -outer.header: ; preds = %entry, %outer.latch_exiting - %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] - br label %inner.header_latch_exiting - -inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting - %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] - %inner.iv_next = add nuw nsw i32 %inner.iv, 1 - %outer.iv.ext = zext nneg i32 %outer.iv to i64 - %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 - %inner.iv.ext = zext nneg i32 %inner.iv to i64 - %idx = add nuw nsw i64 %idx_part, %inner.iv.ext - %addr = getelementptr inbounds i8, ptr %mem, i64 %idx - store i32 0, ptr %addr - %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub - br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1 - -outer.latch_exiting: ; preds = %inner.header_latch_exiting - %outer.iv_next = add nuw nsw i32 %outer.iv, 1 - %outer.cond = icmp ult i32 %outer.iv_next, 4 - br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 - -end: ; preds = %outer.latch_exiting - ret void -} - -; Make sure that the threshold bonus does not override a correctness check and -; unrolling when a convergent operation that is illegal to unroll is present. -; The loop nest is the same as before except for the fact that the outer -; loop's upper bound is now 11 (instead of 4) and there is an uncontrolled -; convergent call in the outer loop's header. Were the call non-convergent, -; the outer loop would be partially unrolled by a factor of 2, with a breakout -; of 1. - -declare void @convergent_operation() convergent - -define void @dont_unroll_illegal_convergent_op(ptr noundef %mem) { -; CHECK-LABEL: @dont_unroll_illegal_convergent_op( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] -; CHECK: outer.header: -; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ] -; CHECK-NEXT: call void @convergent_operation() -; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] -; CHECK: inner.header_latch_exiting: -; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] -; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 -; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64 -; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16 -; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 -; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]] -; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]] -; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 -; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 8 -; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING]], !llvm.loop [[LOOP0]] -; CHECK: outer.latch_exiting: -; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1 -; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 11 -; CHECK-NEXT: br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]], !llvm.loop [[LOOP0]] -; CHECK: end: -; CHECK-NEXT: ret void -; -entry: - br label %outer.header - -outer.header: ; preds = %entry, %outer.latch_exiting - %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] - call void @convergent_operation() - br label %inner.header_latch_exiting - -inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting - %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] - %inner.iv_next = add nuw nsw i32 %inner.iv, 1 - %outer.iv.ext = zext nneg i32 %outer.iv to i64 - %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 - %inner.iv.ext = zext nneg i32 %inner.iv to i64 - %idx = add nuw nsw i64 %idx_part, %inner.iv.ext - %addr = getelementptr inbounds i8, ptr %mem, i64 %idx - store i32 0, ptr %addr - %inner.cond = icmp ult i32 %inner.iv_next, 8 - br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1 - -outer.latch_exiting: ; preds = %inner.header_latch_exiting - %outer.iv_next = add nuw nsw i32 %outer.iv, 1 - %outer.cond = icmp ult i32 %outer.iv_next, 11 - br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1 - -end: ; preds = %outer.latch_exiting - ret void -} - -!1 = !{!1, !2} -!2 = !{!"amdgpu.loop.unroll.threshold", i32 100} diff --git a/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll b/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll index 556a4032b58e4..8f4f71abf37a9 100644 --- a/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll +++ b/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll @@ -22,55 +22,73 @@ define i32 @foo(ptr %a) { ; ANALYZE-FULL: for.body: ; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE:%.*]], label [[FOR_NEXT:%.*]] ; ANALYZE-FULL: do_store: -; ANALYZE-FULL-NEXT: store i32 0, ptr [[A:%.*]], align 4 +; ANALYZE-FULL-NEXT: [[DATA:%.*]] = load i32, ptr [[A:%.*]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL:%.*]] = mul i32 [[DATA]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL]], ptr [[A]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT]] ; ANALYZE-FULL: for.next: ; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_1:%.*]], label [[FOR_NEXT_1:%.*]] ; ANALYZE-FULL: do_store.1: ; ANALYZE-FULL-NEXT: [[GEP_1:%.*]] = getelementptr i32, ptr [[A]], i32 1 -; ANALYZE-FULL-NEXT: store i32 1, ptr [[GEP_1]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_1:%.*]] = load i32, ptr [[GEP_1]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_1:%.*]] = mul i32 [[DATA_1]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_1]], ptr [[GEP_1]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_1]] ; ANALYZE-FULL: for.next.1: ; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_2:%.*]], label [[FOR_NEXT_2:%.*]] ; ANALYZE-FULL: do_store.2: ; ANALYZE-FULL-NEXT: [[GEP_2:%.*]] = getelementptr i32, ptr [[A]], i32 2 -; ANALYZE-FULL-NEXT: store i32 2, ptr [[GEP_2]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_2:%.*]] = load i32, ptr [[GEP_2]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_2:%.*]] = mul i32 [[DATA_2]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_2]], ptr [[GEP_2]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_2]] ; ANALYZE-FULL: for.next.2: ; ANALYZE-FULL-NEXT: br i1 true, label [[DO_STORE_3:%.*]], label [[FOR_NEXT_3:%.*]] ; ANALYZE-FULL: do_store.3: ; ANALYZE-FULL-NEXT: [[GEP_3:%.*]] = getelementptr i32, ptr [[A]], i32 3 -; ANALYZE-FULL-NEXT: store i32 3, ptr [[GEP_3]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_3:%.*]] = load i32, ptr [[GEP_3]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_3:%.*]] = mul i32 [[DATA_3]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_3]], ptr [[GEP_3]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_3]] ; ANALYZE-FULL: for.next.3: ; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_4:%.*]], label [[FOR_NEXT_4:%.*]] ; ANALYZE-FULL: do_store.4: ; ANALYZE-FULL-NEXT: [[GEP_4:%.*]] = getelementptr i32, ptr [[A]], i32 4 -; ANALYZE-FULL-NEXT: store i32 4, ptr [[GEP_4]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_4:%.*]] = load i32, ptr [[GEP_4]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_4:%.*]] = mul i32 [[DATA_4]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_4]], ptr [[GEP_4]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_4]] ; ANALYZE-FULL: for.next.4: ; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_5:%.*]], label [[FOR_NEXT_5:%.*]] ; ANALYZE-FULL: do_store.5: ; ANALYZE-FULL-NEXT: [[GEP_5:%.*]] = getelementptr i32, ptr [[A]], i32 5 -; ANALYZE-FULL-NEXT: store i32 5, ptr [[GEP_5]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_5:%.*]] = load i32, ptr [[GEP_5]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_5:%.*]] = mul i32 [[DATA_5]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_5]], ptr [[GEP_5]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_5]] ; ANALYZE-FULL: for.next.5: ; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_6:%.*]], label [[FOR_NEXT_6:%.*]] ; ANALYZE-FULL: do_store.6: ; ANALYZE-FULL-NEXT: [[GEP_6:%.*]] = getelementptr i32, ptr [[A]], i32 6 -; ANALYZE-FULL-NEXT: store i32 6, ptr [[GEP_6]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_6:%.*]] = load i32, ptr [[GEP_6]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_6:%.*]] = mul i32 [[DATA_6]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_6]], ptr [[GEP_6]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_6]] ; ANALYZE-FULL: for.next.6: ; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_7:%.*]], label [[FOR_NEXT_7:%.*]] ; ANALYZE-FULL: do_store.7: ; ANALYZE-FULL-NEXT: [[GEP_7:%.*]] = getelementptr i32, ptr [[A]], i32 7 -; ANALYZE-FULL-NEXT: store i32 7, ptr [[GEP_7]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_7:%.*]] = load i32, ptr [[GEP_7]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_7:%.*]] = mul i32 [[DATA_7]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_7]], ptr [[GEP_7]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_7]] ; ANALYZE-FULL: for.next.7: ; ANALYZE-FULL-NEXT: br i1 false, label [[DO_STORE_8:%.*]], label [[FOR_NEXT_8:%.*]] ; ANALYZE-FULL: do_store.8: ; ANALYZE-FULL-NEXT: [[GEP_8:%.*]] = getelementptr i32, ptr [[A]], i32 8 -; ANALYZE-FULL-NEXT: store i32 8, ptr [[GEP_8]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_8:%.*]] = load i32, ptr [[GEP_8]], align 4 +; ANALYZE-FULL-NEXT: [[DATA_MUL_8:%.*]] = mul i32 [[DATA_8]], 2 +; ANALYZE-FULL-NEXT: store i32 [[DATA_MUL_8]], ptr [[GEP_8]], align 4 ; ANALYZE-FULL-NEXT: br label [[FOR_NEXT_8]] ; ANALYZE-FULL: for.next.8: ; ANALYZE-FULL-NEXT: ret i32 9 @@ -87,7 +105,10 @@ define i32 @foo(ptr %a) { ; DONT-ANALYZE-FULL-NEXT: br i1 [[CMP2]], label [[DO_STORE:%.*]], label [[FOR_NEXT]] ; DONT-ANALYZE-FULL: do_store: ; DONT-ANALYZE-FULL-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[INDVAR]] -; DONT-ANALYZE-FULL-NEXT: store i32 [[INDVAR]], ptr [[GEP]], align 4 +; DONT-ANALYZE-FULL-NEXT: [[DATA:%.*]] = load i32, ptr [[GEP]], align 4 +; DONT-ANALYZE-FULL-NEXT: [[DATA_MUL:%.*]] = mul i32 [[DATA]], 2 +; DONT-ANALYZE-FULL-NEXT: [[DATA_ADD:%.*]] = add i32 [[DATA_MUL]], 1 +; DONT-ANALYZE-FULL-NEXT: store i32 [[DATA_MUL]], ptr [[GEP]], align 4 ; DONT-ANALYZE-FULL-NEXT: br label [[FOR_NEXT]] ; DONT-ANALYZE-FULL: for.next: ; DONT-ANALYZE-FULL-NEXT: [[EXITCOND:%.*]] = icmp ne i32 [[INDVAR_NEXT]], 9 @@ -108,7 +129,10 @@ for.body: do_store: %gep = getelementptr i32, ptr %a, i32 %indvar - store i32 %indvar, ptr %gep + %data = load i32, ptr %gep + %data_mul = mul i32 %data, 2 + %data_add = add i32 %data_mul, 1 + store i32 %data_mul, ptr %gep br label %for.next for.next: diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll new file mode 100644 index 0000000000000..1658af6dd55b9 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll @@ -0,0 +1,354 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -passes=loop-unroll -unroll-threshold=25 < %s | FileCheck %s + +; All functions are simple variations of the same double nested loop with an +; if/then/else-like CFG structure in the outer loop. The unrolling threshold is +; set manually so that it is just slightly higher than the estimated unrolled +; cost of the outer loop in the baseline, even after unroll cost savings +; analysis. + +; Baseline. Inner loop's bounds and if/then/else's condition depend on function +; arguments. No unrolling happens. + +define void @no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub, i32 noundef %ifcond) { +; CHECK-LABEL: @no_fullunroll( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ] +; CHECK-NEXT: [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64 +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] +; CHECK: inner.header_latch_exiting: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16 +; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 +; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]] +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]] +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]] +; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_IF:%.*]] +; CHECK: outer.if: +; CHECK-NEXT: [[IF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[OUTER_IV_EXT]] +; CHECK-NEXT: [[MOD2:%.*]] = and i32 [[IFCOND:%.*]], 1 +; CHECK-NEXT: [[IF_COND:%.*]] = icmp ult i32 [[MOD2]], 0 +; CHECK-NEXT: br i1 [[IF_COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 1, ptr [[IF_ADDR]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]] +; CHECK: if.else: +; CHECK-NEXT: store i32 2, ptr [[IF_ADDR]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]] +; CHECK: outer.latch_exiting: +; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1 +; CHECK-NEXT: [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 2 +; CHECK-NEXT: br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]] +; CHECK: end: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: ; preds = %entry, %outer.latch_exiting + %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] + %outer.iv.ext = zext nneg i32 %outer.iv to i64 + br label %inner.header_latch_exiting + +inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting + %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] + %inner.iv_next = add nuw nsw i32 %inner.iv, 1 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %inner.iv.ext = zext nneg i32 %inner.iv to i64 + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %addr = getelementptr inbounds i8, ptr %mem, i64 %idx + store i32 0, ptr %addr + %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub + br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if + +outer.if: ; preds = %inner.header_latch_exiting + %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext + %mod2 = and i32 %ifcond, 1 + %if.cond = icmp ult i32 %mod2, 0 + br i1 %if.cond, label %if.then, label %if.else + +if.then: ; preds = %outer.if + store i32 1, ptr %if.addr + br label %outer.latch_exiting + +if.else: ; preds = %outer.if + store i32 2, ptr %if.addr + br label %outer.latch_exiting + +outer.latch_exiting: ; preds = %if.then, %if.else + %outer.iv_next = add nuw nsw i32 %outer.iv, 1 + %outer.cond = icmp ult i32 %outer.iv_next, 2 + br i1 %outer.cond, label %outer.header, label %end + +end: ; preds = %outer.latch_exiting + ret void +} + +; Inner loop's bounds depend on constants and outer IV, yielding extra cost +; savings. These are enough to fully unroll the outer loop. + +define void @save_subloop(ptr noundef %mem, i32 noundef %ifcond) { +; CHECK-LABEL: @save_subloop( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] +; CHECK: inner.header_latch_exiting: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]] +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 2 +; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING_1:%.*]] +; CHECK: outer.if: +; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 1, ptr [[MEM]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING:%.*]] +; CHECK: if.else: +; CHECK-NEXT: store i32 2, ptr [[MEM]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]] +; CHECK: outer.latch_exiting: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]] +; CHECK: inner.header_latch_exiting.1: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ] +; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 1 +; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV]] to i64 +; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]] +; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]] +; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4 +; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 2 +; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]] +; CHECK: outer.if.1: +; CHECK-NEXT: [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1 +; CHECK-NEXT: br i1 false, label [[IF_THEN_1:%.*]], label [[IF_ELSE_1:%.*]] +; CHECK: if.else.1: +; CHECK-NEXT: store i32 2, ptr [[IF_ADDR_1]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2:%.*]] +; CHECK: if.then.1: +; CHECK-NEXT: store i32 1, ptr [[IF_ADDR_1]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2]] +; CHECK: outer.latch_exiting.1: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: ; preds = %entry, %outer.latch_exiting + %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] + %outer.iv.ext = zext nneg i32 %outer.iv to i64 + br label %inner.header_latch_exiting + +inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting + %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] + %inner.iv_next = add nuw nsw i32 %inner.iv, 1 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %inner.iv.ext = zext nneg i32 %inner.iv to i64 + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %addr = getelementptr inbounds i8, ptr %mem, i64 %idx + store i32 0, ptr %addr + %inner.cond = icmp ult i32 %inner.iv_next, 2 + br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if + +outer.if: ; preds = %inner.header_latch_exiting + %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext + %mod2 = and i32 %ifcond, 1 + %if.cond = icmp ult i32 %mod2, 0 + br i1 %if.cond, label %if.then, label %if.else + +if.then: ; preds = %outer.if + store i32 1, ptr %if.addr + br label %outer.latch_exiting + +if.else: ; preds = %outer.if + store i32 2, ptr %if.addr + br label %outer.latch_exiting + +outer.latch_exiting: ; preds = %if.then, %if.else + %outer.iv_next = add nuw nsw i32 %outer.iv, 1 + %outer.cond = icmp ult i32 %outer.iv_next, 2 + br i1 %outer.cond, label %outer.header, label %end + +end: ; preds = %outer.latch_exiting + ret void +} + +; If/then/else's condition depends on constants and outer IV, yielding extra +; cost savings. These are enough to fully unroll the outer loop. + +define void @save_ifthenelse(ptr noundef %mem, i32 noundef %inner.ub) { +; CHECK-LABEL: @save_ifthenelse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] +; CHECK: inner.header_latch_exiting: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]] +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]] +; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING_1:%.*]] +; CHECK: outer.if: +; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 1, ptr [[MEM]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING:%.*]] +; CHECK: if.else: +; CHECK-NEXT: store i32 2, ptr [[MEM]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]] +; CHECK: outer.latch_exiting: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]] +; CHECK: inner.header_latch_exiting.1: +; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ] +; CHECK-NEXT: [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 1 +; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV]] to i64 +; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]] +; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]] +; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4 +; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], [[INNER_UB]] +; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]] +; CHECK: outer.if.1: +; CHECK-NEXT: [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1 +; CHECK-NEXT: br i1 false, label [[IF_THEN_1:%.*]], label [[IF_ELSE_1:%.*]] +; CHECK: if.else.1: +; CHECK-NEXT: store i32 2, ptr [[IF_ADDR_1]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2:%.*]] +; CHECK: if.then.1: +; CHECK-NEXT: store i32 1, ptr [[IF_ADDR_1]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_2]] +; CHECK: outer.latch_exiting.1: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: ; preds = %entry, %outer.latch_exiting + %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] + %outer.iv.ext = zext nneg i32 %outer.iv to i64 + br label %inner.header_latch_exiting + +inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting + %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] + %inner.iv_next = add nuw nsw i32 %inner.iv, 1 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %inner.iv.ext = zext nneg i32 %inner.iv to i64 + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %addr = getelementptr inbounds i8, ptr %mem, i64 %idx + store i32 0, ptr %addr + %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub + br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if + +outer.if: ; preds = %inner.header_latch_exiting + %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext + %mod2 = and i32 %outer.iv, 1 + %if.cond = icmp ult i32 %mod2, 0 + br i1 %if.cond, label %if.then, label %if.else + +if.then: ; preds = %outer.if + store i32 1, ptr %if.addr + br label %outer.latch_exiting + +if.else: ; preds = %outer.if + store i32 2, ptr %if.addr + br label %outer.latch_exiting + +outer.latch_exiting: ; preds = %if.then, %if.else + %outer.iv_next = add nuw nsw i32 %outer.iv, 1 + %outer.cond = icmp ult i32 %outer.iv_next, 2 + br i1 %outer.cond, label %outer.header, label %end + +end: ; preds = %outer.latch_exiting + ret void +} + + +; Tests whether an if/then-like CFG structure is also recognized as a cost +; saving opportunity. Same double nested loop as before, but the if's else +; branch is removed and two extra instructions are added to the then branch to +; maintain the same loop size. + +define void @save_ifthen(ptr noundef %mem, i32 noundef %inner.ub) { +; CHECK-LABEL: @save_ifthen( +; CHECK-NEXT: entry: +; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] +; CHECK: outer.header: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING:%.*]] +; CHECK: inner.header_latch_exiting: +; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ] +; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1 +; CHECK-NEXT: [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64 +; CHECK-NEXT: [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]] +; CHECK-NEXT: store i32 0, ptr [[ADDR]], align 4 +; CHECK-NEXT: [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]] +; CHECK-NEXT: br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_IF:%.*]] +; CHECK: outer.if: +; CHECK-NEXT: br i1 false, label [[IF_THEN:%.*]], label [[OUTER_LATCH_EXITING:%.*]] +; CHECK: if.then: +; CHECK-NEXT: store i32 0, ptr [[MEM]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING]] +; CHECK: outer.latch_exiting: +; CHECK-NEXT: br label [[INNER_HEADER_LATCH_EXITING_1:%.*]] +; CHECK: inner.header_latch_exiting.1: +; CHECK-NEXT: [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ] +; CHECK-NEXT: [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1 +; CHECK-NEXT: [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64 +; CHECK-NEXT: [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]] +; CHECK-NEXT: [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]] +; CHECK-NEXT: store i32 0, ptr [[ADDR_1]], align 4 +; CHECK-NEXT: [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]] +; CHECK-NEXT: br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]] +; CHECK: outer.if.1: +; CHECK-NEXT: [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1 +; CHECK-NEXT: br i1 false, label [[IF_THEN_1:%.*]], label [[OUTER_LATCH_EXITING_1:%.*]] +; CHECK: if.then.1: +; CHECK-NEXT: store i32 4, ptr [[IF_ADDR_1]], align 4 +; CHECK-NEXT: br label [[OUTER_LATCH_EXITING_1]] +; CHECK: outer.latch_exiting.1: +; CHECK-NEXT: ret void +; +entry: + br label %outer.header + +outer.header: ; preds = %entry, %outer.latch_exiting + %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ] + %outer.iv.ext = zext nneg i32 %outer.iv to i64 + br label %inner.header_latch_exiting + +inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting + %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ] + %inner.iv_next = add nuw nsw i32 %inner.iv, 1 + %idx_part = mul nuw nsw i64 %outer.iv.ext, 16 + %inner.iv.ext = zext nneg i32 %inner.iv to i64 + %idx = add nuw nsw i64 %idx_part, %inner.iv.ext + %addr = getelementptr inbounds i8, ptr %mem, i64 %idx + store i32 0, ptr %addr + %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub + br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if + +outer.if: ; preds = %inner.header_latch_exiting + %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext + %mod2 = and i32 %outer.iv, 1 + %if.cond = icmp ult i32 %mod2, 0 + br i1 %if.cond, label %if.then, label %outer.latch_exiting + +if.then: ; preds = %outer.if + %mod2x2 = mul i32 %mod2, 2 + %mod2x2x2 = mul i32 %mod2x2, 2 + store i32 %mod2x2x2, ptr %if.addr + br label %outer.latch_exiting + +outer.latch_exiting: ; preds = %if.then, $outer.if + %outer.iv_next = add nuw nsw i32 %outer.iv, 1 + %outer.cond = icmp ult i32 %outer.iv_next, 2 + br i1 %outer.cond, label %outer.header, label %end + +end: ; preds = %outer.latch_exiting + ret void +} From 271cb46aa98d532253d319c7a6698553b4cc73eb Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Thu, 19 Dec 2024 16:08:22 +0100 Subject: [PATCH 4/5] Fix clang-format --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index a4bcc2d9e7efa..df2deafe2346d 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -83,8 +83,7 @@ static cl::opt UnrollThreshold("unroll-threshold", cl::Hidden, cl::desc("The cost threshold for loop unrolling")); -static cl::opt - UnrollOptSizeThreshold( +static cl::opt UnrollOptSizeThreshold( "unroll-optsize-threshold", cl::init(0), cl::Hidden, cl::desc("The cost threshold for loop unrolling when optimizing for " "size")); @@ -152,8 +151,8 @@ static cl::opt FlatLoopTripCountThreshold( "threshold, the loop is considered as flat and will be less " "aggressively unrolled.")); -static cl::opt UnrollUnrollRemainder( - "unroll-remainder", cl::Hidden, +static cl::opt + UnrollUnrollRemainder("unroll-remainder", cl::Hidden, cl::desc("Allow the loop remainder to be unrolled.")); // This option isn't ever intended to be enabled, it serves to allow From 046a9a3c4e1d411e3b3d43a4cb5b2f163755a5d0 Mon Sep 17 00:00:00 2001 From: Lucas Ramirez Date: Thu, 2 Jan 2025 14:36:57 +0100 Subject: [PATCH 5/5] Address feedback, almost NFC The only functional change concerns the selection of potentially foldable instructions inside loops ("foldable instructions" replaces "known instructions" since the former seems like a more canonical term). Instead of filtering-in instruction types that I think are foldable I now filter-out instructions w/ side-effect, phis, and terminators. SelectInst is special-cased because, unlike other instructions, it is foldable as soon as its single "select operand" is known. --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp | 138 ++++++++++-------- 1 file changed, 79 insertions(+), 59 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp index edacdab8f7994..a8bb2dd0e621d 100644 --- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -338,7 +338,7 @@ struct PragmaInfo { /// Helper type to estimate per-iteration cost savings coming from fully /// unrolling a loop. /// -/// The analysis maintains a set of "known instructions" inside the loop (i.e., +/// The analysis maintains a set of foldable instructions inside the loop (i.e., /// instructions whose result will be statically known after loop unrolling) /// that we assume will be entirely removable if the loop is fully unrolled. /// These instructions' cost can be deducted from the unrolled cost when @@ -346,39 +346,36 @@ struct PragmaInfo { struct FullUnrollCostSavings { FullUnrollCostSavings(const Loop *L) : L(L) {} - /// Returns whether the instruction is known. - inline bool isKnown(const Instruction *I) const { - return KnownVals.contains(I); + /// Returns whether the instruction is foldable. + inline bool isFoldable(const Instruction *I) const { + return Foldable.contains(I); } - /// If the value is an instruction, returns whether that instruction is known, - /// false otherwise. - bool isKnown(const Value *V) const { + /// If the value is an instruction, returns whether that instruction is + /// foldable, false otherwise. + bool isFoldable(const Value *V) const { if (const Instruction *I = dyn_cast(V)) - return isKnown(I); + return isFoldable(I); return false; } - /// Adds an instruction to the known set and re-evaluates unknown instructions - /// in the loop to determine whether their result can now be known. - void addToKnown(const Instruction *I) { - if (!KnownVals.insert(I).second) + /// Adds an instruction to the foldable set and re-evaluates instructions in + /// the loop to determine whether they are now foldable. + void addFoldable(const Instruction *I) { + if (!Foldable.insert(I).second) return; - // Every time we assume knowledge of an additional instruction result, we + // Every time we assume foldability of an additional instruction, we // potentially need to revisit instructions that were previously seen as - // unoptimizable. + // unfoldable. Evaluated.clear(); addUsersToExploreSet(I); - while (ToEvaluate.size()) { - const Instruction *I = ToEvaluate.back(); - ToEvaluate.pop_back(); - evalInstruction(I); - } + while (ToEvaluate.size()) + evalInstruction(ToEvaluate.pop_back_val()); } - /// Returns savings incurred by all known instructions, according to the \p + /// Returns savings incurred by all foldable instructions, according to the \p /// TTI. InstructionCost computeSavings(const TargetTransformInfo &TTI) const { TargetTransformInfo::TargetCostKind CostKind = @@ -387,19 +384,19 @@ struct FullUnrollCostSavings { : TargetTransformInfo::TCK_SizeAndLatency; InstructionCost CostSavings; - for (const Value *Val : KnownVals) + for (const Value *Val : Foldable) CostSavings += TTI.getInstructionCost(cast(Val), CostKind); return CostSavings; } private: - /// The set of instruction inside the loop whose results are considered known. - SmallPtrSet KnownVals; + /// The set of instruction inside the loop which we consider foldable. + SmallPtrSet Foldable; /// Caches the set of instructions we have already evaluated when adding a new - /// instruction to the known set. + /// instruction to the foldable set. SmallPtrSet Evaluated; /// Stack of instructions to evaluate when adding a new instruction to the - /// known set. + /// foldable set. SmallVector ToEvaluate; /// The loop under consideration. const Loop *L; @@ -414,28 +411,46 @@ struct FullUnrollCostSavings { } } - /// Evaluates an instruction to determine whether its result is "known", and - /// returns if that is the case. This may recurse on operands that are the - /// resul of yet unevaluated instructions inside the loop. + /// Evaluates an instruction to determine whether it is foldable, and returns + /// if that is the case. This may recurse on operands that are the result of + /// yet unevaluated instructions inside the loop. bool evalInstruction(const Instruction *I) { Evaluated.insert(I); - if (isKnown(I)) + if (isFoldable(I)) return true; - if (!isa(I)) + if (I->mayHaveSideEffects() || I->isTerminator() || isa(I)) return false; - bool Known = llvm::all_of(I->operand_values(), [&](const Value *Val) { - if (isa(Val) || isKnown(Val)) - return true; - const Instruction *ValInstr = dyn_cast(Val); - if (!ValInstr || Evaluated.contains(ValInstr) || !L->contains(ValInstr)) - return false; - return evalInstruction(ValInstr); - }); - if (Known) { - KnownVals.insert(I); + bool IsFoldable; + if (isa(I)) { + // Special case a select instruction; if the select operand is constant + // the result equals one of the other operands so the instruction is + // foldable. + IsFoldable = valWillBeConstant(I->getOperand(0)); + } else { + IsFoldable = true; + // All instruction operands must end up as constants for the instruction + // to be foldable. + for (const Value *Val : I->operand_values()) { + if (!valWillBeConstant(Val)) { + IsFoldable = false; + break; + } + } + } + if (IsFoldable) { + Foldable.insert(I); addUsersToExploreSet(I); } - return Known; + return IsFoldable; + } + + bool valWillBeConstant(const Value *Val) { + if (isa(Val) || isFoldable(Val)) + return true; + const Instruction *ValInstr = dyn_cast(Val); + if (!ValInstr || Evaluated.contains(ValInstr) || !L->contains(ValInstr)) + return false; + return evalInstruction(ValInstr); } }; @@ -450,9 +465,9 @@ struct FullUnrollCostSavings { static InstructionCost analyzeFullUnrollCostSavings( const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI, const TargetTransformInfo::UnrollingPreferences &UP) { - // Cost savings analysis is all based on unrolling making some values - // statically known; if we cannot identify the loop's IV then there is nothing - // we can do. + // Cost savings analysis is all based on unrolling making some instructions + // foldable; if we cannot identify the loop's IV then there is nothing we can + // do. PHINode *IV = L->getInductionVariable(SE); if (!IV) return {}; @@ -460,7 +475,7 @@ static InstructionCost analyzeFullUnrollCostSavings( // If we were to unroll the loop, everything that is only dependent on the IV // and constants will get simplified away. - Savings.addToKnown(IV); + Savings.addFoldable(IV); // Look for subloops whose trip count would go from runtime-dependent to // runtime-independent if we were to unroll the loop. These subloops are @@ -483,7 +498,7 @@ static InstructionCost analyzeFullUnrollCostSavings( auto IsValKnown = [&](const Value *Val) -> bool { if (isa(Val)) return true; - if (Savings.isKnown(Val)) { + if (Savings.isFoldable(Val)) { SubBoundsDependsOnIV = true; return true; } @@ -504,14 +519,16 @@ static InstructionCost analyzeFullUnrollCostSavings( // conservatively assume that the inner loop will only execute once per // outer loop iteration. This also reduces our cost savings estimation // mistake in the case where the subloop does not end up being unrolled. - Savings.addToKnown(SubIV); + Savings.addFoldable(SubIV); ++NumUnrollableSubloops; - LLVM_DEBUG( - dbgs() << " Trip count of subloop %" - << SubLoop->getHeader()->getName() - << " will become runtime-independent by fully unrolling loop %" - << L->getHeader()->getName() << "\n"); + LLVM_DEBUG({ + dbgs() << " Trip count of subloop "; + SubLoop->getHeader()->printAsOperand(dbgs(), false); + dbgs() << " will become runtime-independent by fully unrolling loop "; + L->getHeader()->printAsOperand(dbgs(), false); + dbgs() << '\n'; + }); } } @@ -521,7 +538,7 @@ static InstructionCost analyzeFullUnrollCostSavings( for (const BasicBlock *BB : L->getBlocks()) { const Instruction *TermInstr = BB->getTerminator(); if (const BranchInst *Br = dyn_cast(TermInstr)) { - if (Br->isConditional() && Savings.isKnown(Br->getCondition())) { + if (Br->isConditional() && Savings.isFoldable(Br->getCondition())) { // The branch condition will be statically determined at each iteration // of the loop. BasicBlock *FalseSucc = Br->getSuccessor(0), @@ -550,10 +567,13 @@ static InstructionCost analyzeFullUnrollCostSavings( // one of the two blocks at each iteration of the outer loop. Only the // branch represents a cost saving, since one successor block will // still be executed. - Savings.addToKnown(Br); - LLVM_DEBUG(dbgs() << " Conditional branch will be removed by fully " - "unrolling loop %" - << L->getHeader()->getName() << "\n"); + Savings.addFoldable(Br); + LLVM_DEBUG({ + dbgs() << " Conditional branch will be removed by fully " + "unrolling loop "; + L->getHeader()->printAsOperand(dbgs(), false); + dbgs() << '\n'; + }); } } } @@ -1093,7 +1113,7 @@ shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, return true; } else { InstructionCost Savings = analyzeFullUnrollCostSavings(L, SE, TTI, UP); - if (!(Savings.isValid() && *Savings.getValue())) + if (!Savings.isValid() || !*Savings.getValue()) return false; // Savings for one loop iteration are those estimated by the analaysis plus // the loop backedge's branch. @@ -1778,7 +1798,7 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM, if (!Changed) return PreservedAnalyses::all(); - // The parent must not be damaged by unrolling! + // The parent must not be damaged by unrolling! #ifndef NDEBUG if (ParentL) ParentL->verifyLoop();