From af4d8f1575e66af384946e221ae3da7e6f3a5362 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 5 Mar 2025 14:37:21 +0800 Subject: [PATCH 1/3] [VPlan] Fix crash when unrolling in-loop reduction chains If an in-loop reduction is chained e.g. WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, ir<%add2> REDUCE ir<%add1> = ir<%rdx> + reduce.add (ir<%x>) REDUCE ir<%add2> = ir<%add1> + reduce.add (ir<%y>) When we try to unroll the second add reduction, we crash because we currently expect the first operand to be a VPReductionPHIRecipe, when in fact it's the previous chain. This relaxes the cast to a dyn_cast, so we end up unrolling to: WIDEN-REDUCTION-PHI ir<%rdx> = phi ir<0>, ir<%add2> WIDEN-REDUCTION-PHI ir<%rdx>.1 = phi ir<0>, ir<%add2>.1, ir<1> WIDEN-REDUCTION-PHI ir<%rdx>.2 = phi ir<0>, ir<%add2>.2, ir<2> WIDEN-REDUCTION-PHI ir<%rdx>.3 = phi ir<0>, ir<%add2>.3, ir<3> REDUCE ir<%add1> = ir<%rdx> + reduce.add (ir<%x>) REDUCE ir<%add1>.1 = ir<%rdx>.1 + reduce.add (ir<%x>.1) REDUCE ir<%add1>.2 = ir<%rdx>.2 + reduce.add (ir<%x>.2) REDUCE ir<%add1>.3 = ir<%rdx>.3 + reduce.add (ir<%x>.3) REDUCE ir<%add2> = ir<%add1> + reduce.add (ir<%y>) REDUCE ir<%add2>.1 = ir<%add1>.1 + reduce.add (ir<%y>.1) REDUCE ir<%add2>.2 = ir<%add1>.2 + reduce.add (ir<%y>.2) REDUCE ir<%add2>.3 = ir<%add1>.3 + reduce.add (ir<%y>.3) This fixes a crash when building 525.x264_r from SPEC CPU 2017 on AArch64 with -mllvm -prefer-inloop-reductions --- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 4 +- .../LoopVectorize/reduction-inloop-uf4.ll | 90 ++++++++++++++++++- 2 files changed, 88 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index 89e372d6b46cf..dd2e666e8d150 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -295,8 +295,8 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { continue; } if (auto *Red = dyn_cast(&R)) { - auto *Phi = cast(R.getOperand(0)); - if (Phi->isOrdered()) { + if (auto *Phi = dyn_cast(R.getOperand(0)); + Phi && Phi->isOrdered()) { auto &Parts = VPV2Parts[Phi]; if (Part == 1) { Parts.clear(); diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll index 81163c45abe95..3393ea81992da 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -66,6 +66,88 @@ entry: ret i32 %sum.0.lcssa } +define i64 @reduction_sum_double(ptr noalias %p, ptr noalias %q) { +; CHECK-LABEL: @reduction_sum_double( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP0]], i64 64 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[TMP0]], i64 96 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8 +; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP1]], i64 64 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i64>, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x i64>, ptr [[TMP7]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD]]) +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP8]], [[VEC_PHI]] +; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD4]]) +; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP10]], [[VEC_PHI1]] +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD5]]) +; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], [[VEC_PHI2]] +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD6]]) +; CHECK-NEXT: [[TMP15:%.*]] = add i64 [[TMP14]], [[VEC_PHI3]] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD7]]) +; CHECK-NEXT: [[TMP17]] = add i64 [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD8]]) +; CHECK-NEXT: [[TMP19]] = add i64 [[TMP18]], [[TMP11]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD9]]) +; CHECK-NEXT: [[TMP21]] = add i64 [[TMP20]], [[TMP13]] +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[WIDE_LOAD10]]) +; CHECK-NEXT: [[TMP23]] = add i64 [[TMP22]], [[TMP15]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 +; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[BIN_RDX:%.*]] = add i64 [[TMP19]], [[TMP17]] +; CHECK-NEXT: [[BIN_RDX11:%.*]] = add i64 [[TMP21]], [[BIN_RDX]] +; CHECK-NEXT: [[BIN_RDX12:%.*]] = add i64 [[TMP23]], [[BIN_RDX11]] +; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: exit: +; CHECK-NEXT: [[ADD2_LCSSA:%.*]] = phi i64 [ poison, [[LOOP]] ], [ [[BIN_RDX12]], [[MIDDLE_BLOCK]] ] +; CHECK-NEXT: ret i64 [[ADD2_LCSSA]] +; +entry: + br label %loop +loop: + %iv = phi i64 [0, %entry], [%iv.next, %loop] + %rdx = phi i64 [0, %entry], [%add2, %loop] + + %p.gep = getelementptr i64, ptr %p, i64 %iv + %q.gep = getelementptr i64, ptr %q, i64 %iv + + %x = load i64, ptr %p.gep + %y = load i64, ptr %q.gep + + %add1 = add i64 %rdx, %x + %add2 = add i64 %add1, %y + + %iv.next = add i64 %iv, 1 + %done = icmp eq i64 %iv.next, 256 + br i1 %done, label %exit, label %loop +exit: + ret i64 %add2 +} + define i32 @predicated(ptr noalias nocapture %A) { ; CHECK-LABEL: @predicated( ; CHECK-NEXT: entry: @@ -260,7 +342,7 @@ define i32 @predicated(ptr noalias nocapture %A) { ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[TMP111:%.*]] = icmp eq i64 [[INDEX_NEXT]], 272 -; CHECK-NEXT: br i1 [[TMP111]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP111]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP104]], [[TMP101]] ; CHECK-NEXT: [[BIN_RDX37:%.*]] = add i32 [[TMP107]], [[BIN_RDX]] @@ -269,7 +351,7 @@ define i32 @predicated(ptr noalias nocapture %A) { ; CHECK: scalar.ph: ; CHECK-NEXT: br label [[DOTLR_PH:%.*]] ; CHECK: .lr.ph: -; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[DOT_CRIT_EDGE]], label [[DOTLR_PH]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: ._crit_edge: ; CHECK-NEXT: [[SUM_0_LCSSA:%.*]] = phi i32 [ poison, [[DOTLR_PH]] ], [ [[BIN_RDX38]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[SUM_0_LCSSA]] @@ -499,7 +581,7 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) { ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[TMP119:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP119]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP119]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[BIN_RDX:%.*]] = mul i32 [[TMP112]], [[TMP109]] ; CHECK-NEXT: [[BIN_RDX39:%.*]] = mul i32 [[TMP115]], [[BIN_RDX]] @@ -512,7 +594,7 @@ define i32 @cond_rdx_pred(i32 %cond, ptr noalias %a, i64 %N) { ; CHECK: if.then: ; CHECK-NEXT: br label [[FOR_INC]] ; CHECK: for.inc: -; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: [[RES_LCSSA:%.*]] = phi i32 [ poison, [[FOR_INC]] ], [ [[BIN_RDX40]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret i32 [[RES_LCSSA]] From 78b358ed780a538621649c8cef14251a72d599bb Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 5 Mar 2025 17:50:17 +0800 Subject: [PATCH 2/3] Declare Phi out of the if statement --- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index dd2e666e8d150..3fb6a5be79ff0 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -295,8 +295,8 @@ void UnrollState::unrollRecipeByUF(VPRecipeBase &R) { continue; } if (auto *Red = dyn_cast(&R)) { - if (auto *Phi = dyn_cast(R.getOperand(0)); - Phi && Phi->isOrdered()) { + auto *Phi = dyn_cast(R.getOperand(0)); + if (Phi && Phi->isOrdered()) { auto &Parts = VPV2Parts[Phi]; if (Part == 1) { Parts.clear(); From 0007657bbc045fffe9b142089cb1fd9e5648eaec Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 5 Mar 2025 18:38:42 +0800 Subject: [PATCH 3/3] Add comment to test and adjust spacing --- llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll index 3393ea81992da..279e0be3c8312 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -66,8 +66,9 @@ entry: ret i32 %sum.0.lcssa } -define i64 @reduction_sum_double(ptr noalias %p, ptr noalias %q) { -; CHECK-LABEL: @reduction_sum_double( +; Check that we correctly unroll two reductions chained together. +define i64 @reduction_sum_chain(ptr noalias %p, ptr noalias %q) { +; CHECK-LABEL: @reduction_sum_chain( ; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -128,6 +129,7 @@ define i64 @reduction_sum_double(ptr noalias %p, ptr noalias %q) { ; entry: br label %loop + loop: %iv = phi i64 [0, %entry], [%iv.next, %loop] %rdx = phi i64 [0, %entry], [%add2, %loop] @@ -144,6 +146,7 @@ loop: %iv.next = add i64 %iv, 1 %done = icmp eq i64 %iv.next, 256 br i1 %done, label %exit, label %loop + exit: ret i64 %add2 }