diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 75cace77ec534..7ca43efb47c6e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -198,6 +198,16 @@ static cl::opt MaxProfitableLoadStride( "slp-max-stride", cl::init(8), cl::Hidden, cl::desc("The maximum stride, considered to be profitable.")); +static cl::opt + DisableTreeReorder("slp-disable-tree-reorder", cl::init(false), cl::Hidden, + cl::desc("Disable tree reordering even if it is " + "profitable. Used for testing only.")); + +static cl::opt + ForceStridedLoads("slp-force-strided-loads", cl::init(false), cl::Hidden, + cl::desc("Generate strided loads even if they are not " + "profitable. Used for testing only.")); + static cl::opt ViewSLPTree("view-slp-tree", cl::Hidden, cl::desc("Display the SLP trees with Graphviz")); @@ -7770,6 +7780,9 @@ static void combineOrders(MutableArrayRef Order, } bool BoUpSLP::isProfitableToReorder() const { + if (DisableTreeReorder) + return false; + constexpr unsigned TinyVF = 2; constexpr unsigned TinyTree = 10; constexpr unsigned PhiOpsLimit = 12; @@ -13027,7 +13040,7 @@ void BoUpSLP::transformNodes() { InstructionCost StridedCost = TTI->getStridedMemoryOpCost( Instruction::Load, VecTy, BaseLI->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind, BaseLI); - if (StridedCost < OriginalVecCost) + if (StridedCost < OriginalVecCost || ForceStridedLoads) // Strided load is more profitable than consecutive load + reverse - // transform the node to strided load. E.State = TreeEntry::StridedVectorize; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-load.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-load.ll new file mode 100644 index 0000000000000..77d3ac1fb2322 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-load.ll @@ -0,0 +1,85 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=riscv64 -mattr=+m,+v \ +; RUN: -passes=slp-vectorizer \ +; RUN: -slp-disable-tree-reorder=true -slp-force-strided-loads=true \ +; RUN: -S < %s | FileCheck %s + +define void @const_stride_reversed(ptr %pl, ptr %ps) { +; CHECK-LABEL: define void @const_stride_reversed( +; CHECK-SAME: ptr [[PL:%.*]], ptr [[PS:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[GEP_L15:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 15 +; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.experimental.vp.strided.load.v16i8.p0.i64(ptr align 16 [[GEP_L15]], i64 -1, <16 x i1> splat (i1 true), i32 16) +; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16 +; CHECK-NEXT: ret void +; + %gep_l0 = getelementptr inbounds i8, ptr %pl, i64 0 + %gep_l1 = getelementptr inbounds i8, ptr %pl, i64 1 + %gep_l2 = getelementptr inbounds i8, ptr %pl, i64 2 + %gep_l3 = getelementptr inbounds i8, ptr %pl, i64 3 + %gep_l4 = getelementptr inbounds i8, ptr %pl, i64 4 + %gep_l5 = getelementptr inbounds i8, ptr %pl, i64 5 + %gep_l6 = getelementptr inbounds i8, ptr %pl, i64 6 + %gep_l7 = getelementptr inbounds i8, ptr %pl, i64 7 + %gep_l8 = getelementptr inbounds i8, ptr %pl, i64 8 + %gep_l9 = getelementptr inbounds i8, ptr %pl, i64 9 + %gep_l10 = getelementptr inbounds i8, ptr %pl, i64 10 + %gep_l11 = getelementptr inbounds i8, ptr %pl, i64 11 + %gep_l12 = getelementptr inbounds i8, ptr %pl, i64 12 + %gep_l13 = getelementptr inbounds i8, ptr %pl, i64 13 + %gep_l14 = getelementptr inbounds i8, ptr %pl, i64 14 + %gep_l15 = getelementptr inbounds i8, ptr %pl, i64 15 + + %load0 = load i8, ptr %gep_l0 , align 16 + %load1 = load i8, ptr %gep_l1 , align 16 + %load2 = load i8, ptr %gep_l2 , align 16 + %load3 = load i8, ptr %gep_l3 , align 16 + %load4 = load i8, ptr %gep_l4 , align 16 + %load5 = load i8, ptr %gep_l5 , align 16 + %load6 = load i8, ptr %gep_l6 , align 16 + %load7 = load i8, ptr %gep_l7 , align 16 + %load8 = load i8, ptr %gep_l8 , align 16 + %load9 = load i8, ptr %gep_l9 , align 16 + %load10 = load i8, ptr %gep_l10, align 16 + %load11 = load i8, ptr %gep_l11, align 16 + %load12 = load i8, ptr %gep_l12, align 16 + %load13 = load i8, ptr %gep_l13, align 16 + %load14 = load i8, ptr %gep_l14, align 16 + %load15 = load i8, ptr %gep_l15, align 16 + + %gep_s0 = getelementptr inbounds i8, ptr %ps, i64 0 + %gep_s1 = getelementptr inbounds i8, ptr %ps, i64 1 + %gep_s2 = getelementptr inbounds i8, ptr %ps, i64 2 + %gep_s3 = getelementptr inbounds i8, ptr %ps, i64 3 + %gep_s4 = getelementptr inbounds i8, ptr %ps, i64 4 + %gep_s5 = getelementptr inbounds i8, ptr %ps, i64 5 + %gep_s6 = getelementptr inbounds i8, ptr %ps, i64 6 + %gep_s7 = getelementptr inbounds i8, ptr %ps, i64 7 + %gep_s8 = getelementptr inbounds i8, ptr %ps, i64 8 + %gep_s9 = getelementptr inbounds i8, ptr %ps, i64 9 + %gep_s10 = getelementptr inbounds i8, ptr %ps, i64 10 + %gep_s11 = getelementptr inbounds i8, ptr %ps, i64 11 + %gep_s12 = getelementptr inbounds i8, ptr %ps, i64 12 + %gep_s13 = getelementptr inbounds i8, ptr %ps, i64 13 + %gep_s14 = getelementptr inbounds i8, ptr %ps, i64 14 + %gep_s15 = getelementptr inbounds i8, ptr %ps, i64 15 + + store i8 %load0, ptr %gep_s15, align 16 + store i8 %load1, ptr %gep_s14, align 16 + store i8 %load2, ptr %gep_s13, align 16 + store i8 %load3, ptr %gep_s12, align 16 + store i8 %load4, ptr %gep_s11, align 16 + store i8 %load5, ptr %gep_s10, align 16 + store i8 %load6, ptr %gep_s9, align 16 + store i8 %load7, ptr %gep_s8, align 16 + store i8 %load8, ptr %gep_s7, align 16 + store i8 %load9, ptr %gep_s6, align 16 + store i8 %load10, ptr %gep_s5, align 16 + store i8 %load11, ptr %gep_s4, align 16 + store i8 %load12, ptr %gep_s3, align 16 + store i8 %load13, ptr %gep_s2, align 16 + store i8 %load14, ptr %gep_s1, align 16 + store i8 %load15, ptr %gep_s0, align 16 + + ret void +}