diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index e8fea6851dae5..53c0805326d0c 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1400,14 +1400,46 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) { - if (!isa(&R)) + if (!isa(&R)) continue; auto *RepR = dyn_cast(&R); if (RepR && (RepR->isSingleScalar() || RepR->isPredicated())) continue; - auto *RepOrWidenR = cast(&R); - if (RepR && isa(RepR->getUnderlyingInstr()) && + // Convert scatters with a uniform address that is unmasked into an + // extract-last-element + scalar store. + // TODO: Add a profitability check comparing the cost of a scatter vs. + // extract + scalar store. + auto *WidenStoreR = dyn_cast(&R); + if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) && + !WidenStoreR->isConsecutive()) { + assert(!WidenStoreR->isReverse() && + "Not consecutive memory recipes shouldn't be reversed"); + VPValue *Mask = WidenStoreR->getMask(); + + // Only convert the scatter to a scalar store if it is unmasked. or + // TODO: Support converting scatter masked by the header mask to scalar + // store. + if (Mask) + continue; + + auto *Extract = new VPInstruction(VPInstruction::ExtractLastElement, + {WidenStoreR->getOperand(1)}); + Extract->insertBefore(WidenStoreR); + + // TODO: Sink the scalar store recipe to middle block if possible. + auto *ScalarStore = new VPReplicateRecipe( + &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()}, + true /*IsSingleScalar*/, nullptr /*Mask*/, + *WidenStoreR /*Metadata*/); + ScalarStore->insertBefore(WidenStoreR); + WidenStoreR->eraseFromParent(); + continue; + } + + auto *RepOrWidenR = dyn_cast(&R); + if (RepR && RepOrWidenR && isa(RepR->getUnderlyingInstr()) && vputils::isSingleScalar(RepR->getOperand(1))) { auto *Clone = new VPReplicateRecipe( RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(), @@ -1427,7 +1459,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) { // Skip recipes that aren't single scalars or don't have only their // scalar results used. In the latter case, we would introduce extra // broadcasts. - if (!vputils::isSingleScalar(RepOrWidenR) || + if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR) || !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) { if (auto *Store = dyn_cast(U)) { // VPWidenStore doesn't have users, and stores are always diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/narrow-scatter-to-scalar-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/narrow-scatter-to-scalar-store.ll new file mode 100644 index 0000000000000..5c5674dbb3f20 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/narrow-scatter-to-scalar-store.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -passes=loop-vectorize -force-vector-width=2 -mtriple=riscv64 -mattr=+v -S %s | FileCheck %s +define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) { +; CHECK-LABEL: define void @truncate_i16_to_i8_cse( +; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = trunc <2 x i16> [[BROADCAST_SPLAT]] to <2 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1 +; CHECK-NEXT: store i8 [[TMP2]], ptr null, align 1 +; CHECK-NEXT: store i8 [[TMP2]], ptr [[DST]], align 1 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296 +; CHECK-NEXT: br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %count = phi i32 [ 0, %entry ], [ %count.next, %loop ] + %val = load i16, ptr %src, align 2 + %val.zext = zext i16 %val to i64 + %val.trunc.zext = trunc i64 %val.zext to i8 + store i8 %val.trunc.zext, ptr null, align 1 + %val.trunc = trunc i16 %val to i8 + store i8 %val.trunc, ptr %dst, align 1 + %count.next = add i32 %count, 1 + %exitcond = icmp eq i32 %count.next, 0 + %iv.next = add i64 %iv, 1 + br i1 %exitcond, label %exit, label %loop + +exit: + ret void +} +;. +; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} +; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} +; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} +;.