[LV] Convert uniform-address scatters to scalar store when unmasked.

ElvisWang123 · ElvisWang123 · commit 8f59bef5c4da · 2025-11-12T15:40:59.000-08:00
This patch optimizes vector scatters that have a uniform (single-scalar) address by replacing them with "extract-last-element + scalar store" when the scatter is unmasked.

In all of these cases, at least one lane is guaranteed to execute in each vector iteration, so storing the last active element is sufficient.

Implementation:

- Add optimizeScatterWithUniformAddr(VPlan &amp;), and invoke it from VPlanTransforms::optimize().
- Identify non-consecutive VPWidenStoreRecipe/VPWidenStoreEVLRecipe with uniform addresses.
- Replace the scatter with VPInstruction::ExtractLastElement of the stored value and a VPReplicate (scalar) store.

Notes:

- The legacy cost model can scalarize a store if both the address and the value are uniform. In VPlan we materialize the stored value via ExtractLastElement, so only the address must be uniform.
- Some of the loops won't be vectorized any sine no vector instructions
will be generated.

I plan to have a follow-up patch for convert uniform-address scatters
to scalar store when the mask is header maks. This reqiures
`extract-last-active-element` to get the correct value to store.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1400,14 +1400,47 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
   for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
            vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
     for (VPRecipeBase &R : make_early_inc_range(reverse(*VPBB))) {
-      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe>(&R))
+      if (!isa<VPWidenRecipe, VPWidenSelectRecipe, VPReplicateRecipe,
+               VPWidenMemoryRecipe>(&R))
         continue;
       auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
       if (RepR && (RepR->isSingleScalar() || RepR->isPredicated()))
         continue;
 
-      auto *RepOrWidenR = cast<VPSingleDefRecipe>(&R);
-      if (RepR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
+      // Convert scatters with a uniform address that is unmasked into an
+      // extract-last-element + scalar store.
+      //  TODO: Add a profitability check comparing the cost of a scatter vs.
+      //  extract + scalar store.
+      auto *WidenStoreR = dyn_cast<VPWidenMemoryRecipe>(&R);
+      if (WidenStoreR && vputils::isSingleScalar(WidenStoreR->getAddr()) &&
+          !WidenStoreR->isConsecutive() &&
+          isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe>(WidenStoreR)) {
+        assert(!WidenStoreR->isReverse() &&
+               "Not consecutive memory recipes shouldn't be reversed");
+        VPValue *Mask = WidenStoreR->getMask();
+
+        // Only convert the scatter to a scalar store if it is unmasked. or
+        // TODO: Support converting scatter masked by the header mask to scalar
+        // store.
+        if (Mask)
+          continue;
+
+        auto *Extract = new VPInstruction(VPInstruction::ExtractLastElement,
+                                          {WidenStoreR->getOperand(1)});
+        Extract->insertBefore(WidenStoreR);
+
+        // TODO: Sink the scalar store recipe to middle block if possible.
+        auto *ScalarStore = new VPReplicateRecipe(
+            &WidenStoreR->getIngredient(), {Extract, WidenStoreR->getAddr()},
+            true /*IsSingleScalar*/, nullptr /*Mask*/,
+            *WidenStoreR /*Metadata*/);
+        ScalarStore->insertBefore(WidenStoreR);
+        WidenStoreR->eraseFromParent();
+        continue;
+      }
+
+      auto *RepOrWidenR = dyn_cast<VPSingleDefRecipe>(&R);
+      if (RepR && RepOrWidenR && isa<StoreInst>(RepR->getUnderlyingInstr()) &&
           vputils::isSingleScalar(RepR->getOperand(1))) {
         auto *Clone = new VPReplicateRecipe(
             RepOrWidenR->getUnderlyingInstr(), RepOrWidenR->operands(),
@@ -1427,7 +1460,7 @@ static void narrowToSingleScalarRecipes(VPlan &Plan) {
       // Skip recipes that aren't single scalars or don't have only their
       // scalar results used. In the latter case, we would introduce extra
       // broadcasts.
-      if (!vputils::isSingleScalar(RepOrWidenR) ||
+      if (!RepOrWidenR || !vputils::isSingleScalar(RepOrWidenR) ||
           !all_of(RepOrWidenR->users(), [RepOrWidenR](const VPUser *U) {
             if (auto *Store = dyn_cast<VPWidenStoreRecipe>(U)) {
               // VPWidenStore doesn't have users, and stores are always
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/narrow-scatter-to-scalar-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/narrow-scatter-to-scalar-store.ll
@@ -6,17 +6,16 @@ define void @truncate_i16_to_i8_cse(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x ptr> poison, ptr [[DST]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <2 x ptr> [[BROADCAST_SPLATINSERT1]], <2 x ptr> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[SRC]], align 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i16> poison, i16 [[TMP0]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[BROADCAST_SPLATINSERT]], <2 x i16> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc <2 x i16> [[BROADCAST_SPLAT]] to <2 x i8>
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> [[TMP1]], <2 x ptr> align 1 zeroinitializer, <2 x i1> splat (i1 true))
-; CHECK-NEXT:    call void @llvm.masked.scatter.v2i8.v2p0(<2 x i8> [[TMP1]], <2 x ptr> align 1 [[BROADCAST_SPLAT1]], <2 x i1> splat (i1 true))
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i32 1
+; CHECK-NEXT:    store i8 [[TMP2]], ptr null, align 1
+; CHECK-NEXT:    store i8 [[TMP2]], ptr [[DST]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 4294967296
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]