Skip to content

Conversation

@preames
Copy link
Collaborator

@preames preames commented Dec 6, 2024

If we're performing a segment store and all but one of the segments are undefined, that's equivalent to performing a strided store of the one active segment.

This is the store side of a905203. As before, this only covers fixed vectors.

If we're performing a segment store and all but one of the segments are
undefined, that's equivalent to performing a strided store of the one
active segment.

This is the store side of a905203.  As before, this only covers
fixed vectors.
@llvmbot
Copy link
Member

llvmbot commented Dec 6, 2024

@llvm/pr-subscribers-backend-risc-v

Author: Philip Reames (preames)

Changes

If we're performing a segment store and all but one of the segments are undefined, that's equivalent to performing a strided store of the one active segment.

This is the store side of a905203. As before, this only covers fixed vectors.


Full diff: https://github.com/llvm/llvm-project/pull/119027.diff

2 Files Affected:

  • (modified) llvm/lib/Target/RISCV/RISCVISelLowering.cpp (+26-1)
  • (modified) llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll (+45-7)
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 83308682835394..743d68523bff76 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -21906,6 +21906,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
                                                 ShuffleVectorInst *SVI,
                                                 unsigned Factor) const {
   IRBuilder<> Builder(SI);
+  auto Mask = SVI->getShuffleMask();
   auto *ShuffleVTy = cast<FixedVectorType>(SVI->getType());
   // Given SVI : <n*factor x ty>, then VTy : <n x ty>
   auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(),
@@ -21917,11 +21918,35 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI,
 
   auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
+  unsigned Index;
+  // If the segment store only has one active lane (i.e. the interleave is
+  // just a spread shuffle), we can use a strided store instead.  This will
+  // be equally fast, and create less vector register pressure.
+  if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) &&
+      isSpreadMask(Mask, Factor, Index)) {
+    unsigned ScalarSizeInBytes = ShuffleVTy->getScalarSizeInBits() / 8;
+    Value *Data = SVI->getOperand(0);
+    auto *DataVTy = cast<FixedVectorType>(Data->getType());
+    Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
+    Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes);
+    Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset);
+    Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount());
+    Value *VL = Builder.getInt32(VTy->getNumElements());
+
+    CallInst *CI =
+        Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_store,
+                                {Data->getType(), BasePtr->getType(), Stride->getType()},
+                                {Data, BasePtr, Stride, Mask, VL});
+    CI->addParamAttr(
+        1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign()));
+
+    return true;
+  }
+
   Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
       SI->getModule(), FixedVssegIntrIds[Factor - 2],
       {VTy, SI->getPointerOperandType(), XLenTy});
 
-  auto Mask = SVI->getShuffleMask();
   SmallVector<Value *, 10> Ops;
 
   for (unsigned i = 0; i < Factor; i++) {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index 5649ee20a47092..8833634be1a0ed 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -1285,17 +1285,55 @@ define void @load_factor4_one_active_storeback_full(ptr %ptr) {
   ret void
 }
 
-; TODO: This should be a strided store
-define void @store_factor4_one_active_storeback(ptr %ptr, <4 x i32> %v) {
-; CHECK-LABEL: store_factor4_one_active_storeback:
+define void @store_factor4_one_active(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active:
 ; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 16
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vslideup.vi v9, v8, 1
-; CHECK-NEXT:    vmv.v.v v10, v9
-; CHECK-NEXT:    vmv.v.v v11, v9
-; CHECK-NEXT:    vsseg4e32.v v8, (a0)
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
 ; CHECK-NEXT:    ret
   %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef, i32 undef>
   store <16 x i32> %v0, ptr %ptr
   ret void
 }
+
+define void @store_factor4_one_active_idx1(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_idx1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, 4
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 undef, i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef>
+  store <16 x i32> %v0, ptr %ptr
+  ret void
+}
+
+define void @store_factor4_one_active_fullwidth(ptr %ptr, <16 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_fullwidth:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m4, ta, ma
+; CHECK-NEXT:    vsse32.v v8, (a0), a1
+; CHECK-NEXT:    ret
+  %v0 = shufflevector <16 x i32> %v, <16 x i32> poison, <16 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3,  i32 undef, i32 undef, i32 undef>
+  store <16 x i32> %v0, ptr %ptr
+  ret void
+}
+
+; TODO: This could be a vslidedown followed by a strided store
+define void @store_factor4_one_active_slidedown(ptr %ptr, <4 x i32> %v) {
+; CHECK-LABEL: store_factor4_one_active_slidedown:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v9, v8, 1
+; CHECK-NEXT:    vslideup.vi v10, v8, 1
+; CHECK-NEXT:    vmv.v.v v11, v10
+; CHECK-NEXT:    vmv.v.v v12, v10
+; CHECK-NEXT:    vsseg4e32.v v9, (a0)
+; CHECK-NEXT:    ret
+  %v0 = shufflevector <4 x i32> %v, <4 x i32> poison, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 2, i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 4,  i32 undef, i32 undef, i32 undef>
+  store <16 x i32> %v0, ptr %ptr
+  ret void
+}

@github-actions
Copy link

github-actions bot commented Dec 6, 2024

✅ With the latest revision this PR passed the C/C++ code formatter.

Copy link
Collaborator

@topperc topperc left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

@preames preames merged commit 02ad623 into llvm:main Dec 7, 2024
6 of 7 checks passed
@preames preames deleted the pr-riscv-segment-store-as-strided branch December 7, 2024 00:45
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants