From 952f7cb07bb53a17af393ca693d0e06685313d8f Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 5 Nov 2024 13:18:43 -0800 Subject: [PATCH 1/2] [RISCV] Prefer strided load for interleave load with only one lane active If only one of the elements is actually used, then we can legally use a strided load in place of the segment load. Doing so reduces vector register pressure, so if both segment and strided are believed to be element/segment at a time, then prefer the strided load variant. Note that I've seen the vectorizer emitting wide interleave loads to represent a strided load, so this does happen in practice. It doesn't matter much for small LMUL*NF, but at large NF can start causing problems in register allocation. Note that this patch only covers the fixed vector formation cases. In theory, we should do the same patch for scalable, but we can currently only represent NF2 in scalable IR, and NF2 is assumed to be optimized to better than segment-at-a-time by default, so there's currently nothing to do. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 22 +++++++++++++++++ llvm/lib/Target/RISCV/RISCVSubtarget.h | 21 ++++++++++++++++ .../Target/RISCV/RISCVTargetTransformInfo.cpp | 24 +------------------ .../rvv/fixed-vectors-interleaved-access.ll | 24 ++++++++++++------- 4 files changed, 60 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 43f08b453536c..046f1212717cf 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21585,6 +21585,8 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = { bool RISCVTargetLowering::lowerInterleavedLoad( LoadInst *LI, ArrayRef Shuffles, ArrayRef Indices, unsigned Factor) const { + assert(Indices.size() == Shuffles.size()); + IRBuilder<> Builder(LI); auto *VTy = cast(Shuffles[0]->getType()); @@ -21595,6 +21597,26 @@ bool RISCVTargetLowering::lowerInterleavedLoad( auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + // If the segment load is going to be performed segment at a time anyways + // and there's only one element used, use a strided load instead. This + // will be equally fast, and create less vector register pressure. + if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { + unsigned ScalarSizeInBytes = VTy->getScalarSizeInBits() / 8; + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); + Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); + Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); + Value *VL = Builder.getInt32(VTy->getNumElements()); + + CallInst *CI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, + {VTy, BasePtr->getType(), Stride->getType()}, + {BasePtr, Stride, Mask, VL}); + CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + Shuffles[0]->replaceAllUsesWith(CI); + return true; + }; + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); CallInst *VlsegN = Builder.CreateIntrinsic( diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h index bf9ed3f3d7165..f59a3737ae76f 100644 --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -238,6 +238,27 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo { return hasVInstructions() ? MaxInterleaveFactor : 1; } + bool hasOptimizedSegmentLoadStore(unsigned NF) const { + switch (NF) { + case 2: + return hasOptimizedNF2SegmentLoadStore(); + case 3: + return hasOptimizedNF3SegmentLoadStore(); + case 4: + return hasOptimizedNF4SegmentLoadStore(); + case 5: + return hasOptimizedNF5SegmentLoadStore(); + case 6: + return hasOptimizedNF6SegmentLoadStore(); + case 7: + return hasOptimizedNF7SegmentLoadStore(); + case 8: + return hasOptimizedNF8SegmentLoadStore(); + default: + llvm_unreachable("Unexpected NF"); + } + } + // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the // vector hardware implementation which may be less than VLEN. unsigned getDLenFactor() const { diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index 046f63fe1617c..10b8a355e2fe3 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -716,28 +716,6 @@ RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind); } -static bool hasOptimizedSegmentLoadStore(unsigned NF, - const RISCVSubtarget *ST) { - switch (NF) { - case 2: - return ST->hasOptimizedNF2SegmentLoadStore(); - case 3: - return ST->hasOptimizedNF3SegmentLoadStore(); - case 4: - return ST->hasOptimizedNF4SegmentLoadStore(); - case 5: - return ST->hasOptimizedNF5SegmentLoadStore(); - case 6: - return ST->hasOptimizedNF6SegmentLoadStore(); - case 7: - return ST->hasOptimizedNF7SegmentLoadStore(); - case 8: - return ST->hasOptimizedNF8SegmentLoadStore(); - default: - llvm_unreachable("Unexpected NF"); - } -} - InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, @@ -761,7 +739,7 @@ InstructionCost RISCVTTIImpl::getInterleavedMemoryOpCost( // Some processors optimize segment loads/stores as one wide memory op + // Factor * LMUL shuffle ops. - if (hasOptimizedSegmentLoadStore(Factor, ST)) { + if (ST->hasOptimizedSegmentLoadStore(Factor)) { InstructionCost Cost = getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind); MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT(); diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll index a5419c7cd1c2d..25ef3050e266a 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll @@ -1198,8 +1198,9 @@ define <4 x i32> @load_factor2_one_active(ptr %ptr) { define <4 x i32> @load_factor3_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor3_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg3e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <12 x i32>, ptr %ptr %v0 = shufflevector <12 x i32> %interleaved.vec, <12 x i32> poison, <4 x i32> @@ -1209,8 +1210,9 @@ define <4 x i32> @load_factor3_one_active(ptr %ptr) { define <4 x i32> @load_factor4_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor4_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 16 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg4e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <16 x i32>, ptr %ptr %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> poison, <4 x i32> @@ -1220,8 +1222,9 @@ define <4 x i32> @load_factor4_one_active(ptr %ptr) { define <4 x i32> @load_factor5_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor5_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 20 ; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; CHECK-NEXT: vlseg5e32.v v8, (a0) +; CHECK-NEXT: vlse32.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <20 x i32>, ptr %ptr %v0 = shufflevector <20 x i32> %interleaved.vec, <20 x i32> poison, <4 x i32> @@ -1231,30 +1234,35 @@ define <4 x i32> @load_factor5_one_active(ptr %ptr) { define <2 x i16> @load_factor6_one_active(ptr %ptr) { ; CHECK-LABEL: load_factor6_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 10 +; CHECK-NEXT: li a1, 12 ; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma -; CHECK-NEXT: vlseg6e16.v v8, (a0) +; CHECK-NEXT: vlse16.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <12 x i16>, ptr %ptr - %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> + %v0 = shufflevector <12 x i16> %interleaved.vec, <12 x i16> poison, <2 x i32> ret <2 x i16> %v0 } define <4 x i8> @load_factor7_one_active(ptr %ptr) vscale_range(8,1024) { ; CHECK-LABEL: load_factor7_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: addi a0, a0, 1 +; CHECK-NEXT: li a1, 7 ; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; CHECK-NEXT: vlseg7e8.v v8, (a0) +; CHECK-NEXT: vlse8.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <32 x i8>, ptr %ptr - %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> + %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> ret <4 x i8> %v0 } define <4 x i8> @load_factor8_one_active(ptr %ptr) vscale_range(8,1024) { ; CHECK-LABEL: load_factor8_one_active: ; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 8 ; CHECK-NEXT: vsetivli zero, 4, e8, mf8, ta, ma -; CHECK-NEXT: vlseg8e8.v v8, (a0) +; CHECK-NEXT: vlse8.v v8, (a0), a1 ; CHECK-NEXT: ret %interleaved.vec = load <32 x i8>, ptr %ptr %v0 = shufflevector <32 x i8> %interleaved.vec, <32 x i8> poison, <4 x i32> From ebfa8d1f42f6a71ee81f4541211f4c4e03b4b64d Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 5 Nov 2024 14:19:21 -0800 Subject: [PATCH 2/2] clang-format --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 046f1212717cf..e35e9b1bb8119 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21608,11 +21608,12 @@ bool RISCVTargetLowering::lowerInterleavedLoad( Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); Value *VL = Builder.getInt32(VTy->getNumElements()); - CallInst *CI = Builder.CreateIntrinsic( - Intrinsic::experimental_vp_strided_load, - {VTy, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, Mask, VL}); - CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + CallInst *CI = + Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, + {VTy, BasePtr->getType(), Stride->getType()}, + {BasePtr, Stride, Mask, VL}); + CI->addParamAttr( + 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); Shuffles[0]->replaceAllUsesWith(CI); return true; };