From ee3a3d4b0986b81a90220cbd8fb34d12958ae6ee Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 10 Jul 2025 12:02:58 -0700 Subject: [PATCH 1/2] [RISCV] Split InterleavedAccess related TLI hooks into a separate file. NFC --- llvm/lib/Target/RISCV/CMakeLists.txt | 1 + llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 587 +---------------- llvm/lib/Target/RISCV/RISCVISelLowering.h | 6 + .../Target/RISCV/RISCVInterleavedAccess.cpp | 595 ++++++++++++++++++ 4 files changed, 610 insertions(+), 579 deletions(-) create mode 100644 llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp diff --git a/llvm/lib/Target/RISCV/CMakeLists.txt b/llvm/lib/Target/RISCV/CMakeLists.txt index e32d6eab3b977..47329b2c2f4d2 100644 --- a/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/llvm/lib/Target/RISCV/CMakeLists.txt @@ -45,6 +45,7 @@ add_llvm_target(RISCVCodeGen RISCVInsertVSETVLI.cpp RISCVInsertWriteVXRM.cpp RISCVInstrInfo.cpp + RISCVInterleavedAccess.cpp RISCVISelDAGToDAG.cpp RISCVISelLowering.cpp RISCVLandingPadSetup.cpp diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 7e83abe013063..cdc4a5d54caab 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -5098,12 +5098,13 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlide1(const SDLoc &DL, MVT VT, return convertFromScalableVector(VT, Vec, DAG, Subtarget); } -// Match a mask which "spreads" the leading elements of a vector evenly -// across the result. Factor is the spread amount, and Index is the -// offset applied. (on success, Index < Factor) This is the inverse -// of a deinterleave with the same Factor and Index. This is analogous -// to an interleave, except that all but one lane is undef. -static bool isSpreadMask(ArrayRef Mask, unsigned Factor, unsigned &Index) { +/// Match a mask which "spreads" the leading elements of a vector evenly +/// across the result. Factor is the spread amount, and Index is the +/// offset applied. (on success, Index < Factor) This is the inverse +/// of a deinterleave with the same Factor and Index. This is analogous +/// to an interleave, except that all but one lane is undef. +bool RISCVTargetLowering::isSpreadMask(ArrayRef Mask, unsigned Factor, + unsigned &Index) { SmallVector LaneIsUndef(Factor, true); for (unsigned i = 0; i < Mask.size(); i++) LaneIsUndef[i % Factor] &= (Mask[i] == -1); @@ -6082,7 +6083,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, assert(MaxFactor == 2 || MaxFactor == 4 || MaxFactor == 8); for (unsigned Factor = 4; Factor <= MaxFactor; Factor <<= 1) { unsigned Index; - if (isSpreadMask(Mask, Factor, Index)) { + if (RISCVTargetLowering::isSpreadMask(Mask, Factor, Index)) { MVT NarrowVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts / Factor); SDValue Src = DAG.getExtractSubvector(DL, NarrowVT, V1, 0); @@ -24080,39 +24081,6 @@ Value *RISCVTargetLowering::getIRStackGuard(IRBuilderBase &IRB) const { return TargetLowering::getIRStackGuard(IRB); } -bool RISCVTargetLowering::isLegalInterleavedAccessType( - VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, - const DataLayout &DL) const { - EVT VT = getValueType(DL, VTy); - // Don't lower vlseg/vsseg for vector types that can't be split. - if (!isTypeLegal(VT)) - return false; - - if (!isLegalElementTypeForRVV(VT.getScalarType()) || - !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, - Alignment)) - return false; - - MVT ContainerVT = VT.getSimpleVT(); - - if (auto *FVTy = dyn_cast(VTy)) { - if (!Subtarget.useRVVForFixedLengthVectors()) - return false; - // Sometimes the interleaved access pass picks up splats as interleaves of - // one element. Don't lower these. - if (FVTy->getNumElements() < 2) - return false; - - ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT()); - } - - // Need to make sure that EMUL * NFIELDS ≤ 8 - auto [LMUL, Fractional] = RISCVVType::decodeVLMUL(getLMUL(ContainerVT)); - if (Fractional) - return true; - return Factor * LMUL <= 8; -} - bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, Align Alignment) const { if (!Subtarget.hasVInstructions()) @@ -24133,545 +24101,6 @@ bool RISCVTargetLowering::isLegalStridedLoadStore(EVT DataType, return true; } -static const Intrinsic::ID FixedVlsegIntrIds[] = { - Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask, - Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask, - Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask, - Intrinsic::riscv_seg8_load_mask}; - -static const Intrinsic::ID ScalableVlsegIntrIds[] = { - Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, - Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, - Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, - Intrinsic::riscv_vlseg8_mask}; - -/// Lower an interleaved load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved load (Factor = 2): -/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr -/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements -/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements -/// -/// Into: -/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.riscv.seg2.load.v4i32.p0.i64( -/// %ptr, i64 4) -/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 -/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 -bool RISCVTargetLowering::lowerInterleavedLoad( - LoadInst *LI, ArrayRef Shuffles, - ArrayRef Indices, unsigned Factor) const { - assert(Indices.size() == Shuffles.size()); - - IRBuilder<> Builder(LI); - - const DataLayout &DL = LI->getDataLayout(); - - auto *VTy = cast(Shuffles[0]->getType()); - if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) - return false; - - auto *PtrTy = LI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); - - // If the segment load is going to be performed segment at a time anyways - // and there's only one element used, use a strided load instead. This - // will be equally fast, and create less vector register pressure. - if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { - unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - Value *VL = Builder.getInt32(VTy->getNumElements()); - - CallInst *CI = - Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, - {VTy, BasePtr->getType(), Stride->getType()}, - {BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); - Shuffles[0]->replaceAllUsesWith(CI); - return true; - }; - - Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); - CallInst *VlsegN = Builder.CreateIntrinsic( - FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); - - for (unsigned i = 0; i < Shuffles.size(); i++) { - Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); - Shuffles[i]->replaceAllUsesWith(SubVec); - } - - return true; -} - -static const Intrinsic::ID FixedVssegIntrIds[] = { - Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, - Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, - Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, - Intrinsic::riscv_seg8_store_mask}; - -static const Intrinsic::ID ScalableVssegIntrIds[] = { - Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, - Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, - Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, - Intrinsic::riscv_vsseg8_mask}; - -/// Lower an interleaved store into a vssegN intrinsic. -/// -/// E.g. Lower an interleaved store (Factor = 3): -/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, -/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> -/// store <12 x i32> %i.vec, <12 x i32>* %ptr -/// -/// Into: -/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> -/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> -/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> -/// call void llvm.riscv.seg3.store.v4i32.p0.i64(%sub.v0, %sub.v1, %sub.v2, -/// %ptr, i32 4) -/// -/// Note that the new shufflevectors will be removed and we'll only generate one -/// vsseg3 instruction in CodeGen. -bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, - ShuffleVectorInst *SVI, - unsigned Factor) const { - IRBuilder<> Builder(SI); - const DataLayout &DL = SI->getDataLayout(); - auto Mask = SVI->getShuffleMask(); - auto *ShuffleVTy = cast(SVI->getType()); - // Given SVI : , then VTy : - auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), - ShuffleVTy->getNumElements() / Factor); - if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) - return false; - - auto *PtrTy = SI->getPointerOperandType(); - auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); - - unsigned Index; - // If the segment store only has one active lane (i.e. the interleave is - // just a spread shuffle), we can use a strided store instead. This will - // be equally fast, and create less vector register pressure. - if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) && - isSpreadMask(Mask, Factor, Index)) { - unsigned ScalarSizeInBytes = - DL.getTypeStoreSize(ShuffleVTy->getElementType()); - Value *Data = SVI->getOperand(0); - auto *DataVTy = cast(Data->getType()); - Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); - Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); - Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); - Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); - Value *VL = Builder.getInt32(VTy->getNumElements()); - - CallInst *CI = Builder.CreateIntrinsic( - Intrinsic::experimental_vp_strided_store, - {Data->getType(), BasePtr->getType(), Stride->getType()}, - {Data, BasePtr, Stride, Mask, VL}); - CI->addParamAttr( - 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign())); - - return true; - } - - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); - - SmallVector Ops; - SmallVector NewShuffleMask; - - for (unsigned i = 0; i < Factor; i++) { - // Collect shuffle mask for this lane. - for (unsigned j = 0; j < VTy->getNumElements(); j++) - NewShuffleMask.push_back(Mask[i + Factor * j]); - - Value *Shuffle = Builder.CreateShuffleVector( - SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask); - Ops.push_back(Shuffle); - - NewShuffleMask.clear(); - } - // This VL should be OK (should be executable in one vsseg instruction, - // potentially under larger LMULs) because we checked that the fixed vector - // type fits in isLegalInterleavedAccessType - Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); - Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount()); - Ops.append({SI->getPointerOperand(), StoreMask, VL}); - - Builder.CreateCall(VssegNFunc, Ops); - - return true; -} - -bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( - LoadInst *LI, ArrayRef DeinterleaveValues) const { - const unsigned Factor = DeinterleaveValues.size(); - if (Factor > 8) - return false; - - assert(LI->isSimple()); - IRBuilder<> Builder(LI); - - Value *FirstActive = - *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); - VectorType *ResVTy = cast(FirstActive->getType()); - - const DataLayout &DL = LI->getDataLayout(); - - if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(), - LI->getPointerAddressSpace(), DL)) - return false; - - Value *Return; - Type *PtrTy = LI->getPointerOperandType(); - Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); - - if (auto *FVTy = dyn_cast(ResVTy)) { - Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); - Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount()); - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {ResVTy, PtrTy, XLenTy}, - {LI->getPointerOperand(), Mask, VL}); - } else { - static const Intrinsic::ID IntrIds[] = { - Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, - Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5, - Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, - Intrinsic::riscv_vlseg8}; - - unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType()); - unsigned NumElts = ResVTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - LI->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(LI->getContext()), - NumElts * SEW / 8), - Factor); - - Value *VL = Constant::getAllOnesValue(XLenTy); - - Value *Vlseg = Builder.CreateIntrinsic( - IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy}, - {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL, - ConstantInt::get(XLenTy, Log2_64(SEW))}); - - SmallVector AggrTypes{Factor, ResVTy}; - Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes)); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = Builder.CreateIntrinsic( - Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy}, - {Vlseg, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } - - for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { - if (!DIV) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast(Idx)}); - DIV->replaceAllUsesWith(NewEV); - } - - return true; -} - -bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( - StoreInst *SI, ArrayRef InterleaveValues) const { - unsigned Factor = InterleaveValues.size(); - if (Factor > 8) - return false; - - assert(SI->isSimple()); - IRBuilder<> Builder(SI); - - auto *InVTy = cast(InterleaveValues[0]->getType()); - auto *PtrTy = SI->getPointerOperandType(); - const DataLayout &DL = SI->getDataLayout(); - - if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), - SI->getPointerAddressSpace(), DL)) - return false; - - Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); - - if (auto *FVTy = dyn_cast(InVTy)) { - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy}); - - SmallVector Ops(InterleaveValues); - Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); - Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount()); - Ops.append({SI->getPointerOperand(), Mask, VL}); - - Builder.CreateCall(VssegNFunc, Ops); - } else { - static const Intrinsic::ID IntrIds[] = { - Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, - Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5, - Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, - Intrinsic::riscv_vsseg8}; - - unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); - unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - SI->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(SI->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - SI->getModule(), IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy}); - - Value *VL = Constant::getAllOnesValue(XLenTy); - - Value *StoredVal = PoisonValue::get(VecTupTy); - for (unsigned i = 0; i < Factor; ++i) - StoredVal = Builder.CreateIntrinsic( - Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}, - {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); - - Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL, - ConstantInt::get(XLenTy, Log2_64(SEW))}); - } - - return true; -} - -static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { - assert(N); - if (N == 1) - return true; - - using namespace PatternMatch; - // Right now we're only recognizing the simplest pattern. - uint64_t C; - if (match(V, m_CombineOr(m_ConstantInt(C), - m_c_Mul(m_Value(), m_ConstantInt(C)))) && - C && C % N == 0) - return true; - - if (isPowerOf2_32(N)) { - KnownBits KB = llvm::computeKnownBits(V, DL); - return KB.countMinTrailingZeros() >= Log2_32(N); - } - - return false; -} - -/// Lower an interleaved vp.load into a vlsegN intrinsic. -/// -/// E.g. Lower an interleaved vp.load (Factor = 2): -/// %l = call @llvm.vp.load.nxv64i8.p0(ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// %dl = tail call { , } -/// @llvm.vector.deinterleave2.nxv64i8( -/// %l) -/// %r0 = extractvalue { , } %dl, 0 -/// %r1 = extractvalue { , } %dl, 1 -/// -/// Into: -/// %rvl = udiv %wide.rvl, 2 -/// %sl = call { , } -/// @llvm.riscv.vlseg2.mask.nxv32i8.i64( undef, -/// undef, -/// ptr %ptr, -/// %mask, -/// i64 %rvl, -/// i64 1) -/// %r0 = extractvalue { , } %sl, 0 -/// %r1 = extractvalue { , } %sl, 1 -/// -/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be -/// removed by the caller -/// TODO: We probably can loosen the dependency on matching extractvalue when -/// dealing with factor of 2 (extractvalue is still required for most of other -/// factors though). -bool RISCVTargetLowering::lowerInterleavedVPLoad( - VPIntrinsic *Load, Value *Mask, - ArrayRef DeinterleaveResults) const { - const unsigned Factor = DeinterleaveResults.size(); - assert(Mask && "Expect a valid mask"); - assert(Load->getIntrinsicID() == Intrinsic::vp_load && - "Unexpected intrinsic"); - - Value *FirstActive = *llvm::find_if(DeinterleaveResults, - [](Value *V) { return V != nullptr; }); - VectorType *VTy = cast(FirstActive->getType()); - - auto &DL = Load->getModule()->getDataLayout(); - Align Alignment = Load->getParamAlign(0).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Load); - - Value *WideEVL = Load->getVectorLengthParam(); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Load->getArgOperand(0)->getType(); - auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); - - Value *Return = nullptr; - if (auto *FVTy = dyn_cast(VTy)) { - Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], - {FVTy, PtrTy, XLenTy}, - {Load->getArgOperand(0), Mask, EVL}); - } else { - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Load->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), - NumElts * SEW / 8), - Factor); - - Value *PoisonVal = PoisonValue::get(VecTupTy); - - Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), ScalableVlsegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = { - PoisonVal, - Load->getArgOperand(0), - Mask, - EVL, - ConstantInt::get(XLenTy, - RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); - - SmallVector AggrTypes{Factor, VTy}; - Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); - Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( - Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); - for (unsigned i = 0; i < Factor; ++i) { - Value *VecExtract = - Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); - Return = Builder.CreateInsertValue(Return, VecExtract, i); - } - } - - for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { - if (!DIO) - continue; - // We have to create a brand new ExtractValue to replace each - // of these old ExtractValue instructions. - Value *NewEV = - Builder.CreateExtractValue(Return, {static_cast(Idx)}); - DIO->replaceAllUsesWith(NewEV); - } - - return true; -} - -/// Lower an interleaved vp.store into a vssegN intrinsic. -/// -/// E.g. Lower an interleaved vp.store (Factor = 2): -/// -/// %is = tail call -/// @llvm.vector.interleave2.nxv64i8( -/// %load0, -/// %load1 -/// %wide.rvl = shl nuw nsw i32 %rvl, 1 -/// tail call void @llvm.vp.store.nxv64i8.p0( -/// %is, ptr %ptr, -/// %mask, -/// i32 %wide.rvl) -/// -/// Into: -/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64( -/// %load1, -/// %load2, ptr %ptr, -/// %mask, -/// i64 %rvl) -bool RISCVTargetLowering::lowerInterleavedVPStore( - VPIntrinsic *Store, Value *Mask, - ArrayRef InterleaveOperands) const { - assert(Mask && "Expect a valid mask"); - assert(Store->getIntrinsicID() == Intrinsic::vp_store && - "Unexpected intrinsic"); - - const unsigned Factor = InterleaveOperands.size(); - - auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); - if (!VTy) - return false; - - const DataLayout &DL = Store->getDataLayout(); - Align Alignment = Store->getParamAlign(1).value_or( - DL.getABITypeAlign(VTy->getElementType())); - if (!isLegalInterleavedAccessType( - VTy, Factor, Alignment, - Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) - return false; - - IRBuilder<> Builder(Store); - Value *WideEVL = Store->getArgOperand(3); - // Conservatively check if EVL is a multiple of factor, otherwise some - // (trailing) elements might be lost after the transformation. - if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor)) - return false; - - auto *PtrTy = Store->getArgOperand(1)->getType(); - auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); - Value *EVL = Builder.CreateZExt( - Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), - XLenTy); - - if (auto *FVTy = dyn_cast(VTy)) { - SmallVector Operands(InterleaveOperands); - Operands.append({Store->getArgOperand(1), Mask, EVL}); - Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2], - {FVTy, PtrTy, XLenTy}, Operands); - return true; - } - - unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); - unsigned NumElts = VTy->getElementCount().getKnownMinValue(); - Type *VecTupTy = TargetExtType::get( - Store->getContext(), "riscv.vector.tuple", - ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), - NumElts * SEW / 8), - Factor); - - Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy}); - Value *StoredVal = PoisonValue::get(VecTupTy); - for (unsigned i = 0; i < Factor; ++i) - StoredVal = Builder.CreateCall( - VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)}); - - Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( - Store->getModule(), ScalableVssegIntrIds[Factor - 2], - {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); - - Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL, - ConstantInt::get(XLenTy, Log2_64(SEW))}; - - Builder.CreateCall(VssegNFunc, Operands); - return true; -} - MachineInstr * RISCVTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index bcbda30342b80..00e969056df7d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -468,6 +468,12 @@ class RISCVTargetLowering : public TargetLowering { ArrayRef getRoundingControlRegisters() const override; + /// Match a mask which "spreads" the leading elements of a vector evenly + /// across the result. Factor is the spread amount, and Index is the + /// offset applied. + static bool isSpreadMask(ArrayRef Mask, unsigned Factor, + unsigned &Index); + private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, bool IsRet, diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp new file mode 100644 index 0000000000000..e5e79453ee53a --- /dev/null +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -0,0 +1,595 @@ +//===-- RISCVInterleavedAccess.cpp - RISC-V Interleaved Access Transform --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// +//===----------------------------------------------------------------------===// + +#include "RISCV.h" +#include "RISCVISelLowering.h" +#include "RISCVSubtarget.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicsRISCV.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/PatternMatch.h" + +using namespace llvm; + +bool RISCVTargetLowering::isLegalInterleavedAccessType( + VectorType *VTy, unsigned Factor, Align Alignment, unsigned AddrSpace, + const DataLayout &DL) const { + EVT VT = getValueType(DL, VTy); + // Don't lower vlseg/vsseg for vector types that can't be split. + if (!isTypeLegal(VT)) + return false; + + if (!isLegalElementTypeForRVV(VT.getScalarType()) || + !allowsMemoryAccessForAlignment(VTy->getContext(), DL, VT, AddrSpace, + Alignment)) + return false; + + MVT ContainerVT = VT.getSimpleVT(); + + if (auto *FVTy = dyn_cast(VTy)) { + if (!Subtarget.useRVVForFixedLengthVectors()) + return false; + // Sometimes the interleaved access pass picks up splats as interleaves of + // one element. Don't lower these. + if (FVTy->getNumElements() < 2) + return false; + + ContainerVT = getContainerForFixedLengthVector(VT.getSimpleVT()); + } + + // Need to make sure that EMUL * NFIELDS ≤ 8 + auto [LMUL, Fractional] = RISCVVType::decodeVLMUL(getLMUL(ContainerVT)); + if (Fractional) + return true; + return Factor * LMUL <= 8; +} + +static const Intrinsic::ID FixedVlsegIntrIds[] = { + Intrinsic::riscv_seg2_load_mask, Intrinsic::riscv_seg3_load_mask, + Intrinsic::riscv_seg4_load_mask, Intrinsic::riscv_seg5_load_mask, + Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask, + Intrinsic::riscv_seg8_load_mask}; + +static const Intrinsic::ID ScalableVlsegIntrIds[] = { + Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask, + Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask, + Intrinsic::riscv_vlseg6_mask, Intrinsic::riscv_vlseg7_mask, + Intrinsic::riscv_vlseg8_mask}; + +/// Lower an interleaved load into a vlsegN intrinsic. +/// +/// E.g. Lower an interleaved load (Factor = 2): +/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr +/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements +/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements +/// +/// Into: +/// %ld2 = { <4 x i32>, <4 x i32> } call llvm.riscv.seg2.load.v4i32.p0.i64( +/// %ptr, i64 4) +/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0 +/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1 +bool RISCVTargetLowering::lowerInterleavedLoad( + LoadInst *LI, ArrayRef Shuffles, + ArrayRef Indices, unsigned Factor) const { + assert(Indices.size() == Shuffles.size()); + + IRBuilder<> Builder(LI); + + const DataLayout &DL = LI->getDataLayout(); + + auto *VTy = cast(Shuffles[0]->getType()); + if (!isLegalInterleavedAccessType(VTy, Factor, LI->getAlign(), + LI->getPointerAddressSpace(), DL)) + return false; + + auto *PtrTy = LI->getPointerOperandType(); + auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + + // If the segment load is going to be performed segment at a time anyways + // and there's only one element used, use a strided load instead. This + // will be equally fast, and create less vector register pressure. + if (Indices.size() == 1 && !Subtarget.hasOptimizedSegmentLoadStore(Factor)) { + unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType()); + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + Value *Offset = ConstantInt::get(XLenTy, Indices[0] * ScalarSizeInBytes); + Value *BasePtr = Builder.CreatePtrAdd(LI->getPointerOperand(), Offset); + Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); + Value *VL = Builder.getInt32(VTy->getNumElements()); + + CallInst *CI = + Builder.CreateIntrinsic(Intrinsic::experimental_vp_strided_load, + {VTy, BasePtr->getType(), Stride->getType()}, + {BasePtr, Stride, Mask, VL}); + CI->addParamAttr( + 0, Attribute::getWithAlignment(CI->getContext(), LI->getAlign())); + Shuffles[0]->replaceAllUsesWith(CI); + return true; + }; + + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); + Value *Mask = Builder.getAllOnesMask(VTy->getElementCount()); + CallInst *VlsegN = Builder.CreateIntrinsic( + FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, + {LI->getPointerOperand(), Mask, VL}); + + for (unsigned i = 0; i < Shuffles.size(); i++) { + Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]); + Shuffles[i]->replaceAllUsesWith(SubVec); + } + + return true; +} + +static const Intrinsic::ID FixedVssegIntrIds[] = { + Intrinsic::riscv_seg2_store_mask, Intrinsic::riscv_seg3_store_mask, + Intrinsic::riscv_seg4_store_mask, Intrinsic::riscv_seg5_store_mask, + Intrinsic::riscv_seg6_store_mask, Intrinsic::riscv_seg7_store_mask, + Intrinsic::riscv_seg8_store_mask}; + +static const Intrinsic::ID ScalableVssegIntrIds[] = { + Intrinsic::riscv_vsseg2_mask, Intrinsic::riscv_vsseg3_mask, + Intrinsic::riscv_vsseg4_mask, Intrinsic::riscv_vsseg5_mask, + Intrinsic::riscv_vsseg6_mask, Intrinsic::riscv_vsseg7_mask, + Intrinsic::riscv_vsseg8_mask}; + +/// Lower an interleaved store into a vssegN intrinsic. +/// +/// E.g. Lower an interleaved store (Factor = 3): +/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1, +/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11> +/// store <12 x i32> %i.vec, <12 x i32>* %ptr +/// +/// Into: +/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3> +/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7> +/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11> +/// call void llvm.riscv.seg3.store.v4i32.p0.i64(%sub.v0, %sub.v1, %sub.v2, +/// %ptr, i32 4) +/// +/// Note that the new shufflevectors will be removed and we'll only generate one +/// vsseg3 instruction in CodeGen. +bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, + ShuffleVectorInst *SVI, + unsigned Factor) const { + IRBuilder<> Builder(SI); + const DataLayout &DL = SI->getDataLayout(); + auto Mask = SVI->getShuffleMask(); + auto *ShuffleVTy = cast(SVI->getType()); + // Given SVI : , then VTy : + auto *VTy = FixedVectorType::get(ShuffleVTy->getElementType(), + ShuffleVTy->getNumElements() / Factor); + if (!isLegalInterleavedAccessType(VTy, Factor, SI->getAlign(), + SI->getPointerAddressSpace(), DL)) + return false; + + auto *PtrTy = SI->getPointerOperandType(); + auto *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + + unsigned Index; + // If the segment store only has one active lane (i.e. the interleave is + // just a spread shuffle), we can use a strided store instead. This will + // be equally fast, and create less vector register pressure. + if (!Subtarget.hasOptimizedSegmentLoadStore(Factor) && + isSpreadMask(Mask, Factor, Index)) { + unsigned ScalarSizeInBytes = + DL.getTypeStoreSize(ShuffleVTy->getElementType()); + Value *Data = SVI->getOperand(0); + auto *DataVTy = cast(Data->getType()); + Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes); + Value *Offset = ConstantInt::get(XLenTy, Index * ScalarSizeInBytes); + Value *BasePtr = Builder.CreatePtrAdd(SI->getPointerOperand(), Offset); + Value *Mask = Builder.getAllOnesMask(DataVTy->getElementCount()); + Value *VL = Builder.getInt32(VTy->getNumElements()); + + CallInst *CI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_store, + {Data->getType(), BasePtr->getType(), Stride->getType()}, + {Data, BasePtr, Stride, Mask, VL}); + CI->addParamAttr( + 1, Attribute::getWithAlignment(CI->getContext(), SI->getAlign())); + + return true; + } + + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + SI->getModule(), FixedVssegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}); + + SmallVector Ops; + SmallVector NewShuffleMask; + + for (unsigned i = 0; i < Factor; i++) { + // Collect shuffle mask for this lane. + for (unsigned j = 0; j < VTy->getNumElements(); j++) + NewShuffleMask.push_back(Mask[i + Factor * j]); + + Value *Shuffle = Builder.CreateShuffleVector( + SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask); + Ops.push_back(Shuffle); + + NewShuffleMask.clear(); + } + // This VL should be OK (should be executable in one vsseg instruction, + // potentially under larger LMULs) because we checked that the fixed vector + // type fits in isLegalInterleavedAccessType + Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements()); + Value *StoreMask = Builder.getAllOnesMask(VTy->getElementCount()); + Ops.append({SI->getPointerOperand(), StoreMask, VL}); + + Builder.CreateCall(VssegNFunc, Ops); + + return true; +} + +bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad( + LoadInst *LI, ArrayRef DeinterleaveValues) const { + const unsigned Factor = DeinterleaveValues.size(); + if (Factor > 8) + return false; + + assert(LI->isSimple()); + IRBuilder<> Builder(LI); + + Value *FirstActive = + *llvm::find_if(DeinterleaveValues, [](Value *V) { return V != nullptr; }); + VectorType *ResVTy = cast(FirstActive->getType()); + + const DataLayout &DL = LI->getDataLayout(); + + if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(), + LI->getPointerAddressSpace(), DL)) + return false; + + Value *Return; + Type *PtrTy = LI->getPointerOperandType(); + Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen()); + + if (auto *FVTy = dyn_cast(ResVTy)) { + Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); + Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount()); + Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], + {ResVTy, PtrTy, XLenTy}, + {LI->getPointerOperand(), Mask, VL}); + } else { + static const Intrinsic::ID IntrIds[] = { + Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3, + Intrinsic::riscv_vlseg4, Intrinsic::riscv_vlseg5, + Intrinsic::riscv_vlseg6, Intrinsic::riscv_vlseg7, + Intrinsic::riscv_vlseg8}; + + unsigned SEW = DL.getTypeSizeInBits(ResVTy->getElementType()); + unsigned NumElts = ResVTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + LI->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(LI->getContext()), + NumElts * SEW / 8), + Factor); + + Value *VL = Constant::getAllOnesValue(XLenTy); + + Value *Vlseg = Builder.CreateIntrinsic( + IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy}, + {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL, + ConstantInt::get(XLenTy, Log2_64(SEW))}); + + SmallVector AggrTypes{Factor, ResVTy}; + Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes)); + for (unsigned i = 0; i < Factor; ++i) { + Value *VecExtract = Builder.CreateIntrinsic( + Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy}, + {Vlseg, Builder.getInt32(i)}); + Return = Builder.CreateInsertValue(Return, VecExtract, i); + } + } + + for (auto [Idx, DIV] : enumerate(DeinterleaveValues)) { + if (!DIV) + continue; + // We have to create a brand new ExtractValue to replace each + // of these old ExtractValue instructions. + Value *NewEV = + Builder.CreateExtractValue(Return, {static_cast(Idx)}); + DIV->replaceAllUsesWith(NewEV); + } + + return true; +} + +bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore( + StoreInst *SI, ArrayRef InterleaveValues) const { + unsigned Factor = InterleaveValues.size(); + if (Factor > 8) + return false; + + assert(SI->isSimple()); + IRBuilder<> Builder(SI); + + auto *InVTy = cast(InterleaveValues[0]->getType()); + auto *PtrTy = SI->getPointerOperandType(); + const DataLayout &DL = SI->getDataLayout(); + + if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), + SI->getPointerAddressSpace(), DL)) + return false; + + Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen()); + + if (auto *FVTy = dyn_cast(InVTy)) { + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + SI->getModule(), FixedVssegIntrIds[Factor - 2], {InVTy, PtrTy, XLenTy}); + + SmallVector Ops(InterleaveValues); + Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements()); + Value *Mask = Builder.getAllOnesMask(FVTy->getElementCount()); + Ops.append({SI->getPointerOperand(), Mask, VL}); + + Builder.CreateCall(VssegNFunc, Ops); + } else { + static const Intrinsic::ID IntrIds[] = { + Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3, + Intrinsic::riscv_vsseg4, Intrinsic::riscv_vsseg5, + Intrinsic::riscv_vsseg6, Intrinsic::riscv_vsseg7, + Intrinsic::riscv_vsseg8}; + + unsigned SEW = DL.getTypeSizeInBits(InVTy->getElementType()); + unsigned NumElts = InVTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + SI->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(SI->getContext()), + NumElts * SEW / 8), + Factor); + + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + SI->getModule(), IntrIds[Factor - 2], {VecTupTy, PtrTy, XLenTy}); + + Value *VL = Constant::getAllOnesValue(XLenTy); + + Value *StoredVal = PoisonValue::get(VecTupTy); + for (unsigned i = 0; i < Factor; ++i) + StoredVal = Builder.CreateIntrinsic( + Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy}, + {StoredVal, InterleaveValues[i], Builder.getInt32(i)}); + + Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL, + ConstantInt::get(XLenTy, Log2_64(SEW))}); + } + + return true; +} + +static bool isMultipleOfN(const Value *V, const DataLayout &DL, unsigned N) { + assert(N); + if (N == 1) + return true; + + using namespace PatternMatch; + // Right now we're only recognizing the simplest pattern. + uint64_t C; + if (match(V, m_CombineOr(m_ConstantInt(C), + m_c_Mul(m_Value(), m_ConstantInt(C)))) && + C && C % N == 0) + return true; + + if (isPowerOf2_32(N)) { + KnownBits KB = llvm::computeKnownBits(V, DL); + return KB.countMinTrailingZeros() >= Log2_32(N); + } + + return false; +} + +/// Lower an interleaved vp.load into a vlsegN intrinsic. +/// +/// E.g. Lower an interleaved vp.load (Factor = 2): +/// %l = call @llvm.vp.load.nxv64i8.p0(ptr %ptr, +/// %mask, +/// i32 %wide.rvl) +/// %dl = tail call { , } +/// @llvm.vector.deinterleave2.nxv64i8( +/// %l) +/// %r0 = extractvalue { , } %dl, 0 +/// %r1 = extractvalue { , } %dl, 1 +/// +/// Into: +/// %rvl = udiv %wide.rvl, 2 +/// %sl = call { , } +/// @llvm.riscv.vlseg2.mask.nxv32i8.i64( undef, +/// undef, +/// ptr %ptr, +/// %mask, +/// i64 %rvl, +/// i64 1) +/// %r0 = extractvalue { , } %sl, 0 +/// %r1 = extractvalue { , } %sl, 1 +/// +/// NOTE: the deinterleave2 intrinsic won't be touched and is expected to be +/// removed by the caller +/// TODO: We probably can loosen the dependency on matching extractvalue when +/// dealing with factor of 2 (extractvalue is still required for most of other +/// factors though). +bool RISCVTargetLowering::lowerInterleavedVPLoad( + VPIntrinsic *Load, Value *Mask, + ArrayRef DeinterleaveResults) const { + const unsigned Factor = DeinterleaveResults.size(); + assert(Mask && "Expect a valid mask"); + assert(Load->getIntrinsicID() == Intrinsic::vp_load && + "Unexpected intrinsic"); + + Value *FirstActive = *llvm::find_if(DeinterleaveResults, + [](Value *V) { return V != nullptr; }); + VectorType *VTy = cast(FirstActive->getType()); + + auto &DL = Load->getModule()->getDataLayout(); + Align Alignment = Load->getParamAlign(0).value_or( + DL.getABITypeAlign(VTy->getElementType())); + if (!isLegalInterleavedAccessType( + VTy, Factor, Alignment, + Load->getArgOperand(0)->getType()->getPointerAddressSpace(), DL)) + return false; + + IRBuilder<> Builder(Load); + + Value *WideEVL = Load->getVectorLengthParam(); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, Load->getDataLayout(), Factor)) + return false; + + auto *PtrTy = Load->getArgOperand(0)->getType(); + auto *XLenTy = Type::getIntNTy(Load->getContext(), Subtarget.getXLen()); + Value *EVL = Builder.CreateZExt( + Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), + XLenTy); + + Value *Return = nullptr; + if (auto *FVTy = dyn_cast(VTy)) { + Return = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2], + {FVTy, PtrTy, XLenTy}, + {Load->getArgOperand(0), Mask, EVL}); + } else { + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Load->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Load->getContext()), + NumElts * SEW / 8), + Factor); + + Value *PoisonVal = PoisonValue::get(VecTupTy); + + Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), ScalableVlsegIntrIds[Factor - 2], + {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); + + Value *Operands[] = { + PoisonVal, + Load->getArgOperand(0), + Mask, + EVL, + ConstantInt::get(XLenTy, + RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC), + ConstantInt::get(XLenTy, Log2_64(SEW))}; + + CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands); + + SmallVector AggrTypes{Factor, VTy}; + Return = PoisonValue::get(StructType::get(Load->getContext(), AggrTypes)); + Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration( + Load->getModule(), Intrinsic::riscv_tuple_extract, {VTy, VecTupTy}); + for (unsigned i = 0; i < Factor; ++i) { + Value *VecExtract = + Builder.CreateCall(VecExtractFunc, {VlsegN, Builder.getInt32(i)}); + Return = Builder.CreateInsertValue(Return, VecExtract, i); + } + } + + for (auto [Idx, DIO] : enumerate(DeinterleaveResults)) { + if (!DIO) + continue; + // We have to create a brand new ExtractValue to replace each + // of these old ExtractValue instructions. + Value *NewEV = + Builder.CreateExtractValue(Return, {static_cast(Idx)}); + DIO->replaceAllUsesWith(NewEV); + } + + return true; +} + +/// Lower an interleaved vp.store into a vssegN intrinsic. +/// +/// E.g. Lower an interleaved vp.store (Factor = 2): +/// +/// %is = tail call +/// @llvm.vector.interleave2.nxv64i8( +/// %load0, +/// %load1 +/// %wide.rvl = shl nuw nsw i32 %rvl, 1 +/// tail call void @llvm.vp.store.nxv64i8.p0( +/// %is, ptr %ptr, +/// %mask, +/// i32 %wide.rvl) +/// +/// Into: +/// call void @llvm.riscv.vsseg2.mask.nxv32i8.i64( +/// %load1, +/// %load2, ptr %ptr, +/// %mask, +/// i64 %rvl) +bool RISCVTargetLowering::lowerInterleavedVPStore( + VPIntrinsic *Store, Value *Mask, + ArrayRef InterleaveOperands) const { + assert(Mask && "Expect a valid mask"); + assert(Store->getIntrinsicID() == Intrinsic::vp_store && + "Unexpected intrinsic"); + + const unsigned Factor = InterleaveOperands.size(); + + auto *VTy = dyn_cast(InterleaveOperands[0]->getType()); + if (!VTy) + return false; + + const DataLayout &DL = Store->getDataLayout(); + Align Alignment = Store->getParamAlign(1).value_or( + DL.getABITypeAlign(VTy->getElementType())); + if (!isLegalInterleavedAccessType( + VTy, Factor, Alignment, + Store->getArgOperand(1)->getType()->getPointerAddressSpace(), DL)) + return false; + + IRBuilder<> Builder(Store); + Value *WideEVL = Store->getArgOperand(3); + // Conservatively check if EVL is a multiple of factor, otherwise some + // (trailing) elements might be lost after the transformation. + if (!isMultipleOfN(WideEVL, Store->getDataLayout(), Factor)) + return false; + + auto *PtrTy = Store->getArgOperand(1)->getType(); + auto *XLenTy = Type::getIntNTy(Store->getContext(), Subtarget.getXLen()); + Value *EVL = Builder.CreateZExt( + Builder.CreateUDiv(WideEVL, ConstantInt::get(WideEVL->getType(), Factor)), + XLenTy); + + if (auto *FVTy = dyn_cast(VTy)) { + SmallVector Operands(InterleaveOperands); + Operands.append({Store->getArgOperand(1), Mask, EVL}); + Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2], + {FVTy, PtrTy, XLenTy}, Operands); + return true; + } + + unsigned SEW = DL.getTypeSizeInBits(VTy->getElementType()); + unsigned NumElts = VTy->getElementCount().getKnownMinValue(); + Type *VecTupTy = TargetExtType::get( + Store->getContext(), "riscv.vector.tuple", + ScalableVectorType::get(Type::getInt8Ty(Store->getContext()), + NumElts * SEW / 8), + Factor); + + Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, VTy}); + Value *StoredVal = PoisonValue::get(VecTupTy); + for (unsigned i = 0; i < Factor; ++i) + StoredVal = Builder.CreateCall( + VecInsertFunc, {StoredVal, InterleaveOperands[i], Builder.getInt32(i)}); + + Function *VssegNFunc = Intrinsic::getOrInsertDeclaration( + Store->getModule(), ScalableVssegIntrIds[Factor - 2], + {VecTupTy, PtrTy, Mask->getType(), EVL->getType()}); + + Value *Operands[] = {StoredVal, Store->getArgOperand(1), Mask, EVL, + ConstantInt::get(XLenTy, Log2_64(SEW))}; + + Builder.CreateCall(VssegNFunc, Operands); + return true; +} From 10341a7d003c5363643518347f15ee4d93c68cce Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 10 Jul 2025 12:22:34 -0700 Subject: [PATCH 2/2] fixup! [RISCV] Split InterleavedAccess related TLI hooks into a separate file. NFC --- llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp index e5e79453ee53a..a6ff22c4b391f 100644 --- a/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp +++ b/llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// // +// Functions and callbacks related to the InterleavedAccessPass. // //===----------------------------------------------------------------------===//