diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h index 639070c07897b..0cadbc5fede9b 100644 --- a/llvm/include/llvm/Analysis/Loads.h +++ b/llvm/include/llvm/Analysis/Loads.h @@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, /// FindAvailableLoadedValue() for the case where we are not interested in /// finding the closest clobbering instruction if no available load is found. /// This overload cannot be used to scan across multiple blocks. +/// If \p VectorKindChange is not nullptr, this is a out parameter that is true +/// if a value was found, but it is a scalable vector instead of a requested +/// fixed-sized one (or the other way round). Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, bool *IsLoadCSE, + bool *IsVectorKindChange = nullptr, unsigned MaxInstsToScan = DefMaxInstsToScan); /// Scan backwards to see if we have the value of the given pointer available diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 691d7e4a3edcf..e4bd59fbf2d30 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr, static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr, Type *AccessTy, bool AtLeastAtomic, - const DataLayout &DL, bool *IsLoadCSE) { + const DataLayout &DL, bool *IsLoadCSE, + bool *IsVectorKindChange) { // If this is a load of Ptr, the loaded value is available. // (This is true even if the load is volatile or atomic, although // those cases are unlikely.) @@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr, if (TypeSize::isKnownLE(LoadSize, StoreSize)) if (auto *C = dyn_cast(Val)) return ConstantFoldLoadFromConst(C, AccessTy, DL); + + if (IsVectorKindChange && Val->getType()->isVectorTy() && + AccessTy->isVectorTy()) { + auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs(); + unsigned VScale = Attrs.getVScaleRangeMin(); + if (Attrs.getVScaleRangeMax() != VScale) + return nullptr; + + unsigned FixedStoreSize = + (StoreSize.isFixed() ? StoreSize : StoreSize * VScale) + .getKnownMinValue(); + unsigned FixedLoadSize = + (LoadSize.isFixed() ? LoadSize : LoadSize * VScale) + .getKnownMinValue(); + if (FixedStoreSize == FixedLoadSize) { + *IsVectorKindChange = true; + return Val; + } + } } if (auto *MSI = dyn_cast(Inst)) { @@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore( --ScanFrom; - if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy, - AtLeastAtomic, DL, IsLoadCSE)) + if (Value *Available = getAvailableLoadStore( + Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr)) return Available; // Try to get the store size for the type. @@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore( } Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, - bool *IsLoadCSE, + bool *IsLoadCSE, bool *IsVectorKindChange, unsigned MaxInstsToScan) { const DataLayout &DL = Load->getDataLayout(); Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts(); @@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, if (MaxInstsToScan-- == 0) return nullptr; - Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, - AtLeastAtomic, DL, IsLoadCSE); + Available = + getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, + IsLoadCSE, IsVectorKindChange); if (Available) break; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index f748f78524e0d..f463fe3e7d504 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Value *Vec = II->getArgOperand(0); Value *SubVec = II->getArgOperand(1); Value *Idx = II->getArgOperand(2); - auto *DstTy = dyn_cast(II->getType()); - auto *VecTy = dyn_cast(Vec->getType()); - auto *SubVecTy = dyn_cast(SubVec->getType()); + auto *DstTy = cast(II->getType()); + auto *VecTy = cast(Vec->getType()); + auto *SubVecTy = cast(SubVec->getType()); + unsigned IdxN = cast(Idx)->getZExtValue(); + + // Try store-to-load forwarding where the stored value has the same + // type as this intrinsic, and the loaded value is the inserted + // vector. This has to be done here because a temporary insert of + // a scalable vector (the available value) into a fixed-sized one + // (the second operand of this intrinisc) cannot be created. + if (auto *LI = dyn_cast(SubVec); + LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) { + bool IsVectorKindChange = false; + BatchAAResults BatchAA(*AA); + if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr, + &IsVectorKindChange); + AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) { + return replaceInstUsesWith(CI, AvilVal); + } + } // Only canonicalize if the destination vector, Vec, and SubVec are all // fixed vectors. - if (DstTy && VecTy && SubVecTy) { - unsigned DstNumElts = DstTy->getNumElements(); - unsigned VecNumElts = VecTy->getNumElements(); - unsigned SubVecNumElts = SubVecTy->getNumElements(); - unsigned IdxN = cast(Idx)->getZExtValue(); + if (!DstTy->isScalableTy() && !VecTy->isScalableTy() && + !SubVecTy->isScalableTy()) { + unsigned DstNumElts = DstTy->getElementCount().getFixedValue(); + unsigned VecNumElts = VecTy->getElementCount().getFixedValue(); + unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue(); // An insert that entirely overwrites Vec with SubVec is a nop. if (VecNumElts == SubVecNumElts) diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll new file mode 100644 index 0000000000000..73685fe8c3762 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +%struct.svfloat32_wrapped_t = type { <16 x float> } + +define @store_to_vector_load_different_type( %.coerce) #0 { +; CHECK-LABEL: define @store_to_vector_load_different_type( +; CHECK-SAME: [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE]], [[DOTCOERCE]] +; CHECK-NEXT: ret [[TMP0]] +; +entry: + %retval = alloca %struct.svfloat32_wrapped_t + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @vscale_not_fixed( %.coerce) #1 { +; CHECK-LABEL: define @vscale_not_fixed( +; CHECK-SAME: [[DOTCOERCE:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca %struct.svfloat32_wrapped_t + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @sizes_do_not_match( %.coerce) #0 { +; CHECK-LABEL: define @sizes_do_not_match( +; CHECK-SAME: [[DOTCOERCE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[RETVAL]], align 32 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v8f32( poison, <8 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca %struct.svfloat32_wrapped_t + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <8 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v8f32( poison, <8 x float> %1, i64 0) + ret %cast.scalable +} + +declare @llvm.vector.insert.nxv4f32.v16f32(, <16 x float>, i64 immarg) +declare @llvm.vector.insert.nxv4f32.v8f32(, <8 x float>, i64 immarg) + +attributes #0 = { vscale_range(4,4) } +attributes #1 = { vscale_range(1,16) }