diff --git a/llvm/include/llvm/Transforms/Utils/VNCoercion.h b/llvm/include/llvm/Transforms/Utils/VNCoercion.h index f1ea94bf60fcc..ed4dbad50ee85 100644 --- a/llvm/include/llvm/Transforms/Utils/VNCoercion.h +++ b/llvm/include/llvm/Transforms/Utils/VNCoercion.h @@ -23,6 +23,7 @@ namespace llvm { class Constant; +class Function; class StoreInst; class LoadInst; class MemIntrinsic; @@ -35,7 +36,7 @@ namespace VNCoercion { /// Return true if CoerceAvailableValueToLoadType would succeed if it was /// called. bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, - const DataLayout &DL); + Function *F); /// If we saw a store of a value to memory, and then a load from a must-aliased /// pointer of a different type, try to coerce the stored value to the loaded @@ -44,7 +45,7 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, /// /// If we can't do it, return null. Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, - IRBuilderBase &IRB, const DataLayout &DL); + IRBuilderBase &IRB, Function *F); /// This function determines whether a value for the pointer LoadPtr can be /// extracted from the store at DepSI. @@ -75,7 +76,7 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr, /// It inserts instructions to do so at InsertPt, and returns the extracted /// value. Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, - Instruction *InsertPt, const DataLayout &DL); + Instruction *InsertPt, Function *F); // This is the same as getValueForLoad, except it performs no insertion. // It only allows constant inputs. Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset, diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp index 21eb7f741d7c8..3f306bb52c12a 100644 --- a/llvm/lib/Transforms/Scalar/GVN.cpp +++ b/llvm/lib/Transforms/Scalar/GVN.cpp @@ -1096,7 +1096,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load, if (isSimpleValue()) { Res = getSimpleValue(); if (Res->getType() != LoadTy) { - Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, DL); + Res = getValueForLoad(Res, Offset, LoadTy, InsertPt, Load->getFunction()); LLVM_DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << " " << *getSimpleValue() << '\n' @@ -1109,7 +1109,8 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *Load, Res = CoercedLoad; combineMetadataForCSE(CoercedLoad, Load, false); } else { - Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt, DL); + Res = getValueForLoad(CoercedLoad, Offset, LoadTy, InsertPt, + Load->getFunction()); // We are adding a new user for this load, for which the original // metadata may not hold. Additionally, the new load may have a different // size and type, so their metadata cannot be combined in any @@ -1291,7 +1292,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, // If MD reported clobber, check it was nested. if (DepInfo.isClobber() && - canCoerceMustAliasedValueToLoad(DepLoad, LoadType, DL)) { + canCoerceMustAliasedValueToLoad(DepLoad, LoadType, + DepLoad->getFunction())) { const auto ClobberOff = MD->getClobberOffset(DepLoad); // GVN has no deal with a negative offset. Offset = (ClobberOff == std::nullopt || *ClobberOff < 0) @@ -1343,7 +1345,7 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, // different types if we have to. If the stored value is convertable to // the loaded value, we can reuse it. if (!canCoerceMustAliasedValueToLoad(S->getValueOperand(), Load->getType(), - DL)) + S->getFunction())) return std::nullopt; // Can't forward from non-atomic to atomic without violating memory model. @@ -1357,7 +1359,8 @@ GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, // If the types mismatch and we can't handle it, reject reuse of the load. // If the stored value is larger or equal to the loaded value, we can reuse // it. - if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), DL)) + if (!canCoerceMustAliasedValueToLoad(LD, Load->getType(), + LD->getFunction())) return std::nullopt; // Can't forward from non-atomic to atomic without violating memory model. diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp index 7a61ab7416638..c1bce01239dcc 100644 --- a/llvm/lib/Transforms/Utils/VNCoercion.cpp +++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp @@ -15,30 +15,42 @@ static bool isFirstClassAggregateOrScalableType(Type *Ty) { /// Return true if coerceAvailableValueToLoadType will succeed. bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, - const DataLayout &DL) { + Function *F) { Type *StoredTy = StoredVal->getType(); - if (StoredTy == LoadTy) return true; + const DataLayout &DL = F->getDataLayout(); + TypeSize MinStoreSize = DL.getTypeSizeInBits(StoredTy); + TypeSize LoadSize = DL.getTypeSizeInBits(LoadTy); if (isa(StoredTy) && isa(LoadTy) && - DL.getTypeSizeInBits(StoredTy) == DL.getTypeSizeInBits(LoadTy)) + MinStoreSize == LoadSize) return true; - // If the loaded/stored value is a first class array/struct, or scalable type, - // don't try to transform them. We need to be able to bitcast to integer. - if (isFirstClassAggregateOrScalableType(LoadTy) || - isFirstClassAggregateOrScalableType(StoredTy)) + // If the loaded/stored value is a first class array/struct, don't try to + // transform them. We need to be able to bitcast to integer. For scalable + // vectors forwarded to fixed-sized vectors @llvm.vector.extract is used. + if (isa(StoredTy) && isa(LoadTy)) { + if (StoredTy->getScalarType() != LoadTy->getScalarType()) + return false; + + // If it is known at compile-time that the VScale is larger than one, + // use that information to allow for wider loads. + const auto &Attrs = F->getAttributes().getFnAttrs(); + unsigned MinVScale = Attrs.getVScaleRangeMin(); + MinStoreSize = + TypeSize::getFixed(MinStoreSize.getKnownMinValue() * MinVScale); + } else if (isFirstClassAggregateOrScalableType(LoadTy) || + isFirstClassAggregateOrScalableType(StoredTy)) { return false; - - uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedValue(); + } // The store size must be byte-aligned to support future type casts. - if (llvm::alignTo(StoreSize, 8) != StoreSize) + if (llvm::alignTo(MinStoreSize, 8) != MinStoreSize) return false; // The store has to be at least as big as the load. - if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedValue()) + if (!TypeSize::isKnownGE(MinStoreSize, LoadSize)) return false; bool StoredNI = DL.isNonIntegralPointerType(StoredTy->getScalarType()); @@ -57,11 +69,10 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, return false; } - // The implementation below uses inttoptr for vectors of unequal size; we // can't allow this for non integral pointers. We could teach it to extract // exact subvectors if desired. - if (StoredNI && StoreSize != DL.getTypeSizeInBits(LoadTy).getFixedValue()) + if (StoredNI && (StoredTy->isScalableTy() || MinStoreSize != LoadSize)) return false; if (StoredTy->isTargetExtTy() || LoadTy->isTargetExtTy()) @@ -77,16 +88,24 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy, /// /// If we can't do it, return null. Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy, - IRBuilderBase &Helper, - const DataLayout &DL) { - assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) && + IRBuilderBase &Helper, Function *F) { + assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, F) && "precondition violation - materialization can't fail"); + const DataLayout &DL = F->getDataLayout(); if (auto *C = dyn_cast(StoredVal)) StoredVal = ConstantFoldConstant(C, DL); // If this is already the right type, just return it. Type *StoredValTy = StoredVal->getType(); + // If this is a scalable vector forwarded to a fixed vector load, create + // a @llvm.vector.extract instead of bitcasts. + if (isa(StoredVal->getType()) && + isa(LoadedTy)) { + return Helper.CreateIntrinsic(LoadedTy, Intrinsic::vector_extract, + {StoredVal, Helper.getInt64(0)}); + } + TypeSize StoredValSize = DL.getTypeSizeInBits(StoredValTy); TypeSize LoadedValSize = DL.getTypeSizeInBits(LoadedTy); @@ -220,7 +239,7 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, if (isFirstClassAggregateOrScalableType(StoredVal->getType())) return -1; - if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DL)) + if (!canCoerceMustAliasedValueToLoad(StoredVal, LoadTy, DepSI->getFunction())) return -1; Value *StorePtr = DepSI->getPointerOperand(); @@ -235,11 +254,11 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr, /// the other load can feed into the second load. int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI, const DataLayout &DL) { - // Cannot handle reading from store of first-class aggregate yet. - if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy()) + // Cannot handle reading from store of first-class aggregate or scalable type. + if (isFirstClassAggregateOrScalableType(DepLI->getType())) return -1; - if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DL)) + if (!canCoerceMustAliasedValueToLoad(DepLI, LoadTy, DepLI->getFunction())) return -1; Value *DepPtr = DepLI->getPointerOperand(); @@ -315,6 +334,16 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset, return SrcVal; } + // For the case of a scalable vector being forwarded to a fixed-sized load, + // only equal element types are allowed and a @llvm.vector.extract will be + // used instead of bitcasts. + if (isa(SrcVal->getType()) && + isa(LoadTy)) { + assert(Offset == 0 && + SrcVal->getType()->getScalarType() == LoadTy->getScalarType()); + return SrcVal; + } + uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()).getFixedValue() + 7) / 8; uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedValue() + 7) / 8; @@ -344,20 +373,24 @@ static Value *getStoreValueForLoadHelper(Value *SrcVal, unsigned Offset, } Value *getValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy, - Instruction *InsertPt, const DataLayout &DL) { + Instruction *InsertPt, Function *F) { + const DataLayout &DL = F->getDataLayout(); #ifndef NDEBUG - TypeSize SrcValSize = DL.getTypeStoreSize(SrcVal->getType()); + TypeSize MinSrcValSize = DL.getTypeStoreSize(SrcVal->getType()); TypeSize LoadSize = DL.getTypeStoreSize(LoadTy); - assert(SrcValSize.isScalable() == LoadSize.isScalable()); - assert((SrcValSize.isScalable() || Offset + LoadSize <= SrcValSize) && + if (MinSrcValSize.isScalable() && !LoadSize.isScalable()) + MinSrcValSize = + TypeSize::getFixed(MinSrcValSize.getKnownMinValue() * + F->getAttributes().getFnAttrs().getVScaleRangeMin()); + assert((MinSrcValSize.isScalable() || Offset + LoadSize <= MinSrcValSize) && "Expected Offset + LoadSize <= SrcValSize"); - assert( - (!SrcValSize.isScalable() || (Offset == 0 && LoadSize == SrcValSize)) && - "Expected scalable type sizes to match"); + assert((!MinSrcValSize.isScalable() || + (Offset == 0 && TypeSize::isKnownLE(LoadSize, MinSrcValSize))) && + "Expected offset of zero and LoadSize <= SrcValSize"); #endif IRBuilder<> Builder(InsertPt); SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL); - return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL); + return coerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, F); } Constant *getConstantValueForLoad(Constant *SrcVal, unsigned Offset, @@ -408,7 +441,8 @@ Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset, ++NumBytesSet; } - return coerceAvailableValueToLoadType(Val, LoadTy, Builder, DL); + return coerceAvailableValueToLoadType(Val, LoadTy, Builder, + InsertPt->getFunction()); } // Otherwise, this is a memcpy/memmove from a constant global. diff --git a/llvm/test/Transforms/GVN/vscale.ll b/llvm/test/Transforms/GVN/vscale.ll index 67cbfc2f05ef8..d7b07b9891c41 100644 --- a/llvm/test/Transforms/GVN/vscale.ll +++ b/llvm/test/Transforms/GVN/vscale.ll @@ -641,3 +641,127 @@ entry: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp) ret { , , , } %15 } + +define @scalable_store_to_fixed_load( %.coerce) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: ret [[TMP0]] +; +entry: + %retval = alloca { <16 x float> } + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +; Here, only the lower bound for the vscale is known, but this is enough to allow a forward to a load to 16 elements. +define @scalable_store_to_fixed_load_only_lower_bound( %a) vscale_range(4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_only_lower_bound( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { }, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: ret [[A]] +; +entry: + %retval = alloca { } + store %a, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_with_offset( %a) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_with_offset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR:%.*]] = alloca { <32 x float> }, align 128 +; CHECK-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[GEP]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP0]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %ptr = alloca { <32 x float> } + store %a, ptr %ptr + %gep = getelementptr inbounds i8, ptr %ptr, i64 8 + %1 = load <16 x float>, ptr %gep + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_unknown_vscale( %.coerce) { +; CHECK-LABEL: @scalable_store_to_fixed_load_unknown_vscale( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca { <16 x float> } + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_size_missmatch( %.coerce) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_size_missmatch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <32 x float> }, align 128 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[RETVAL]], align 128 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v32f32( poison, <32 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca { <32 x float> } + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <32 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v32f32( poison, <32 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_different_types( %a) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_different_types( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR:%.*]] = alloca { <16 x float> }, align 64 +; CHECK-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr [[PTR]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %ptr = alloca { <16 x float> } + store %a, ptr %ptr + %1 = load <16 x i32>, ptr %ptr + %cast.scalable = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> %1, i64 0) + ret %cast.scalable +} + +; This function does not have a fixed vscale, but the loaded vector is still known +; to be smaller or equal in size compared to the stored vector. +define <4 x float> @scalable_store_to_small_fixed_load( %a) { +; CHECK-LABEL: @scalable_store_to_small_fixed_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.vector.extract.v4f32.nxv4f32( [[A]], i64 0) +; CHECK-NEXT: ret <4 x float> [[TMP0]] +; +entry: + %ptr = alloca + store %a, ptr %ptr + %1 = load <4 x float>, ptr %ptr + ret <4 x float> %1 +} diff --git a/llvm/test/Transforms/NewGVN/vscale.ll b/llvm/test/Transforms/NewGVN/vscale.ll index 500d58baed1a2..702117213f6eb 100644 --- a/llvm/test/Transforms/NewGVN/vscale.ll +++ b/llvm/test/Transforms/NewGVN/vscale.ll @@ -646,3 +646,131 @@ entry: call void @llvm.lifetime.end.p0(i64 -1, ptr nonnull %ref.tmp) ret { , , , } %15 } + +define @scalable_store_to_fixed_load( %.coerce) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca { <16 x float> } + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +; Here, only the lower bound for the vscale is known, but this is enough to allow a forward to a load to 16 elements. +define @scalable_store_to_fixed_load_only_lower_bound( %a) vscale_range(4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_only_lower_bound( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { }, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP0]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca { } + store %a, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_with_offset( %a) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_with_offset( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR:%.*]] = alloca { <32 x float> }, align 128 +; CHECK-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x float>, ptr [[GEP]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP0]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %ptr = alloca { <32 x float> } + store %a, ptr %ptr + %gep = getelementptr inbounds i8, ptr %ptr, i64 8 + %1 = load <16 x float>, ptr %gep + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_unknown_vscale( %.coerce) { +; CHECK-LABEL: @scalable_store_to_fixed_load_unknown_vscale( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <16 x float> }, align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca { <16 x float> } + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_size_missmatch( %.coerce) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_size_missmatch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RETVAL:%.*]] = alloca { <32 x float> }, align 128 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE:%.*]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <32 x float>, ptr [[RETVAL]], align 128 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v32f32( poison, <32 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca { <32 x float> } + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <32 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v32f32( poison, <32 x float> %1, i64 0) + ret %cast.scalable +} + +define @scalable_store_to_fixed_load_different_types( %a) vscale_range(4,4) { +; CHECK-LABEL: @scalable_store_to_fixed_load_different_types( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR:%.*]] = alloca { <16 x float> }, align 64 +; CHECK-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr [[PTR]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TMP0]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %ptr = alloca { <16 x float> } + store %a, ptr %ptr + %1 = load <16 x i32>, ptr %ptr + %cast.scalable = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> %1, i64 0) + ret %cast.scalable +} + +; This function does not have a fixed vscale, but the loaded vector is still known +; to be smaller or equal in size compared to the stored vector. +define <4 x float> @scalable_store_to_small_fixed_load( %a) { +; CHECK-LABEL: @scalable_store_to_small_fixed_load( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[PTR:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[PTR]], align 16 +; CHECK-NEXT: ret <4 x float> [[TMP0]] +; +entry: + %ptr = alloca + store %a, ptr %ptr + %1 = load <4 x float>, ptr %ptr + ret <4 x float> %1 +}