diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b..0cadbc5fede9b 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
 /// FindAvailableLoadedValue() for the case where we are not interested in
 /// finding the closest clobbering instruction if no available load is found.
 /// This overload cannot be used to scan across multiple blocks.
+/// If \p VectorKindChange is not nullptr, this is a out parameter that is true
+/// if a value was found, but it is a scalable vector instead of a requested
+/// fixed-sized one (or the other way round).
 Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
                                 bool *IsLoadCSE,
+                                bool *IsVectorKindChange = nullptr,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan);
 
 /// Scan backwards to see if we have the value of the given pointer available
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 691d7e4a3edcf..e4bd59fbf2d30 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
 
 static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
                                     Type *AccessTy, bool AtLeastAtomic,
-                                    const DataLayout &DL, bool *IsLoadCSE) {
+                                    const DataLayout &DL, bool *IsLoadCSE,
+                                    bool *IsVectorKindChange) {
   // If this is a load of Ptr, the loaded value is available.
   // (This is true even if the load is volatile or atomic, although
   // those cases are unlikely.)
@@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
     if (TypeSize::isKnownLE(LoadSize, StoreSize))
       if (auto *C = dyn_cast<Constant>(Val))
         return ConstantFoldLoadFromConst(C, AccessTy, DL);
+
+    if (IsVectorKindChange && Val->getType()->isVectorTy() &&
+        AccessTy->isVectorTy()) {
+      auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs();
+      unsigned VScale = Attrs.getVScaleRangeMin();
+      if (Attrs.getVScaleRangeMax() != VScale)
+        return nullptr;
+
+      unsigned FixedStoreSize =
+          (StoreSize.isFixed() ? StoreSize : StoreSize * VScale)
+              .getKnownMinValue();
+      unsigned FixedLoadSize =
+          (LoadSize.isFixed() ? LoadSize : LoadSize * VScale)
+              .getKnownMinValue();
+      if (FixedStoreSize == FixedLoadSize) {
+        *IsVectorKindChange = true;
+        return Val;
+      }
+    }
   }
 
   if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore(
 
     --ScanFrom;
 
-    if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy,
-                                                 AtLeastAtomic, DL, IsLoadCSE))
+    if (Value *Available = getAvailableLoadStore(
+            Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr))
       return Available;
 
     // Try to get the store size for the type.
@@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore(
 }
 
 Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
-                                      bool *IsLoadCSE,
+                                      bool *IsLoadCSE, bool *IsVectorKindChange,
                                       unsigned MaxInstsToScan) {
   const DataLayout &DL = Load->getDataLayout();
   Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
     if (MaxInstsToScan-- == 0)
       return nullptr;
 
-    Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
-                                      AtLeastAtomic, DL, IsLoadCSE);
+    Available =
+        getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL,
+                              IsLoadCSE, IsVectorKindChange);
     if (Available)
       break;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f748f78524e0d..f463fe3e7d504 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Value *Vec = II->getArgOperand(0);
     Value *SubVec = II->getArgOperand(1);
     Value *Idx = II->getArgOperand(2);
-    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
-    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
-    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+    auto *DstTy = cast<VectorType>(II->getType());
+    auto *VecTy = cast<VectorType>(Vec->getType());
+    auto *SubVecTy = cast<VectorType>(SubVec->getType());
+    unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+    // Try store-to-load forwarding where the stored value has the same
+    // type as this intrinsic, and the loaded value is the inserted
+    // vector. This has to be done here because a temporary insert of
+    // a scalable vector (the available value) into a fixed-sized one
+    // (the second operand of this intrinisc) cannot be created.
+    if (auto *LI = dyn_cast<LoadInst>(SubVec);
+        LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) {
+      bool IsVectorKindChange = false;
+      BatchAAResults BatchAA(*AA);
+      if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr,
+                                                    &IsVectorKindChange);
+          AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) {
+        return replaceInstUsesWith(CI, AvilVal);
+      }
+    }
 
     // Only canonicalize if the destination vector, Vec, and SubVec are all
     // fixed vectors.
-    if (DstTy && VecTy && SubVecTy) {
-      unsigned DstNumElts = DstTy->getNumElements();
-      unsigned VecNumElts = VecTy->getNumElements();
-      unsigned SubVecNumElts = SubVecTy->getNumElements();
-      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+    if (!DstTy->isScalableTy() && !VecTy->isScalableTy() &&
+        !SubVecTy->isScalableTy()) {
+      unsigned DstNumElts = DstTy->getElementCount().getFixedValue();
+      unsigned VecNumElts = VecTy->getElementCount().getFixedValue();
+      unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue();
 
       // An insert that entirely overwrites Vec with SubVec is a nop.
       if (VecNumElts == SubVecNumElts)
diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
new file mode 100644
index 0000000000000..73685fe8c3762
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+%struct.svfloat32_wrapped_t = type { <16 x float> }
+
+define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x float> %.coerce) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
+;
+entry:
+  %retval = alloca %struct.svfloat32_wrapped_t
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @vscale_not_fixed(<vscale x 4 x float> %.coerce) #1 {
+; CHECK-LABEL: define <vscale x 4 x float> @vscale_not_fixed(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca %struct.svfloat32_wrapped_t
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @sizes_do_not_match(<vscale x 4 x float> %.coerce) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @sizes_do_not_match(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[RETVAL]], align 32
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca %struct.svfloat32_wrapped_t
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <8 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float>, <16 x float>, i64 immarg)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float>, <8 x float>, i64 immarg)
+
+attributes #0 = { vscale_range(4,4) }
+attributes #1 = { vscale_range(1,16) }