From 091c45d903128e20bd087498f5518f198d2a656e Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer@sipearl.com>
Date: Mon, 27 Jan 2025 18:53:29 +0100
Subject: [PATCH 1/2] [InstCombine] tests for simple store-to-load forwaring
 between fixed/scalable vectors

---
 .../InstCombine/store-load-vector-insert.ll   | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
new file mode 100644
index 0000000000000..1457b8c4391e2
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+%struct.svfloat32_wrapped_t = type { <16 x float> }
+
+define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x float> %.coerce) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca %struct.svfloat32_wrapped_t
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @vscale_not_fixed(<vscale x 4 x float> %.coerce) #1 {
+; CHECK-LABEL: define <vscale x 4 x float> @vscale_not_fixed(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca %struct.svfloat32_wrapped_t
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <16 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+define <vscale x 4 x float> @sizes_do_not_match(<vscale x 4 x float> %.coerce) #0 {
+; CHECK-LABEL: define <vscale x 4 x float> @sizes_do_not_match(
+; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[RETVAL]], align 32
+; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+;
+entry:
+  %retval = alloca %struct.svfloat32_wrapped_t
+  %0 = fadd <vscale x 4 x float> %.coerce, %.coerce
+  store <vscale x 4 x float> %0, ptr %retval
+  %1 = load <8 x float>, ptr %retval
+  %cast.scalable = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float> poison, <8 x float> %1, i64 0)
+  ret <vscale x 4 x float> %cast.scalable
+}
+
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float>, <16 x float>, i64 immarg)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v8f32(<vscale x 4 x float>, <8 x float>, i64 immarg)
+
+attributes #0 = { vscale_range(4,4) }
+attributes #1 = { vscale_range(1,16) }

From 28e62bf79e12a57d32e9523ff17326f698552bd0 Mon Sep 17 00:00:00 2001
From: Lou Knauer <lou.knauer@sipearl.com>
Date: Mon, 27 Jan 2025 18:55:36 +0100
Subject: [PATCH 2/2] [InstCombine] Simple store-to-load forwaring between
 fixed/scalable vectors

When storing a scalable vector and the VScale is a compile-time known
constant, do basic store-to-load forwarding through @llvm.vector.insert
calls, even if the loaded vector is fixed-sized instead of scalable.

The @llvm.vector.insert is matched instead of the load itself because it
is
invalid to create a temporary insert of a scalable vector (the stored
value) into a fixed-sized vector (the load type).

The usecase is shown in this [godbold
link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates
IR that matches this pattern when the "arm_sve_vector_bits" attribute is
used:

```
typedef svfloat32_t svfloat32_fixed_t
__attribute__((arm_sve_vector_bits(512)));

struct svfloat32_wrapped_t {
    svfloat32_fixed_t v;
};

static inline svfloat32_wrapped_t
add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
    return {svadd_f32_x(svptrue_b32(), a.v, b.v)};
}

svfloat32_wrapped_t
foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) {
    // The IR pattern this patch matches is generated for this return:
      return add(a, b);
}
```
---
 llvm/include/llvm/Analysis/Loads.h            |  4 +++
 llvm/lib/Analysis/Loads.cpp                   | 33 +++++++++++++++----
 .../InstCombine/InstCombineCalls.cpp          | 33 ++++++++++++++-----
 .../InstCombine/store-load-vector-insert.ll   |  6 +---
 4 files changed, 57 insertions(+), 19 deletions(-)

diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 639070c07897b..0cadbc5fede9b 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
 /// FindAvailableLoadedValue() for the case where we are not interested in
 /// finding the closest clobbering instruction if no available load is found.
 /// This overload cannot be used to scan across multiple blocks.
+/// If \p VectorKindChange is not nullptr, this is a out parameter that is true
+/// if a value was found, but it is a scalable vector instead of a requested
+/// fixed-sized one (or the other way round).
 Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
                                 bool *IsLoadCSE,
+                                bool *IsVectorKindChange = nullptr,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan);
 
 /// Scan backwards to see if we have the value of the given pointer available
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 691d7e4a3edcf..e4bd59fbf2d30 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
 
 static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
                                     Type *AccessTy, bool AtLeastAtomic,
-                                    const DataLayout &DL, bool *IsLoadCSE) {
+                                    const DataLayout &DL, bool *IsLoadCSE,
+                                    bool *IsVectorKindChange) {
   // If this is a load of Ptr, the loaded value is available.
   // (This is true even if the load is volatile or atomic, although
   // those cases are unlikely.)
@@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
     if (TypeSize::isKnownLE(LoadSize, StoreSize))
       if (auto *C = dyn_cast<Constant>(Val))
         return ConstantFoldLoadFromConst(C, AccessTy, DL);
+
+    if (IsVectorKindChange && Val->getType()->isVectorTy() &&
+        AccessTy->isVectorTy()) {
+      auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs();
+      unsigned VScale = Attrs.getVScaleRangeMin();
+      if (Attrs.getVScaleRangeMax() != VScale)
+        return nullptr;
+
+      unsigned FixedStoreSize =
+          (StoreSize.isFixed() ? StoreSize : StoreSize * VScale)
+              .getKnownMinValue();
+      unsigned FixedLoadSize =
+          (LoadSize.isFixed() ? LoadSize : LoadSize * VScale)
+              .getKnownMinValue();
+      if (FixedStoreSize == FixedLoadSize) {
+        *IsVectorKindChange = true;
+        return Val;
+      }
+    }
   }
 
   if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore(
 
     --ScanFrom;
 
-    if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy,
-                                                 AtLeastAtomic, DL, IsLoadCSE))
+    if (Value *Available = getAvailableLoadStore(
+            Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr))
       return Available;
 
     // Try to get the store size for the type.
@@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore(
 }
 
 Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
-                                      bool *IsLoadCSE,
+                                      bool *IsLoadCSE, bool *IsVectorKindChange,
                                       unsigned MaxInstsToScan) {
   const DataLayout &DL = Load->getDataLayout();
   Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
     if (MaxInstsToScan-- == 0)
       return nullptr;
 
-    Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
-                                      AtLeastAtomic, DL, IsLoadCSE);
+    Available =
+        getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL,
+                              IsLoadCSE, IsVectorKindChange);
     if (Available)
       break;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f748f78524e0d..f463fe3e7d504 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Value *Vec = II->getArgOperand(0);
     Value *SubVec = II->getArgOperand(1);
     Value *Idx = II->getArgOperand(2);
-    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
-    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
-    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+    auto *DstTy = cast<VectorType>(II->getType());
+    auto *VecTy = cast<VectorType>(Vec->getType());
+    auto *SubVecTy = cast<VectorType>(SubVec->getType());
+    unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+    // Try store-to-load forwarding where the stored value has the same
+    // type as this intrinsic, and the loaded value is the inserted
+    // vector. This has to be done here because a temporary insert of
+    // a scalable vector (the available value) into a fixed-sized one
+    // (the second operand of this intrinisc) cannot be created.
+    if (auto *LI = dyn_cast<LoadInst>(SubVec);
+        LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) {
+      bool IsVectorKindChange = false;
+      BatchAAResults BatchAA(*AA);
+      if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr,
+                                                    &IsVectorKindChange);
+          AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) {
+        return replaceInstUsesWith(CI, AvilVal);
+      }
+    }
 
     // Only canonicalize if the destination vector, Vec, and SubVec are all
     // fixed vectors.
-    if (DstTy && VecTy && SubVecTy) {
-      unsigned DstNumElts = DstTy->getNumElements();
-      unsigned VecNumElts = VecTy->getNumElements();
-      unsigned SubVecNumElts = SubVecTy->getNumElements();
-      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+    if (!DstTy->isScalableTy() && !VecTy->isScalableTy() &&
+        !SubVecTy->isScalableTy()) {
+      unsigned DstNumElts = DstTy->getElementCount().getFixedValue();
+      unsigned VecNumElts = VecTy->getElementCount().getFixedValue();
+      unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue();
 
       // An insert that entirely overwrites Vec with SubVec is a nop.
       if (VecNumElts == SubVecNumElts)
diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
index 1457b8c4391e2..73685fe8c3762 100644
--- a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
+++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
@@ -7,12 +7,8 @@ define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x f
 ; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
 ; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
-; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 ;
 entry:
   %retval = alloca %struct.svfloat32_wrapped_t