[InstCombine] Simple store-to-load forwaring between fixed/scalable vectors

iamlouk · iamlouk · commit 28e62bf79e12 · 2025-01-27T18:55:36.000+01:00
When storing a scalable vector and the VScale is a compile-time known constant, do basic store-to-load forwarding through @llvm.vector.insert calls, even if the loaded vector is fixed-sized instead of scalable. The @llvm.vector.insert is matched instead of the load itself because it is invalid to create a temporary insert of a scalable vector (the stored value) into a fixed-sized vector (the load type). The usecase is shown in this [godbold link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates IR that matches this pattern when the "arm_sve_vector_bits" attribute is used: ``` typedef svfloat32_t svfloat32_fixed_t __attribute__((arm_sve_vector_bits(512))); struct svfloat32_wrapped_t { svfloat32_fixed_t v; }; static inline svfloat32_wrapped_t add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { return {svadd_f32_x(svptrue_b32(), a.v, b.v)}; } svfloat32_wrapped_t foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { // The IR pattern this patch matches is generated for this return: return add(a, b); } ```
diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
@@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
 /// FindAvailableLoadedValue() for the case where we are not interested in
 /// finding the closest clobbering instruction if no available load is found.
 /// This overload cannot be used to scan across multiple blocks.
+/// If \p VectorKindChange is not nullptr, this is a out parameter that is true
+/// if a value was found, but it is a scalable vector instead of a requested
+/// fixed-sized one (or the other way round).
 Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
                                 bool *IsLoadCSE,
+                                bool *IsVectorKindChange = nullptr,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan);
 
 /// Scan backwards to see if we have the value of the given pointer available
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
@@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
 
 static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
                                     Type *AccessTy, bool AtLeastAtomic,
-                                    const DataLayout &DL, bool *IsLoadCSE) {
+                                    const DataLayout &DL, bool *IsLoadCSE,
+                                    bool *IsVectorKindChange) {
   // If this is a load of Ptr, the loaded value is available.
   // (This is true even if the load is volatile or atomic, although
   // those cases are unlikely.)
@@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
     if (TypeSize::isKnownLE(LoadSize, StoreSize))
       if (auto *C = dyn_cast<Constant>(Val))
         return ConstantFoldLoadFromConst(C, AccessTy, DL);
+
+    if (IsVectorKindChange && Val->getType()->isVectorTy() &&
+        AccessTy->isVectorTy()) {
+      auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs();
+      unsigned VScale = Attrs.getVScaleRangeMin();
+      if (Attrs.getVScaleRangeMax() != VScale)
+        return nullptr;
+
+      unsigned FixedStoreSize =
+          (StoreSize.isFixed() ? StoreSize : StoreSize * VScale)
+              .getKnownMinValue();
+      unsigned FixedLoadSize =
+          (LoadSize.isFixed() ? LoadSize : LoadSize * VScale)
+              .getKnownMinValue();
+      if (FixedStoreSize == FixedLoadSize) {
+        *IsVectorKindChange = true;
+        return Val;
+      }
+    }
   }
 
   if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore(
 
     --ScanFrom;
 
-    if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy,
-                                                 AtLeastAtomic, DL, IsLoadCSE))
+    if (Value *Available = getAvailableLoadStore(
+            Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr))
       return Available;
 
     // Try to get the store size for the type.
@@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore(
 }
 
 Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
-                                      bool *IsLoadCSE,
+                                      bool *IsLoadCSE, bool *IsVectorKindChange,
                                       unsigned MaxInstsToScan) {
   const DataLayout &DL = Load->getDataLayout();
   Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
     if (MaxInstsToScan-- == 0)
       return nullptr;
 
-    Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
-                                      AtLeastAtomic, DL, IsLoadCSE);
+    Available =
+        getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL,
+                              IsLoadCSE, IsVectorKindChange);
     if (Available)
       break;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     Value *Vec = II->getArgOperand(0);
     Value *SubVec = II->getArgOperand(1);
     Value *Idx = II->getArgOperand(2);
-    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
-    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
-    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+    auto *DstTy = cast<VectorType>(II->getType());
+    auto *VecTy = cast<VectorType>(Vec->getType());
+    auto *SubVecTy = cast<VectorType>(SubVec->getType());
+    unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+    // Try store-to-load forwarding where the stored value has the same
+    // type as this intrinsic, and the loaded value is the inserted
+    // vector. This has to be done here because a temporary insert of
+    // a scalable vector (the available value) into a fixed-sized one
+    // (the second operand of this intrinisc) cannot be created.
+    if (auto *LI = dyn_cast<LoadInst>(SubVec);
+        LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) {
+      bool IsVectorKindChange = false;
+      BatchAAResults BatchAA(*AA);
+      if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr,
+                                                    &IsVectorKindChange);
+          AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) {
+        return replaceInstUsesWith(CI, AvilVal);
+      }
+    }
 
     // Only canonicalize if the destination vector, Vec, and SubVec are all
     // fixed vectors.
-    if (DstTy && VecTy && SubVecTy) {
-      unsigned DstNumElts = DstTy->getNumElements();
-      unsigned VecNumElts = VecTy->getNumElements();
-      unsigned SubVecNumElts = SubVecTy->getNumElements();
-      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+    if (!DstTy->isScalableTy() && !VecTy->isScalableTy() &&
+        !SubVecTy->isScalableTy()) {
+      unsigned DstNumElts = DstTy->getElementCount().getFixedValue();
+      unsigned VecNumElts = VecTy->getElementCount().getFixedValue();
+      unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue();
 
       // An insert that entirely overwrites Vec with SubVec is a nop.
       if (VecNumElts == SubVecNumElts)
diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll
@@ -7,12 +7,8 @@ define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x f
 ; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
 ; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
-; CHECK-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
-; CHECK-NEXT:    ret <vscale x 4 x float> [[CAST_SCALABLE]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 ;
 entry:
   %retval = alloca %struct.svfloat32_wrapped_t