From 091c45d903128e20bd087498f5518f198d2a656e Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Mon, 27 Jan 2025 18:53:29 +0100 Subject: [PATCH 1/2] [InstCombine] tests for simple store-to-load forwaring between fixed/scalable vectors --- .../InstCombine/store-load-vector-insert.ll | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 llvm/test/Transforms/InstCombine/store-load-vector-insert.ll diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll new file mode 100644 index 0000000000000..1457b8c4391e2 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll @@ -0,0 +1,70 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=instcombine < %s | FileCheck %s + +%struct.svfloat32_wrapped_t = type { <16 x float> } + +define @store_to_vector_load_different_type( %.coerce) #0 { +; CHECK-LABEL: define @store_to_vector_load_different_type( +; CHECK-SAME: [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca %struct.svfloat32_wrapped_t + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @vscale_not_fixed( %.coerce) #1 { +; CHECK-LABEL: define @vscale_not_fixed( +; CHECK-SAME: [[DOTCOERCE:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca %struct.svfloat32_wrapped_t + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <16 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> %1, i64 0) + ret %cast.scalable +} + +define @sizes_do_not_match( %.coerce) #0 { +; CHECK-LABEL: define @sizes_do_not_match( +; CHECK-SAME: [[DOTCOERCE:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64 +; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE]], [[DOTCOERCE]] +; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr [[RETVAL]], align 32 +; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v8f32( poison, <8 x float> [[TMP1]], i64 0) +; CHECK-NEXT: ret [[CAST_SCALABLE]] +; +entry: + %retval = alloca %struct.svfloat32_wrapped_t + %0 = fadd %.coerce, %.coerce + store %0, ptr %retval + %1 = load <8 x float>, ptr %retval + %cast.scalable = tail call @llvm.vector.insert.nxv4f32.v8f32( poison, <8 x float> %1, i64 0) + ret %cast.scalable +} + +declare @llvm.vector.insert.nxv4f32.v16f32(, <16 x float>, i64 immarg) +declare @llvm.vector.insert.nxv4f32.v8f32(, <8 x float>, i64 immarg) + +attributes #0 = { vscale_range(4,4) } +attributes #1 = { vscale_range(1,16) } From 28e62bf79e12a57d32e9523ff17326f698552bd0 Mon Sep 17 00:00:00 2001 From: Lou Knauer Date: Mon, 27 Jan 2025 18:55:36 +0100 Subject: [PATCH 2/2] [InstCombine] Simple store-to-load forwaring between fixed/scalable vectors When storing a scalable vector and the VScale is a compile-time known constant, do basic store-to-load forwarding through @llvm.vector.insert calls, even if the loaded vector is fixed-sized instead of scalable. The @llvm.vector.insert is matched instead of the load itself because it is invalid to create a temporary insert of a scalable vector (the stored value) into a fixed-sized vector (the load type). The usecase is shown in this [godbold link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates IR that matches this pattern when the "arm_sve_vector_bits" attribute is used: ``` typedef svfloat32_t svfloat32_fixed_t __attribute__((arm_sve_vector_bits(512))); struct svfloat32_wrapped_t { svfloat32_fixed_t v; }; static inline svfloat32_wrapped_t add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { return {svadd_f32_x(svptrue_b32(), a.v, b.v)}; } svfloat32_wrapped_t foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { // The IR pattern this patch matches is generated for this return: return add(a, b); } ``` --- llvm/include/llvm/Analysis/Loads.h | 4 +++ llvm/lib/Analysis/Loads.cpp | 33 +++++++++++++++---- .../InstCombine/InstCombineCalls.cpp | 33 ++++++++++++++----- .../InstCombine/store-load-vector-insert.ll | 6 +--- 4 files changed, 57 insertions(+), 19 deletions(-) diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h index 639070c07897b..0cadbc5fede9b 100644 --- a/llvm/include/llvm/Analysis/Loads.h +++ b/llvm/include/llvm/Analysis/Loads.h @@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB, /// FindAvailableLoadedValue() for the case where we are not interested in /// finding the closest clobbering instruction if no available load is found. /// This overload cannot be used to scan across multiple blocks. +/// If \p VectorKindChange is not nullptr, this is a out parameter that is true +/// if a value was found, but it is a scalable vector instead of a requested +/// fixed-sized one (or the other way round). Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, bool *IsLoadCSE, + bool *IsVectorKindChange = nullptr, unsigned MaxInstsToScan = DefMaxInstsToScan); /// Scan backwards to see if we have the value of the given pointer available diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp index 691d7e4a3edcf..e4bd59fbf2d30 100644 --- a/llvm/lib/Analysis/Loads.cpp +++ b/llvm/lib/Analysis/Loads.cpp @@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr, static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr, Type *AccessTy, bool AtLeastAtomic, - const DataLayout &DL, bool *IsLoadCSE) { + const DataLayout &DL, bool *IsLoadCSE, + bool *IsVectorKindChange) { // If this is a load of Ptr, the loaded value is available. // (This is true even if the load is volatile or atomic, although // those cases are unlikely.) @@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr, if (TypeSize::isKnownLE(LoadSize, StoreSize)) if (auto *C = dyn_cast(Val)) return ConstantFoldLoadFromConst(C, AccessTy, DL); + + if (IsVectorKindChange && Val->getType()->isVectorTy() && + AccessTy->isVectorTy()) { + auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs(); + unsigned VScale = Attrs.getVScaleRangeMin(); + if (Attrs.getVScaleRangeMax() != VScale) + return nullptr; + + unsigned FixedStoreSize = + (StoreSize.isFixed() ? StoreSize : StoreSize * VScale) + .getKnownMinValue(); + unsigned FixedLoadSize = + (LoadSize.isFixed() ? LoadSize : LoadSize * VScale) + .getKnownMinValue(); + if (FixedStoreSize == FixedLoadSize) { + *IsVectorKindChange = true; + return Val; + } + } } if (auto *MSI = dyn_cast(Inst)) { @@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore( --ScanFrom; - if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy, - AtLeastAtomic, DL, IsLoadCSE)) + if (Value *Available = getAvailableLoadStore( + Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr)) return Available; // Try to get the store size for the type. @@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore( } Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, - bool *IsLoadCSE, + bool *IsLoadCSE, bool *IsVectorKindChange, unsigned MaxInstsToScan) { const DataLayout &DL = Load->getDataLayout(); Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts(); @@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA, if (MaxInstsToScan-- == 0) return nullptr; - Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, - AtLeastAtomic, DL, IsLoadCSE); + Available = + getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, + IsLoadCSE, IsVectorKindChange); if (Available) break; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index f748f78524e0d..f463fe3e7d504 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { Value *Vec = II->getArgOperand(0); Value *SubVec = II->getArgOperand(1); Value *Idx = II->getArgOperand(2); - auto *DstTy = dyn_cast(II->getType()); - auto *VecTy = dyn_cast(Vec->getType()); - auto *SubVecTy = dyn_cast(SubVec->getType()); + auto *DstTy = cast(II->getType()); + auto *VecTy = cast(Vec->getType()); + auto *SubVecTy = cast(SubVec->getType()); + unsigned IdxN = cast(Idx)->getZExtValue(); + + // Try store-to-load forwarding where the stored value has the same + // type as this intrinsic, and the loaded value is the inserted + // vector. This has to be done here because a temporary insert of + // a scalable vector (the available value) into a fixed-sized one + // (the second operand of this intrinisc) cannot be created. + if (auto *LI = dyn_cast(SubVec); + LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) { + bool IsVectorKindChange = false; + BatchAAResults BatchAA(*AA); + if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr, + &IsVectorKindChange); + AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) { + return replaceInstUsesWith(CI, AvilVal); + } + } // Only canonicalize if the destination vector, Vec, and SubVec are all // fixed vectors. - if (DstTy && VecTy && SubVecTy) { - unsigned DstNumElts = DstTy->getNumElements(); - unsigned VecNumElts = VecTy->getNumElements(); - unsigned SubVecNumElts = SubVecTy->getNumElements(); - unsigned IdxN = cast(Idx)->getZExtValue(); + if (!DstTy->isScalableTy() && !VecTy->isScalableTy() && + !SubVecTy->isScalableTy()) { + unsigned DstNumElts = DstTy->getElementCount().getFixedValue(); + unsigned VecNumElts = VecTy->getElementCount().getFixedValue(); + unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue(); // An insert that entirely overwrites Vec with SubVec is a nop. if (VecNumElts == SubVecNumElts) diff --git a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll index 1457b8c4391e2..73685fe8c3762 100644 --- a/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll +++ b/llvm/test/Transforms/InstCombine/store-load-vector-insert.ll @@ -7,12 +7,8 @@ define @store_to_vector_load_different_type( @store_to_vector_load_different_type( ; CHECK-SAME: [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64 ; CHECK-NEXT: [[TMP0:%.*]] = fadd [[DOTCOERCE]], [[DOTCOERCE]] -; CHECK-NEXT: store [[TMP0]], ptr [[RETVAL]], align 16 -; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64 -; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4f32.v16f32( poison, <16 x float> [[TMP1]], i64 0) -; CHECK-NEXT: ret [[CAST_SCALABLE]] +; CHECK-NEXT: ret [[TMP0]] ; entry: %retval = alloca %struct.svfloat32_wrapped_t