Skip to content

Commit 28e62bf

Browse files
committed
[InstCombine] Simple store-to-load forwaring between fixed/scalable vectors
When storing a scalable vector and the VScale is a compile-time known constant, do basic store-to-load forwarding through @llvm.vector.insert calls, even if the loaded vector is fixed-sized instead of scalable. The @llvm.vector.insert is matched instead of the load itself because it is invalid to create a temporary insert of a scalable vector (the stored value) into a fixed-sized vector (the load type). The usecase is shown in this [godbold link](https://godbolt.org/z/KT3sMrMbd), which shows that clang generates IR that matches this pattern when the "arm_sve_vector_bits" attribute is used: ``` typedef svfloat32_t svfloat32_fixed_t __attribute__((arm_sve_vector_bits(512))); struct svfloat32_wrapped_t { svfloat32_fixed_t v; }; static inline svfloat32_wrapped_t add(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { return {svadd_f32_x(svptrue_b32(), a.v, b.v)}; } svfloat32_wrapped_t foo(svfloat32_wrapped_t a, svfloat32_wrapped_t b) { // The IR pattern this patch matches is generated for this return: return add(a, b); } ```
1 parent 091c45d commit 28e62bf

File tree

4 files changed

+57
-19
lines changed

4 files changed

+57
-19
lines changed

llvm/include/llvm/Analysis/Loads.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,8 +154,12 @@ Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
154154
/// FindAvailableLoadedValue() for the case where we are not interested in
155155
/// finding the closest clobbering instruction if no available load is found.
156156
/// This overload cannot be used to scan across multiple blocks.
157+
/// If \p VectorKindChange is not nullptr, this is a out parameter that is true
158+
/// if a value was found, but it is a scalable vector instead of a requested
159+
/// fixed-sized one (or the other way round).
157160
Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
158161
bool *IsLoadCSE,
162+
bool *IsVectorKindChange = nullptr,
159163
unsigned MaxInstsToScan = DefMaxInstsToScan);
160164

161165
/// Scan backwards to see if we have the value of the given pointer available

llvm/lib/Analysis/Loads.cpp

Lines changed: 27 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -538,7 +538,8 @@ static bool areNonOverlapSameBaseLoadAndStore(const Value *LoadPtr,
538538

539539
static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
540540
Type *AccessTy, bool AtLeastAtomic,
541-
const DataLayout &DL, bool *IsLoadCSE) {
541+
const DataLayout &DL, bool *IsLoadCSE,
542+
bool *IsVectorKindChange) {
542543
// If this is a load of Ptr, the loaded value is available.
543544
// (This is true even if the load is volatile or atomic, although
544545
// those cases are unlikely.)
@@ -584,6 +585,25 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
584585
if (TypeSize::isKnownLE(LoadSize, StoreSize))
585586
if (auto *C = dyn_cast<Constant>(Val))
586587
return ConstantFoldLoadFromConst(C, AccessTy, DL);
588+
589+
if (IsVectorKindChange && Val->getType()->isVectorTy() &&
590+
AccessTy->isVectorTy()) {
591+
auto Attrs = Inst->getFunction()->getAttributes().getFnAttrs();
592+
unsigned VScale = Attrs.getVScaleRangeMin();
593+
if (Attrs.getVScaleRangeMax() != VScale)
594+
return nullptr;
595+
596+
unsigned FixedStoreSize =
597+
(StoreSize.isFixed() ? StoreSize : StoreSize * VScale)
598+
.getKnownMinValue();
599+
unsigned FixedLoadSize =
600+
(LoadSize.isFixed() ? LoadSize : LoadSize * VScale)
601+
.getKnownMinValue();
602+
if (FixedStoreSize == FixedLoadSize) {
603+
*IsVectorKindChange = true;
604+
return Val;
605+
}
606+
}
587607
}
588608

589609
if (auto *MSI = dyn_cast<MemSetInst>(Inst)) {
@@ -655,8 +675,8 @@ Value *llvm::findAvailablePtrLoadStore(
655675

656676
--ScanFrom;
657677

658-
if (Value *Available = getAvailableLoadStore(Inst, StrippedPtr, AccessTy,
659-
AtLeastAtomic, DL, IsLoadCSE))
678+
if (Value *Available = getAvailableLoadStore(
679+
Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL, IsLoadCSE, nullptr))
660680
return Available;
661681

662682
// Try to get the store size for the type.
@@ -711,7 +731,7 @@ Value *llvm::findAvailablePtrLoadStore(
711731
}
712732

713733
Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
714-
bool *IsLoadCSE,
734+
bool *IsLoadCSE, bool *IsVectorKindChange,
715735
unsigned MaxInstsToScan) {
716736
const DataLayout &DL = Load->getDataLayout();
717737
Value *StrippedPtr = Load->getPointerOperand()->stripPointerCasts();
@@ -734,8 +754,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
734754
if (MaxInstsToScan-- == 0)
735755
return nullptr;
736756

737-
Available = getAvailableLoadStore(&Inst, StrippedPtr, AccessTy,
738-
AtLeastAtomic, DL, IsLoadCSE);
757+
Available =
758+
getAvailableLoadStore(&Inst, StrippedPtr, AccessTy, AtLeastAtomic, DL,
759+
IsLoadCSE, IsVectorKindChange);
739760
if (Available)
740761
break;
741762

llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp

Lines changed: 25 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3389,17 +3389,34 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
33893389
Value *Vec = II->getArgOperand(0);
33903390
Value *SubVec = II->getArgOperand(1);
33913391
Value *Idx = II->getArgOperand(2);
3392-
auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
3393-
auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
3394-
auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
3392+
auto *DstTy = cast<VectorType>(II->getType());
3393+
auto *VecTy = cast<VectorType>(Vec->getType());
3394+
auto *SubVecTy = cast<VectorType>(SubVec->getType());
3395+
unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
3396+
3397+
// Try store-to-load forwarding where the stored value has the same
3398+
// type as this intrinsic, and the loaded value is the inserted
3399+
// vector. This has to be done here because a temporary insert of
3400+
// a scalable vector (the available value) into a fixed-sized one
3401+
// (the second operand of this intrinisc) cannot be created.
3402+
if (auto *LI = dyn_cast<LoadInst>(SubVec);
3403+
LI && IdxN == 0 && DstTy->isScalableTy() && !SubVecTy->isScalableTy()) {
3404+
bool IsVectorKindChange = false;
3405+
BatchAAResults BatchAA(*AA);
3406+
if (Value *AvilVal = FindAvailableLoadedValue(LI, BatchAA, nullptr,
3407+
&IsVectorKindChange);
3408+
AvilVal && IsVectorKindChange && AvilVal->getType() == DstTy) {
3409+
return replaceInstUsesWith(CI, AvilVal);
3410+
}
3411+
}
33953412

33963413
// Only canonicalize if the destination vector, Vec, and SubVec are all
33973414
// fixed vectors.
3398-
if (DstTy && VecTy && SubVecTy) {
3399-
unsigned DstNumElts = DstTy->getNumElements();
3400-
unsigned VecNumElts = VecTy->getNumElements();
3401-
unsigned SubVecNumElts = SubVecTy->getNumElements();
3402-
unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
3415+
if (!DstTy->isScalableTy() && !VecTy->isScalableTy() &&
3416+
!SubVecTy->isScalableTy()) {
3417+
unsigned DstNumElts = DstTy->getElementCount().getFixedValue();
3418+
unsigned VecNumElts = VecTy->getElementCount().getFixedValue();
3419+
unsigned SubVecNumElts = SubVecTy->getElementCount().getFixedValue();
34033420

34043421
// An insert that entirely overwrites Vec with SubVec is a nop.
34053422
if (VecNumElts == SubVecNumElts)

llvm/test/Transforms/InstCombine/store-load-vector-insert.ll

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,8 @@ define <vscale x 4 x float> @store_to_vector_load_different_type(<vscale x 4 x f
77
; CHECK-LABEL: define <vscale x 4 x float> @store_to_vector_load_different_type(
88
; CHECK-SAME: <vscale x 4 x float> [[DOTCOERCE:%.*]]) #[[ATTR0:[0-9]+]] {
99
; CHECK-NEXT: [[ENTRY:.*:]]
10-
; CHECK-NEXT: [[RETVAL:%.*]] = alloca [[STRUCT_SVFLOAT32_WRAPPED_T:%.*]], align 64
1110
; CHECK-NEXT: [[TMP0:%.*]] = fadd <vscale x 4 x float> [[DOTCOERCE]], [[DOTCOERCE]]
12-
; CHECK-NEXT: store <vscale x 4 x float> [[TMP0]], ptr [[RETVAL]], align 16
13-
; CHECK-NEXT: [[TMP1:%.*]] = load <16 x float>, ptr [[RETVAL]], align 64
14-
; CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v16f32(<vscale x 4 x float> poison, <16 x float> [[TMP1]], i64 0)
15-
; CHECK-NEXT: ret <vscale x 4 x float> [[CAST_SCALABLE]]
11+
; CHECK-NEXT: ret <vscale x 4 x float> [[TMP0]]
1612
;
1713
entry:
1814
%retval = alloca %struct.svfloat32_wrapped_t

0 commit comments

Comments
 (0)