Skip to content
6 changes: 4 additions & 2 deletions llvm/include/llvm/CodeGen/TargetLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -3209,10 +3209,12 @@ class LLVM_ABI TargetLoweringBase {
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
/// \p Indices is the corresponding indices for each shufflevector.
/// \p Factor is the interleave factor.
/// \p GapMask is a mask with zeros for components / fields that may not be
/// accessed.
virtual bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const {
ArrayRef<unsigned> Indices, unsigned Factor,
const APInt &GapMask) const {
return false;
}

Expand Down
101 changes: 78 additions & 23 deletions llvm/lib/CodeGen/InterleavedAccessPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,13 +268,16 @@ static Value *getMaskOperand(IntrinsicInst *II) {
}
}

// Return the corresponded deinterleaved mask, or nullptr if there is no valid
// mask.
static Value *getMask(Value *WideMask, unsigned Factor,
ElementCount LeafValueEC);

static Value *getMask(Value *WideMask, unsigned Factor,
VectorType *LeafValueTy) {
// Return a pair of
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We talked about this offline, but I'm more and more coming to the view we should have made these a set of utility routines (usable by each target), and simply passed the mask operand through (or maybe not even that.) More of an aside for longer term consideration than a comment on this review.

// (1) The corresponded deinterleaved mask, or nullptr if there is no valid
// mask.
// (2) Some mask effectively skips a certain field, and this element is a mask
// in which inactive lanes represent fields that are skipped (i.e. "gaps").
static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
ElementCount LeafValueEC);

static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
VectorType *LeafValueTy) {
return getMask(WideMask, Factor, LeafValueTy->getElementCount());
}

Expand Down Expand Up @@ -379,22 +382,25 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, Load);

Value *Mask = nullptr;
auto GapMask = APInt::getAllOnes(Factor);
if (LI) {
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *Load << "\n");
} else {
// Check mask operand. Handle both all-true/false and interleaved mask.
Mask = getMask(getMaskOperand(II), Factor, VecTy);
std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor, VecTy);
if (!Mask)
return false;

LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.load or masked.load: "
<< *Load << "\n");
LLVM_DEBUG(dbgs() << "IA: With nominal factor " << Factor
<< " and actual factor " << GapMask.popcount() << "\n");
}

// Try to create target specific intrinsics to replace the load and
// shuffles.
if (!TLI->lowerInterleavedLoad(cast<Instruction>(Load), Mask, Shuffles,
Indices, Factor))
Indices, Factor, GapMask))
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
return !Extracts.empty() || BinOpShuffleChanged;

Expand Down Expand Up @@ -536,10 +542,15 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
} else {
// Check mask operand. Handle both all-true/false and interleaved mask.
unsigned LaneMaskLen = NumStoredElements / Factor;
Mask = getMask(getMaskOperand(II), Factor,
ElementCount::getFixed(LaneMaskLen));
APInt GapMask(Factor, 0);
std::tie(Mask, GapMask) = getMask(getMaskOperand(II), Factor,
ElementCount::getFixed(LaneMaskLen));
if (!Mask)
return false;
// We haven't supported gap mask for stores. Yet it is possible that we
// already changed the IR, hence returning true here.
if (GapMask.popcount() != Factor)
return true;

LLVM_DEBUG(dbgs() << "IA: Found an interleaved vp.store or masked.store: "
<< *Store << "\n");
Expand All @@ -556,34 +567,64 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
return true;
}

static Value *getMask(Value *WideMask, unsigned Factor,
ElementCount LeafValueEC) {
// A wide mask <1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0> could be used to skip the
// last field in a factor-of-three interleaved store or deinterleaved load (in
// which case LeafMaskLen is 4). Such (wide) mask is also known as gap mask.
// This helper function tries to detect this pattern and return the actual
// factor we're accessing, which is 2 in this example.
static void getGapMask(const Constant &MaskConst, unsigned Factor,
unsigned LeafMaskLen, APInt &GapMask) {
assert(GapMask.getBitWidth() == Factor);
for (unsigned F = 0U; F < Factor; ++F) {
bool AllZero = true;
for (unsigned Idx = 0U; Idx < LeafMaskLen; ++Idx) {
Constant *C = MaskConst.getAggregateElement(F + Idx * Factor);
if (!C->isZeroValue()) {
AllZero = false;
break;
}
}
// All mask bits on this field are zero, skipping it.
if (AllZero)
GapMask.clearBit(F);
}
}

static std::pair<Value *, APInt> getMask(Value *WideMask, unsigned Factor,
ElementCount LeafValueEC) {
auto GapMask = APInt::getAllOnes(Factor);

if (auto *IMI = dyn_cast<IntrinsicInst>(WideMask)) {
if (unsigned F = getInterleaveIntrinsicFactor(IMI->getIntrinsicID());
F && F == Factor && llvm::all_equal(IMI->args())) {
return IMI->getArgOperand(0);
return {IMI->getArgOperand(0), GapMask};
}
}

if (auto *ConstMask = dyn_cast<Constant>(WideMask)) {
if (auto *Splat = ConstMask->getSplatValue())
// All-ones or all-zeros mask.
return ConstantVector::getSplat(LeafValueEC, Splat);
return {ConstantVector::getSplat(LeafValueEC, Splat), GapMask};

if (LeafValueEC.isFixed()) {
unsigned LeafMaskLen = LeafValueEC.getFixedValue();
// First, check if we use a gap mask to skip some of the factors / fields.
getGapMask(*ConstMask, Factor, LeafMaskLen, GapMask);

SmallVector<Constant *, 8> LeafMask(LeafMaskLen, nullptr);
// If this is a fixed-length constant mask, each lane / leaf has to
// use the same mask. This is done by checking if every group with Factor
// number of elements in the interleaved mask has homogeneous values.
for (unsigned Idx = 0U; Idx < LeafMaskLen * Factor; ++Idx) {
if (!GapMask[Idx % Factor])
continue;
Constant *C = ConstMask->getAggregateElement(Idx);
if (LeafMask[Idx / Factor] && LeafMask[Idx / Factor] != C)
return nullptr;
return {nullptr, GapMask};
LeafMask[Idx / Factor] = C;
}

return ConstantVector::get(LeafMask);
return {ConstantVector::get(LeafMask), GapMask};
}
}

Expand All @@ -603,12 +644,13 @@ static Value *getMask(Value *WideMask, unsigned Factor,
auto *LeafMaskTy =
VectorType::get(Type::getInt1Ty(SVI->getContext()), LeafValueEC);
IRBuilder<> Builder(SVI);
return Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
uint64_t(0));
return {Builder.CreateExtractVector(LeafMaskTy, SVI->getOperand(0),
uint64_t(0)),
GapMask};
}
}

return nullptr;
return {nullptr, GapMask};
}

bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
Expand Down Expand Up @@ -639,9 +681,16 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
return false;

// Check mask operand. Handle both all-true/false and interleaved mask.
Mask = getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
APInt GapMask(Factor, 0);
std::tie(Mask, GapMask) =
getMask(getMaskOperand(II), Factor, getDeinterleavedVectorType(DI));
if (!Mask)
return false;
// We haven't supported gap mask if it's deinterleaving using intrinsics.
// Yet it is possible that we already changed the IR, hence returning true
// here.
if (GapMask.popcount() != Factor)
return true;

LLVM_DEBUG(dbgs() << "IA: Found a vp.load or masked.load with deinterleave"
<< " intrinsic " << *DI << " and factor = "
Expand Down Expand Up @@ -680,10 +729,16 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
II->getIntrinsicID() != Intrinsic::vp_store)
return false;
// Check mask operand. Handle both all-true/false and interleaved mask.
Mask = getMask(getMaskOperand(II), Factor,
cast<VectorType>(InterleaveValues[0]->getType()));
APInt GapMask(Factor, 0);
std::tie(Mask, GapMask) =
getMask(getMaskOperand(II), Factor,
cast<VectorType>(InterleaveValues[0]->getType()));
if (!Mask)
return false;
// We haven't supported gap mask if it's interleaving using intrinsics. Yet
// it is possible that we already changed the IR, hence returning true here.
if (GapMask.popcount() != Factor)
return true;

LLVM_DEBUG(dbgs() << "IA: Found a vp.store or masked.store with interleave"
<< " intrinsic " << *IntII << " and factor = "
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17254,7 +17254,7 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool AArch64TargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
Expand All @@ -17264,7 +17264,7 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
assert(!Mask && "Unexpected mask on a load");
assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");

const DataLayout &DL = LI->getDataLayout();
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be an assert (same for most targets), since LoadInst isn't masked by definition.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed.


Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,8 @@ class AArch64TargetLowering : public TargetLowering {

bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
ArrayRef<unsigned> Indices, unsigned Factor,
const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21599,7 +21599,7 @@ unsigned ARMTargetLowering::getMaxSupportedInterleaveFactor() const {
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
bool ARMTargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
Expand All @@ -21609,7 +21609,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
assert(!Mask && "Unexpected mask on a load");
assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");

auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
Type *EltTy = VecTy->getElementType();
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/ARM/ARMISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -683,8 +683,8 @@ class VectorType;

bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
ArrayRef<unsigned> Indices, unsigned Factor,
const APInt &GapMask) const override;
bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
unsigned Factor) const override;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/RISCV/RISCVISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -431,8 +431,8 @@ class RISCVTargetLowering : public TargetLowering {

bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
ArrayRef<unsigned> Indices, unsigned Factor,
const APInt &GapMask) const override;

bool lowerInterleavedStore(Instruction *Store, Value *Mask,
ShuffleVectorInst *SVI,
Expand Down
43 changes: 36 additions & 7 deletions llvm/lib/Target/RISCV/RISCVInterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,12 @@ static const Intrinsic::ID FixedVlsegIntrIds[] = {
Intrinsic::riscv_seg6_load_mask, Intrinsic::riscv_seg7_load_mask,
Intrinsic::riscv_seg8_load_mask};

static const Intrinsic::ID FixedVlssegIntrIds[] = {
Intrinsic::riscv_sseg2_load_mask, Intrinsic::riscv_sseg3_load_mask,
Intrinsic::riscv_sseg4_load_mask, Intrinsic::riscv_sseg5_load_mask,
Intrinsic::riscv_sseg6_load_mask, Intrinsic::riscv_sseg7_load_mask,
Intrinsic::riscv_sseg8_load_mask};

static const Intrinsic::ID ScalableVlsegIntrIds[] = {
Intrinsic::riscv_vlseg2_mask, Intrinsic::riscv_vlseg3_mask,
Intrinsic::riscv_vlseg4_mask, Intrinsic::riscv_vlseg5_mask,
Expand Down Expand Up @@ -197,9 +203,15 @@ static bool getMemOperands(unsigned Factor, VectorType *VTy, Type *XLenTy,
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
bool RISCVTargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Indices.size() == Shuffles.size());
assert(GapMask.getBitWidth() == Factor);

// We only support cases where the skipped fields are the trailing ones.
// TODO: Lower to strided load if there is only a single active field.
unsigned MaskFactor = GapMask.popcount();
if (MaskFactor < 2 || !GapMask.isMask())
return false;
IRBuilder<> Builder(Load);

const DataLayout &DL = Load->getDataLayout();
Expand All @@ -208,20 +220,37 @@ bool RISCVTargetLowering::lowerInterleavedLoad(

Value *Ptr, *VL;
Align Alignment;
if (!getMemOperands(Factor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
if (!getMemOperands(MaskFactor, VTy, XLenTy, Load, Ptr, Mask, VL, Alignment))
return false;

Type *PtrTy = Ptr->getType();
unsigned AS = PtrTy->getPointerAddressSpace();
if (!isLegalInterleavedAccessType(VTy, Factor, Alignment, AS, DL))
if (!isLegalInterleavedAccessType(VTy, MaskFactor, Alignment, AS, DL))
return false;

CallInst *VlsegN = Builder.CreateIntrinsic(
FixedVlsegIntrIds[Factor - 2], {VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
CallInst *SegLoad = nullptr;
if (MaskFactor < Factor) {
// Lower to strided segmented load.
unsigned ScalarSizeInBytes = DL.getTypeStoreSize(VTy->getElementType());
Value *Stride = ConstantInt::get(XLenTy, Factor * ScalarSizeInBytes);
SegLoad = Builder.CreateIntrinsic(FixedVlssegIntrIds[MaskFactor - 2],
{VTy, PtrTy, XLenTy, XLenTy},
{Ptr, Stride, Mask, VL});
} else {
// Lower to normal segmented load.
SegLoad = Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
{VTy, PtrTy, XLenTy}, {Ptr, Mask, VL});
}

for (unsigned i = 0; i < Shuffles.size(); i++) {
Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
Shuffles[i]->replaceAllUsesWith(SubVec);
unsigned FactorIdx = Indices[i];
if (FactorIdx >= MaskFactor) {
// Replace masked-off factors (that are still extracted) with poison.
Shuffles[i]->replaceAllUsesWith(PoisonValue::get(VTy));
} else {
Value *SubVec = Builder.CreateExtractValue(SegLoad, FactorIdx);
Shuffles[i]->replaceAllUsesWith(SubVec);
}
}

return true;
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86ISelLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -1663,8 +1663,8 @@ namespace llvm {
/// instructions/intrinsics.
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices,
unsigned Factor) const override;
ArrayRef<unsigned> Indices, unsigned Factor,
const APInt &GapMask) const override;

/// Lower interleaved store(s) into target specific
/// instructions/intrinsics.
Expand Down
4 changes: 2 additions & 2 deletions llvm/lib/Target/X86/X86InterleavedAccess.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -802,7 +802,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// Currently, lowering is supported for 4x64 bits with Factor = 4 on AVX.
bool X86TargetLowering::lowerInterleavedLoad(
Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
ArrayRef<unsigned> Indices, unsigned Factor) const {
ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
"Invalid interleave factor");
assert(!Shuffles.empty() && "Empty shufflevector input");
Expand All @@ -812,7 +812,7 @@ bool X86TargetLowering::lowerInterleavedLoad(
auto *LI = dyn_cast<LoadInst>(Load);
if (!LI)
return false;
assert(!Mask && "Unexpected mask on a load");
assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");

// Create an interleaved access group.
IRBuilder<> Builder(LI);
Expand Down
Loading
Loading