Skip to content

Commit 45eb570

Browse files
committed
[InterleavedAccess] Construct interleaved access store with shuffles
- [AArch64]: Interleaved access store can handle more elements than target supported maximum interleaved factor with shuffles.
1 parent 40d4ea6 commit 45eb570

File tree

9 files changed

+423
-18
lines changed

9 files changed

+423
-18
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3206,6 +3206,11 @@ class LLVM_ABI TargetLoweringBase {
32063206
/// Default to be the minimum interleave factor: 2.
32073207
virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; }
32083208

3209+
/// Return true if the target can interleave data with shuffles.
3210+
virtual bool isProfitableToInterleaveWithGatherScatter() const {
3211+
return false;
3212+
}
3213+
32093214
/// Lower an interleaved load to target specific intrinsics. Return
32103215
/// true on success.
32113216
///

llvm/lib/CodeGen/InterleavedAccessPass.cpp

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
239239
/// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
240240
/// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
241241
static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
242-
unsigned MaxFactor) {
242+
unsigned MaxFactor,
243+
bool InterleaveWithShuffles) {
243244
unsigned NumElts = SVI->getShuffleMask().size();
244245
if (NumElts < 4)
245246
return false;
@@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
250251
return true;
251252
}
252253

254+
if (InterleaveWithShuffles) {
255+
for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) {
256+
Factor = i * MaxFactor;
257+
if (SVI->isInterleave(Factor))
258+
return true;
259+
}
260+
}
253261
return false;
254262
}
255263

@@ -530,7 +538,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
530538
cast<FixedVectorType>(SVI->getType())->getNumElements();
531539
// Check if the shufflevector is RE-interleave shuffle.
532540
unsigned Factor;
533-
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
541+
if (!isReInterleaveMask(SVI, Factor, MaxFactor,
542+
TLI->isProfitableToInterleaveWithGatherScatter()))
534543
return false;
535544
assert(NumStoredElements % Factor == 0 &&
536545
"number of stored element should be a multiple of Factor");

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 138 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@
9696
#include <cctype>
9797
#include <cstdint>
9898
#include <cstdlib>
99+
#include <deque>
99100
#include <iterator>
100101
#include <limits>
101102
#include <optional>
@@ -18023,11 +18024,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
1802318024
unsigned Factor,
1802418025
const APInt &GapMask) const {
1802518026

18026-
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18027-
"Invalid interleave factor");
1802818027
auto *SI = dyn_cast<StoreInst>(Store);
1802918028
if (!SI)
1803018029
return false;
18030+
18031+
if (isProfitableToInterleaveWithGatherScatter() &&
18032+
Factor > getMaxSupportedInterleaveFactor())
18033+
return lowerInterleavedStoreWithShuffle(SI, SVI, Factor);
18034+
18035+
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
18036+
"Invalid interleave factor");
18037+
1803118038
assert(!LaneMask && GapMask.popcount() == Factor &&
1803218039
"Unexpected mask on store");
1803318040

@@ -18173,6 +18180,135 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store,
1817318180
return true;
1817418181
}
1817518182

18183+
/// If the interleaved vector elements are greter than supported MaxFactor
18184+
/// then, interleaving the data with additional shuffles can be used to
18185+
/// achieve the same.
18186+
/// Below shows how 8 interleaved data are shuffled to store with stN
18187+
/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7
18188+
/// v0 v4 v2 v6 v1 v5 v3 v7
18189+
/// | | | | | | | |
18190+
/// \ / \ / \ / \ /
18191+
/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7]==> stN = 4
18192+
/// | | | |
18193+
/// \ / \ /
18194+
/// \ / \ /
18195+
/// \ / \ /
18196+
/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2
18197+
///
18198+
/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store
18199+
/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4.
18200+
///
18201+
/// In stN = 2 level first upper half of interleaved data V0,V1 is store
18202+
/// with one st2 instruction. Second set V2,V3 with store with another st2.
18203+
/// Total of 4 st2 are required.
18204+
bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle(
18205+
StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const {
18206+
unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor();
18207+
18208+
auto *VecTy = cast<FixedVectorType>(SVI->getType());
18209+
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
18210+
18211+
unsigned LaneLen = VecTy->getNumElements() / Factor;
18212+
Type *EltTy = VecTy->getElementType();
18213+
auto *SubVecTy = FixedVectorType::get(EltTy, Factor);
18214+
18215+
const DataLayout &DL = SI->getModule()->getDataLayout();
18216+
bool UseScalable;
18217+
18218+
// Skip if we do not have NEON and skip illegal vector types. We can
18219+
// "legalize" wide vector types into multiple interleaved accesses as long as
18220+
// the vector types are divisible by 128.
18221+
if (!Subtarget->hasNEON() ||
18222+
!isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
18223+
return false;
18224+
18225+
if (UseScalable)
18226+
return false;
18227+
18228+
std::deque<Value *> Shuffles;
18229+
Shuffles.push_back(SVI);
18230+
unsigned ConcatLevel = Factor;
18231+
while (ConcatLevel > 1) {
18232+
std::deque<Value *> ShufflesIntermediate;
18233+
ShufflesIntermediate = Shuffles;
18234+
Shuffles.clear();
18235+
while (!ShufflesIntermediate.empty()) {
18236+
ShuffleVectorInst *SFL =
18237+
dyn_cast<ShuffleVectorInst>(ShufflesIntermediate.front());
18238+
if (!SFL)
18239+
break;
18240+
ShufflesIntermediate.pop_front();
18241+
18242+
Value *Op0 = SFL->getOperand(0);
18243+
Value *Op1 = SFL->getOperand(1);
18244+
18245+
Shuffles.push_back(dyn_cast<Value>(Op0));
18246+
Shuffles.push_back(dyn_cast<Value>(Op1));
18247+
}
18248+
if (!ShufflesIntermediate.empty()) {
18249+
Shuffles = ShufflesIntermediate;
18250+
break;
18251+
}
18252+
ConcatLevel = ConcatLevel >> 1;
18253+
}
18254+
18255+
if (Shuffles.size() != Factor)
18256+
return false;
18257+
18258+
IRBuilder<> Builder(SI);
18259+
auto Mask = createInterleaveMask(LaneLen, 2);
18260+
SmallVector<int, 16> UpperHalfMask, LowerHalfMask;
18261+
for (unsigned i = 0; i < (2 * LaneLen); i++) {
18262+
if (i < LaneLen)
18263+
LowerHalfMask.push_back(Mask[i]);
18264+
else
18265+
UpperHalfMask.push_back(Mask[i]);
18266+
}
18267+
18268+
unsigned InterleaveFactor = Factor >> 1;
18269+
while (InterleaveFactor >= MaxSupportedFactor) {
18270+
std::deque<Value *> ShufflesIntermediate;
18271+
for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) {
18272+
for (unsigned i = 0; i < InterleaveFactor; i++) {
18273+
auto *Shuffle = Builder.CreateShuffleVector(
18274+
Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask);
18275+
ShufflesIntermediate.push_back(Shuffle);
18276+
}
18277+
for (unsigned i = 0; i < InterleaveFactor; i++) {
18278+
auto *Shuffle = Builder.CreateShuffleVector(
18279+
Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask);
18280+
ShufflesIntermediate.push_back(Shuffle);
18281+
}
18282+
}
18283+
18284+
Shuffles = ShufflesIntermediate;
18285+
InterleaveFactor = InterleaveFactor >> 1;
18286+
}
18287+
18288+
Type *PtrTy = SI->getPointerOperandType();
18289+
auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
18290+
18291+
Value *BaseAddr = SI->getPointerOperand();
18292+
Function *StNFunc = getStructuredStoreFunction(
18293+
SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy);
18294+
for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) {
18295+
SmallVector<Value *, 5> Ops;
18296+
for (unsigned j = 0; j < MaxSupportedFactor; j++)
18297+
Ops.push_back(Shuffles[i * MaxSupportedFactor + j]);
18298+
18299+
if (i > 0) {
18300+
// We will compute the pointer operand of each store from the original
18301+
// base address using GEPs. Cast the base address to a pointer to the
18302+
// scalar element type.
18303+
BaseAddr = Builder.CreateConstGEP1_32(
18304+
SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor);
18305+
}
18306+
Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
18307+
Builder.CreateCall(StNFunc, Ops);
18308+
}
18309+
return true;
18310+
}
18311+
1817618312
bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad(
1817718313
Instruction *Load, Value *Mask, IntrinsicInst *DI) const {
1817818314
const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID());

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,10 @@ class AArch64TargetLowering : public TargetLowering {
229229

230230
bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override;
231231

232+
bool isProfitableToInterleaveWithGatherScatter() const override {
233+
return true;
234+
}
235+
232236
unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
233237

234238
bool lowerInterleavedLoad(Instruction *Load, Value *Mask,
@@ -239,6 +243,9 @@ class AArch64TargetLowering : public TargetLowering {
239243
ShuffleVectorInst *SVI, unsigned Factor,
240244
const APInt &GapMask) const override;
241245

246+
bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI,
247+
unsigned Factor) const;
248+
242249
bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask,
243250
IntrinsicInst *DI) const override;
244251

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4801,19 +4801,47 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost(
48014801
if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
48024802
return InstructionCost::getInvalid();
48034803

4804-
if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
4804+
unsigned NumLoadStores = 1;
4805+
InstructionCost ShuffleCost = 0;
4806+
bool isInterleaveWithShuffle = false;
4807+
unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor();
4808+
4809+
auto *SubVecTy =
4810+
VectorType::get(VecVTy->getElementType(),
4811+
VecVTy->getElementCount().divideCoefficientBy(Factor));
4812+
4813+
if (TLI->isProfitableToInterleaveWithGatherScatter() &&
4814+
Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) &&
4815+
Factor > MaxSupportedFactor) {
4816+
isInterleaveWithShuffle = true;
4817+
SmallVector<int, 16> Mask;
4818+
// preparing interleave Mask.
4819+
for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2;
4820+
i++) {
4821+
for (unsigned j = 0; j < 2; j++)
4822+
Mask.push_back(j * Factor + i);
4823+
}
4824+
4825+
NumLoadStores = Factor / MaxSupportedFactor;
4826+
ShuffleCost =
4827+
(Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy,
4828+
Mask, CostKind, 0, SubVecTy));
4829+
}
4830+
4831+
if (!UseMaskForGaps &&
4832+
(Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) {
48054833
unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
4806-
auto *SubVecTy =
4807-
VectorType::get(VecVTy->getElementType(),
4808-
VecVTy->getElementCount().divideCoefficientBy(Factor));
48094834

48104835
// ldN/stN only support legal vector types of size 64 or 128 in bits.
48114836
// Accesses having vector types that are a multiple of 128 bits can be
48124837
// matched to more than one ldN/stN instruction.
48134838
bool UseScalable;
48144839
if (MinElts % Factor == 0 &&
48154840
TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
4816-
return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
4841+
return (Factor *
4842+
TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) *
4843+
NumLoadStores) +
4844+
ShuffleCost;
48174845
}
48184846

48194847
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,

llvm/test/CodeGen/AArch64/vldn_shuffle.ll

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -730,6 +730,109 @@ entry:
730730
ret void
731731
}
732732

733+
define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
734+
<4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) {
735+
; CHECK-LABEL: store_factor8:
736+
; CHECK: .Lfunc_begin17:
737+
; CHECK-NEXT: .cfi_startproc
738+
; CHECK-NEXT: // %bb.0:
739+
; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]]
740+
; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]]
741+
; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]]
742+
; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]]
743+
; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]]
744+
; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]]
745+
; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]]
746+
; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]]
747+
; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64
748+
; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0]
749+
; CHECK-NEXT: ret
750+
751+
%v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
752+
%v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
753+
%v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
754+
%v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
755+
756+
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
757+
%s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
758+
759+
%interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
760+
store <32 x i32> %interleaved.vec, ptr %ptr, align 4
761+
ret void
762+
}
763+
764+
define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3,
765+
<4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7,
766+
<4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11,
767+
<4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) {
768+
; CHECK-LABEL: store_factor16:
769+
; CHECK: .Lfunc_begin18:
770+
; CHECK-NEXT: .cfi_startproc
771+
; CHECK-NEXT: // %bb.0:
772+
; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]]
773+
; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]]
774+
; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]]
775+
; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]]
776+
; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]]
777+
; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]]
778+
; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]]
779+
; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]]
780+
; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]]
781+
; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]]
782+
; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]]
783+
; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]]
784+
; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]]
785+
; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]]
786+
; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]]
787+
; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]]
788+
; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]]
789+
; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]]
790+
; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]]
791+
; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]]
792+
; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]]
793+
; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]]
794+
; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]]
795+
; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]]
796+
; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]]
797+
; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]]
798+
; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]]
799+
; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]]
800+
; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64
801+
; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload
802+
; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8]
803+
; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]]
804+
; CHECK-NEXT: add x8, x0, #128
805+
; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]]
806+
; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]]
807+
; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]]
808+
; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8]
809+
; CHECK-NEXT: add x8, x0, #192
810+
; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8]
811+
; CHECK-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload
812+
; CHECK-NEXT: ret
813+
814+
%v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
815+
%v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
816+
%v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
817+
%v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
818+
%v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
819+
%v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
820+
%v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
821+
%v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
822+
823+
%s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
824+
%s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
825+
%s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
826+
%s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
827+
828+
%d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
829+
%d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
830+
831+
%interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60, i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61, i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30, i32 34, i32 38, i32 42, i32 46, i32 50, i32 54, i32 58, i32 62, i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31, i32 35, i32 39, i32 43, i32 47, i32 51, i32 55, i32 59, i32 63>
832+
store <64 x i32> %interleaved.vec, ptr %ptr, align 4
833+
ret void
834+
}
835+
733836
declare void @llvm.dbg.value(metadata, metadata, metadata)
734837

735838
!llvm.dbg.cu = !{!0}

0 commit comments

Comments
 (0)