diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h index b2132562ac3f..5d8bc0ae452b 100644 --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -27,6 +27,7 @@ #include "llvm/CodeGenTypes/LowLevelType.h" #include "llvm/IR/InstrTypes.h" #include +#include namespace llvm { @@ -245,19 +246,33 @@ class CombinerHelper { /// or an implicit_def if \p Ops is empty. void applyCombineShuffleConcat(MachineInstr &MI, SmallVector &Ops); - /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS. + /// A function type that returns either the next value in a + /// shufflemask or an empty value. Each iteration should return + /// one value, like a Python iterator or a Lisp stream. + using GeneratorType = std::function()>; + + /// Try to combine G_SHUFFLE_VECTOR into more efficient opcodes. /// Returns true if MI changed. /// /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR. bool tryCombineShuffleVector(MachineInstr &MI); - /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by a - /// concat_vectors. - /// \p Ops will contain the operands needed to produce the flattened - /// concat_vectors. + /// Check if the G_SHUFFLE_VECTOR \p MI can be replaced by checking + /// whether the shufflemask given matches that of a given generator. /// /// \pre MI.getOpcode() == G_SHUFFLE_VECTOR. - bool matchCombineShuffleVector(MachineInstr &MI, - SmallVectorImpl &Ops); + bool matchCombineShuffleVector(MachineInstr &MI, GeneratorType Generator, + const size_t TargetDstSize); + + /// Create G_UNMERGE_VECTOR instructions until the source has reached a + /// target vector size. + /// + /// Requires that the destination fits evenly in the source register. It + /// allows you to pass which of the different destination sized slices + /// you require. + Register createUnmergeValue(MachineInstr &MI, const Register SrcReg, + const Register DstReg, uint8_t DestinationIndex, + const uint32_t Start, const uint32_t End); + /// Replace \p MI with a concat_vectors with \p Ops. void applyCombineShuffleVector(MachineInstr &MI, const ArrayRef Ops); diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp index ec7ca5dc8e2b..dcf1c601601f 100644 --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -42,6 +42,8 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetMachine.h" #include +#include +#include #include #include @@ -384,17 +386,221 @@ void CombinerHelper::applyCombineShuffleConcat(MachineInstr &MI, MI.eraseFromParent(); } +// Create a stream from 0 to n with a specified number of steps +CombinerHelper::GeneratorType +adderGenerator(const int32_t From, const int32_t To, const int32_t StepSize) { + int32_t Counter = From; + return [Counter, To, StepSize]() mutable { + std::optional OldCount = std::optional(Counter); + Counter += StepSize; + if (OldCount == (To + StepSize)) + OldCount = {}; + return OldCount; + }; +} + +// Move to the next generator if it is exhausted allowing to chain generators +CombinerHelper::GeneratorType +concatGenerators(SmallVector &Generators) { + auto *GeneratorIterator = Generators.begin(); + + return [GeneratorIterator, Generators]() mutable { + std::optional GenValue = (*GeneratorIterator)(); + if (!GenValue.has_value() && GeneratorIterator != Generators.end()) { + GeneratorIterator++; + GenValue = (*GeneratorIterator)(); + } + return GenValue; + }; +} + +Register CombinerHelper::createUnmergeValue( + MachineInstr &MI, const Register SrcReg, const Register DstReg, + const uint8_t DestinationIndex, const uint32_t Start, const uint32_t End) { + Builder.setInsertPt(*MI.getParent(), MI); + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + assert((DstTy.isScalar() || + (SrcTy.getNumElements() % DstTy.getNumElements()) == 0) && + "destination vector must divide source cleanly"); + + const unsigned HalfElements = SrcTy.getNumElements() / 2; + const LLT ScalarTy = SrcTy.getScalarType(); + const LLT HalfSizeTy = (HalfElements == 1) + ? ScalarTy + : LLT::fixed_vector(HalfElements, ScalarTy); + const Register TmpReg = MRI.createGenericVirtualRegister(HalfSizeTy); + Register TargetReg = DstReg; + if (DstTy != HalfSizeTy) { + TargetReg = MRI.createGenericVirtualRegister(HalfSizeTy); + } + + // Each destination fits n times into the source and each iteration we exactly + // half the source. Therefore we need to pick on which side we want to iterate + // on. + const uint32_t DstNumElements = DstTy.isVector() ? DstTy.getNumElements() : 1; + const uint32_t HalfWay = Start + ((End - Start) / 2); + const uint32_t Position = DestinationIndex * DstNumElements; + + uint32_t NextStart, NextEnd; + if (Position < HalfWay) { + Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TargetReg, TmpReg}, + {SrcReg}); + NextStart = Start; + NextEnd = HalfWay; + } else { + Builder.buildInstr(TargetOpcode::G_UNMERGE_VALUES, {TmpReg, TargetReg}, + {SrcReg}); + NextStart = HalfWay; + NextEnd = End; + } + + if (HalfSizeTy.isVector() && DstTy != HalfSizeTy) + return createUnmergeValue(MI, TargetReg, DstReg, DestinationIndex, + NextStart, NextEnd); + + return DstReg; +} + bool CombinerHelper::tryCombineShuffleVector(MachineInstr &MI) { + const Register DstReg = MI.getOperand(0).getReg(); + const Register SrcReg1 = MI.getOperand(1).getReg(); + const Register SrcReg2 = MI.getOperand(2).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + + const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; + const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; + + // This test is a bit silly, but it is required because some tests rely on + // the legalizer changing the type of the shufflevector. + if (DstTy.getScalarSizeInBits() == 1) + return false; + + // {1, 2, ..., n} -> G_CONCAT_VECTOR + // Turns a shuffle vector that only increments into a concat vector + // instruction + GeneratorType CountUp = adderGenerator(0, DstNumElts - 1, 1); SmallVector Ops; - if (matchCombineShuffleVector(MI, Ops)) { + + if (matchCombineShuffleVector(MI, CountUp, 2 * SrcNumElts)) { + // The shuffle is concatenating multiple vectors together. + // Collect the different operands for that. + Register UndefReg; + const Register Src1 = MI.getOperand(1).getReg(); + const Register Src2 = MI.getOperand(2).getReg(); + + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + + // The destination can be longer than the source, so we separate them into + // equal blocks and check them separately to see if one of the blocks can be + // copied whole. + unsigned NumConcat = DstNumElts / SrcNumElts; + unsigned Index = 0; + for (unsigned Concat = 0; Concat < NumConcat; Concat++) { + unsigned Target = (Concat + 1) * SrcNumElts; + while (Index < Target) { + int MaskElt = Mask[Index]; + if (MaskElt >= 0) { + Ops.push_back((MaskElt < (int)SrcNumElts) ? Src1 : Src2); + break; + } + Index++; + } + + if (Index == Target) { + if (!UndefReg) { + Builder.setInsertPt(*MI.getParent(), MI); + UndefReg = Builder.buildUndef(SrcTy).getReg(0); + } + Ops.push_back(UndefReg); + } + + Index = Target; + } + applyCombineShuffleVector(MI, Ops); return true; } + + // {1, 2, ..., |DstVector|} -> G_UNMERGE_VALUES + // Extracts the first chunk of the same size of the destination vector from + // the source + GeneratorType FirstQuarter = adderGenerator(0, DstNumElts - 1, 1); + if (matchCombineShuffleVector(MI, FirstQuarter, DstNumElts - 1)) { + // This optimization does not work if the target type is not a multiple of + // two, this can happen in some backends that support uneven vector types. + // We also need to make sure that the vector can be split into two. + if (SrcTy == DstTy || ((SrcNumElts / 2) % 2) != 0 || + SrcNumElts % DstNumElts != 0) + return false; + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2; + createUnmergeValue(MI, TargetReg, DstReg, 0, 0, SrcNumElts); + MI.eraseFromParent(); + return true; + } + + // {|DstVector|, |DstVector|+1, ..., 2 * |DstVector|} -> G_UNMERGE_VALUES + // Extracts the second chunk of the same size of the destination vector from + // the source + GeneratorType SecondQuarter = + adderGenerator(DstNumElts, (DstNumElts * 2) - 1, 1); + if (matchCombineShuffleVector(MI, SecondQuarter, DstNumElts - 1)) { + if (((SrcNumElts / 2) % 2) != 0 || SrcNumElts % DstNumElts != 0) + return false; + ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + const Register TargetReg = Mask[0] < (int)SrcNumElts ? SrcReg1 : SrcReg2; + createUnmergeValue(MI, TargetReg, DstReg, 1, 0, SrcNumElts); + MI.eraseFromParent(); + return true; + } + + // After this point, it is assumed our shufflevectors work on vectors that can + // be splint into two + if ((DstNumElts % 2) != 0) + return false; + + // {1, 2, ..., n/4, n/2, n/2+1, .... 3n/4} -> G_UNMERGE_VALUES + // Take the first halfs of the two vectors and concatenate them into one + // vector. + GeneratorType FirstEightA = adderGenerator(0, (DstNumElts / 2) - 1, 1); + GeneratorType FirstEightB = + adderGenerator(DstNumElts, DstNumElts + (DstNumElts / 2) - 1, 1); + + auto UnmergeMatcher = SmallVector{FirstEightA, FirstEightB}; + GeneratorType FirstAndThird = concatGenerators(UnmergeMatcher); + if (matchCombineShuffleVector(MI, FirstAndThird, (DstNumElts / 2) - 1)) { + if (DstNumElts <= 2) + return false; + const Register DstReg = MI.getOperand(0).getReg(); + const LLT HalfSrcTy = + LLT::fixed_vector(SrcNumElts / 2, SrcTy.getScalarType()); + const Register HalfOfA = createUnmergeValue( + MI, MI.getOperand(1).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + const Register HalfOfB = createUnmergeValue( + MI, MI.getOperand(2).getReg(), + MRI.createGenericVirtualRegister(HalfSrcTy), 0, 0, SrcNumElts); + + const ArrayRef Mask = MI.getOperand(3).getShuffleMask(); + if (Mask[0] <= 0) { + Builder.buildMergeLikeInstr(DstReg, {HalfOfA, HalfOfB}); + } else { + Builder.buildMergeLikeInstr(DstReg, {HalfOfB, HalfOfA}); + } + + MI.eraseFromParent(); + return true; + } + return false; } bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI, - SmallVectorImpl &Ops) { + GeneratorType Generator, + const size_t TargetDstSize) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR && "Invalid instruction kind"); LLT DstType = MRI.getType(MI.getOperand(0).getReg()); @@ -421,51 +627,24 @@ bool CombinerHelper::matchCombineShuffleVector(MachineInstr &MI, // // TODO: If the size between the source and destination don't match // we could still emit an extract vector element in that case. - if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1) - return false; - - // Check that the shuffle mask can be broken evenly between the - // different sources. - if (DstNumElts % SrcNumElts != 0) + if ((DstNumElts < TargetDstSize) && DstNumElts != 1) return false; - // Mask length is a multiple of the source vector length. - // Check if the shuffle is some kind of concatenation of the input - // vectors. - unsigned NumConcat = DstNumElts / SrcNumElts; - SmallVector ConcatSrcs(NumConcat, -1); ArrayRef Mask = MI.getOperand(3).getShuffleMask(); for (unsigned i = 0; i != DstNumElts; ++i) { int Idx = Mask[i]; + const int32_t ShiftIndex = Generator().value_or(-1); + // Undef value. - if (Idx < 0) + if (Idx < 0 || ShiftIndex < 0) continue; + // Ensure the indices in each SrcType sized piece are sequential and that // the same source is used for the whole piece. - if ((Idx % SrcNumElts != (i % SrcNumElts)) || - (ConcatSrcs[i / SrcNumElts] >= 0 && - ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) + if ((Idx % SrcNumElts != (ShiftIndex % SrcNumElts))) return false; - // Remember which source this index came from. - ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts; } - // The shuffle is concatenating multiple vectors together. - // Collect the different operands for that. - Register UndefReg; - Register Src2 = MI.getOperand(2).getReg(); - for (auto Src : ConcatSrcs) { - if (Src < 0) { - if (!UndefReg) { - Builder.setInsertPt(*MI.getParent(), MI); - UndefReg = Builder.buildUndef(SrcType).getReg(0); - } - Ops.push_back(UndefReg); - } else if (Src == 0) - Ops.push_back(Src1); - else - Ops.push_back(Src2); - } return true; } diff --git a/llvm/lib/Target/AIE/AIE2InstrPatterns.td b/llvm/lib/Target/AIE/AIE2InstrPatterns.td index d363f40bab53..835bc7a86962 100644 --- a/llvm/lib/Target/AIE/AIE2InstrPatterns.td +++ b/llvm/lib/Target/AIE/AIE2InstrPatterns.td @@ -597,6 +597,18 @@ def : Pat<(int_aie2_vshuffle VEC512:$s1, VEC512:$s2, eR:$mod), def : Pat<(int_aie2_vshuffle_bf16 VEC512:$s1, VEC512:$s2, eR:$mod), (VSHUFFLE VEC512:$s1, VEC512:$s2, eR:$mod)>; +// VSHUFFLE generic opcodes translation +def vshuffle_node : SDNode<"AIE2::G_AIE_VSHUFFLE", + SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>]>>; +def : GINodeEquiv; + +def : Pat<(v16i32 (vshuffle_node (v16i32 VEC512:$v0), (v16i32 VEC512:$v1), (i32 eR:$mode))), + (VSHUFFLE VEC512:$v0, VEC512:$v1, i32:$mode)>; +def : Pat<(v32i16 (vshuffle_node (v32i16 VEC512:$v0), (v32i16 VEC512:$v1), (i32 eR:$mode))), + (VSHUFFLE VEC512:$v0, VEC512:$v1, i32:$mode)>; +def : Pat<(v64i8 (vshuffle_node (v64i8 VEC512:$v0), (v64i8 VEC512:$v1), (i32 eR:$mode))), + (VSHUFFLE VEC512:$v0, VEC512:$v1, i32:$mode)>; + // VSHIFT Intrinsic (shift/shiftx/shift_bytes) def : Pat<(int_aie2_vshift_I512_I512 VEC512:$s1, VEC512:$s2, 0x0, eR:$shift), (VSHIFT VEC512:$s1, VEC512:$s2, eR:$shift)>; diff --git a/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp b/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp index 37865902ad13..b514e8e84a02 100644 --- a/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AIE/AIE2PreLegalizerCombiner.cpp @@ -15,6 +15,7 @@ #include "AIE2TargetMachine.h" #include "AIECombinerHelper.h" +#include "MCTargetDesc/AIE2MCTargetDesc.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/Combiner.h" #include "llvm/CodeGen/GlobalISel/CombinerHelper.h" @@ -22,6 +23,7 @@ #include "llvm/CodeGen/GlobalISel/GIMatchTableExecutorImpl.h" #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h" #include "llvm/CodeGen/MachineDominators.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/IR/IntrinsicsAIE2.h" #include "llvm/InitializePasses.h" @@ -66,6 +68,7 @@ class AIE2PreLegalizerCombinerImpl : public Combiner { bool tryCombineAll(MachineInstr &I) const override; bool tryCombineAllImpl(MachineInstr &I) const; + bool tryCombineShuffleVector(MachineInstr &MI) const; bool tryToCombineVectorShiftsByZero(MachineInstr &MI) const; @@ -149,6 +152,91 @@ bool AIE2PreLegalizerCombinerImpl::tryToCombineIntrinsic( return false; } +bool createVShuffle(MachineInstr &MI, const LLT TargetTy, const uint8_t Mode) { + MachineIRBuilder MIB(MI); + MachineRegisterInfo &MRI = *MIB.getMRI(); + const Register DstReg = MI.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + + if (DstTy != TargetTy) + return false; + + const Register Src1 = MI.getOperand(1).getReg(); + const Register Src2 = MI.getOperand(2).getReg(); + const Register ShuffleModeReg = + MRI.createGenericVirtualRegister(LLT::scalar(32)); + + // This combiner only cares about the lower bits, so we can pad the + // vector to cover the case where two separate vectors are shuffled. + // together + MIB.buildConstant(ShuffleModeReg, Mode); + if (MRI.getType(Src1) == TargetTy) { + MIB.buildInstr(AIE2::G_AIE_VSHUFFLE, {DstReg}, + {Src1, Src2, ShuffleModeReg}); + } else { + // We reuse the same register since we ignore the high part of the vector + const Register TmpRegister = MRI.createGenericVirtualRegister(TargetTy); + MIB.buildConcatVectors(TmpRegister, {Src1, Src2}); + MIB.buildInstr(AIE2::G_AIE_VSHUFFLE, {DstReg}, + {TmpRegister, TmpRegister, ShuffleModeReg}); + } + + MI.eraseFromParent(); + return true; +} + +CombinerHelper::GeneratorType sectionGenerator(const int32_t From, + const int32_t To, + const int32_t Partitions, + const int32_t Increment) { + int32_t RoundSize = To / Partitions; + int32_t Index = 0; + int32_t Round = 0; + + return [=]() mutable { + int32_t CurrentGroup = (Index / Increment) % Partitions; + int32_t GroupFirstElement = CurrentGroup * RoundSize; + int32_t IndexInGroup = Index % Increment; + int32_t OffsetGroup = Round * Increment; + int32_t Next = GroupFirstElement + IndexInGroup + OffsetGroup; + if (++Index % (Partitions * Increment) == 0) + Round++; + + std::optional Return = std::optional(Next); + if (Index == To + 1) + Return = {}; + return Return; + }; +} + +bool AIE2PreLegalizerCombinerImpl::tryCombineShuffleVector( + MachineInstr &MI) const { + const Register DstReg = MI.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(MI.getOperand(1).getReg()); + const unsigned DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1; + const unsigned SrcNumElts = SrcTy.isVector() ? SrcTy.getNumElements() : 1; + MachineIRBuilder MIB(MI); + MachineRegisterInfo &MRI = *MIB.getMRI(); + + if (Helper.tryCombineShuffleVector(MI)) + return true; + + const LLT V64S8 = LLT::fixed_vector(64, 8); + CombinerHelper::GeneratorType FourPartitions = + sectionGenerator(0, DstNumElts, 4, 1); + if (Helper.matchCombineShuffleVector(MI, FourPartitions, DstNumElts)) + return createVShuffle(MI, V64S8, 35); + + const LLT V32S16 = LLT::fixed_vector(32, 16); + CombinerHelper::GeneratorType FourPartitionByTwo = + sectionGenerator(0, DstNumElts, 4, 2); + if (Helper.matchCombineShuffleVector(MI, FourPartitionByTwo, DstNumElts)) + return createVShuffle(MI, V32S16, 29); + + return false; +} + bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { if (tryCombineAllImpl(MI)) return true; @@ -167,6 +255,9 @@ bool AIE2PreLegalizerCombinerImpl::tryCombineAll(MachineInstr &MI) const { case TargetOpcode::G_INTRINSIC: { return tryToCombineIntrinsic(MI); } + case TargetOpcode::G_SHUFFLE_VECTOR: { + return tryCombineShuffleVector(MI); + } default: break; } diff --git a/llvm/lib/Target/AIE/AIEInstrGISel.td b/llvm/lib/Target/AIE/AIEInstrGISel.td index 69154fa83819..6eea5bf96782 100644 --- a/llvm/lib/Target/AIE/AIEInstrGISel.td +++ b/llvm/lib/Target/AIE/AIEInstrGISel.td @@ -96,6 +96,12 @@ def G_AIE_BROADCAST_VECTOR : AIEGenericInstruction { let hasSideEffects = false; } +def G_AIE_VSHUFFLE : AIEGenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2, type1:$mode); + let hasSideEffects = false; +} + // Create a larger vector by padding undefined values in the high bits def G_AIE_PAD_VECTOR_UNDEF : AIEGenericInstruction { let OutOperandList = (outs type0:$dst); diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir index 0de989f8be75..b87fdf8bc552 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shufflevector.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 # RUN: llc -o - -mtriple=aarch64-unknown-unknown -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s | FileCheck %s +# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates --- name: shuffle_concat_1 @@ -101,7 +102,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(0, undef, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 @@ -179,7 +182,9 @@ body: | ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<4 x s8>) = G_IMPLICIT_DEF ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %a(<4 x s8>), %b(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) ; CHECK-NEXT: [[CONCAT_VECTORS1:%[0-9]+]]:_(<16 x s8>) = G_CONCAT_VECTORS %c(<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>), [[DEF]](<4 x s8>) - ; CHECK-NEXT: %z:_(<16 x s8>) = G_SHUFFLE_VECTOR [[CONCAT_VECTORS]](<16 x s8>), [[CONCAT_VECTORS1]], shufflemask(undef, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, undef, undef, undef, undef) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s8>), [[UV1:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS]](<16 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s8>), [[UV3:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[CONCAT_VECTORS1]](<16 x s8>) + ; CHECK-NEXT: %z:_(<16 x s8>) = G_CONCAT_VECTORS [[UV]](<8 x s8>), [[UV2]](<8 x s8>) ; CHECK-NEXT: $q0 = COPY %z(<16 x s8>) ; CHECK-NEXT: RET_ReallyLR implicit $q0 %p1:_(p0) = COPY $x0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir index 2c9ae5b06b62..1d4651fe70b5 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple aarch64-apple-ios -run-pass=aarch64-prelegalizer-combiner %s -o - | FileCheck %s +# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates # Check that we canonicalize shuffle_vector(Src1, Src2, mask(0,1,2,3)) # into concat_vector(Src1, Src2). @@ -270,8 +271,10 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0 ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<4 x s32>) = COPY $q1 - ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<4 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<4 x s32>), [[COPY1]], shufflemask(4, 5, 0, 1) - ; CHECK-NEXT: RET_ReallyLR implicit [[SHUF]](<4 x s32>) + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<2 x s32>), [[UV3:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY1]](<4 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<4 x s32>) = G_CONCAT_VECTORS [[UV2]](<2 x s32>), [[UV]](<2 x s32>) + ; CHECK-NEXT: RET_ReallyLR implicit [[CONCAT_VECTORS]](<4 x s32>) %0:_(<4 x s32>) = COPY $q0 %1:_(<4 x s32>) = COPY $q1 %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<4 x s32>), %1(<4 x s32>), shufflemask(4,5,0,1) diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 749d6071c98d..89002fc9de43 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI +; Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates ; CHECK-GI: warning: Instruction selection used fallback path for test_bitcastv2f32tov1f64 ; CHECK-GI-NEXT: warning: Instruction selection used fallback path for test_bitcastv1f64tov2f32 @@ -1776,19 +1777,10 @@ entry: } define <16 x i8> @test_concat_v16i8_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) #0 { -; CHECK-SD-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v16i8_v16i8_v16i8: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI126_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI126_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v16i8_v16i8_v16i8: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit30 = shufflevector <16 x i8> %x, <16 x i8> %y, <16 x i32> ret <16 x i8> %vecinit30 @@ -1803,9 +1795,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v16i8: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI127_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov b2, v0.b[1] ; CHECK-GI-NEXT: mov b3, v0.b[2] ; CHECK-GI-NEXT: mov b4, v0.b[3] @@ -1814,14 +1804,13 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-GI-NEXT: mov b7, v0.b[6] ; CHECK-GI-NEXT: mov b16, v0.b[7] ; CHECK-GI-NEXT: mov v0.b[1], v2.b[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI127_0] ; CHECK-GI-NEXT: mov v0.b[2], v3.b[0] ; CHECK-GI-NEXT: mov v0.b[3], v4.b[0] ; CHECK-GI-NEXT: mov v0.b[4], v5.b[0] ; CHECK-GI-NEXT: mov v0.b[5], v6.b[0] ; CHECK-GI-NEXT: mov v0.b[6], v7.b[0] ; CHECK-GI-NEXT: mov v0.b[7], v16.b[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <8 x i8> %x, i32 0 @@ -1999,19 +1988,10 @@ entry: } define <8 x i16> @test_concat_v8i16_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) #0 { -; CHECK-SD-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v8i16_v8i16_v8i16: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI130_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI130_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v8i16_v8i16_v8i16: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit14 = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> ret <8 x i16> %vecinit14 @@ -2026,17 +2006,14 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v8i16: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI131_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov h2, v0.h[1] ; CHECK-GI-NEXT: mov h3, v0.h[2] ; CHECK-GI-NEXT: mov h4, v0.h[3] ; CHECK-GI-NEXT: mov v0.h[1], v2.h[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI131_0] ; CHECK-GI-NEXT: mov v0.h[2], v3.h[0] ; CHECK-GI-NEXT: mov v0.h[3], v4.h[0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <4 x i16> %x, i32 0 @@ -2142,19 +2119,10 @@ entry: } define <4 x i32> @test_concat_v4i32_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) #0 { -; CHECK-SD-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: test_concat_v4i32_v4i32_v4i32: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: adrp x8, .LCPI134_0 -; CHECK-GI-NEXT: // kill: def $q0 killed $q0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI134_0] -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b -; CHECK-GI-NEXT: ret +; CHECK-LABEL: test_concat_v4i32_v4i32_v4i32: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov v0.d[1], v1.d[0] +; CHECK-NEXT: ret entry: %vecinit6 = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> ret <4 x i32> %vecinit6 @@ -2169,13 +2137,10 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; ; CHECK-GI-LABEL: test_concat_v4i32_v2i32_v4i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0_q1 def $q0_q1 -; CHECK-GI-NEXT: adrp x8, .LCPI135_0 -; CHECK-GI-NEXT: // kill: def $q1 killed $q1 killed $q0_q1 def $q0_q1 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov s2, v0.s[1] ; CHECK-GI-NEXT: mov v0.s[1], v2.s[0] -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI135_0] -; CHECK-GI-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v2.16b +; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: ret entry: %vecext = extractelement <2 x i32> %x, i32 0 diff --git a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll index 2c5d33da93c8..6f095c59f2a6 100644 --- a/llvm/test/CodeGen/AArch64/ext-narrow-index.ll +++ b/llvm/test/CodeGen/AArch64/ext-narrow-index.ll @@ -1,6 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-SD ; RUN: llc < %s -global-isel -mtriple=aarch64 | FileCheck %s --check-prefixes=CHECK,CHECK-GISEL +; Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates ; Tests of shufflevector where the index operand is half the width of the vector ; operands. We should get one ext instruction and not two. @@ -42,8 +43,7 @@ define <8 x i8> @i8_off8(<16 x i8> %arg1, <16 x i8> %arg2) { ; ; CHECK-GISEL-LABEL: i8_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> %arg2, <8 x i32> @@ -254,9 +254,7 @@ define <8 x i8> @i8_zero_off8(<16 x i8> %arg1) { ; ; CHECK-GISEL-LABEL: i8_zero_off8: ; CHECK-GISEL: // %bb.0: // %entry -; CHECK-GISEL-NEXT: movi v1.2d, #0000000000000000 -; CHECK-GISEL-NEXT: ext v0.16b, v0.16b, v1.16b, #8 -; CHECK-GISEL-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-GISEL-NEXT: mov d0, v0.d[1] ; CHECK-GISEL-NEXT: ret entry: %shuffle = shufflevector <16 x i8> %arg1, <16 x i8> zeroinitializer, <8 x i32> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll index 3254c5ebe9c6..0ef0e6e22922 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll @@ -3,6 +3,7 @@ ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE ; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel -global-isel-abort=2 %s -o - -mattr=+dotprod 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT +; Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates define i32 @addv_v2i32(<2 x i32> %a) { ; CHECK-LABEL: addv_v2i32: @@ -3744,17 +3745,13 @@ define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i1 ; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ushll v4.4s, v0.4h, #0 -; CHECK-GI-NEXT: ushll2 v0.4s, v0.8h, #0 ; CHECK-GI-NEXT: ushll v5.4s, v1.4h, #0 -; CHECK-GI-NEXT: ushll2 v1.4s, v1.8h, #0 ; CHECK-GI-NEXT: ushll v6.4s, v2.4h, #0 -; CHECK-GI-NEXT: ushll2 v2.4s, v2.8h, #0 ; CHECK-GI-NEXT: ushll v7.4s, v3.4h, #0 -; CHECK-GI-NEXT: ushll2 v3.4s, v3.8h, #0 -; CHECK-GI-NEXT: add v0.4s, v4.4s, v0.4s -; CHECK-GI-NEXT: add v1.4s, v5.4s, v1.4s -; CHECK-GI-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-GI-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-GI-NEXT: uaddw2 v0.4s, v4.4s, v0.8h +; CHECK-GI-NEXT: uaddw2 v1.4s, v5.4s, v1.8h +; CHECK-GI-NEXT: uaddw2 v2.4s, v6.4s, v2.8h +; CHECK-GI-NEXT: uaddw2 v3.4s, v7.4s, v3.8h ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: add v1.4s, v2.4s, v3.4s ; CHECK-GI-NEXT: add v0.4s, v0.4s, v1.4s diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-aie-vshuffle.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-aie-vshuffle.mir new file mode 100644 index 000000000000..489db1e15e7c --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/inst-select-aie-vshuffle.mir @@ -0,0 +1,83 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# +# RUN: llc -mtriple aie2 -run-pass=instruction-select %s -verify-machineinstrs -o - | FileCheck %s + +--- +name: vshuffle_32_m35 +legalized: true +regBankSelected: true +tracksRegLiveness: true +stack: + - { id: 0, name: "", size: 128, alignment: 32 } +body: | + bb.0.entry: + liveins: $x2 + ; CHECK-LABEL: name: vshuffle_32_m35 + ; CHECK: liveins: $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x2 + ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 29 + ; CHECK-NEXT: [[VSHUFFLE:%[0-9]+]]:vec512 = VSHUFFLE [[COPY]], [[COPY]], [[MOV_RLC_imm10_pseudo]] + ; CHECK-NEXT: $x0 = COPY [[VSHUFFLE]] + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:vregbank(<16 x s32>) = COPY $x2 + %2:gprregbank(s32) = G_CONSTANT i32 29 + %0:vregbank(<16 x s32>) = G_AIE_VSHUFFLE %1:vregbank, %1:vregbank, %2:gprregbank(s32) + $x0 = COPY %0:vregbank(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: vshuffle_16_m35 +legalized: true +regBankSelected: true +tracksRegLiveness: true +stack: + - { id: 0, name: "", size: 128, alignment: 32 } +body: | + bb.0.entry: + liveins: $x2 + ; CHECK-LABEL: name: vshuffle_16_m35 + ; CHECK: liveins: $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x2 + ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 29 + ; CHECK-NEXT: [[VSHUFFLE:%[0-9]+]]:vec512 = VSHUFFLE [[COPY]], [[COPY]], [[MOV_RLC_imm10_pseudo]] + ; CHECK-NEXT: $x0 = COPY [[VSHUFFLE]] + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:vregbank(<32 x s16>) = COPY $x2 + %2:gprregbank(s32) = G_CONSTANT i32 29 + %0:vregbank(<32 x s16>) = G_AIE_VSHUFFLE %1:vregbank, %1:vregbank, %2:gprregbank(s32) + $x0 = COPY %0:vregbank(<32 x s16>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: vshuffle_8_m35 +legalized: true +regBankSelected: true +tracksRegLiveness: true +stack: + - { id: 0, name: "", size: 128, alignment: 32 } +body: | + bb.0.entry: + liveins: $x2 + ; CHECK-LABEL: name: vshuffle_8_m35 + ; CHECK: liveins: $x2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vec512 = COPY $x2 + ; CHECK-NEXT: [[MOV_RLC_imm10_pseudo:%[0-9]+]]:er = MOV_RLC_imm10_pseudo 29 + ; CHECK-NEXT: [[VSHUFFLE:%[0-9]+]]:vec512 = VSHUFFLE [[COPY]], [[COPY]], [[MOV_RLC_imm10_pseudo]] + ; CHECK-NEXT: $x0 = COPY [[VSHUFFLE]] + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:vregbank(<64 x s8>) = COPY $x2 + %2:gprregbank(s32) = G_CONSTANT i32 29 + %0:vregbank(<64 x s8>) = G_AIE_VSHUFFLE %1:vregbank, %1:vregbank, %2:gprregbank(s32) + $x0 = COPY %0:vregbank(<64 x s8>) + PseudoRET implicit $lr, implicit $x0 diff --git a/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir new file mode 100644 index 000000000000..6b8785691eb0 --- /dev/null +++ b/llvm/test/CodeGen/AIE/aie2/GlobalISel/prelegalizercombiner-shufflevector.mir @@ -0,0 +1,986 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2023-2024 Advanced Micro Devices, Inc. or its affiliates +# RUN: llc -mtriple aie2 -run-pass=aie2-prelegalizer-combiner %s -verify-machineinstrs -o - | FileCheck %s + +--- +name: concat_vector_32_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_1024 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: concat_vector_32_1024 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s32>) = G_CONCAT_VECTORS [[COPY]](<16 x s32>), [[COPY1]](<16 x s32>) + ; CHECK-NEXT: $y2 = COPY [[CONCAT_VECTORS]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $y2 + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<32 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + $y2 = COPY %0:_(<32 x s32>) + PseudoRET implicit $lr, implicit $y2 +... + +--- +name: concat_vector_32_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0 + ; CHECK-LABEL: name: concat_vector_32_256 + ; CHECK: liveins: $wl0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[UV1]](<4 x s32>), [[UV]](<4 x s32>) + ; CHECK-NEXT: $wl0 = COPY [[CONCAT_VECTORS]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $wl0 + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<4 x s32>), %3:_(<4 x s32>) = G_UNMERGE_VALUES %1:_(<8 x s32>) + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %3:_(<4 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + $wl0 = COPY %0:_(<8 x s32>) + PseudoRET implicit $lr, implicit $wl0 +... + +--- +name: concat_vector_16_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_16_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s16>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s16>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[COPY]](<16 x s16>), [[COPY1]](<16 x s16>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<16 x s16>) = COPY $wl2 + %2:_(<16 x s16>) = COPY $wl4 + %0:_(<32 x s16>) = G_SHUFFLE_VECTOR %1:_(<16 x s16>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31) + $x0 = COPY %0:_(<32 x s16>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_8_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_8_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s8>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s8>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[COPY]](<32 x s8>), [[COPY1]](<32 x s8>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s8>) = COPY $wl2 + %2:_(<32 x s8>) = COPY $wl4 + %0:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<32 x s8>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + $x0 = COPY %0:_(<64 x s8>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: extract_vector_1024_to_512 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_512 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: $x0 = COPY [[UV]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s32>) = COPY $y2 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: extract_vector_1024_to_256 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_256 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[UV]](<16 x s32>) + ; CHECK-NEXT: $wl0 = COPY [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<32 x s32>) = COPY $y2 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7) + $wl0 = COPY %0:_(<8 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_first_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_first_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[DEF]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, -1, -1, -1, -1, 8, 9, 10, 11, 12, 13, 14, 15) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_second_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_second_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[DEF]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_32_512_random +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_32_512_random + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[COPY1]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(0, -1, 2, -1, 4, -1, -1, 7, 8, 9, -1, 11, 12, -1, 14, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: extract_vector_1024_to_128 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_128 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[UV]](<16 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<4 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %1:_, shufflemask(0, 1, 2, 3) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_1024_to_32 +legalized: false +body: | + bb.1.entry: + liveins: $y2 + ; CHECK-LABEL: name: extract_vector_1024_to_32 + ; CHECK: liveins: $y2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[UV]](<64 x s8>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_AIE_UNPAD_VECTOR [[UV2]](<32 x s8>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<8 x s8>), [[UV5:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[AIE_UNPAD_VECTOR]](<16 x s8>) + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s8>), [[UV7:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[UV4]](<8 x s8>) + ; CHECK-NEXT: [[UV8:%[0-9]+]]:_(<2 x s8>), [[UV9:%[0-9]+]]:_(<2 x s8>) = G_UNMERGE_VALUES [[UV6]](<4 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV8]](<2 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %0:_(<2 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_, shufflemask(0, 1) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_second_half_512_to_256 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_second_half_512_to_256 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %1:_(<16 x s32>), shufflemask(8, 9, 10, 11, 12, 13, 14, 15) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_512_to_128 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_second_half_512_to_128 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<4 x s32>), [[UV3:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[UV]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV3]](<4 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %1:_(<16 x s32>), shufflemask(4, 5, 6, 7) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_1024_to_512 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_second_half_1024_to_512 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<64 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_second_half_1024_to_32 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_second_half_1024_to_32 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<32 x s8>), [[UV3:%[0-9]+]]:_(<32 x s8>) = G_UNMERGE_VALUES [[UV]](<64 x s8>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<16 x s8>) = G_AIE_UNPAD_VECTOR [[UV2]](<32 x s8>) + ; CHECK-NEXT: [[UV4:%[0-9]+]]:_(<8 x s8>), [[UV5:%[0-9]+]]:_(<8 x s8>) = G_UNMERGE_VALUES [[AIE_UNPAD_VECTOR]](<16 x s8>) + ; CHECK-NEXT: [[UV6:%[0-9]+]]:_(<4 x s8>), [[UV7:%[0-9]+]]:_(<4 x s8>) = G_UNMERGE_VALUES [[UV4]](<8 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV7]](<4 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<4 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %1:_(<128 x s8>), shufflemask(4, 5, 6, 7) + PseudoRET implicit $lr, implicit %2 +... + +--- +name: extract_vector_third_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_third_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_third_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_third_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[AIE_UNPAD_VECTOR]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_third_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_third_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(4, 5) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_1024 +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: extract_vector_fourth_half_1024 + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s32>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<16 x s32>), [[UV1:%[0-9]+]]:_(<16 x s32>) = G_UNMERGE_VALUES [[COPY]](<32 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<16 x s32>) + %1:_(<32 x s32>) = COPY $y2 + %2:_(<32 x s32>) = COPY $y3 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<32 x s32>), %2:_, shufflemask(48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_512 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: extract_vector_fourth_half_512 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<8 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(24,25,26,27,28,29,30,31) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_256 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: extract_vector_fourth_half_256 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<4 x s32>), [[UV1:%[0-9]+]]:_(<4 x s32>) = G_UNMERGE_VALUES [[COPY]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<4 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %0:_(<4 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(12,13,14,15) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: extract_vector_fourth_half_128 +legalized: false +body: | + bb.1.entry: + liveins: $q0, $q1 + ; CHECK-LABEL: name: extract_vector_fourth_half_128 + ; CHECK: liveins: $q0, $q1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<2 x s32>), [[UV1:%[0-9]+]]:_(<2 x s32>) = G_UNMERGE_VALUES [[COPY]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[UV1]](<2 x s32>) + %1:_(<4 x s32>) = COPY $q0 + %2:_(<4 x s32>) = COPY $q1 + %0:_(<2 x s32>) = G_SHUFFLE_VECTOR %1:_(<4 x s32>), %2:_, shufflemask(6,7) + PseudoRET implicit $lr, implicit %0 +... + +--- +name: insert_vector_16_elements +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV]](<8 x s32>), [[UV2]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR]](<4 x s32>), [[AIE_UNPAD_VECTOR1]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(0, 1, 2, 3, 8, 9, 10, 11) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV]](<64 x s8>), [[UV2]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_16_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: insert_vector_16_elements_reverse + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<8 x s32>), [[UV1:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY]](<16 x s32>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<8 x s32>), [[UV3:%[0-9]+]]:_(<8 x s32>) = G_UNMERGE_VALUES [[COPY1]](<16 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[UV2]](<8 x s32>), [[UV]](<8 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<16 x s32>) + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %3:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_(<16 x s32>), shufflemask(16, 17, 18, 19, 20, 21, 22, 23, 0, 1, 2, 3, 4, 5, 6, 7) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_8_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: insert_vector_8_elements_reverse + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl1 + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY]](<8 x s32>) + ; CHECK-NEXT: [[AIE_UNPAD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_AIE_UNPAD_VECTOR [[COPY1]](<8 x s32>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<8 x s32>) = G_CONCAT_VECTORS [[AIE_UNPAD_VECTOR1]](<4 x s32>), [[AIE_UNPAD_VECTOR]](<4 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<8 x s32>) + %1:_(<8 x s32>) = COPY $wl0 + %2:_(<8 x s32>) = COPY $wl1 + %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_(<8 x s32>), shufflemask(8, 9, 10, 11, 0, 1, 2, 3) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: insert_vector_128_elements_reverse +legalized: false +body: | + bb.1.entry: + liveins: $y2, $y3 + ; CHECK-LABEL: name: insert_vector_128_elements_reverse + ; CHECK: liveins: $y2, $y3 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<128 x s8>) = COPY $y2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<128 x s8>) = COPY $y3 + ; CHECK-NEXT: [[UV:%[0-9]+]]:_(<64 x s8>), [[UV1:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY]](<128 x s8>) + ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(<64 x s8>), [[UV3:%[0-9]+]]:_(<64 x s8>) = G_UNMERGE_VALUES [[COPY1]](<128 x s8>) + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<128 x s8>) = G_CONCAT_VECTORS [[UV2]](<64 x s8>), [[UV]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit [[CONCAT_VECTORS]](<128 x s8>) + %1:_(<128 x s8>) = COPY $y2 + %2:_(<128 x s8>) = COPY $y3 + %3:_(<128 x s8>) = G_SHUFFLE_VECTOR %1:_(<128 x s8>), %2:_(<128 x s8>), shufflemask(128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63) + PseudoRET implicit $lr, implicit %3 +... + +--- +name: concat_vector_reverse_32_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512 + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_undef_start_first +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_undef_start_first + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_start_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_start_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_end_start +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_end_start + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_end_end +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_end_end + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_first_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_first_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[DEF]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(-1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_second_block +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_second_block + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<8 x s32>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY]](<8 x s32>), [[DEF]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: concat_vector_reverse_32_512_random +legalized: false +body: | + bb.1.entry: + liveins: $wl2, $wl4 + ; CHECK-LABEL: name: concat_vector_reverse_32_512_random + ; CHECK: liveins: $wl2, $wl4 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s32>) = COPY $wl2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<8 x s32>) = COPY $wl4 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<16 x s32>) = G_CONCAT_VECTORS [[COPY1]](<8 x s32>), [[COPY]](<8 x s32>) + ; CHECK-NEXT: $x0 = COPY [[CONCAT_VECTORS]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x0 + %1:_(<8 x s32>) = COPY $wl2 + %2:_(<8 x s32>) = COPY $wl4 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<8 x s32>), %2:_, shufflemask(8, 9, -1, 11, 12, 13, -1, 15, 0, 1, -1, 3, 4, 5, -1, 7) + $x0 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x0 +... + +--- +name: shuffle_vector_32_4x4 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_32_4x4 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s32>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s32>) = COPY $x1 + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<16 x s32>) = G_SHUFFLE_VECTOR [[COPY]](<16 x s32>), [[COPY1]], shufflemask(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) + ; CHECK-NEXT: $x2 = COPY [[SHUF]](<16 x s32>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x2 + %1:_(<16 x s32>) = COPY $x0 + %2:_(<16 x s32>) = COPY $x1 + %0:_(<16 x s32>) = G_SHUFFLE_VECTOR %1:_(<16 x s32>), %2:_, shufflemask(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15) + $x2 = COPY %0:_(<16 x s32>) + PseudoRET implicit $lr, implicit $x2 +... + +--- +name: shuffle_vector_16_4x4 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_16_4x4 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY $x1 + ; CHECK-NEXT: [[SHUF:%[0-9]+]]:_(<32 x s16>) = G_SHUFFLE_VECTOR [[COPY]](<32 x s16>), [[COPY1]], shufflemask(0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31) + ; CHECK-NEXT: $x2 = COPY [[SHUF]](<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x2 + %1:_(<32 x s16>) = COPY $x0 + %2:_(<32 x s16>) = COPY $x1 + %0:_(<32 x s16>) = G_SHUFFLE_VECTOR %1:_(<32 x s16>), %2:_, shufflemask(0, 8, 16, 24, 1, 9, 17, 25, 2, 10, 18, 26, 3, 11, 19, 27, 4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31) + $x2 = COPY %0:_(<32 x s16>) + PseudoRET implicit $lr, implicit $x2 +... + +--- +name: shuffle_vector_8_512 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: shuffle_vector_8_512 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s8>) = COPY $wl0 + ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(<32 x s8>) = G_IMPLICIT_DEF + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 35 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<64 x s8>) = G_CONCAT_VECTORS [[COPY]](<32 x s8>), [[DEF]](<32 x s8>) + ; CHECK-NEXT: [[AIE_VSHUFFLE:%[0-9]+]]:_(<64 x s8>) = G_AIE_VSHUFFLE [[CONCAT_VECTORS]], [[CONCAT_VECTORS]], [[C]](s32) + ; CHECK-NEXT: $x2 = COPY [[AIE_VSHUFFLE]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x2 + %1:_(<32 x s8>) = COPY $wl0 + %2:_(<32 x s8>) = G_IMPLICIT_DEF + %0:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<32 x s8>), %2:_, shufflemask(0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63) + $x2 = COPY %0:_(<64 x s8>) + PseudoRET implicit $lr, implicit $x2 +... + + +--- +name: shuffle_vector_8_1024 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_8_1024 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<64 x s8>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<64 x s8>) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 35 + ; CHECK-NEXT: [[AIE_VSHUFFLE:%[0-9]+]]:_(<64 x s8>) = G_AIE_VSHUFFLE [[COPY]], [[COPY1]], [[C]](s32) + ; CHECK-NEXT: $x2 = COPY [[AIE_VSHUFFLE]](<64 x s8>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x2 + %1:_(<64 x s8>) = COPY $x0 + %2:_(<64 x s8>) = COPY $x1 + %0:_(<64 x s8>) = G_SHUFFLE_VECTOR %1:_(<64 x s8>), %2:_, shufflemask(0, 16, 32, 48, 1, 17, 33, 49, 2, 18, 34, 50, 3, 19, 35, 51, 4, 20, 36, 52, 5, 21, 37, 53, 6, 22, 38, 54, 7, 23, 39, 55, 8, 24, 40, 56, 9, 25, 41, 57, 10, 26, 42, 58, 11, 27, 43, 59, 12, 28, 44, 60, 13, 29, 45, 61, 14, 30, 46, 62, 15, 31, 47, 63) + $x2 = COPY %0:_(<64 x s8>) + PseudoRET implicit $lr, implicit $x2 +... + +--- +name: shuffle_vector_1024_4x8 +legalized: false +body: | + bb.1.entry: + liveins: $x0, $x1 + ; CHECK-LABEL: name: shuffle_vector_1024_4x8 + ; CHECK: liveins: $x0, $x1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<32 x s16>) = COPY $x0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<32 x s16>) = COPY $x1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK-NEXT: [[AIE_VSHUFFLE:%[0-9]+]]:_(<32 x s16>) = G_AIE_VSHUFFLE [[COPY]], [[COPY1]], [[C]](s32) + ; CHECK-NEXT: $x2 = COPY [[AIE_VSHUFFLE]](<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x2 + %1:_(<32 x s16>) = COPY $x0 + %2:_(<32 x s16>) = COPY $x1 + %0:_(<32 x s16>) = G_SHUFFLE_VECTOR %1:_(<32 x s16>), %2:_, shufflemask(0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31) + $x2 = COPY %0:_(<32 x s16>) + PseudoRET implicit $lr, implicit $x2 +... + +--- +name: shuffle_vector_512_4x8 +legalized: false +body: | + bb.1.entry: + liveins: $wl0, $wl1 + ; CHECK-LABEL: name: shuffle_vector_512_4x8 + ; CHECK: liveins: $wl0, $wl1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<16 x s16>) = COPY $wl0 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<16 x s16>) = COPY $wl1 + ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 29 + ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:_(<32 x s16>) = G_CONCAT_VECTORS [[COPY]](<16 x s16>), [[COPY1]](<16 x s16>) + ; CHECK-NEXT: [[AIE_VSHUFFLE:%[0-9]+]]:_(<32 x s16>) = G_AIE_VSHUFFLE [[CONCAT_VECTORS]], [[CONCAT_VECTORS]], [[C]](s32) + ; CHECK-NEXT: $x2 = COPY [[AIE_VSHUFFLE]](<32 x s16>) + ; CHECK-NEXT: PseudoRET implicit $lr, implicit $x2 + %1:_(<16 x s16>) = COPY $wl0 + %2:_(<16 x s16>) = COPY $wl1 + %0:_(<32 x s16>) = G_SHUFFLE_VECTOR %1:_(<16 x s16>), %2:_, shufflemask(0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31) + $x2 = COPY %0:_(<32 x s16>) + PseudoRET implicit $lr, implicit $x2 +... diff --git a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll index f33c6811ccd6..0284bbbe9d7f 100644 --- a/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll +++ b/llvm/test/CodeGen/AIE/aie2/intrinsics-shufflevec.ll @@ -15,69 +15,18 @@ define <8 x i32> @test_extract_vector(<16 x i32> noundef %a, i32 noundef %idx) { ; CHECK-NEXT: nopa ; nopx // Delay Slot 5 ; CHECK-NEXT: nop // Delay Slot 4 ; CHECK-NEXT: nop // Delay Slot 3 -; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mov r8, r16 // Delay Slot 1 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: mova r16, #8 -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #9 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #10 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #11 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #12 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #13 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: j #.LBB0_3 +; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; vmov wl0, wh0; nopv +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_2: // %return +; CHECK-NEXT: nopa ; ret lr ; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mova r16, #15 // Delay Slot 4 -; CHECK-NEXT: vextract.s32 r6, x2, r16 // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 ; CHECK-NEXT: nop // Delay Slot 2 -; CHECK-NEXT: mova r16, #14 // Delay Slot 1 -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_2: // %if.then -; CHECK-NEXT: mova r16, #0; nopxm -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_3: // %return -; CHECK-NEXT: nopx ; vextract.s32 r7, x2, r16 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r8 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 1 entry: %cmp = icmp eq i32 %idx, 0 br i1 %cmp, label %if.then, label %if.end @@ -99,117 +48,27 @@ define <16 x i32> @test_insert_vector(<16 x i32> noundef %a, i32 noundef %idx, < ; CHECK-LABEL: test_insert_vector: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopx ; mov r25, r17 -; CHECK-NEXT: mov r26, r18 -; CHECK-NEXT: mov r27, r19 -; CHECK-NEXT: mova r19, #0 -; CHECK-NEXT: mova r18, #1 -; CHECK-NEXT: mov r24, r16 -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r4, x4, r16 -; CHECK-NEXT: movx r17, #2 -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r1, x4, r19 -; CHECK-NEXT: vextract.s32 r2, x4, r18 -; CHECK-NEXT: vextract.s32 r3, x4, r17 -; CHECK-NEXT: vextract.s32 r5, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r6, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r7, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r8, x4, r16 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: jz r0, #.LBB1_2 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 2 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; jz r0, #.LBB1_2; nopv +; CHECK-NEXT: nopa ; nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: nop // Delay Slot 2 +; CHECK-NEXT: vmov wl0, wl4 // Delay Slot 1 ; CHECK-NEXT: // %bb.1: // %if.end -; CHECK-NEXT: nopb ; mova r16, #3; nops ; nopxm ; nopv -; CHECK-NEXT: vextract.s32 r0, x2, r19 -; CHECK-NEXT: vextract.s32 r1, x0, r19 -; CHECK-NEXT: vextract.s32 r2, x2, r18 -; CHECK-NEXT: vextract.s32 r3, x0, r18 -; CHECK-NEXT: vextract.s32 r4, x2, r17 -; CHECK-NEXT: vextract.s32 r5, x0, r17 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: vextract.s32 r7, x0, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: vextract.s32 r9, x0, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x2, r16 -; CHECK-NEXT: vextract.s32 r11, x0, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x2, r16 -; CHECK-NEXT: vextract.s32 r13, x0, r16 -; CHECK-NEXT: j #.LBB1_3 -; CHECK-NEXT: nop // Delay Slot 5 -; CHECK-NEXT: mova r16, #6 // Delay Slot 4 -; CHECK-NEXT: vextract.s32 r14, x2, r16 // Delay Slot 3 -; CHECK-NEXT: vextract.s32 r15, x0, r16 // Delay Slot 2 +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: vmov wh2, wl0 // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov x0, x2 // Delay Slot 2 ; CHECK-NEXT: nop // Delay Slot 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB1_2: // %if.then -; CHECK-NEXT: mova r16, #3; nopx -; CHECK-NEXT: vextract.s32 r0, x0, r19 -; CHECK-NEXT: vextract.s32 r1, x2, r19 -; CHECK-NEXT: vextract.s32 r2, x0, r18 -; CHECK-NEXT: vextract.s32 r3, x2, r18 -; CHECK-NEXT: vextract.s32 r4, x0, r17 -; CHECK-NEXT: vextract.s32 r5, x2, r17 -; CHECK-NEXT: vextract.s32 r6, x0, r16 -; CHECK-NEXT: vextract.s32 r7, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x0, r16 -; CHECK-NEXT: vextract.s32 r9, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x0, r16 -; CHECK-NEXT: vextract.s32 r11, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x0, r16 -; CHECK-NEXT: vextract.s32 r13, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x0, r16 -; CHECK-NEXT: vextract.s32 r15, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB1_3: // %cleanup -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r19, r27; nopv -; CHECK-NEXT: mov r18, r26 -; CHECK-NEXT: mov r17, r25 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nop // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: nop // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %b, <8 x i32> undef, <16 x i32> %cmp = icmp eq i32 %idx, 0 @@ -232,56 +91,12 @@ define <16 x i32> @test_concat_vector(<8 x i32> noundef %a, <8 x i32> noundef %b ; CHECK-LABEL: test_concat_vector: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopa ; nopx ; mov r24, r16 -; CHECK-NEXT: mova r16, #0 -; CHECK-NEXT: vextract.s32 r0, x2, r16 -; CHECK-NEXT: vextract.s32 r1, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: vextract.s32 r3, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: vextract.s32 r5, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: vextract.s32 r7, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: vextract.s32 r9, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r10, x2, r16 -; CHECK-NEXT: vextract.s32 r11, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r12, x2, r16 -; CHECK-NEXT: vextract.s32 r13, x4, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r14, x2, r16 -; CHECK-NEXT: vextract.s32 r15, x4, r16 -; CHECK-NEXT: vpush.lo.32 x0, r13, x0 -; CHECK-NEXT: vpush.lo.32 x0, r15, x0 -; CHECK-NEXT: vpush.lo.32 x0, r11, x0 -; CHECK-NEXT: vpush.lo.32 x0, r9, x0 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x0, r12, x0 -; CHECK-NEXT: vpush.lo.32 x0, r14, x0 -; CHECK-NEXT: vpush.lo.32 x0, r10, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 // Delay Slot 3 -; CHECK-NEXT: vpush.lo.32 x0, r0, x0 // Delay Slot 2 -; CHECK-NEXT: mov r16, r24 // Delay Slot 1 +; CHECK-NEXT: nopb ; nopa ; nops ; ret lr ; nopm ; nopv +; CHECK-NEXT: nopx // Delay Slot 5 +; CHECK-NEXT: nop // Delay Slot 4 +; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 3 +; CHECK-NEXT: vmov wh0, wl4 // Delay Slot 2 +; CHECK-NEXT: nop // Delay Slot 1 entry: %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> ret <16 x i32> %shuffle @@ -291,50 +106,14 @@ define <16 x i32> @test_set_vector(i32 noundef %idx, <8 x i32> noundef %a) { ; CHECK-LABEL: test_set_vector: ; CHECK: .p2align 4 ; CHECK-NEXT: // %bb.0: // %entry -; CHECK-NEXT: nopb ; nopa ; nops ; nopx ; mov r9, r16; nopv -; CHECK-NEXT: mova r16, #0 -; CHECK-NEXT: vextract.s32 r1, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #1 -; CHECK-NEXT: vextract.s32 r2, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #2 -; CHECK-NEXT: vextract.s32 r3, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #3 -; CHECK-NEXT: vextract.s32 r4, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #4 -; CHECK-NEXT: vextract.s32 r5, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #5 -; CHECK-NEXT: vextract.s32 r6, x2, r16 -; CHECK-NEXT: nop -; CHECK-NEXT: mova r16, #7 -; CHECK-NEXT: vextract.s32 r7, x2, r16 +; CHECK-NEXT: mov r1, r16 ; CHECK-NEXT: eqz r0, r0 -; CHECK-NEXT: mova r16, #6 -; CHECK-NEXT: vextract.s32 r8, x2, r16 -; CHECK-NEXT: add r16, r0, #-1 -; CHECK-NEXT: vpush.lo.32 x0, r7, x0 -; CHECK-NEXT: vpush.lo.32 x0, r8, x0 -; CHECK-NEXT: vpush.lo.32 x0, r6, x0 -; CHECK-NEXT: vpush.lo.32 x0, r5, x0 -; CHECK-NEXT: vpush.lo.32 x0, r4, x0 -; CHECK-NEXT: vpush.lo.32 x0, r3, x0 -; CHECK-NEXT: vpush.lo.32 x0, r2, x0 -; CHECK-NEXT: vpush.lo.32 x0, r1, x0 -; CHECK-NEXT: vpush.lo.32 x2, r0, x0 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 ; CHECK-NEXT: ret lr -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 5 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 4 -; CHECK-NEXT: vpush.lo.32 x2, r0, x2 // Delay Slot 3 -; CHECK-NEXT: vsel.32 x0, x0, x2, r16 // Delay Slot 2 -; CHECK-NEXT: mov r16, r9 // Delay Slot 1 +; CHECK-NEXT: vmov wh0, wl2 // Delay Slot 5 +; CHECK-NEXT: vmov wl0, wl2 // Delay Slot 4 +; CHECK-NEXT: add r16, r0, #-1 // Delay Slot 3 +; CHECK-NEXT: vsel.32 x0, x0, x0, r16 // Delay Slot 2 +; CHECK-NEXT: mov r16, r1 // Delay Slot 1 entry: %cmp = icmp eq i32 %idx, 0 %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <16 x i32>