Skip to content

Commit 024353a

Browse files
committed
[RISCV] Attempt to widen SEW before generic shuffle lowering
This takes inspiration from AArch64 which does the same thing to assist with zip/trn/etc.. Doing this recursion unconditionally when the mask allows is slightly questionable, but seems to work out okay in practice. If reviewers would rather see a more narrow heuristic for widening, let me know - and please suggest a heuristic which makes sense to you. My actual motivation for this involves matching the proposed zip/unzip/zipeven/zipodd instructions being discussed on sig-vector, but this also applies to other shuffle masks as well. As a bit of context, it's helpful to realize that we have existing logic in both DAGCombine and InstCombine which mutates the element width of in an analogous manner. However, that code has two restriction which prevent it from handling the motivating cases here. First, it only triggers if there is a bitcast involving a different element type. Second, the matcher used considers a partially undef wide element to be a non-match. I considered trying to relax those assumptions, but the information loss for undef in mid-level opt seemed more likely to open a can of worms than I wanted.
1 parent cdbba15 commit 024353a

File tree

8 files changed

+390
-476
lines changed

8 files changed

+390
-476
lines changed

llvm/include/llvm/Analysis/VectorUtils.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,12 @@ void narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
235235
bool widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
236236
SmallVectorImpl<int> &ScaledMask);
237237

238+
/// A variant of the previous method which is specialized for Scale=2, and
239+
/// treats -1 as undef and allows widening when a wider element is partially
240+
/// undef in the narrow form of the mask. This transformation discards
241+
/// information about which bytes in the original shuffle were undef.
242+
bool widenShuffleMaskElts(ArrayRef<int> M, SmallVectorImpl<int> &NewMask);
243+
238244
/// Attempt to narrow/widen the \p Mask shuffle mask to the \p NumDstElts target
239245
/// width. Internally this will call narrowShuffleMaskElts/widenShuffleMaskElts.
240246
/// This will assert unless NumDstElts is a multiple of Mask.size (or vice-versa).

llvm/lib/Analysis/VectorUtils.cpp

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,42 @@ bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
479479
return true;
480480
}
481481

482+
bool llvm::widenShuffleMaskElts(ArrayRef<int> M,
483+
SmallVectorImpl<int> &NewMask) {
484+
unsigned NumElts = M.size();;
485+
if (NumElts % 2 != 0)
486+
return false;
487+
488+
NewMask.clear();
489+
for (unsigned i = 0; i < NumElts; i += 2) {
490+
int M0 = M[i];
491+
int M1 = M[i + 1];
492+
493+
// If both elements are undef, new mask is undef too.
494+
if (M0 == -1 && M1 == -1) {
495+
NewMask.push_back(-1);
496+
continue;
497+
}
498+
499+
if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
500+
NewMask.push_back(M1 / 2);
501+
continue;
502+
}
503+
504+
if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
505+
NewMask.push_back(M0 / 2);
506+
continue;
507+
}
508+
509+
NewMask.clear();
510+
return false;
511+
}
512+
513+
assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
514+
return true;
515+
}
516+
517+
482518
bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
483519
SmallVectorImpl<int> &ScaledMask) {
484520
unsigned NumSrcElts = Mask.size();

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 1 addition & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -13721,44 +13721,6 @@ static SDValue constructDup(SDValue V, int Lane, SDLoc dl, EVT VT,
1372113721
return DAG.getNode(Opcode, dl, VT, V, DAG.getConstant(Lane, dl, MVT::i64));
1372213722
}
1372313723

13724-
// Return true if we can get a new shuffle mask by checking the parameter mask
13725-
// array to test whether every two adjacent mask values are continuous and
13726-
// starting from an even number.
13727-
static bool isWideTypeMask(ArrayRef<int> M, EVT VT,
13728-
SmallVectorImpl<int> &NewMask) {
13729-
unsigned NumElts = VT.getVectorNumElements();
13730-
if (NumElts % 2 != 0)
13731-
return false;
13732-
13733-
NewMask.clear();
13734-
for (unsigned i = 0; i < NumElts; i += 2) {
13735-
int M0 = M[i];
13736-
int M1 = M[i + 1];
13737-
13738-
// If both elements are undef, new mask is undef too.
13739-
if (M0 == -1 && M1 == -1) {
13740-
NewMask.push_back(-1);
13741-
continue;
13742-
}
13743-
13744-
if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
13745-
NewMask.push_back(M1 / 2);
13746-
continue;
13747-
}
13748-
13749-
if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
13750-
NewMask.push_back(M0 / 2);
13751-
continue;
13752-
}
13753-
13754-
NewMask.clear();
13755-
return false;
13756-
}
13757-
13758-
assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
13759-
return true;
13760-
}
13761-
1376213724
// Try to widen element type to get a new mask value for a better permutation
1376313725
// sequence, so that we can use NEON shuffle instructions, such as zip1/2,
1376413726
// UZP1/2, TRN1/2, REV, INS, etc.
@@ -13785,7 +13747,7 @@ static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
1378513747
return SDValue();
1378613748

1378713749
SmallVector<int, 8> NewMask;
13788-
if (isWideTypeMask(Mask, VT, NewMask)) {
13750+
if (widenShuffleMaskElts(Mask, NewMask)) {
1378913751
MVT NewEltVT = VT.isFloatingPoint()
1379013752
? MVT::getFloatingPointVT(ElementSize * 2)
1379113753
: MVT::getIntegerVT(ElementSize * 2);

llvm/lib/Target/RISCV/RISCVISelLowering.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5261,6 +5261,42 @@ static SDValue lowerDisjointIndicesShuffle(ShuffleVectorSDNode *SVN,
52615261
return DAG.getVectorShuffle(VT, DL, Select, DAG.getUNDEF(VT), NewMask);
52625262
}
52635263

5264+
/// Try to widen element type to get a new mask value for a better permutation
5265+
/// sequence. This doesn't try to inspect the widened mask for profitability;
5266+
/// we speculate the widened form is equal or better. This has the effect of
5267+
/// reducing mask constant sizes - allowing cheaper materialization sequences
5268+
/// - and index sequence sizes - reducing register pressure and materialization
5269+
/// cost, at the cost of (possibly) an extra VTYPE toggle.
5270+
static SDValue tryWidenMaskForShuffle(SDValue Op, SelectionDAG &DAG) {
5271+
SDLoc DL(Op);
5272+
EVT VT = Op.getValueType();
5273+
EVT ScalarVT = VT.getVectorElementType();
5274+
unsigned ElementSize = ScalarVT.getFixedSizeInBits();
5275+
SDValue V0 = Op.getOperand(0);
5276+
SDValue V1 = Op.getOperand(1);
5277+
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
5278+
5279+
// Avoid wasted work leading to isTypeLegal check failing below
5280+
if (ElementSize > 32)
5281+
return SDValue();
5282+
5283+
SmallVector<int, 8> NewMask;
5284+
if (widenShuffleMaskElts(Mask, NewMask)) {
5285+
MVT NewEltVT = VT.isFloatingPoint()
5286+
? MVT::getFloatingPointVT(ElementSize * 2)
5287+
: MVT::getIntegerVT(ElementSize * 2);
5288+
MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
5289+
if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
5290+
V0 = DAG.getBitcast(NewVT, V0);
5291+
V1 = DAG.getBitcast(NewVT, V1);
5292+
return DAG.getBitcast(VT,
5293+
DAG.getVectorShuffle(NewVT, DL, V0, V1, NewMask));
5294+
}
5295+
}
5296+
5297+
return SDValue();
5298+
}
5299+
52645300
static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
52655301
const RISCVSubtarget &Subtarget) {
52665302
SDValue V1 = Op.getOperand(0);
@@ -5506,6 +5542,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
55065542
if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
55075543
return V;
55085544

5545+
// Before hitting generic lowering fallbacks, try to widen the mask
5546+
// to a wider SEW.
5547+
if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
5548+
return V;
5549+
55095550
// Can we generate a vcompress instead of a vrgather? These scale better
55105551
// at high LMUL, at the cost of not being able to fold a following select
55115552
// into them. The mask constants are also smaller than the index vector
@@ -5615,6 +5656,12 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
56155656
if (SDValue V = lowerDisjointIndicesShuffle(SVN, DAG, Subtarget))
56165657
return V;
56175658

5659+
5660+
// Before hitting generic lowering fallbacks, try to widen the mask
5661+
// to a wider SEW.
5662+
if (SDValue V = tryWidenMaskForShuffle(Op, DAG))
5663+
return V;
5664+
56185665
// Try to pick a profitable operand order.
56195666
bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
56205667
SwapOps = SwapOps ^ ShuffleVectorInst::isIdentityMask(ShuffleMaskRHS, NumElts);

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll

Lines changed: 17 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -603,10 +603,8 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
603603
define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
604604
; CHECK-LABEL: concat_4xi8_start_undef_at_start:
605605
; CHECK: # %bb.0:
606-
; CHECK-NEXT: li a0, -32
607-
; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, mu
608-
; CHECK-NEXT: vmv.s.x v0, a0
609-
; CHECK-NEXT: vslideup.vi v8, v9, 4, v0.t
606+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
607+
; CHECK-NEXT: vslideup.vi v8, v9, 2
610608
; CHECK-NEXT: ret
611609
%res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
612610
ret <8 x i8> %res
@@ -704,8 +702,8 @@ define <8 x i32> @shuffle_v8i32_2(<8 x i32> %x, <8 x i32> %y) {
704702
; CHECK-LABEL: shuffle_v8i32_2:
705703
; CHECK: # %bb.0:
706704
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
707-
; CHECK-NEXT: vmv.v.i v0, -13
708-
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
705+
; CHECK-NEXT: vmv.v.i v0, 13
706+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
709707
; CHECK-NEXT: vmerge.vvm v8, v10, v8, v0
710708
; CHECK-NEXT: ret
711709
%s = shufflevector <8 x i32> %x, <8 x i32> %y, <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
@@ -756,9 +754,9 @@ define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) {
756754
define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) {
757755
; CHECK-LABEL: shuffle_compress_singlesrc_e32:
758756
; CHECK: # %bb.0:
759-
; CHECK-NEXT: li a0, 115
760-
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
761-
; CHECK-NEXT: vmv.s.x v12, a0
757+
; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma
758+
; CHECK-NEXT: vmv.v.i v12, 13
759+
; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma
762760
; CHECK-NEXT: vcompress.vm v10, v8, v12
763761
; CHECK-NEXT: vmv.v.v v8, v10
764762
; CHECK-NEXT: ret
@@ -832,26 +830,16 @@ define <8 x i32> @shuffle_spread2_singlesrc_e32_index2(<8 x i32> %v) {
832830
}
833831

834832
define <8 x i32> @shuffle_spread3_singlesrc_e32(<8 x i32> %v) {
835-
; RV32-LABEL: shuffle_spread3_singlesrc_e32:
836-
; RV32: # %bb.0:
837-
; RV32-NEXT: lui a0, %hi(.LCPI57_0)
838-
; RV32-NEXT: addi a0, a0, %lo(.LCPI57_0)
839-
; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma
840-
; RV32-NEXT: vle16.v v12, (a0)
841-
; RV32-NEXT: vrgatherei16.vv v10, v8, v12
842-
; RV32-NEXT: vmv.v.v v8, v10
843-
; RV32-NEXT: ret
844-
;
845-
; RV64-LABEL: shuffle_spread3_singlesrc_e32:
846-
; RV64: # %bb.0:
847-
; RV64-NEXT: lui a0, 32769
848-
; RV64-NEXT: slli a0, a0, 21
849-
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
850-
; RV64-NEXT: vmv.v.x v12, a0
851-
; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma
852-
; RV64-NEXT: vrgatherei16.vv v10, v8, v12
853-
; RV64-NEXT: vmv.v.v v8, v10
854-
; RV64-NEXT: ret
833+
; CHECK-LABEL: shuffle_spread3_singlesrc_e32:
834+
; CHECK: # %bb.0:
835+
; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma
836+
; CHECK-NEXT: vmv.v.i v10, 0
837+
; CHECK-NEXT: li a0, 1
838+
; CHECK-NEXT: vslide1down.vx v12, v10, a0
839+
; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma
840+
; CHECK-NEXT: vrgatherei16.vv v10, v8, v12
841+
; CHECK-NEXT: vmv.v.v v8, v10
842+
; CHECK-NEXT: ret
855843
%out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 2, i32 undef>
856844
ret <8 x i32> %out
857845
}

0 commit comments

Comments
 (0)