Skip to content

Commit b280cf5

Browse files
authored
[LoongArch] Split 256-bit build_vector to avoid using LASX element insertion (#154918)
Note: Only worse for `v8i32/v8f32/v4i64/v4f64` types when the high part only has one non-undef element. Skip spliting to avoid this.
1 parent 44e71c9 commit b280cf5

File tree

7 files changed

+471
-705
lines changed

7 files changed

+471
-705
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 76 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1978,9 +1978,8 @@ lowerVECTOR_SHUFFLE_XVREPLVEI(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
19781978
if (fitsRegularPattern<int>(Begin, 1, End - HalfSize, SplatIndex, 0) &&
19791979
fitsRegularPattern<int>(Begin + HalfSize, 1, End, SplatIndex + HalfSize,
19801980
0)) {
1981-
APInt Imm(64, SplatIndex);
19821981
return DAG.getNode(LoongArchISD::VREPLVEI, DL, VT, V1,
1983-
DAG.getConstant(Imm, DL, Subtarget.getGRLenVT()));
1982+
DAG.getConstant(SplatIndex, DL, Subtarget.getGRLenVT()));
19841983
}
19851984

19861985
return SDValue();
@@ -2647,6 +2646,43 @@ static SDValue lowerBUILD_VECTORAsBroadCastLoad(BuildVectorSDNode *BVOp,
26472646
return SDValue();
26482647
}
26492648

2649+
// Sequentially insert elements from Ops into Vector, from low to high indices.
2650+
// Note: Ops can have fewer elements than Vector.
2651+
static void fillVector(ArrayRef<SDValue> Ops, SelectionDAG &DAG, SDLoc DL,
2652+
const LoongArchSubtarget &Subtarget, SDValue &Vector,
2653+
EVT ResTy) {
2654+
assert(Ops.size() <= ResTy.getVectorNumElements());
2655+
2656+
SDValue Op0 = Ops[0];
2657+
if (!Op0.isUndef())
2658+
Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0);
2659+
for (unsigned i = 1; i < Ops.size(); ++i) {
2660+
SDValue Opi = Ops[i];
2661+
if (Opi.isUndef())
2662+
continue;
2663+
Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi,
2664+
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
2665+
}
2666+
}
2667+
2668+
// Build a ResTy subvector from Node, taking NumElts elements starting at index
2669+
// 'first'.
2670+
static SDValue fillSubVectorFromBuildVector(BuildVectorSDNode *Node,
2671+
SelectionDAG &DAG, SDLoc DL,
2672+
const LoongArchSubtarget &Subtarget,
2673+
EVT ResTy, unsigned first) {
2674+
unsigned NumElts = ResTy.getVectorNumElements();
2675+
2676+
assert(first >= 0 &&
2677+
first + NumElts <= Node->getSimpleValueType(0).getVectorNumElements());
2678+
2679+
SmallVector<SDValue, 16> Ops(Node->op_begin() + first,
2680+
Node->op_begin() + first + NumElts);
2681+
SDValue Vector = DAG.getUNDEF(ResTy);
2682+
fillVector(Ops, DAG, DL, Subtarget, Vector, ResTy);
2683+
return Vector;
2684+
}
2685+
26502686
SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
26512687
SelectionDAG &DAG) const {
26522688
BuildVectorSDNode *Node = cast<BuildVectorSDNode>(Op);
@@ -2756,29 +2792,18 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
27562792
BitVector UndefElements;
27572793
if (Node->getRepeatedSequence(Sequence, &UndefElements) &&
27582794
UndefElements.count() == 0) {
2759-
SDValue Vector = DAG.getUNDEF(ResTy);
2760-
SDValue FillVec = Vector;
2761-
EVT FillTy = ResTy;
2762-
27632795
// Using LSX instructions to fill the sub-sequence of 256-bits vector,
27642796
// because the high part can be simply treated as undef.
2765-
if (Is256Vec) {
2766-
FillTy = ResTy.getHalfNumVectorElementsVT(*DAG.getContext());
2767-
FillVec = DAG.getExtractSubvector(DL, FillTy, Vector, 0);
2768-
}
2797+
SDValue Vector = DAG.getUNDEF(ResTy);
2798+
EVT FillTy = Is256Vec
2799+
? ResTy.getHalfNumVectorElementsVT(*DAG.getContext())
2800+
: ResTy;
2801+
SDValue FillVec =
2802+
Is256Vec ? DAG.getExtractSubvector(DL, FillTy, Vector, 0) : Vector;
27692803

2770-
SDValue Op0 = Sequence[0];
2771-
unsigned SeqLen = Sequence.size();
2772-
if (!Op0.isUndef())
2773-
FillVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, FillTy, Op0);
2774-
for (unsigned i = 1; i < SeqLen; ++i) {
2775-
SDValue Opi = Sequence[i];
2776-
if (Opi.isUndef())
2777-
continue;
2778-
FillVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, FillTy, FillVec, Opi,
2779-
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
2780-
}
2804+
fillVector(Sequence, DAG, DL, Subtarget, FillVec, FillTy);
27812805

2806+
unsigned SeqLen = Sequence.size();
27822807
unsigned SplatLen = NumElts / SeqLen;
27832808
MVT SplatEltTy = MVT::getIntegerVT(VT.getScalarSizeInBits() * SeqLen);
27842809
MVT SplatTy = MVT::getVectorVT(SplatEltTy, SplatLen);
@@ -2805,24 +2830,38 @@ SDValue LoongArchTargetLowering::lowerBUILD_VECTOR(SDValue Op,
28052830
return DAG.getBitcast(ResTy, SplatVec);
28062831
}
28072832

2808-
// Use INSERT_VECTOR_ELT operations rather than expand to stores.
2809-
// The resulting code is the same length as the expansion, but it doesn't
2810-
// use memory operations.
2811-
assert(ResTy.isVector());
2833+
// Use INSERT_VECTOR_ELT operations rather than expand to stores, because
2834+
// using memory operations is much lower.
2835+
//
2836+
// For 256-bit vectors, normally split into two halves and concatenate.
2837+
// Special case: for v8i32/v8f32/v4i64/v4f64, if the upper half has only
2838+
// one non-undef element, skip spliting to avoid a worse result.
2839+
if (ResTy == MVT::v8i32 || ResTy == MVT::v8f32 || ResTy == MVT::v4i64 ||
2840+
ResTy == MVT::v4f64) {
2841+
unsigned NonUndefCount = 0;
2842+
for (unsigned i = NumElts / 2; i < NumElts; ++i) {
2843+
if (!Node->getOperand(i).isUndef()) {
2844+
++NonUndefCount;
2845+
if (NonUndefCount > 1)
2846+
break;
2847+
}
2848+
}
2849+
if (NonUndefCount == 1)
2850+
return fillSubVectorFromBuildVector(Node, DAG, DL, Subtarget, ResTy, 0);
2851+
}
2852+
2853+
EVT VecTy =
2854+
Is256Vec ? ResTy.getHalfNumVectorElementsVT(*DAG.getContext()) : ResTy;
2855+
SDValue Vector =
2856+
fillSubVectorFromBuildVector(Node, DAG, DL, Subtarget, VecTy, 0);
28122857

2813-
SDValue Op0 = Node->getOperand(0);
2814-
SDValue Vector = DAG.getUNDEF(ResTy);
2858+
if (Is128Vec)
2859+
return Vector;
28152860

2816-
if (!Op0.isUndef())
2817-
Vector = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ResTy, Op0);
2818-
for (unsigned i = 1; i < NumElts; ++i) {
2819-
SDValue Opi = Node->getOperand(i);
2820-
if (Opi.isUndef())
2821-
continue;
2822-
Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ResTy, Vector, Opi,
2823-
DAG.getConstant(i, DL, Subtarget.getGRLenVT()));
2824-
}
2825-
return Vector;
2861+
SDValue VectorHi = fillSubVectorFromBuildVector(Node, DAG, DL, Subtarget,
2862+
VecTy, NumElts / 2);
2863+
2864+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResTy, Vector, VectorHi);
28262865
}
28272866

28282867
return SDValue();

llvm/test/CodeGen/LoongArch/lasx/bitreverse.ll

Lines changed: 38 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -7,18 +7,19 @@ declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
77
define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
88
; CHECK-LABEL: test_bitreverse_v32i8:
99
; CHECK: # %bb.0:
10-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
11-
; CHECK-NEXT: bitrev.8b $a0, $a0
12-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
13-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
14-
; CHECK-NEXT: bitrev.8b $a0, $a0
15-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
1610
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
1711
; CHECK-NEXT: bitrev.8b $a0, $a0
18-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
12+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
1913
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
2014
; CHECK-NEXT: bitrev.8b $a0, $a0
21-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
15+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
16+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
17+
; CHECK-NEXT: bitrev.8b $a0, $a0
18+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
19+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
20+
; CHECK-NEXT: bitrev.8b $a0, $a0
21+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
22+
; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2
2223
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
2324
; CHECK-NEXT: ret
2425
%b = call <32 x i8> @llvm.bitreverse.v32i8(<32 x i8> %a)
@@ -30,19 +31,20 @@ declare <16 x i16> @llvm.bitreverse.v16i16(<16 x i16>)
3031
define <16 x i16> @test_bitreverse_v16i16(<16 x i16> %a) nounwind {
3132
; CHECK-LABEL: test_bitreverse_v16i16:
3233
; CHECK: # %bb.0:
33-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
34-
; CHECK-NEXT: bitrev.d $a0, $a0
35-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
36-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
37-
; CHECK-NEXT: bitrev.d $a0, $a0
38-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
3934
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
4035
; CHECK-NEXT: bitrev.d $a0, $a0
41-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
36+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
4237
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
4338
; CHECK-NEXT: bitrev.d $a0, $a0
44-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
45-
; CHECK-NEXT: xvshuf4i.h $xr0, $xr1, 27
39+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
40+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
41+
; CHECK-NEXT: bitrev.d $a0, $a0
42+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
43+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
44+
; CHECK-NEXT: bitrev.d $a0, $a0
45+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
46+
; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
47+
; CHECK-NEXT: xvshuf4i.h $xr0, $xr2, 27
4648
; CHECK-NEXT: ret
4749
%b = call <16 x i16> @llvm.bitreverse.v16i16(<16 x i16> %a)
4850
ret <16 x i16> %b
@@ -53,19 +55,20 @@ declare <8 x i32> @llvm.bitreverse.v8i32(<8 x i32>)
5355
define <8 x i32> @test_bitreverse_v8i32(<8 x i32> %a) nounwind {
5456
; CHECK-LABEL: test_bitreverse_v8i32:
5557
; CHECK: # %bb.0:
56-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
57-
; CHECK-NEXT: bitrev.d $a0, $a0
58-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
59-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
60-
; CHECK-NEXT: bitrev.d $a0, $a0
61-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
6258
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
6359
; CHECK-NEXT: bitrev.d $a0, $a0
64-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
60+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
6561
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
6662
; CHECK-NEXT: bitrev.d $a0, $a0
67-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
68-
; CHECK-NEXT: xvshuf4i.w $xr0, $xr1, 177
63+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
64+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
65+
; CHECK-NEXT: bitrev.d $a0, $a0
66+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
67+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
68+
; CHECK-NEXT: bitrev.d $a0, $a0
69+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
70+
; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
71+
; CHECK-NEXT: xvshuf4i.w $xr0, $xr2, 177
6972
; CHECK-NEXT: ret
7073
%b = call <8 x i32> @llvm.bitreverse.v8i32(<8 x i32> %a)
7174
ret <8 x i32> %b
@@ -76,18 +79,19 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>)
7679
define <4 x i64> @test_bitreverse_v4i64(<4 x i64> %a) nounwind {
7780
; CHECK-LABEL: test_bitreverse_v4i64:
7881
; CHECK: # %bb.0:
79-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
80-
; CHECK-NEXT: bitrev.d $a0, $a0
81-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 0
82-
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
83-
; CHECK-NEXT: bitrev.d $a0, $a0
84-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 1
8582
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 2
8683
; CHECK-NEXT: bitrev.d $a0, $a0
87-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 2
84+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 0
8885
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 3
8986
; CHECK-NEXT: bitrev.d $a0, $a0
90-
; CHECK-NEXT: xvinsgr2vr.d $xr1, $a0, 3
87+
; CHECK-NEXT: vinsgr2vr.d $vr2, $a0, 1
88+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 0
89+
; CHECK-NEXT: bitrev.d $a0, $a0
90+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 0
91+
; CHECK-NEXT: xvpickve2gr.d $a0, $xr0, 1
92+
; CHECK-NEXT: bitrev.d $a0, $a0
93+
; CHECK-NEXT: vinsgr2vr.d $vr1, $a0, 1
94+
; CHECK-NEXT: xvpermi.q $xr1, $xr2, 2
9195
; CHECK-NEXT: xvori.b $xr0, $xr1, 0
9296
; CHECK-NEXT: ret
9397
%b = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> %a)

0 commit comments

Comments
 (0)