Skip to content

Commit 66ba9dc

Browse files
authored
[LoongArch] Custom lower vecreduce. (#155196)
1 parent 8fdae0c commit 66ba9dc

17 files changed

+552
-602
lines changed

llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,13 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
342342
MVT::v2i16, MVT::v4i32, MVT::v2i32, MVT::v2i64}) {
343343
setOperationAction(ISD::TRUNCATE, VT, Custom);
344344
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
345+
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
346+
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
347+
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
348+
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
349+
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
350+
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
351+
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
345352
}
346353
}
347354

@@ -529,6 +536,14 @@ SDValue LoongArchTargetLowering::LowerOperation(SDValue Op,
529536
return lowerBF16_TO_FP(Op, DAG);
530537
case ISD::VECREDUCE_ADD:
531538
return lowerVECREDUCE_ADD(Op, DAG);
539+
case ISD::VECREDUCE_AND:
540+
case ISD::VECREDUCE_OR:
541+
case ISD::VECREDUCE_XOR:
542+
case ISD::VECREDUCE_SMAX:
543+
case ISD::VECREDUCE_SMIN:
544+
case ISD::VECREDUCE_UMAX:
545+
case ISD::VECREDUCE_UMIN:
546+
return lowerVECREDUCE(Op, DAG);
532547
}
533548
return SDValue();
534549
}
@@ -583,6 +598,45 @@ SDValue LoongArchTargetLowering::lowerVECREDUCE_ADD(SDValue Op,
583598
DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
584599
}
585600

601+
// Lower vecreduce_and/or/xor/[s/u]max/[s/u]min.
602+
// For Example:
603+
// call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a)
604+
// can be lowered to:
605+
// VBSRL_V vr1, vr0, 8
606+
// VMAX_W vr0, vr1, vr0
607+
// VBSRL_V vr1, vr0, 4
608+
// VMAX_W vr0, vr1, vr0
609+
// VPICKVE2GR_W a0, vr0, 0
610+
// For 256 bit vector, it is illegal and will be spilt into
611+
// two 128 bit vector by default then processed by this.
612+
SDValue LoongArchTargetLowering::lowerVECREDUCE(SDValue Op,
613+
SelectionDAG &DAG) const {
614+
SDLoc DL(Op);
615+
616+
MVT OpVT = Op.getSimpleValueType();
617+
SDValue Val = Op.getOperand(0);
618+
619+
unsigned NumEles = Val.getSimpleValueType().getVectorNumElements();
620+
unsigned EleBits = Val.getSimpleValueType().getScalarSizeInBits();
621+
622+
// Ensure operand type legal or enable it legal.
623+
while (!isTypeLegal(Val.getSimpleValueType())) {
624+
Val = DAG.WidenVector(Val, DL);
625+
}
626+
627+
unsigned Opcode = ISD::getVecReduceBaseOpcode(Op.getOpcode());
628+
MVT VecTy = Val.getSimpleValueType();
629+
630+
for (int i = NumEles; i > 1; i /= 2) {
631+
SDValue ShiftAmt = DAG.getConstant(i * EleBits / 16, DL, MVT::i64);
632+
SDValue Tmp = DAG.getNode(LoongArchISD::VBSRL, DL, VecTy, Val, ShiftAmt);
633+
Val = DAG.getNode(Opcode, DL, VecTy, Tmp, Val);
634+
}
635+
636+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpVT, Val,
637+
DAG.getConstant(0, DL, Subtarget.getGRLenVT()));
638+
}
639+
586640
SDValue LoongArchTargetLowering::lowerPREFETCH(SDValue Op,
587641
SelectionDAG &DAG) const {
588642
unsigned IsData = Op.getConstantOperandVal(4);

llvm/lib/Target/LoongArch/LoongArchISelLowering.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,7 @@ class LoongArchTargetLowering : public TargetLowering {
395395
SDValue lowerFP_TO_BF16(SDValue Op, SelectionDAG &DAG) const;
396396
SDValue lowerBF16_TO_FP(SDValue Op, SelectionDAG &DAG) const;
397397
SDValue lowerVECREDUCE_ADD(SDValue Op, SelectionDAG &DAG) const;
398+
SDValue lowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
398399

399400
bool isFPImmLegal(const APFloat &Imm, EVT VT,
400401
bool ForCodeSize) const override;

llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,13 @@ bool LoongArchTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
100100
default:
101101
return true;
102102
case Intrinsic::vector_reduce_add:
103+
case Intrinsic::vector_reduce_and:
104+
case Intrinsic::vector_reduce_or:
105+
case Intrinsic::vector_reduce_smax:
106+
case Intrinsic::vector_reduce_smin:
107+
case Intrinsic::vector_reduce_umax:
108+
case Intrinsic::vector_reduce_umin:
109+
case Intrinsic::vector_reduce_xor:
103110
return false;
104111
}
105112
}

llvm/test/CodeGen/LoongArch/lasx/vec-reduce-and.ll

Lines changed: 32 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,17 @@ define void @vec_reduce_and_v32i8(ptr %src, ptr %dst) nounwind {
55
; CHECK-LABEL: vec_reduce_and_v32i8:
66
; CHECK: # %bb.0:
77
; CHECK-NEXT: xvld $xr0, $a0, 0
8-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78
9-
; CHECK-NEXT: xvshuf4i.b $xr1, $xr1, 228
10-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
11-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
12-
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 8
13-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
14-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
15-
; CHECK-NEXT: xvsrli.d $xr1, $xr1, 32
16-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
17-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
18-
; CHECK-NEXT: xvshuf4i.b $xr1, $xr1, 14
19-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
20-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
21-
; CHECK-NEXT: xvrepl128vei.b $xr1, $xr1, 1
22-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
23-
; CHECK-NEXT: xvstelm.b $xr0, $a1, 0, 0
8+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
9+
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
10+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
11+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
12+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4
13+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
14+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2
15+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
16+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1
17+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
18+
; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0
2419
; CHECK-NEXT: ret
2520
%v = load <32 x i8>, ptr %src
2621
%res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v)
@@ -32,19 +27,15 @@ define void @vec_reduce_and_v16i16(ptr %src, ptr %dst) nounwind {
3227
; CHECK-LABEL: vec_reduce_and_v16i16:
3328
; CHECK: # %bb.0:
3429
; CHECK-NEXT: xvld $xr0, $a0, 0
35-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78
36-
; CHECK-NEXT: xvshuf4i.h $xr1, $xr1, 228
37-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
38-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
39-
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 8
40-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
41-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
42-
; CHECK-NEXT: xvshuf4i.h $xr1, $xr1, 14
43-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
44-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
45-
; CHECK-NEXT: xvrepl128vei.h $xr1, $xr1, 1
46-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
47-
; CHECK-NEXT: xvstelm.h $xr0, $a1, 0, 0
30+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
31+
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
32+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
33+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
34+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4
35+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
36+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2
37+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
38+
; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0
4839
; CHECK-NEXT: ret
4940
%v = load <16 x i16>, ptr %src
5041
%res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v)
@@ -56,16 +47,13 @@ define void @vec_reduce_and_v8i32(ptr %src, ptr %dst) nounwind {
5647
; CHECK-LABEL: vec_reduce_and_v8i32:
5748
; CHECK: # %bb.0:
5849
; CHECK-NEXT: xvld $xr0, $a0, 0
59-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78
60-
; CHECK-NEXT: xvshuf4i.w $xr1, $xr1, 228
61-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
62-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
63-
; CHECK-NEXT: xvshuf4i.w $xr1, $xr1, 14
64-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
65-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
66-
; CHECK-NEXT: xvrepl128vei.w $xr1, $xr1, 1
67-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
68-
; CHECK-NEXT: xvstelm.w $xr0, $a1, 0, 0
50+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
51+
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
52+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
53+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
54+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4
55+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
56+
; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0
6957
; CHECK-NEXT: ret
7058
%v = load <8 x i32>, ptr %src
7159
%res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v)
@@ -77,15 +65,11 @@ define void @vec_reduce_and_v4i64(ptr %src, ptr %dst) nounwind {
7765
; CHECK-LABEL: vec_reduce_and_v4i64:
7866
; CHECK: # %bb.0:
7967
; CHECK-NEXT: xvld $xr0, $a0, 0
80-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
81-
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
82-
; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78
83-
; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr2
84-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
85-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
86-
; CHECK-NEXT: xvrepl128vei.d $xr1, $xr1, 1
87-
; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
88-
; CHECK-NEXT: xvstelm.d $xr0, $a1, 0, 0
68+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
69+
; CHECK-NEXT: vand.v $vr0, $vr0, $vr1
70+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
71+
; CHECK-NEXT: vand.v $vr0, $vr1, $vr0
72+
; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0
8973
; CHECK-NEXT: ret
9074
%v = load <4 x i64>, ptr %src
9175
%res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v)

llvm/test/CodeGen/LoongArch/lasx/vec-reduce-or.ll

Lines changed: 32 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -5,22 +5,17 @@ define void @vec_reduce_or_v32i8(ptr %src, ptr %dst) nounwind {
55
; CHECK-LABEL: vec_reduce_or_v32i8:
66
; CHECK: # %bb.0:
77
; CHECK-NEXT: xvld $xr0, $a0, 0
8-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78
9-
; CHECK-NEXT: xvshuf4i.b $xr1, $xr1, 228
10-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
11-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
12-
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 8
13-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
14-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
15-
; CHECK-NEXT: xvsrli.d $xr1, $xr1, 32
16-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
17-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
18-
; CHECK-NEXT: xvshuf4i.b $xr1, $xr1, 14
19-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
20-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
21-
; CHECK-NEXT: xvrepl128vei.b $xr1, $xr1, 1
22-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
23-
; CHECK-NEXT: xvstelm.b $xr0, $a1, 0, 0
8+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
9+
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
10+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
11+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
12+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4
13+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
14+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2
15+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
16+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 1
17+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
18+
; CHECK-NEXT: vstelm.b $vr0, $a1, 0, 0
2419
; CHECK-NEXT: ret
2520
%v = load <32 x i8>, ptr %src
2621
%res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v)
@@ -32,19 +27,15 @@ define void @vec_reduce_or_v16i16(ptr %src, ptr %dst) nounwind {
3227
; CHECK-LABEL: vec_reduce_or_v16i16:
3328
; CHECK: # %bb.0:
3429
; CHECK-NEXT: xvld $xr0, $a0, 0
35-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78
36-
; CHECK-NEXT: xvshuf4i.h $xr1, $xr1, 228
37-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
38-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
39-
; CHECK-NEXT: xvbsrl.v $xr1, $xr1, 8
40-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
41-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
42-
; CHECK-NEXT: xvshuf4i.h $xr1, $xr1, 14
43-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
44-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
45-
; CHECK-NEXT: xvrepl128vei.h $xr1, $xr1, 1
46-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
47-
; CHECK-NEXT: xvstelm.h $xr0, $a1, 0, 0
30+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
31+
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
32+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
33+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
34+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4
35+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
36+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 2
37+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
38+
; CHECK-NEXT: vstelm.h $vr0, $a1, 0, 0
4839
; CHECK-NEXT: ret
4940
%v = load <16 x i16>, ptr %src
5041
%res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v)
@@ -56,16 +47,13 @@ define void @vec_reduce_or_v8i32(ptr %src, ptr %dst) nounwind {
5647
; CHECK-LABEL: vec_reduce_or_v8i32:
5748
; CHECK: # %bb.0:
5849
; CHECK-NEXT: xvld $xr0, $a0, 0
59-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 78
60-
; CHECK-NEXT: xvshuf4i.w $xr1, $xr1, 228
61-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
62-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
63-
; CHECK-NEXT: xvshuf4i.w $xr1, $xr1, 14
64-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
65-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
66-
; CHECK-NEXT: xvrepl128vei.w $xr1, $xr1, 1
67-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
68-
; CHECK-NEXT: xvstelm.w $xr0, $a1, 0, 0
50+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
51+
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
52+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
53+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
54+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 4
55+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
56+
; CHECK-NEXT: vstelm.w $vr0, $a1, 0, 0
6957
; CHECK-NEXT: ret
7058
%v = load <8 x i32>, ptr %src
7159
%res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v)
@@ -77,15 +65,11 @@ define void @vec_reduce_or_v4i64(ptr %src, ptr %dst) nounwind {
7765
; CHECK-LABEL: vec_reduce_or_v4i64:
7866
; CHECK: # %bb.0:
7967
; CHECK-NEXT: xvld $xr0, $a0, 0
80-
; CHECK-NEXT: pcalau12i $a0, %pc_hi20(.LCPI3_0)
81-
; CHECK-NEXT: xvld $xr1, $a0, %pc_lo12(.LCPI3_0)
82-
; CHECK-NEXT: xvpermi.d $xr2, $xr0, 78
83-
; CHECK-NEXT: xvshuf.d $xr1, $xr0, $xr2
84-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
85-
; CHECK-NEXT: xvpermi.d $xr1, $xr0, 68
86-
; CHECK-NEXT: xvrepl128vei.d $xr1, $xr1, 1
87-
; CHECK-NEXT: xvor.v $xr0, $xr0, $xr1
88-
; CHECK-NEXT: xvstelm.d $xr0, $a1, 0, 0
68+
; CHECK-NEXT: xvpermi.q $xr1, $xr0, 1
69+
; CHECK-NEXT: vor.v $vr0, $vr0, $vr1
70+
; CHECK-NEXT: vbsrl.v $vr1, $vr0, 8
71+
; CHECK-NEXT: vor.v $vr0, $vr1, $vr0
72+
; CHECK-NEXT: vstelm.d $vr0, $a1, 0, 0
8973
; CHECK-NEXT: ret
9074
%v = load <4 x i64>, ptr %src
9175
%res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v)

0 commit comments

Comments
 (0)