Skip to content

Commit 594d359

Browse files
committed
[AArch64] Split v8f32 fptosi_sat into two v4f32.
If we produce illegal v8f32 types, the VectorLegalizer will unroll them, scalarizing the operations. In this patch we pre-split them during custom legalization to produce better results.
1 parent 25efb74 commit 594d359

File tree

4 files changed

+160
-842
lines changed

4 files changed

+160
-842
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4508,21 +4508,28 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
45084508
EVT SrcElementVT = SrcVT.getVectorElementType();
45094509

45104510
// In the absence of FP16 support, promote f16 to f32 and saturate the result.
4511+
SDLoc DL(Op);
4512+
SDValue SrcVal2;
45114513
if ((SrcElementVT == MVT::f16 &&
45124514
(!Subtarget->hasFullFP16() || DstElementWidth > 16)) ||
45134515
SrcElementVT == MVT::bf16) {
45144516
MVT F32VT = MVT::getVectorVT(MVT::f32, SrcVT.getVectorNumElements());
4515-
SrcVal = DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), F32VT, SrcVal);
4517+
SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F32VT, SrcVal);
4518+
// If we are extending to a v8f32, split into two v4f32 to produce legal
4519+
// types.
4520+
if (F32VT.getSizeInBits() > 128) {
4521+
std::tie(SrcVal, SrcVal2) = DAG.SplitVector(SrcVal, DL);
4522+
F32VT = F32VT.getHalfNumVectorElementsVT();
4523+
}
45164524
SrcVT = F32VT;
45174525
SrcElementVT = MVT::f32;
45184526
SrcElementWidth = 32;
45194527
} else if (SrcElementVT != MVT::f64 && SrcElementVT != MVT::f32 &&
45204528
SrcElementVT != MVT::f16 && SrcElementVT != MVT::bf16)
45214529
return SDValue();
45224530

4523-
SDLoc DL(Op);
4524-
// Expand to f64 if we are saturating to i64, to help produce keep the lanes
4525-
// the same width and produce a fcvtzu.
4531+
// Expand to f64 if we are saturating to i64, to help keep the lanes the same
4532+
// width and produce a fcvtzu.
45264533
if (SatWidth == 64 && SrcElementWidth < 64) {
45274534
MVT F64VT = MVT::getVectorVT(MVT::f64, SrcVT.getVectorNumElements());
45284535
SrcVal = DAG.getNode(ISD::FP_EXTEND, DL, F64VT, SrcVal);
@@ -4531,9 +4538,16 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
45314538
SrcElementWidth = 64;
45324539
}
45334540
// Cases that we can emit directly.
4534-
if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth)
4535-
return DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4536-
DAG.getValueType(DstVT.getScalarType()));
4541+
if (SrcElementWidth == DstElementWidth && SrcElementWidth == SatWidth) {
4542+
SDValue Res = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal,
4543+
DAG.getValueType(DstVT.getScalarType()));
4544+
if (SrcVal2) {
4545+
SDValue Res2 = DAG.getNode(Op.getOpcode(), DL, DstVT, SrcVal2,
4546+
DAG.getValueType(DstVT.getScalarType()));
4547+
return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Res, Res2);
4548+
}
4549+
return Res;
4550+
}
45374551

45384552
// Otherwise we emit a cvt that saturates to a higher BW, and saturate the
45394553
// result. This is only valid if the legal cvt is larger than the saturate
@@ -4545,20 +4559,32 @@ AArch64TargetLowering::LowerVectorFP_TO_INT_SAT(SDValue Op,
45454559
EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
45464560
SDValue NativeCvt = DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal,
45474561
DAG.getValueType(IntVT.getScalarType()));
4548-
SDValue Sat;
4562+
SDValue NativeCvt2 =
4563+
SrcVal2 ? DAG.getNode(Op.getOpcode(), DL, IntVT, SrcVal2,
4564+
DAG.getValueType(IntVT.getScalarType()))
4565+
: SDValue();
4566+
SDValue Sat, Sat2;
45494567
if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
45504568
SDValue MinC = DAG.getConstant(
45514569
APInt::getSignedMaxValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
45524570
SDValue Min = DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt, MinC);
4571+
SDValue Min2 = SrcVal2 ? DAG.getNode(ISD::SMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
45534572
SDValue MaxC = DAG.getConstant(
45544573
APInt::getSignedMinValue(SatWidth).sext(SrcElementWidth), DL, IntVT);
45554574
Sat = DAG.getNode(ISD::SMAX, DL, IntVT, Min, MaxC);
4575+
Sat2 = SrcVal2 ? DAG.getNode(ISD::SMAX, DL, IntVT, Min2, MaxC) : SDValue();
45564576
} else {
45574577
SDValue MinC = DAG.getConstant(
45584578
APInt::getAllOnes(SatWidth).zext(SrcElementWidth), DL, IntVT);
45594579
Sat = DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt, MinC);
4580+
Sat2 = SrcVal2 ? DAG.getNode(ISD::UMIN, DL, IntVT, NativeCvt2, MinC) : SDValue();
45604581
}
45614582

4583+
if (SrcVal2)
4584+
Sat = DAG.getNode(ISD::CONCAT_VECTORS, DL,
4585+
IntVT.getDoubleNumVectorElementsVT(*DAG.getContext()),
4586+
Sat, Sat2);
4587+
45624588
return DAG.getNode(ISD::TRUNCATE, DL, DstVT, Sat);
45634589
}
45644590

llvm/test/CodeGen/AArch64/fcvt_combine.ll

Lines changed: 6 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -466,72 +466,19 @@ define <8 x i16> @test_v8f16_sat(<8 x half> %in) {
466466
; CHECK-NO16: // %bb.0:
467467
; CHECK-NO16-NEXT: movi v1.8h, #68, lsl #8
468468
; CHECK-NO16-NEXT: fcvtl v2.4s, v0.4h
469-
; CHECK-NO16-NEXT: mov w8, #32767 // =0x7fff
470469
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v0.8h
471-
; CHECK-NO16-NEXT: mov w11, #-32768 // =0xffff8000
472470
; CHECK-NO16-NEXT: fcvtl v3.4s, v1.4h
473471
; CHECK-NO16-NEXT: fcvtl2 v1.4s, v1.8h
474472
; CHECK-NO16-NEXT: fmul v2.4s, v2.4s, v3.4s
475473
; CHECK-NO16-NEXT: fmul v0.4s, v0.4s, v1.4s
476474
; CHECK-NO16-NEXT: fcvtn v1.4h, v2.4s
477475
; CHECK-NO16-NEXT: fcvtn2 v1.8h, v0.4s
478-
; CHECK-NO16-NEXT: fcvtl2 v0.4s, v1.8h
479-
; CHECK-NO16-NEXT: fcvtl v1.4s, v1.4h
480-
; CHECK-NO16-NEXT: mov s2, v0.s[1]
481-
; CHECK-NO16-NEXT: fcvtzs w10, s0
482-
; CHECK-NO16-NEXT: fcvtzs w15, s1
483-
; CHECK-NO16-NEXT: fcvtzs w9, s2
484-
; CHECK-NO16-NEXT: mov s2, v0.s[2]
485-
; CHECK-NO16-NEXT: mov s0, v0.s[3]
486-
; CHECK-NO16-NEXT: cmp w9, w8
487-
; CHECK-NO16-NEXT: fcvtzs w12, s2
488-
; CHECK-NO16-NEXT: mov s2, v1.s[1]
489-
; CHECK-NO16-NEXT: csel w9, w9, w8, lt
490-
; CHECK-NO16-NEXT: fcvtzs w13, s0
491-
; CHECK-NO16-NEXT: mov s0, v1.s[2]
492-
; CHECK-NO16-NEXT: cmn w9, #8, lsl #12 // =32768
493-
; CHECK-NO16-NEXT: csel w9, w9, w11, gt
494-
; CHECK-NO16-NEXT: cmp w10, w8
495-
; CHECK-NO16-NEXT: csel w10, w10, w8, lt
496-
; CHECK-NO16-NEXT: fcvtzs w14, s2
497-
; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768
498-
; CHECK-NO16-NEXT: fcvtzs w16, s0
499-
; CHECK-NO16-NEXT: mov s0, v1.s[3]
500-
; CHECK-NO16-NEXT: csel w10, w10, w11, gt
501-
; CHECK-NO16-NEXT: cmp w12, w8
502-
; CHECK-NO16-NEXT: csel w12, w12, w8, lt
503-
; CHECK-NO16-NEXT: fmov s1, w10
504-
; CHECK-NO16-NEXT: cmn w12, #8, lsl #12 // =32768
505-
; CHECK-NO16-NEXT: csel w12, w12, w11, gt
506-
; CHECK-NO16-NEXT: cmp w13, w8
507-
; CHECK-NO16-NEXT: csel w13, w13, w8, lt
508-
; CHECK-NO16-NEXT: mov v1.s[1], w9
509-
; CHECK-NO16-NEXT: fcvtzs w9, s0
510-
; CHECK-NO16-NEXT: cmn w13, #8, lsl #12 // =32768
511-
; CHECK-NO16-NEXT: csel w13, w13, w11, gt
512-
; CHECK-NO16-NEXT: cmp w14, w8
513-
; CHECK-NO16-NEXT: csel w14, w14, w8, lt
514-
; CHECK-NO16-NEXT: cmn w14, #8, lsl #12 // =32768
515-
; CHECK-NO16-NEXT: mov v1.s[2], w12
516-
; CHECK-NO16-NEXT: csel w14, w14, w11, gt
517-
; CHECK-NO16-NEXT: cmp w15, w8
518-
; CHECK-NO16-NEXT: csel w15, w15, w8, lt
519-
; CHECK-NO16-NEXT: cmn w15, #8, lsl #12 // =32768
520-
; CHECK-NO16-NEXT: csel w10, w15, w11, gt
521-
; CHECK-NO16-NEXT: cmp w16, w8
522-
; CHECK-NO16-NEXT: mov v1.s[3], w13
523-
; CHECK-NO16-NEXT: fmov s2, w10
524-
; CHECK-NO16-NEXT: csel w10, w16, w8, lt
525-
; CHECK-NO16-NEXT: cmn w10, #8, lsl #12 // =32768
526-
; CHECK-NO16-NEXT: csel w10, w10, w11, gt
527-
; CHECK-NO16-NEXT: cmp w9, w8
528-
; CHECK-NO16-NEXT: mov v2.s[1], w14
529-
; CHECK-NO16-NEXT: csel w8, w9, w8, lt
530-
; CHECK-NO16-NEXT: cmn w8, #8, lsl #12 // =32768
531-
; CHECK-NO16-NEXT: csel w8, w8, w11, gt
532-
; CHECK-NO16-NEXT: mov v2.s[2], w10
533-
; CHECK-NO16-NEXT: mov v2.s[3], w8
534-
; CHECK-NO16-NEXT: uzp1 v0.8h, v2.8h, v1.8h
476+
; CHECK-NO16-NEXT: fcvtl v0.4s, v1.4h
477+
; CHECK-NO16-NEXT: fcvtl2 v1.4s, v1.8h
478+
; CHECK-NO16-NEXT: fcvtzs v0.4s, v0.4s
479+
; CHECK-NO16-NEXT: fcvtzs v1.4s, v1.4s
480+
; CHECK-NO16-NEXT: sqxtn v0.4h, v0.4s
481+
; CHECK-NO16-NEXT: sqxtn2 v0.8h, v1.4s
535482
; CHECK-NO16-NEXT: ret
536483
;
537484
; CHECK-FP16-LABEL: test_v8f16_sat:

0 commit comments

Comments
 (0)