Skip to content

Commit a29ca01

Browse files
committed
[AArch64][GlobalISel] Add explicit bitcast when lowering saturating add/sub and shift intrinsics.
1 parent 0d9dd60 commit a29ca01

File tree

6 files changed

+380
-77
lines changed

6 files changed

+380
-77
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4488,6 +4488,25 @@ static SDValue lowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG,
44884488
return DAG.getMergeValues({Sum, OutFlag}, DL);
44894489
}
44904490

4491+
static SDValue lowerIntNeonIntrinsic(SDValue Op, unsigned Opcode,
4492+
SelectionDAG &DAG) {
4493+
SDLoc DL(Op);
4494+
EVT OrigVT = Op.getValueType();
4495+
assert((OrigVT == MVT::i32 || OrigVT == MVT::i64) &&
4496+
"lowerIntNeonIntrinsic expects 32/64-bit scalar operation.");
4497+
4498+
EVT NodeVT = (OrigVT == MVT::i32) ? MVT::f32 : MVT::f64;
4499+
4500+
SmallVector<SDValue, 2> NewOps;
4501+
NewOps.reserve(Op.getNumOperands() - 1);
4502+
4503+
for (unsigned I = 1, E = Op.getNumOperands(); I < E; ++I)
4504+
NewOps.push_back(DAG.getBitcast(NodeVT, Op.getOperand(I)));
4505+
4506+
SDValue OpNode = DAG.getNode(Opcode, DL, NodeVT, NewOps);
4507+
return DAG.getBitcast(OrigVT, OpNode);
4508+
}
4509+
44914510
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
44924511
// Let legalize expand this if it isn't a legal type yet.
44934512
if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
@@ -6359,26 +6378,45 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
63596378
Op.getOperand(1).getValueType(),
63606379
Op.getOperand(1), Op.getOperand(2)));
63616380
return SDValue();
6381+
case Intrinsic::aarch64_neon_sqrshl:
6382+
if (Op.getValueType().isVector())
6383+
return SDValue();
6384+
return lowerIntNeonIntrinsic(Op, AArch64ISD::SQRSHL, DAG);
6385+
case Intrinsic::aarch64_neon_sqshl:
6386+
if (Op.getValueType().isVector())
6387+
return SDValue();
6388+
return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSHL, DAG);
6389+
case Intrinsic::aarch64_neon_uqrshl:
6390+
if (Op.getValueType().isVector())
6391+
return SDValue();
6392+
return lowerIntNeonIntrinsic(Op, AArch64ISD::UQRSHL, DAG);
6393+
case Intrinsic::aarch64_neon_uqshl:
6394+
if (Op.getValueType().isVector())
6395+
return SDValue();
6396+
return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSHL, DAG);
63626397
case Intrinsic::aarch64_neon_sqadd:
63636398
if (Op.getValueType().isVector())
63646399
return DAG.getNode(ISD::SADDSAT, DL, Op.getValueType(), Op.getOperand(1),
63656400
Op.getOperand(2));
6366-
return SDValue();
6401+
return lowerIntNeonIntrinsic(Op, AArch64ISD::SQADD, DAG);
6402+
63676403
case Intrinsic::aarch64_neon_sqsub:
63686404
if (Op.getValueType().isVector())
63696405
return DAG.getNode(ISD::SSUBSAT, DL, Op.getValueType(), Op.getOperand(1),
63706406
Op.getOperand(2));
6371-
return SDValue();
6407+
return lowerIntNeonIntrinsic(Op, AArch64ISD::SQSUB, DAG);
6408+
63726409
case Intrinsic::aarch64_neon_uqadd:
63736410
if (Op.getValueType().isVector())
63746411
return DAG.getNode(ISD::UADDSAT, DL, Op.getValueType(), Op.getOperand(1),
63756412
Op.getOperand(2));
6376-
return SDValue();
6413+
return lowerIntNeonIntrinsic(Op, AArch64ISD::UQADD, DAG);
63776414
case Intrinsic::aarch64_neon_uqsub:
63786415
if (Op.getValueType().isVector())
63796416
return DAG.getNode(ISD::USUBSAT, DL, Op.getValueType(), Op.getOperand(1),
63806417
Op.getOperand(2));
6381-
return SDValue();
6418+
return lowerIntNeonIntrinsic(Op, AArch64ISD::UQSUB, DAG);
6419+
63826420
case Intrinsic::aarch64_sve_whilelt:
63836421
return optimizeIncrementingWhile(Op.getNode(), DAG, /*IsSigned=*/true,
63846422
/*IsEqual=*/false);
@@ -6713,6 +6751,52 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
67136751
case Intrinsic::experimental_vector_match: {
67146752
return LowerVectorMatch(Op, DAG);
67156753
}
6754+
// case Intrinsic::aarch64_neon_fcvtas:
6755+
// case Intrinsic::aarch64_neon_fcvtau:
6756+
// case Intrinsic::aarch64_neon_fcvtms:
6757+
// case Intrinsic::aarch64_neon_fcvtmu:
6758+
// case Intrinsic::aarch64_neon_fcvtns:
6759+
// case Intrinsic::aarch64_neon_fcvtnu:
6760+
// case Intrinsic::aarch64_neon_fcvtps:
6761+
// case Intrinsic::aarch64_neon_fcvtpu:
6762+
// case Intrinsic::aarch64_neon_fcvtzs:
6763+
// case Intrinsic::aarch64_neon_fcvtzu:
6764+
// case Intrinsic::aarch64_neon_sqabs:
6765+
// case Intrinsic::aarch64_neon_sqneg:
6766+
// case Intrinsic::aarch64_neon_scalar_sqxtn:
6767+
// case Intrinsic::aarch64_neon_scalar_sqxtun:
6768+
// case Intrinsic::aarch64_neon_scalar_uqxtn:
6769+
// case Intrinsic::aarch64_neon_sqadd:
6770+
// case Intrinsic::aarch64_neon_sqdmulh:
6771+
// case Intrinsic::aarch64_neon_sqrdmulh:
6772+
// case Intrinsic::aarch64_neon_sqrshl:
6773+
// case Intrinsic::aarch64_neon_sqshl:
6774+
// case Intrinsic::aarch64_neon_sqshlu:
6775+
// case Intrinsic::aarch64_neon_sqsub:
6776+
// case Intrinsic::aarch64_neon_srshl:
6777+
// case Intrinsic::aarch64_neon_sshl:
6778+
// case Intrinsic::aarch64_neon_suqadd:
6779+
// case Intrinsic::aarch64_neon_uqadd:
6780+
// case Intrinsic::aarch64_neon_uqrshl:
6781+
// case Intrinsic::aarch64_neon_uqshl:
6782+
// case Intrinsic::aarch64_neon_uqsub:
6783+
// case Intrinsic::aarch64_neon_urshl:
6784+
// case Intrinsic::aarch64_neon_ushl:
6785+
// case Intrinsic::aarch64_neon_usqadd:
6786+
// case Intrinsic::aarch64_neon_rshrn:
6787+
// case Intrinsic::aarch64_neon_sqrshrn:
6788+
// case Intrinsic::aarch64_neon_sqrshrun:
6789+
// case Intrinsic::aarch64_neon_sqshrn:
6790+
// case Intrinsic::aarch64_neon_sqshrun:
6791+
// case Intrinsic::aarch64_neon_uqrshrn:
6792+
// case Intrinsic::aarch64_neon_uqshrn:
6793+
// case Intrinsic::aarch64_neon_sqdmulh_lane:
6794+
// case Intrinsic::aarch64_neon_sqdmulh_laneq:
6795+
// case Intrinsic::aarch64_neon_sqrdmulh_lane:
6796+
// case Intrinsic::aarch64_neon_sqrdmulh_laneq:
6797+
// case Intrinsic::aarch64_neon_sqrdmlah:
6798+
// case Intrinsic::aarch64_neon_sqrdmlsh:
6799+
// case Intrinsic::aarch64_neon_abs:{
67166800
}
67176801
}
67186802

llvm/lib/Target/AArch64/AArch64InstrFormats.td

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7703,16 +7703,21 @@ multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
77037703
}
77047704

77057705
multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
7706-
SDPatternOperator OpNode, SDPatternOperator SatOp> {
7706+
SDPatternOperator OpNode, SDPatternOperator G_OpNode, SDPatternOperator SatOp> {
77077707
def v1i64 : BaseSIMDThreeScalar<U, 0b111, opc, FPR64, asm,
77087708
[(set (v1i64 FPR64:$Rd), (SatOp (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
77097709
def v1i32 : BaseSIMDThreeScalar<U, 0b101, opc, FPR32, asm, []>;
77107710
def v1i16 : BaseSIMDThreeScalar<U, 0b011, opc, FPR16, asm, []>;
77117711
def v1i8 : BaseSIMDThreeScalar<U, 0b001, opc, FPR8 , asm, []>;
77127712

7713-
def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
7713+
def : Pat<(i64 (G_OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
77147714
(!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
7715-
def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
7715+
def : Pat<(i32 (G_OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
7716+
(!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
7717+
7718+
def : Pat<(f64 (OpNode FPR64:$Rn, FPR64:$Rm)),
7719+
(!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
7720+
def : Pat<(f32 (OpNode FPR32:$Rn, FPR32:$Rm)),
77167721
(!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
77177722
}
77187723

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1000,6 +1000,25 @@ def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>;
10001000
def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
10011001
def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>;
10021002

1003+
def AArch64sqadd_node: SDNode<"AArch64ISD::SQADD", SDTFPBinOp>;
1004+
def AArch64sqrshl: SDNode<"AArch64ISD::SQRSHL", SDTFPBinOp>;
1005+
def AArch64sqshl: SDNode<"AArch64ISD::SQSHL", SDTFPBinOp>;
1006+
def AArch64sqsub_node: SDNode<"AArch64ISD::SQSUB", SDTFPBinOp>;
1007+
def AArch64uqadd: SDNode<"AArch64ISD::UQADD", SDTFPBinOp>;
1008+
def AArch64uqrshl: SDNode<"AArch64ISD::UQRSHL", SDTFPBinOp>;
1009+
def AArch64uqshl: SDNode<"AArch64ISD::UQSHL", SDTFPBinOp>;
1010+
def AArch64uqsub: SDNode<"AArch64ISD::UQSUB", SDTFPBinOp>;
1011+
1012+
// This patfrags are temporary hack to get around pattern matching issues with not yet updated intrinsics.
1013+
def AArch64sqadd: PatFrags<(ops node:$lhs, node:$rhs),
1014+
[(bitconvert (AArch64sqadd_node (f32 (bitconvert node:$lhs)), (f32 (bitconvert node:$rhs)))),
1015+
(bitconvert (AArch64sqadd_node (f64 (bitconvert node:$lhs)), (f64 (bitconvert node:$rhs)))),
1016+
(int_aarch64_neon_sqadd node:$lhs, node:$rhs)]>;
1017+
def AArch64sqsub: PatFrags<(ops node:$lhs, node:$rhs),
1018+
[(bitconvert (AArch64sqsub_node (f32 (bitconvert node:$lhs)), (f32 (bitconvert node:$rhs)))),
1019+
(bitconvert (AArch64sqsub_node (f64 (bitconvert node:$lhs)), (f64 (bitconvert node:$rhs)))),
1020+
(int_aarch64_neon_sqsub node:$lhs, node:$rhs)]>;
1021+
10031022
//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
10041023

10051024
// Vector immediate ops
@@ -6453,19 +6472,19 @@ defm FCMGT : SIMDThreeScalarFPCmp<1, 1, 0b100, "fcmgt", AArch64fcmgt>;
64536472
defm FMULX : SIMDFPThreeScalar<0, 0, 0b011, "fmulx", int_aarch64_neon_fmulx, HasNEONandIsStreamingSafe>;
64546473
defm FRECPS : SIMDFPThreeScalar<0, 0, 0b111, "frecps", int_aarch64_neon_frecps, HasNEONandIsStreamingSafe>;
64556474
defm FRSQRTS : SIMDFPThreeScalar<0, 1, 0b111, "frsqrts", int_aarch64_neon_frsqrts, HasNEONandIsStreamingSafe>;
6456-
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd, saddsat>;
6475+
defm SQADD : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", AArch64sqadd_node, int_aarch64_neon_sqadd, saddsat>;
64576476
defm SQDMULH : SIMDThreeScalarHS< 0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
64586477
defm SQRDMULH : SIMDThreeScalarHS< 1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
6459-
defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl", int_aarch64_neon_sqrshl, int_aarch64_neon_sqrshl>;
6460-
defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl, int_aarch64_neon_sqshl>;
6461-
defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub, ssubsat>;
6478+
defm SQRSHL : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl", AArch64sqrshl, int_aarch64_neon_sqrshl, int_aarch64_neon_sqrshl>;
6479+
defm SQSHL : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", AArch64sqshl, int_aarch64_neon_sqshl, int_aarch64_neon_sqshl>;
6480+
defm SQSUB : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", AArch64sqsub_node, int_aarch64_neon_sqsub, ssubsat>;
64626481
defm SRSHL : SIMDThreeScalarD< 0, 0b01010, "srshl", int_aarch64_neon_srshl>;
64636482
defm SSHL : SIMDThreeScalarD< 0, 0b01000, "sshl", int_aarch64_neon_sshl>;
64646483
defm SUB : SIMDThreeScalarD< 1, 0b10000, "sub", sub>;
6465-
defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd, uaddsat>;
6466-
defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl", int_aarch64_neon_uqrshl, int_aarch64_neon_uqrshl>;
6467-
defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl, int_aarch64_neon_uqshl>;
6468-
defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub, usubsat>;
6484+
defm UQADD : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", AArch64uqadd, int_aarch64_neon_uqadd, uaddsat>;
6485+
defm UQRSHL : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl", AArch64uqrshl, int_aarch64_neon_uqrshl, int_aarch64_neon_uqrshl>;
6486+
defm UQSHL : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", AArch64uqshl, int_aarch64_neon_uqshl, int_aarch64_neon_uqshl>;
6487+
defm UQSUB : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", AArch64uqsub, int_aarch64_neon_uqsub, usubsat>;
64696488
defm URSHL : SIMDThreeScalarD< 1, 0b01010, "urshl", int_aarch64_neon_urshl>;
64706489
defm USHL : SIMDThreeScalarD< 1, 0b01000, "ushl", int_aarch64_neon_ushl>;
64716490
let Predicates = [HasRDM] in {
@@ -6520,11 +6539,11 @@ defm SQDMULL : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
65206539
defm SQDMLAL : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
65216540
defm SQDMLSL : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
65226541

6523-
def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
6542+
def : Pat<(i64 (AArch64sqadd (i64 FPR64:$Rd),
65246543
(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
65256544
(i32 FPR32:$Rm))))),
65266545
(SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
6527-
def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
6546+
def : Pat<(i64 (AArch64sqsub (i64 FPR64:$Rd),
65286547
(i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
65296548
(i32 FPR32:$Rm))))),
65306549
(SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
@@ -8545,9 +8564,9 @@ defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
85458564
TriOpFrag<(sub node:$LHS, (AArch64smull node:$MHS, node:$RHS))>>;
85468565
defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull", AArch64smull>;
85478566
defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal", saddsat,
8548-
int_aarch64_neon_sqadd>;
8567+
AArch64sqadd>;
85498568
defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl", ssubsat,
8550-
int_aarch64_neon_sqsub>;
8569+
AArch64sqsub>;
85518570
defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
85528571
int_aarch64_neon_sqrdmlah>;
85538572
defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",

0 commit comments

Comments
 (0)