diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e35ad52488501..3ad2905ce5207 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -753,6 +753,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::v8bf16, Expand); } + // For bf16, fpextend is custom lowered to be optionally expanded into shifts. + setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom); + auto LegalizeNarrowFP = [this](MVT ScalarVT) { for (auto Op : { ISD::SETCC, @@ -893,10 +901,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::f16, Legal); } - // Strict conversion to a larger type is legal - for (auto VT : {MVT::f32, MVT::f64}) - setOperationAction(ISD::STRICT_FP_EXTEND, VT, Legal); - setOperationAction(ISD::PREFETCH, MVT::Other, Custom); setOperationAction(ISD::GET_ROUNDING, MVT::i32, Custom); @@ -4498,6 +4502,54 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPExtendToSVE(Op, DAG); + bool IsStrict = Op->isStrictFPOpcode(); + SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0); + EVT Op0VT = Op0.getValueType(); + if (VT == MVT::f64) { + // FP16->FP32 extends are legal for v32 and v4f32. + if (Op0VT == MVT::f32 || Op0VT == MVT::f16) + return Op; + // Split bf16->f64 extends into two fpextends. + if (Op0VT == MVT::bf16 && IsStrict) { + SDValue Ext1 = + DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {MVT::f32, MVT::Other}, + {Op0, Op.getOperand(0)}); + return DAG.getNode(ISD::STRICT_FP_EXTEND, SDLoc(Op), {VT, MVT::Other}, + {Ext1, Ext1.getValue(1)}); + } + if (Op0VT == MVT::bf16) + return DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), VT, + DAG.getNode(ISD::FP_EXTEND, SDLoc(Op), MVT::f32, Op0)); + return SDValue(); + } + + if (VT.getScalarType() == MVT::f32) { + // FP16->FP32 extends are legal for v32 and v4f32. + if (Op0VT.getScalarType() == MVT::f16) + return Op; + if (Op0VT.getScalarType() == MVT::bf16) { + SDLoc DL(Op); + EVT IVT = VT.changeTypeToInteger(); + if (!Op0VT.isVector()) { + Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0); + IVT = MVT::v4i32; + } + + EVT Op0IVT = Op0.getValueType().changeTypeToInteger(); + SDValue Ext = + DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0)); + SDValue Shift = + DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT)); + if (!Op0VT.isVector()) + Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift, + DAG.getConstant(0, DL, MVT::i64)); + Shift = DAG.getBitcast(VT, Shift); + return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL) + : Shift; + } + return SDValue(); + } + assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); return SDValue(); } @@ -7345,6 +7397,7 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op, case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG); case ISD::FP_EXTEND: + case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG); case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index ec891ea4bac85..c6f5cdcd1d5fe 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -5123,22 +5123,6 @@ let Predicates = [HasFullFP16] in { //===----------------------------------------------------------------------===// defm FCVT : FPConversion<"fcvt">; -// Helper to get bf16 into fp32. -def cvt_bf16_to_fp32 : - OutPatFrag<(ops node:$Rn), - (f32 (COPY_TO_REGCLASS - (i32 (UBFMWri - (i32 (COPY_TO_REGCLASS (INSERT_SUBREG (f32 (IMPLICIT_DEF)), - node:$Rn, hsub), GPR32)), - (i64 (i32shift_a (i64 16))), - (i64 (i32shift_b (i64 16))))), - FPR32))>; -// Pattern for bf16 -> fp32. -def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))), - (cvt_bf16_to_fp32 FPR16:$Rn)>; -// Pattern for bf16 -> fp64. -def : Pat<(f64 (any_fpextend (bf16 FPR16:$Rn))), - (FCVTDSr (f32 (cvt_bf16_to_fp32 FPR16:$Rn)))>; //===----------------------------------------------------------------------===// // Floating point single operand instructions. @@ -8333,8 +8317,6 @@ def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))> def : Pat<(v2i64 (sext (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (zext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>; -// Vector bf16 -> fp32 is implemented morally as a zext + shift. -def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), (SHLLv4i16 V64:$Rn)>; // Also match an extend from the upper half of a 128 bit source register. def : Pat<(v8i16 (anyext (v8i8 (extract_high_v16i8 (v16i8 V128:$Rn)) ))), (USHLLv16i8_shift V128:$Rn, (i32 0))>; diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll index 1aa28f5c2733d..9a1203f18243d 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll @@ -156,11 +156,10 @@ define i32 @fptosi_bf(bfloat %a) nounwind ssp { ; CHECK-LABEL: fptosi_bf: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov s1, s0 -; CHECK-NEXT: // implicit-def: $s0 +; CHECK-NEXT: // implicit-def: $d0 ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret entry: @@ -173,11 +172,10 @@ define i32 @fptoui_sbf(bfloat %a) nounwind ssp { ; CHECK-LABEL: fptoui_sbf: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: fmov s1, s0 -; CHECK-NEXT: // implicit-def: $s0 +; CHECK-NEXT: // implicit-def: $d0 ; CHECK-NEXT: fmov s0, s1 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fcvtzu w0, s0 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index ed9c1b037d0cc..fb40dfcbe101d 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -182,17 +182,14 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fadd s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align2: @@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fadd s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fadd_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fadd s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fadd_bf16_seq_cst_align4: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 888b795876f7d..818dcf3a0b487 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -184,17 +184,14 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fmaxnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align2: @@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fmaxnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmax_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fmaxnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmax_bf16_seq_cst_align4: @@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: dup v1.4h, v0.h[1] ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s0, w9 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 +; NOLSE-NEXT: shll v1.4s, v1.4h, #16 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: mov h3, v2.h[1] -; NOLSE-NEXT: fmov w11, s2 -; NOLSE-NEXT: lsl w11, w11, #16 -; NOLSE-NEXT: fmov w10, s3 -; NOLSE-NEXT: fmov s3, w11 -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmaxnm s3, s3, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: dup v3.4h, v2.h[1] +; NOLSE-NEXT: shll v2.4s, v2.4h, #16 ; NOLSE-NEXT: fmaxnm s2, s2, s0 -; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fmaxnm s3, s3, s1 +; NOLSE-NEXT: fmov w11, s2 ; NOLSE-NEXT: ubfx w13, w11, #16, #1 ; NOLSE-NEXT: add w11, w11, w8 -; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: fmov w10, s3 ; NOLSE-NEXT: add w11, w13, w11 ; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 @@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: ; LSE: // %bb.0: ; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; LSE-NEXT: mov h1, v0.h[1] -; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: dup v1.4h, v0.h[1] +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr s0, [x0] -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s1 -; LSE-NEXT: fmov s2, w10 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: shll v1.4s, v1.4h, #16 ; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: mov h3, v0.h[1] -; LSE-NEXT: fmov w10, s0 -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s3 -; LSE-NEXT: fmov s4, w10 -; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: dup v3.4h, v0.h[1] +; LSE-NEXT: shll v4.4s, v0.4h, #16 ; LSE-NEXT: fmaxnm s4, s4, s2 -; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: shll v3.4s, v3.4h, #16 ; LSE-NEXT: fmaxnm s3, s3, s1 ; LSE-NEXT: fmov w10, s4 ; LSE-NEXT: ubfx w12, w10, #16, #1 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index a3665c6e42860..b969241e8bf90 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -184,17 +184,14 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -204,36 +201,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fminnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align2: @@ -283,17 +278,14 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fminnm s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -303,36 +295,34 @@ define bfloat @test_atomicrmw_fmin_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fminnm s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fmin_bf16_seq_cst_align4: @@ -653,31 +643,23 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NOLSE-NEXT: mov h1, v0.h[1] -; NOLSE-NEXT: fmov w10, s0 +; NOLSE-NEXT: dup v1.4h, v0.h[1] ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fmov w9, s1 -; NOLSE-NEXT: fmov s1, w10 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s0, w9 +; NOLSE-NEXT: shll v0.4s, v0.4h, #16 +; NOLSE-NEXT: shll v1.4s, v1.4h, #16 ; NOLSE-NEXT: .LBB7_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: mov h3, v2.h[1] -; NOLSE-NEXT: fmov w11, s2 -; NOLSE-NEXT: lsl w11, w11, #16 -; NOLSE-NEXT: fmov w10, s3 -; NOLSE-NEXT: fmov s3, w11 -; NOLSE-NEXT: lsl w10, w10, #16 -; NOLSE-NEXT: fminnm s3, s3, s1 -; NOLSE-NEXT: fmov s2, w10 +; NOLSE-NEXT: dup v3.4h, v2.h[1] +; NOLSE-NEXT: shll v2.4s, v2.4h, #16 ; NOLSE-NEXT: fminnm s2, s2, s0 -; NOLSE-NEXT: fmov w11, s3 +; NOLSE-NEXT: shll v3.4s, v3.4h, #16 +; NOLSE-NEXT: fminnm s3, s3, s1 +; NOLSE-NEXT: fmov w11, s2 ; NOLSE-NEXT: ubfx w13, w11, #16, #1 ; NOLSE-NEXT: add w11, w11, w8 -; NOLSE-NEXT: fmov w10, s2 +; NOLSE-NEXT: fmov w10, s3 ; NOLSE-NEXT: add w11, w13, w11 ; NOLSE-NEXT: lsr w11, w11, #16 ; NOLSE-NEXT: ubfx w12, w10, #16, #1 @@ -697,25 +679,17 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: ; LSE: // %bb.0: ; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; LSE-NEXT: mov h1, v0.h[1] -; LSE-NEXT: fmov w10, s0 +; LSE-NEXT: dup v1.4h, v0.h[1] +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr s0, [x0] -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s1 -; LSE-NEXT: fmov s2, w10 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 +; LSE-NEXT: shll v1.4s, v1.4h, #16 ; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: mov h3, v0.h[1] -; LSE-NEXT: fmov w10, s0 -; LSE-NEXT: lsl w10, w10, #16 -; LSE-NEXT: fmov w9, s3 -; LSE-NEXT: fmov s4, w10 -; LSE-NEXT: lsl w9, w9, #16 +; LSE-NEXT: dup v3.4h, v0.h[1] +; LSE-NEXT: shll v4.4s, v0.4h, #16 ; LSE-NEXT: fminnm s4, s4, s2 -; LSE-NEXT: fmov s3, w9 +; LSE-NEXT: shll v3.4s, v3.4h, #16 ; LSE-NEXT: fminnm s3, s3, s1 ; LSE-NEXT: fmov w10, s4 ; LSE-NEXT: ubfx w12, w10, #16, #1 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 7725ce0e73185..e603337e7a569 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -182,17 +182,14 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB2_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -202,36 +199,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB2_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB2_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fsub s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB2_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align2: @@ -281,17 +276,14 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align2(ptr %ptr, bfloat %value) define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) #0 { ; NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: ; NOLSE: // %bb.0: -; NOLSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; NOLSE-NEXT: fmov w9, s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; NOLSE-NEXT: shll v1.4s, v0.4h, #16 ; NOLSE-NEXT: mov w8, #32767 // =0x7fff -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s1, w9 ; NOLSE-NEXT: .LBB3_1: // %atomicrmw.start ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxrh w9, [x0] ; NOLSE-NEXT: fmov s0, w9 -; NOLSE-NEXT: lsl w9, w9, #16 -; NOLSE-NEXT: fmov s2, w9 +; NOLSE-NEXT: shll v2.4s, v0.4h, #16 ; NOLSE-NEXT: fsub s2, s2, s1 ; NOLSE-NEXT: fmov w9, s2 ; NOLSE-NEXT: ubfx w10, w9, #16, #1 @@ -301,36 +293,34 @@ define bfloat @test_atomicrmw_fsub_bf16_seq_cst_align4(ptr %ptr, bfloat %value) ; NOLSE-NEXT: stlxrh w10, w9, [x0] ; NOLSE-NEXT: cbnz w10, .LBB3_1 ; NOLSE-NEXT: // %bb.2: // %atomicrmw.end -; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; NOLSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; NOLSE-NEXT: ret ; ; LSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: ; LSE: // %bb.0: -; LSE-NEXT: // kill: def $h0 killed $h0 def $s0 -; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: // kill: def $h0 killed $h0 def $d0 +; LSE-NEXT: shll v1.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr h0, [x0] -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s1, w9 ; LSE-NEXT: .LBB3_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: fmov w9, s0 -; LSE-NEXT: lsl w9, w9, #16 -; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: fsub s2, s2, s1 ; LSE-NEXT: fmov w9, s2 ; LSE-NEXT: ubfx w10, w9, #16, #1 ; LSE-NEXT: add w9, w9, w8 ; LSE-NEXT: add w9, w10, w9 -; LSE-NEXT: fmov w10, s0 ; LSE-NEXT: lsr w9, w9, #16 -; LSE-NEXT: mov w11, w10 -; LSE-NEXT: casalh w11, w9, [x0] +; LSE-NEXT: fmov s2, w9 +; LSE-NEXT: fmov w9, s0 +; LSE-NEXT: fmov w10, s2 +; LSE-NEXT: mov w11, w9 +; LSE-NEXT: casalh w11, w10, [x0] ; LSE-NEXT: fmov s0, w11 -; LSE-NEXT: cmp w11, w10, uxth +; LSE-NEXT: cmp w11, w9, uxth ; LSE-NEXT: b.ne .LBB3_1 ; LSE-NEXT: // %bb.2: // %atomicrmw.end -; LSE-NEXT: // kill: def $h0 killed $h0 killed $s0 +; LSE-NEXT: // kill: def $h0 killed $h0 killed $d0 ; LSE-NEXT: ret ; ; SOFTFP-NOLSE-LABEL: test_atomicrmw_fsub_bf16_seq_cst_align4: diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index 33997614598c3..bc06453e9c01f 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -5,16 +5,12 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fadd: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fadd s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -26,15 +22,11 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fadd: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fadd s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fadd bfloat %a, %b @@ -44,16 +36,12 @@ define bfloat @test_fadd(bfloat %a, bfloat %b) #0 { define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fsub: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fsub s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fsub s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -65,15 +53,11 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fsub: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fsub s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fsub s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fsub bfloat %a, %b @@ -83,16 +67,12 @@ define bfloat @test_fsub(bfloat %a, bfloat %b) #0 { define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fmul: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fmul s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmul s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -104,15 +84,11 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fmul: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmul s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fmul bfloat %a, %b @@ -122,27 +98,21 @@ define bfloat @test_fmul(bfloat %a, bfloat %b) #0 { define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-LABEL: test_fmadd: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmul s0, s1, s0 +; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmul s0, s0, s1 +; CHECK-CVT-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 ; CHECK-CVT-NEXT: add w8, w8, w10 ; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: fmov w9, s2 ; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 ; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 @@ -155,23 +125,15 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { ; ; CHECK-BF16-LABEL: test_fmadd: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmov w9, s2 -; CHECK-BF16-NEXT: fmul s0, s1, s0 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 +; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -183,16 +145,12 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_fdiv: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fdiv s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fdiv s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -204,15 +162,11 @@ define bfloat @test_fdiv(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_fdiv: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fdiv s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fdiv s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = fdiv bfloat %a, %b @@ -223,14 +177,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_frem: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-CVT-NEXT: bl fmodf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -246,14 +198,12 @@ define bfloat @test_frem(bfloat %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_frem: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-BF16-NEXT: bl fmodf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -334,17 +284,13 @@ define bfloat @test_select(bfloat %a, bfloat %b, i1 zeroext %c) #0 { define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 { ; CHECK-LABEL: test_select_cc: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h3 killed $h3 def $s3 -; CHECK-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s2 +; CHECK-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $d2 ; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 ; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: fcmp s2, s3 ; CHECK-NEXT: fcsel s0, s0, s1, ne ; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0 ; CHECK-NEXT: ret @@ -356,15 +302,11 @@ define bfloat @test_select_cc(bfloat %a, bfloat %b, bfloat %c, bfloat %d) #0 { define float @test_select_cc_f32_f16(float %a, float %b, bfloat %c, bfloat %d) #0 { ; CHECK-LABEL: test_select_cc_f32_f16: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h3 killed $h3 def $s3 -; CHECK-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s2 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: // kill: def $h3 killed $h3 def $d3 +; CHECK-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: fcmp s2, s3 ; CHECK-NEXT: fcsel s0, s0, s1, ne ; CHECK-NEXT: ret %cc = fcmp une bfloat %c, %d @@ -389,15 +331,11 @@ define bfloat @test_select_cc_f16_f32(bfloat %a, bfloat %b, float %c, float %d) define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_une: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, ne ; CHECK-NEXT: ret %r = fcmp une bfloat %a, %b @@ -407,15 +345,11 @@ define i1 @test_fcmp_une(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ueq: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w8, eq ; CHECK-NEXT: csinc w0, w8, wzr, vc ; CHECK-NEXT: ret @@ -426,15 +360,11 @@ define i1 @test_fcmp_ueq(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ugt: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, hi ; CHECK-NEXT: ret %r = fcmp ugt bfloat %a, %b @@ -444,15 +374,11 @@ define i1 @test_fcmp_ugt(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_uge: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, pl ; CHECK-NEXT: ret %r = fcmp uge bfloat %a, %b @@ -462,15 +388,11 @@ define i1 @test_fcmp_uge(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ult: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, lt ; CHECK-NEXT: ret %r = fcmp ult bfloat %a, %b @@ -480,15 +402,11 @@ define i1 @test_fcmp_ult(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ule: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, le ; CHECK-NEXT: ret %r = fcmp ule bfloat %a, %b @@ -498,15 +416,11 @@ define i1 @test_fcmp_ule(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_uno: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, vs ; CHECK-NEXT: ret %r = fcmp uno bfloat %a, %b @@ -516,15 +430,11 @@ define i1 @test_fcmp_uno(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_one: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w8, mi ; CHECK-NEXT: csinc w0, w8, wzr, le ; CHECK-NEXT: ret @@ -535,15 +445,11 @@ define i1 @test_fcmp_one(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_oeq: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, eq ; CHECK-NEXT: ret %r = fcmp oeq bfloat %a, %b @@ -553,15 +459,11 @@ define i1 @test_fcmp_oeq(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ogt: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, gt ; CHECK-NEXT: ret %r = fcmp ogt bfloat %a, %b @@ -571,15 +473,11 @@ define i1 @test_fcmp_ogt(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_oge: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, ge ; CHECK-NEXT: ret %r = fcmp oge bfloat %a, %b @@ -589,15 +487,11 @@ define i1 @test_fcmp_oge(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_olt: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, mi ; CHECK-NEXT: ret %r = fcmp olt bfloat %a, %b @@ -607,15 +501,11 @@ define i1 @test_fcmp_olt(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ole: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, ls ; CHECK-NEXT: ret %r = fcmp ole bfloat %a, %b @@ -625,15 +515,11 @@ define i1 @test_fcmp_ole(bfloat %a, bfloat %b) #0 { define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 { ; CHECK-LABEL: test_fcmp_ord: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: cset w0, vc ; CHECK-NEXT: ret %r = fcmp ord bfloat %a, %b @@ -643,13 +529,11 @@ define i1 @test_fcmp_ord(bfloat %a, bfloat %b) #0 { define void @test_fccmp(bfloat %in, ptr %out) { ; CHECK-LABEL: test_fccmp: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 ; CHECK-NEXT: movi v1.2s, #69, lsl #24 -; CHECK-NEXT: movi v3.2s, #72, lsl #24 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s2, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v2.4s, v0.4h, #16 ; CHECK-NEXT: adrp x8, .LCPI29_0 +; CHECK-NEXT: movi v3.2s, #72, lsl #24 ; CHECK-NEXT: fcmp s2, s1 ; CHECK-NEXT: ldr h1, [x8, :lo12:.LCPI29_0] ; CHECK-NEXT: fccmp s2, s3, #4, mi @@ -667,15 +551,11 @@ define void @test_fccmp(bfloat %in, ptr %out) { define void @test_br_cc(bfloat %a, bfloat %b, ptr %p1, ptr %p2) #0 { ; CHECK-LABEL: test_br_cc: ; CHECK: // %bb.0: // %common.ret -; CHECK-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: csel x8, x0, x1, pl ; CHECK-NEXT: str wzr, [x8] ; CHECK-NEXT: ret @@ -725,10 +605,8 @@ declare i1 @test_dummy(ptr %p1) #0 define i32 @test_fptosi_i32(bfloat %a) #0 { ; CHECK-LABEL: test_fptosi_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret %r = fptosi bfloat %a to i32 @@ -738,10 +616,8 @@ define i32 @test_fptosi_i32(bfloat %a) #0 { define i64 @test_fptosi_i64(bfloat %a) #0 { ; CHECK-LABEL: test_fptosi_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs x0, s0 ; CHECK-NEXT: ret %r = fptosi bfloat %a to i64 @@ -751,10 +627,8 @@ define i64 @test_fptosi_i64(bfloat %a) #0 { define i32 @test_fptoui_i32(bfloat %a) #0 { ; CHECK-LABEL: test_fptoui_i32: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu w0, s0 ; CHECK-NEXT: ret %r = fptoui bfloat %a to i32 @@ -764,10 +638,8 @@ define i32 @test_fptoui_i32(bfloat %a) #0 { define i64 @test_fptoui_i64(bfloat %a) #0 { ; CHECK-LABEL: test_fptoui_i64: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu x0, s0 ; CHECK-NEXT: ret %r = fptoui bfloat %a to i64 @@ -927,7 +799,8 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: ucvtf d1, w0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fcvtxn s1, d1 ; CHECK-CVT-NEXT: fmov w9, s1 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -935,12 +808,7 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: add w9, w10, w9 ; CHECK-CVT-NEXT: lsr w9, w9, #16 ; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s1, w10 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -954,15 +822,11 @@ define bfloat @test_uitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_uitofp_i32_fadd: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: ucvtf d1, w0 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fcvtxn s1, d1 -; CHECK-BF16-NEXT: fmov s0, w8 ; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -976,7 +840,8 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: scvtf d1, w0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fcvtxn s1, d1 ; CHECK-CVT-NEXT: fmov w9, s1 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -984,12 +849,7 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-CVT-NEXT: add w9, w10, w9 ; CHECK-CVT-NEXT: lsr w9, w9, #16 ; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmov w9, s0 -; CHECK-CVT-NEXT: fmov w10, s1 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s1, w10 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -1003,15 +863,11 @@ define bfloat @test_sitofp_i32_fadd(i32 %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_sitofp_i32_fadd: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: scvtf d1, w0 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fcvtxn s1, d1 -; CHECK-BF16-NEXT: fmov s0, w8 ; CHECK-BF16-NEXT: bfcvt h1, s1 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -1070,10 +926,9 @@ define bfloat @test_fptrunc_double(double %a) #0 { define float @test_fpext_float(bfloat %a) #0 { ; CHECK-LABEL: test_fpext_float: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: ret %r = fpext bfloat %a to float ret float %r @@ -1082,10 +937,8 @@ define float @test_fpext_float(bfloat %a) #0 { define double @test_fpext_double(bfloat %a) #0 { ; CHECK-LABEL: test_fpext_double: ; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvt d0, s0 ; CHECK-NEXT: ret %r = fpext bfloat %a to double @@ -1148,11 +1001,9 @@ declare bfloat @llvm.fmuladd.f16(bfloat %a, bfloat %b, bfloat %c) #0 define bfloat @test_sqrt(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_sqrt: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fsqrt s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -1165,10 +1016,8 @@ define bfloat @test_sqrt(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_sqrt: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fsqrt s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -1180,10 +1029,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 { ; CHECK-CVT-LABEL: test_powi: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl __powisf2 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1199,10 +1047,9 @@ define bfloat @test_powi(bfloat %a, i32 %b) #0 { ; CHECK-BF16-LABEL: test_powi: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl __powisf2 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1216,10 +1063,9 @@ define bfloat @test_sin(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_sin: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl sinf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1235,10 +1081,9 @@ define bfloat @test_sin(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_sin: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl sinf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1251,10 +1096,9 @@ define bfloat @test_cos(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_cos: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl cosf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1270,10 +1114,9 @@ define bfloat @test_cos(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_cos: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl cosf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1286,10 +1129,9 @@ define bfloat @test_tan(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_tan: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl tanf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1305,10 +1147,9 @@ define bfloat @test_tan(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_tan: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl tanf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1321,10 +1162,9 @@ define bfloat @test_acos(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_acos: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl acosf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1340,10 +1180,9 @@ define bfloat @test_acos(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_acos: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl acosf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1356,10 +1195,9 @@ define bfloat @test_asin(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_asin: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl asinf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1375,10 +1213,9 @@ define bfloat @test_asin(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_asin: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl asinf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1391,10 +1228,9 @@ define bfloat @test_atan(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_atan: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl atanf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1410,10 +1246,9 @@ define bfloat @test_atan(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_atan: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl atanf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1426,14 +1261,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_atan2: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-CVT-NEXT: bl atan2f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1449,14 +1282,12 @@ define bfloat @test_atan2(bfloat %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_atan2: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-BF16-NEXT: bl atan2f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1469,10 +1300,9 @@ define bfloat @test_cosh(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_cosh: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl coshf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1488,10 +1318,9 @@ define bfloat @test_cosh(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_cosh: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl coshf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1504,10 +1333,9 @@ define bfloat @test_sinh(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_sinh: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl sinhf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1523,10 +1351,9 @@ define bfloat @test_sinh(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_sinh: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl sinhf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1539,10 +1366,9 @@ define bfloat @test_tanh(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_tanh: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl tanhf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1558,10 +1384,9 @@ define bfloat @test_tanh(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_tanh: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl tanhf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1574,14 +1399,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_pow: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-CVT-NEXT: bl powf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1597,14 +1420,12 @@ define bfloat @test_pow(bfloat %a, bfloat %b) #0 { ; CHECK-BF16-LABEL: test_pow: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 killed $q1 ; CHECK-BF16-NEXT: bl powf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1617,10 +1438,9 @@ define bfloat @test_exp(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_exp: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl expf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1636,10 +1456,9 @@ define bfloat @test_exp(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_exp: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl expf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1652,10 +1471,9 @@ define bfloat @test_exp2(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_exp2: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl exp2f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1671,10 +1489,9 @@ define bfloat @test_exp2(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_exp2: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl exp2f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1687,10 +1504,9 @@ define bfloat @test_log(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_log: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl logf ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1706,10 +1522,9 @@ define bfloat @test_log(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_log: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl logf ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1722,10 +1537,9 @@ define bfloat @test_log10(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_log10: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl log10f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1741,10 +1555,9 @@ define bfloat @test_log10(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_log10: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl log10f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1757,10 +1570,9 @@ define bfloat @test_log2(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_log2: ; CHECK-CVT: // %bb.0: ; CHECK-CVT-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: bl log2f ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff @@ -1776,10 +1588,9 @@ define bfloat @test_log2(bfloat %a) #0 { ; CHECK-BF16-LABEL: test_log2: ; CHECK-BF16: // %bb.0: ; CHECK-BF16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: bl log2f ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload @@ -1791,20 +1602,14 @@ define bfloat @test_log2(bfloat %a) #0 { define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-LABEL: test_fma: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s2 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmov s2, w10 +; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: fmadd s0, s2, s1, s0 +; CHECK-CVT-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmadd s0, s0, s1, s2 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 ; CHECK-CVT-NEXT: add w8, w8, w10 @@ -1816,19 +1621,13 @@ define bfloat @test_fma(bfloat %a, bfloat %b, bfloat %c) #0 { ; ; CHECK-BF16-LABEL: test_fma: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s2 -; CHECK-BF16-NEXT: fmov w9, s1 -; CHECK-BF16-NEXT: fmov w10, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: lsl w10, w10, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmov s2, w10 -; CHECK-BF16-NEXT: fmadd s0, s2, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) @@ -1851,16 +1650,12 @@ define bfloat @test_fabs(bfloat %a) #0 { define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_minnum: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fminnm s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fminnm s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -1872,15 +1667,11 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_minnum: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fminnm s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fminnm s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.minnum.f16(bfloat %a, bfloat %b) @@ -1890,16 +1681,12 @@ define bfloat @test_minnum(bfloat %a, bfloat %b) #0 { define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_maxnum: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s1 -; CHECK-CVT-NEXT: fmov w10, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: lsl w10, w10, #16 -; CHECK-CVT-NEXT: fmov s0, w9 -; CHECK-CVT-NEXT: fmov s1, w10 -; CHECK-CVT-NEXT: fmaxnm s0, s1, s0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmaxnm s0, s0, s1 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 ; CHECK-CVT-NEXT: add w8, w9, w8 @@ -1911,15 +1698,11 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_maxnum: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmaxnm s0, s1, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmaxnm s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.maxnum.f16(bfloat %a, bfloat %b) @@ -1929,16 +1712,12 @@ define bfloat @test_maxnum(bfloat %a, bfloat %b) #0 { define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_copysign: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 ; CHECK-CVT-NEXT: fmov s0, w8 @@ -1947,16 +1726,12 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { ; ; CHECK-BF16-LABEL: test_copysign: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) @@ -1966,12 +1741,10 @@ define bfloat @test_copysign(bfloat %a, bfloat %b) #0 { define bfloat @test_copysign_f32(bfloat %a, float %b) #0 { ; CHECK-CVT-LABEL: test_copysign_f32: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 ; CHECK-CVT-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 @@ -1981,12 +1754,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 { ; ; CHECK-BF16-LABEL: test_copysign_f32: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 ; CHECK-BF16-NEXT: // kill: def $s1 killed $s1 def $q1 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -1998,12 +1769,10 @@ define bfloat @test_copysign_f32(bfloat %a, float %b) #0 { define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { ; CHECK-CVT-LABEL: test_copysign_f64: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: fcvt s1, d1 ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 @@ -2013,12 +1782,10 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { ; ; CHECK-BF16-LABEL: test_copysign_f64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-BF16-NEXT: fcvt s1, d1 ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2032,34 +1799,33 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { define float @test_copysign_extended(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_copysign_extended: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-CVT-NEXT: movi v2.4s, #16 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: lsl w8, w8, #16 ; CHECK-CVT-NEXT: fmov s0, w8 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-CVT-NEXT: ret ; ; CHECK-BF16-LABEL: test_copysign_extended: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: movi v2.4s, #16 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: ushll v0.4s, v0.4h, #0 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: ushl v0.4s, v0.4s, v2.4s ; CHECK-BF16-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: bit v0.16b, v1.16b, v2.16b +; CHECK-BF16-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-BF16-NEXT: bfcvt h0, s0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-BF16-NEXT: ret %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) @@ -2070,11 +1836,9 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 { define bfloat @test_floor(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_floor: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintm s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2087,10 +1851,8 @@ define bfloat @test_floor(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_floor: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintm s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2101,11 +1863,9 @@ define bfloat @test_floor(bfloat %a) #0 { define bfloat @test_ceil(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_ceil: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintp s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2118,10 +1878,8 @@ define bfloat @test_ceil(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_ceil: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintp s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2132,11 +1890,9 @@ define bfloat @test_ceil(bfloat %a) #0 { define bfloat @test_trunc(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_trunc: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintz s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2149,10 +1905,8 @@ define bfloat @test_trunc(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_trunc: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintz s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2163,11 +1917,9 @@ define bfloat @test_trunc(bfloat %a) #0 { define bfloat @test_rint(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_rint: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintx s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2180,10 +1932,8 @@ define bfloat @test_rint(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_rint: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintx s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2194,11 +1944,9 @@ define bfloat @test_rint(bfloat %a) #0 { define bfloat @test_nearbyint(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_nearbyint: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frinti s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2211,10 +1959,8 @@ define bfloat @test_nearbyint(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_nearbyint: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frinti s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2225,11 +1971,9 @@ define bfloat @test_nearbyint(bfloat %a) #0 { define bfloat @test_round(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_round: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frinta s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2242,10 +1986,8 @@ define bfloat @test_round(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_round: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frinta s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2256,11 +1998,9 @@ define bfloat @test_round(bfloat %a) #0 { define bfloat @test_roundeven(bfloat %a) #0 { ; CHECK-CVT-LABEL: test_roundeven: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w8, #32767 // =0x7fff -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: frintn s0, s0 ; CHECK-CVT-NEXT: fmov w9, s0 ; CHECK-CVT-NEXT: ubfx w10, w9, #16, #1 @@ -2273,10 +2013,8 @@ define bfloat @test_roundeven(bfloat %a) #0 { ; ; CHECK-BF16-LABEL: test_roundeven: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: frintn s0, s0 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret @@ -2287,27 +2025,21 @@ define bfloat @test_roundeven(bfloat %a) #0 { define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 { ; CHECK-CVT-LABEL: test_fmuladd: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-CVT-NEXT: fmov w8, s1 -; CHECK-CVT-NEXT: fmov w9, s0 +; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mov w10, #32767 // =0x7fff -; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 -; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 -; CHECK-CVT-NEXT: fmul s0, s1, s0 +; CHECK-CVT-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-CVT-NEXT: fmul s0, s0, s1 +; CHECK-CVT-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 ; CHECK-CVT-NEXT: add w8, w8, w10 ; CHECK-CVT-NEXT: add w8, w9, w8 -; CHECK-CVT-NEXT: fmov w9, s2 ; CHECK-CVT-NEXT: lsr w8, w8, #16 -; CHECK-CVT-NEXT: lsl w8, w8, #16 -; CHECK-CVT-NEXT: lsl w9, w9, #16 ; CHECK-CVT-NEXT: fmov s0, w8 -; CHECK-CVT-NEXT: fmov s1, w9 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: fadd s0, s0, s1 ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: ubfx w9, w8, #16, #1 @@ -2320,23 +2052,15 @@ define bfloat @test_fmuladd(bfloat %a, bfloat %b, bfloat %c) #0 { ; ; CHECK-BF16-LABEL: test_fmuladd: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $s1 -; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-BF16-NEXT: fmov w8, s1 -; CHECK-BF16-NEXT: fmov w9, s0 -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $s2 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s0, w8 -; CHECK-BF16-NEXT: fmov s1, w9 -; CHECK-BF16-NEXT: fmov w9, s2 -; CHECK-BF16-NEXT: fmul s0, s1, s0 -; CHECK-BF16-NEXT: lsl w9, w9, #16 -; CHECK-BF16-NEXT: fmov s1, w9 +; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 +; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 +; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-BF16-NEXT: fmul s0, s0, s1 +; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 ; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: fmov w8, s0 -; CHECK-BF16-NEXT: lsl w8, w8, #16 -; CHECK-BF16-NEXT: fmov s0, w8 +; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-BF16-NEXT: fadd s0, s0, s1 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index c03e2e5321321..a609e33be935e 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -272,9 +272,8 @@ define <8 x bfloat> @d_to_h(<8 x double> %a) { define <8 x float> @h_to_s(<8 x bfloat> %a) { ; CHECK-LABEL: h_to_s: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: ret %1 = fpext <8 x bfloat> %a to <8 x float> ret <8 x float> %1 @@ -283,13 +282,12 @@ define <8 x float> @h_to_s(<8 x bfloat> %a) { define <8 x double> @h_to_d(<8 x bfloat> %a) { ; CHECK-LABEL: h_to_d: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; CHECK-NEXT: shll v2.4s, v0.4h, #16 -; CHECK-NEXT: fcvtl v0.2d, v2.2s -; CHECK-NEXT: shll v4.4s, v1.4h, #16 -; CHECK-NEXT: fcvtl2 v1.2d, v2.4s -; CHECK-NEXT: fcvtl2 v3.2d, v4.4s -; CHECK-NEXT: fcvtl v2.2d, v4.2s +; CHECK-NEXT: shll v1.4s, v0.4h, #16 +; CHECK-NEXT: shll2 v2.4s, v0.8h, #16 +; CHECK-NEXT: fcvtl v0.2d, v1.2s +; CHECK-NEXT: fcvtl2 v3.2d, v2.4s +; CHECK-NEXT: fcvtl2 v1.2d, v1.4s +; CHECK-NEXT: fcvtl v2.2d, v2.2s ; CHECK-NEXT: ret %1 = fpext <8 x bfloat> %a to <8 x double> ret <8 x double> %1 @@ -788,11 +786,10 @@ define void @test_insert_at_zero(bfloat %a, ptr %b) #0 { define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptosi_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret @@ -803,11 +800,10 @@ define <8 x i8> @fptosi_i8(<8 x bfloat> %a) #0 { define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptosi_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzs v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzs v1.4s, v1.4s +; CHECK-NEXT: fcvtzs v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %1 = fptosi<8 x bfloat> %a to <8 x i16> @@ -817,11 +813,10 @@ define <8 x i16> @fptosi_i16(<8 x bfloat> %a) #0 { define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptoui_i8: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: xtn v0.8b, v0.8h ; CHECK-NEXT: ret @@ -832,11 +827,10 @@ define <8 x i8> @fptoui_i8(<8 x bfloat> %a) #0 { define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 { ; CHECK-LABEL: fptoui_i16: ; CHECK: // %bb.0: -; CHECK-NEXT: ext v1.16b, v0.16b, v0.16b, #8 +; CHECK-NEXT: shll2 v1.4s, v0.8h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-NEXT: fcvtzu v0.4s, v0.4s -; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: fcvtzu v1.4s, v1.4s +; CHECK-NEXT: fcvtzu v0.4s, v0.4s ; CHECK-NEXT: uzp1 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret %1 = fptoui<8 x bfloat> %a to <8 x i16> @@ -846,90 +840,58 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 { define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_une: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, ne -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, ne +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, ne -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, ne +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -941,96 +903,64 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ueq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: lsl w9, w11, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s7, w9 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: csetm w10, eq -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csinv w10, w10, wzr, vc -; CHECK-NEXT: fcmp s7, s6 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w11, s4 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h4, v0.h[4] -; CHECK-NEXT: mov h7, v1.h[5] +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: csinv w8, w8, wzr, vc +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] ; CHECK-NEXT: csetm w9, eq ; CHECK-NEXT: csinv w9, w9, wzr, vc -; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: lsl w11, w11, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s5, w11 -; CHECK-NEXT: fmov s6, w8 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: mov v2.h[1], w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov s6, w10 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: fmov w10, s5 ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: fmov s6, w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: lsl w8, w9, #16 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: csetm w10, eq -; CHECK-NEXT: csinv w10, w10, wzr, vc -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: csinv w8, w8, wzr, vc +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc @@ -1044,90 +974,58 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ugt: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, hi -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, hi -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, hi +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, hi -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, hi +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, hi ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1139,90 +1037,58 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_uge: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, pl -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, pl -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, pl +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, pl -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, pl +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, pl ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1234,90 +1100,58 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ult: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, lt -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, lt -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, lt +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, lt -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, lt +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, lt ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1329,90 +1163,58 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ule: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, le -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, le -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, le +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, le -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, le +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, le ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1424,90 +1226,58 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_uno: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, vs -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, vs -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, vs +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, vs -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, vs +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, vs ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1519,96 +1289,64 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_one: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w11, s0 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: lsl w9, w11, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s7, w9 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: csetm w10, mi -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csinv w10, w10, wzr, le -; CHECK-NEXT: fcmp s7, s6 -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w11, s4 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h4, v0.h[4] -; CHECK-NEXT: mov h7, v1.h[5] +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 +; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: csinv w8, w8, wzr, le +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] ; CHECK-NEXT: csetm w9, mi ; CHECK-NEXT: csinv w9, w9, wzr, le -; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: lsl w11, w11, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov s5, w11 -; CHECK-NEXT: fmov s6, w8 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: mov v2.h[1], w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov s6, w10 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: fmov w9, s7 -; CHECK-NEXT: fmov w10, s5 ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: fmov s6, w10 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: fmov w10, s4 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: lsl w8, w9, #16 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: csetm w10, mi -; CHECK-NEXT: csinv w10, w10, wzr, le -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s1, s0 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: csinv w8, w8, wzr, le +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le @@ -1622,90 +1360,58 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_oeq: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, eq -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, eq +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, eq -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1717,90 +1423,58 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ogt: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, gt -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, gt -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, gt +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, gt -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, gt +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, gt ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1812,90 +1486,58 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_oge: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, ge -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, ge -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, ge +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, ge -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, ge +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, ge ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -1907,90 +1549,58 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_olt: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, mi -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, mi +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, mi -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -2002,90 +1612,58 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ole: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, ls -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, ls -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, ls +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, ls -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, ls +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, ls ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h @@ -2097,90 +1675,58 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ord: ; CHECK: // %bb.0: -; CHECK-NEXT: mov h2, v1.h[1] -; CHECK-NEXT: mov h3, v0.h[1] -; CHECK-NEXT: fmov w10, s1 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: fmov w9, s3 -; CHECK-NEXT: mov h2, v1.h[2] -; CHECK-NEXT: mov h3, v0.h[2] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: fmov w8, s0 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: mov h3, v1.h[4] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s5, s4 -; CHECK-NEXT: fmov s5, w9 -; CHECK-NEXT: mov h4, v1.h[3] -; CHECK-NEXT: lsl w10, w10, #16 -; CHECK-NEXT: fmov s6, w8 -; CHECK-NEXT: fmov w8, s2 -; CHECK-NEXT: csetm w9, vc -; CHECK-NEXT: fmov s16, w10 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: mov h5, v0.h[3] -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: mov h6, v0.h[4] -; CHECK-NEXT: mov h4, v1.h[5] -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: dup v2.4h, v1.h[1] +; CHECK-NEXT: dup v3.4h, v0.h[1] +; CHECK-NEXT: dup v4.4h, v1.h[2] +; CHECK-NEXT: dup v5.4h, v0.h[2] +; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v2.4s, v1.4h, #16 +; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, vc -; CHECK-NEXT: fmov s2, w8 -; CHECK-NEXT: fmov w8, s5 -; CHECK-NEXT: mov h5, v0.h[5] -; CHECK-NEXT: fcmp s16, s7 -; CHECK-NEXT: mov v2.h[1], w9 -; CHECK-NEXT: lsl w9, w10, #16 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fmov w10, s3 -; CHECK-NEXT: fmov s3, w9 -; CHECK-NEXT: fmov w9, s6 -; CHECK-NEXT: fmov s7, w8 +; CHECK-NEXT: fcmp s3, s2 +; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: csetm w9, vc +; CHECK-NEXT: fmov s2, w9 +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[4] +; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[5] +; CHECK-NEXT: dup v6.8h, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fmov w10, s4 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fcmp s7, s3 -; CHECK-NEXT: mov h3, v1.h[6] -; CHECK-NEXT: fmov s4, w8 -; CHECK-NEXT: mov h1, v1.h[7] -; CHECK-NEXT: fmov s6, w9 -; CHECK-NEXT: fmov w9, s5 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: dup v5.8h, v1.h[6] +; CHECK-NEXT: dup v6.8h, v0.h[6] +; CHECK-NEXT: dup v1.8h, v1.h[7] +; CHECK-NEXT: dup v0.8h, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 -; CHECK-NEXT: lsl w8, w10, #16 -; CHECK-NEXT: fcmp s6, s4 -; CHECK-NEXT: mov h4, v0.h[6] -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s5, w8 -; CHECK-NEXT: mov h0, v0.h[7] -; CHECK-NEXT: fmov s6, w9 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 -; CHECK-NEXT: fmov w8, s3 -; CHECK-NEXT: fmov w9, s4 -; CHECK-NEXT: fcmp s6, s5 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: csetm w10, vc -; CHECK-NEXT: fmov s3, w8 -; CHECK-NEXT: fmov s4, w9 -; CHECK-NEXT: fmov w8, s1 -; CHECK-NEXT: fmov w9, s0 -; CHECK-NEXT: mov v2.h[5], w10 -; CHECK-NEXT: lsl w8, w8, #16 -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w8 -; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: mov v2.h[5], w8 +; CHECK-NEXT: csetm w8, vc +; CHECK-NEXT: fcmp s0, s1 ; CHECK-NEXT: mov v2.h[6], w8 -; CHECK-NEXT: fcmp s1, s0 ; CHECK-NEXT: csetm w8, vc ; CHECK-NEXT: mov v2.h[7], w8 ; CHECK-NEXT: xtn v0.8b, v2.8h diff --git a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll index 40684b0f3a256..e3263252875f7 100644 --- a/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll +++ b/llvm/test/CodeGen/AArch64/cvt-fp-int-fp.ll @@ -76,11 +76,9 @@ entry: define bfloat @t7(bfloat %x) { ; CHECK-LABEL: t7: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w9, s0 ; CHECK-NEXT: scvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 @@ -101,11 +99,9 @@ entry: define bfloat @t8(bfloat %x) { ; CHECK-LABEL: t8: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu w9, s0 ; CHECK-NEXT: ucvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 @@ -198,11 +194,9 @@ entry: define bfloat @t7_strict(bfloat %x) #0 { ; CHECK-LABEL: t7_strict: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w9, s0 ; CHECK-NEXT: scvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 @@ -223,11 +217,9 @@ entry: define bfloat @t8_strict(bfloat %x) #0 { ; CHECK-LABEL: t8_strict: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzu w9, s0 ; CHECK-NEXT: ucvtf d0, w9 ; CHECK-NEXT: fcvtxn s0, d0 diff --git a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll index ec7548e1e6541..b7fae2bff6876 100644 --- a/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/AArch64/round-fptosi-sat-scalar.ll @@ -7,19 +7,17 @@ define i32 @testmswbf(bfloat %a) { ; CHECK-LABEL: testmswbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintm s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret entry: @@ -31,19 +29,17 @@ entry: define i64 @testmsxbf(bfloat %a) { ; CHECK-LABEL: testmsxbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintm s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs x0, s0 ; CHECK-NEXT: ret entry: @@ -141,19 +137,17 @@ entry: define i32 @testpswbf(bfloat %a) { ; CHECK-LABEL: testpswbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintp s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs w0, s0 ; CHECK-NEXT: ret entry: @@ -165,19 +159,17 @@ entry: define i64 @testpsxbf(bfloat %a) { ; CHECK-LABEL: testpsxbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $h0 killed $h0 def $s0 -; CHECK-NEXT: fmov w9, s0 +; CHECK-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-NEXT: mov w8, #32767 // =0x7fff -; CHECK-NEXT: lsl w9, w9, #16 -; CHECK-NEXT: fmov s0, w9 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: frintp s0, s0 ; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: ubfx w10, w9, #16, #1 ; CHECK-NEXT: add w8, w9, w8 ; CHECK-NEXT: add w8, w10, w8 ; CHECK-NEXT: lsr w8, w8, #16 -; CHECK-NEXT: lsl w8, w8, #16 ; CHECK-NEXT: fmov s0, w8 +; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: fcvtzs x0, s0 ; CHECK-NEXT: ret entry: