From 1616698c6568802f1d451f89f6e5badd146e0b59 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Tue, 15 Apr 2025 16:51:50 +0100 Subject: [PATCH 1/3] [AArch64] Use pattern to select bf16 fpextend Currently bf16 fpextend is lowered to a vector shift. Instead leave it as fpextend and have an instruction selection pattern which selects to a shift later. Doing this means that DAGCombiner patterns for fpextend will be applied, leading to better codegen. It also means that in some situations we use a mov instruction where we previously have a dup instruction, but I don't think this makes any difference. --- .../Target/AArch64/AArch64ISelLowering.cpp | 38 +- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 18 + .../arm64-fast-isel-conversion-fallback.ll | 8 +- llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll | 8 +- llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll | 8 +- .../test/CodeGen/AArch64/bf16-instructions.ll | 18 +- .../CodeGen/AArch64/bf16-v8-instructions.ll | 628 +++++++++--------- ...e-streaming-mode-fixed-length-fcopysign.ll | 28 +- 8 files changed, 361 insertions(+), 393 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 63924dc1b30ea..17181cba0ceb2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -766,13 +766,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(Op, MVT::v8bf16, Expand); } - // For bf16, fpextend is custom lowered to be optionally expanded into shifts. - setOperationAction(ISD::FP_EXTEND, MVT::f32, Custom); + // fpextend from f16 or bf16 to f32 is legal + setOperationAction(ISD::FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Legal); + setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Legal); + // fpextend from bf16 to f64 needs to be split into two fpextends setOperationAction(ISD::FP_EXTEND, MVT::f64, Custom); - setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); - setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f32, Custom); auto LegalizeNarrowFP = [this](MVT ScalarVT) { for (auto Op : { @@ -4548,33 +4549,6 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, return SDValue(); } - if (VT.getScalarType() == MVT::f32) { - // FP16->FP32 extends are legal for v32 and v4f32. - if (Op0VT.getScalarType() == MVT::f16) - return Op; - if (Op0VT.getScalarType() == MVT::bf16) { - SDLoc DL(Op); - EVT IVT = VT.changeTypeToInteger(); - if (!Op0VT.isVector()) { - Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4bf16, Op0); - IVT = MVT::v4i32; - } - - EVT Op0IVT = Op0.getValueType().changeTypeToInteger(); - SDValue Ext = - DAG.getNode(ISD::ANY_EXTEND, DL, IVT, DAG.getBitcast(Op0IVT, Op0)); - SDValue Shift = - DAG.getNode(ISD::SHL, DL, IVT, Ext, DAG.getConstant(16, DL, IVT)); - if (!Op0VT.isVector()) - Shift = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Shift, - DAG.getConstant(0, DL, MVT::i64)); - Shift = DAG.getBitcast(VT, Shift); - return IsStrict ? DAG.getMergeValues({Shift, Op.getOperand(0)}, DL) - : Shift; - } - return SDValue(); - } - assert(Op.getValueType() == MVT::f128 && "Unexpected lowering"); return SDValue(); } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index f7b13092821d6..3562406738c93 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8513,6 +8513,24 @@ def : InstAlias<"uxtl2 $dst.2d, $src1.4s", (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>; } +// fpextend from bf16 to f32 is just a shift left by 16 +let Predicates = [HasNEON] in { +def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))), + (f32 (EXTRACT_SUBREG + (v4i32 (SHLLv4i16 (v4i16 (SUBREG_TO_REG (i64 0), (bf16 FPR16:$Rn), hsub)))), + ssub))>; +def : Pat<(v4f32 (any_fpextend (v4bf16 V64:$Rn))), + (SHLLv4i16 V64:$Rn)>; +def : Pat<(v4f32 (any_fpextend (extract_high_v8bf16 (v8bf16 V128:$Rn)))), + (SHLLv8i16 V128:$Rn)>; +} +// Fallback pattern for when we don't have NEON +def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))), + (f32 (COPY_TO_REGCLASS + (i32 (UBFMWri (i32 (SUBREG_TO_REG (i32 0), (bf16 FPR16:$Rn), hsub)), + (i64 16), (i64 15))), + FPR32))>; + def abs_f16 : OutPatFrag<(ops node:$Rn), (EXTRACT_SUBREG (f32 (COPY_TO_REGCLASS diff --git a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll index 9a1203f18243d..1d33545cb171a 100644 --- a/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll +++ b/llvm/test/CodeGen/AArch64/arm64-fast-isel-conversion-fallback.ll @@ -155,9 +155,7 @@ entry: define i32 @fptosi_bf(bfloat %a) nounwind ssp { ; CHECK-LABEL: fptosi_bf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s1, s0 -; CHECK-NEXT: // implicit-def: $d0 -; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: // kill: def $d0 killed $h0 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fcvtzs w0, s0 @@ -171,9 +169,7 @@ entry: define i32 @fptoui_sbf(bfloat %a) nounwind ssp { ; CHECK-LABEL: fptoui_sbf: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s1, s0 -; CHECK-NEXT: // implicit-def: $d0 -; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: // kill: def $d0 killed $h0 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-NEXT: fcvtzu w0, s0 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 9b5e48d2b4217..e3e18a1f91c6d 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -641,7 +641,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NOLSE-NEXT: dup v1.4h, v0.h[1] +; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: mov w8, #32767 // =0x7fff ; NOLSE-NEXT: shll v0.4s, v0.4h, #16 ; NOLSE-NEXT: shll v1.4s, v1.4h, #16 @@ -649,7 +649,7 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: dup v3.4h, v2.h[1] +; NOLSE-NEXT: mov h3, v2.h[1] ; NOLSE-NEXT: shll v2.4s, v2.4h, #16 ; NOLSE-NEXT: fmaxnm s2, s2, s0 ; NOLSE-NEXT: shll v3.4s, v3.4h, #16 @@ -677,14 +677,14 @@ define <2 x bfloat> @test_atomicrmw_fmax_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-LABEL: test_atomicrmw_fmax_v2bf16_seq_cst_align4: ; LSE: // %bb.0: ; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; LSE-NEXT: dup v1.4h, v0.h[1] +; LSE-NEXT: mov h1, v0.h[1] ; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr s0, [x0] ; LSE-NEXT: shll v1.4s, v1.4h, #16 ; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: dup v3.4h, v0.h[1] +; LSE-NEXT: mov h3, v0.h[1] ; LSE-NEXT: shll v4.4s, v0.4h, #16 ; LSE-NEXT: fmaxnm s4, s4, s2 ; LSE-NEXT: shll v3.4s, v3.4h, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index f6c542fe7d407..10de6777bd285 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -641,7 +641,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: ; NOLSE: // %bb.0: ; NOLSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; NOLSE-NEXT: dup v1.4h, v0.h[1] +; NOLSE-NEXT: mov h1, v0.h[1] ; NOLSE-NEXT: mov w8, #32767 // =0x7fff ; NOLSE-NEXT: shll v0.4s, v0.4h, #16 ; NOLSE-NEXT: shll v1.4s, v1.4h, #16 @@ -649,7 +649,7 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; NOLSE-NEXT: // =>This Inner Loop Header: Depth=1 ; NOLSE-NEXT: ldaxr w9, [x0] ; NOLSE-NEXT: fmov s2, w9 -; NOLSE-NEXT: dup v3.4h, v2.h[1] +; NOLSE-NEXT: mov h3, v2.h[1] ; NOLSE-NEXT: shll v2.4s, v2.4h, #16 ; NOLSE-NEXT: fminnm s2, s2, s0 ; NOLSE-NEXT: shll v3.4s, v3.4h, #16 @@ -677,14 +677,14 @@ define <2 x bfloat> @test_atomicrmw_fmin_v2bf16_seq_cst_align4(ptr %ptr, <2 x bf ; LSE-LABEL: test_atomicrmw_fmin_v2bf16_seq_cst_align4: ; LSE: // %bb.0: ; LSE-NEXT: // kill: def $d0 killed $d0 def $q0 -; LSE-NEXT: dup v1.4h, v0.h[1] +; LSE-NEXT: mov h1, v0.h[1] ; LSE-NEXT: shll v2.4s, v0.4h, #16 ; LSE-NEXT: mov w8, #32767 // =0x7fff ; LSE-NEXT: ldr s0, [x0] ; LSE-NEXT: shll v1.4s, v1.4h, #16 ; LSE-NEXT: .LBB7_1: // %atomicrmw.start ; LSE-NEXT: // =>This Inner Loop Header: Depth=1 -; LSE-NEXT: dup v3.4h, v0.h[1] +; LSE-NEXT: mov h3, v0.h[1] ; LSE-NEXT: shll v4.4s, v0.4h, #16 ; LSE-NEXT: fminnm s4, s4, s2 ; LSE-NEXT: shll v3.4s, v3.4h, #16 diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index 2fc9c53112ab6..1dd883580715e 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -1996,13 +1996,11 @@ define bfloat @test_copysign_f64(bfloat %a, double %b) #0 { define float @test_copysign_extended(bfloat %a, bfloat %b) #0 { ; CHECK-CVT-LABEL: test_copysign_extended: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-CVT-NEXT: movi v2.4s, #16 ; CHECK-CVT-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-CVT-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-CVT-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-CVT-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-CVT-NEXT: shll v1.4s, v1.4h, #16 +; CHECK-CVT-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-CVT-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-CVT-NEXT: fmov w8, s0 ; CHECK-CVT-NEXT: lsr w8, w8, #16 @@ -2013,16 +2011,12 @@ define float @test_copysign_extended(bfloat %a, bfloat %b) #0 { ; ; CHECK-SD-LABEL: test_copysign_extended: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-SD-NEXT: movi v2.4s, #16 ; CHECK-SD-NEXT: // kill: def $h1 killed $h1 def $d1 -; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0 -; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 -; CHECK-SD-NEXT: ushl v0.4s, v0.4s, v2.4s +; CHECK-SD-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-SD-NEXT: mvni v2.4s, #128, lsl #24 -; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b -; CHECK-SD-NEXT: bfcvt h0, s0 +; CHECK-SD-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-SD-NEXT: shll v0.4s, v0.4h, #16 +; CHECK-SD-NEXT: bif v0.16b, v1.16b, v2.16b ; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index 3a55b68f2d1a3..f4ab8ff581e23 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -882,11 +882,11 @@ define <8 x i16> @fptoui_i16(<8 x bfloat> %a) #0 { define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_une: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -896,34 +896,34 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, ne ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, ne ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, ne -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -945,54 +945,54 @@ define <8 x i1> @test_fcmp_une(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ueq: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v6.4s, v6.4h, #16 ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v2.4s, v1.4h, #16 ; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc ; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v2.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h3, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[4] ; CHECK-NEXT: csetm w9, eq ; CHECK-NEXT: csinv w9, w9, wzr, vc -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: fcmp s4, s2 +; CHECK-NEXT: mov h4, v0.h[4] ; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, eq +; CHECK-NEXT: shll v4.4s, v4.4h, #16 ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: fcmp s6, s3 ; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h5, v1.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: mov h4, v0.h[6] +; CHECK-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: csinv w8, w8, wzr, vc -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: fcmp s6, s5 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v4.4s, v4.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 @@ -1016,11 +1016,11 @@ define <8 x i1> @test_fcmp_ueq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ugt: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1030,34 +1030,34 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, hi ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, hi -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, hi ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, hi -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1079,11 +1079,11 @@ define <8 x i1> @test_fcmp_ugt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_uge: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1093,34 +1093,34 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, pl ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, pl -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, pl ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, pl -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1142,11 +1142,11 @@ define <8 x i1> @test_fcmp_uge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ult: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1156,34 +1156,34 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, lt ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, lt -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, lt ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, lt -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1205,11 +1205,11 @@ define <8 x i1> @test_fcmp_ult(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ule: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1219,34 +1219,34 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, le ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, le -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, le ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, le -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1268,11 +1268,11 @@ define <8 x i1> @test_fcmp_ule(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_uno: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1282,34 +1282,34 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, vs ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, vs -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, vs ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, vs -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1331,54 +1331,54 @@ define <8 x i1> @test_fcmp_uno(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_one: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v6.4s, v6.4h, #16 ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v2.4s, v1.4h, #16 ; CHECK-NEXT: shll v3.4s, v0.4h, #16 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le ; CHECK-NEXT: fcmp s3, s2 -; CHECK-NEXT: shll v3.4s, v4.4h, #16 +; CHECK-NEXT: shll v2.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h3, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[4] ; CHECK-NEXT: csetm w9, mi ; CHECK-NEXT: csinv w9, w9, wzr, le -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: fcmp s4, s2 +; CHECK-NEXT: mov h4, v0.h[4] ; CHECK-NEXT: fmov s2, w9 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, mi +; CHECK-NEXT: shll v4.4s, v4.4h, #16 ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s4, s3 +; CHECK-NEXT: fcmp s6, s3 ; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h5, v1.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: mov h4, v0.h[6] +; CHECK-NEXT: shll v5.4s, v5.4h, #16 +; CHECK-NEXT: shll v6.4s, v6.4h, #16 +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: csinv w8, w8, wzr, le -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 +; CHECK-NEXT: fcmp s6, s5 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: shll v4.4s, v4.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 ; CHECK-NEXT: mov v2.h[4], w8 @@ -1402,11 +1402,11 @@ define <8 x i1> @test_fcmp_one(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_oeq: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1416,34 +1416,34 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, eq ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, eq ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, eq -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1465,11 +1465,11 @@ define <8 x i1> @test_fcmp_oeq(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ogt: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1479,34 +1479,34 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, gt ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, gt -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, gt ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, gt -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1528,11 +1528,11 @@ define <8 x i1> @test_fcmp_ogt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_oge: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1542,34 +1542,34 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, ge ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ge -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, ge ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, ge -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1591,11 +1591,11 @@ define <8 x i1> @test_fcmp_oge(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_olt: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1605,34 +1605,34 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, mi ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, mi ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, mi -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1654,11 +1654,11 @@ define <8 x i1> @test_fcmp_olt(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ole: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1668,34 +1668,34 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, ls ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, ls -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, ls ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, ls -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 @@ -1717,11 +1717,11 @@ define <8 x i1> @test_fcmp_ole(<8 x bfloat> %a, <8 x bfloat> %b) #0 { define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-LABEL: test_fcmp_ord: ; CHECK: // %bb.0: -; CHECK-NEXT: dup v2.4h, v1.h[1] -; CHECK-NEXT: dup v3.4h, v0.h[1] -; CHECK-NEXT: dup v4.4h, v1.h[2] -; CHECK-NEXT: dup v5.4h, v0.h[2] -; CHECK-NEXT: dup v6.4h, v0.h[3] +; CHECK-NEXT: mov h2, v1.h[1] +; CHECK-NEXT: mov h3, v0.h[1] +; CHECK-NEXT: mov h4, v1.h[2] +; CHECK-NEXT: mov h5, v0.h[2] +; CHECK-NEXT: mov h6, v0.h[3] ; CHECK-NEXT: shll v2.4s, v2.4h, #16 ; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: fcmp s3, s2 @@ -1731,34 +1731,34 @@ define <8 x i1> @test_fcmp_ord(<8 x bfloat> %a, <8 x bfloat> %b) #0 { ; CHECK-NEXT: fcmp s3, s2 ; CHECK-NEXT: shll v3.4s, v4.4h, #16 ; CHECK-NEXT: shll v4.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.4h, v1.h[3] +; CHECK-NEXT: mov h5, v1.h[3] ; CHECK-NEXT: csetm w9, vc ; CHECK-NEXT: fmov s2, w9 ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[4] -; CHECK-NEXT: dup v6.8h, v0.h[4] +; CHECK-NEXT: mov h3, v1.h[4] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[4] ; CHECK-NEXT: mov v2.h[1], w8 ; CHECK-NEXT: csetm w8, vc -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 +; CHECK-NEXT: mov h5, v1.h[5] ; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[5] -; CHECK-NEXT: dup v6.8h, v0.h[5] +; CHECK-NEXT: mov h6, v0.h[5] ; CHECK-NEXT: mov v2.h[2], w8 ; CHECK-NEXT: csetm w8, vc ; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 -; CHECK-NEXT: shll v4.4s, v6.4h, #16 -; CHECK-NEXT: dup v5.8h, v1.h[6] -; CHECK-NEXT: dup v6.8h, v0.h[6] -; CHECK-NEXT: dup v1.8h, v1.h[7] -; CHECK-NEXT: dup v0.8h, v0.h[7] +; CHECK-NEXT: mov h3, v1.h[6] +; CHECK-NEXT: shll v4.4s, v5.4h, #16 +; CHECK-NEXT: shll v5.4s, v6.4h, #16 +; CHECK-NEXT: mov h6, v0.h[6] +; CHECK-NEXT: mov h1, v1.h[7] +; CHECK-NEXT: mov h0, v0.h[7] ; CHECK-NEXT: mov v2.h[3], w8 ; CHECK-NEXT: csetm w8, vc -; CHECK-NEXT: fcmp s4, s3 -; CHECK-NEXT: shll v3.4s, v5.4h, #16 +; CHECK-NEXT: fcmp s5, s4 +; CHECK-NEXT: shll v3.4s, v3.4h, #16 ; CHECK-NEXT: shll v4.4s, v6.4h, #16 ; CHECK-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-NEXT: shll v0.4s, v0.4h, #16 diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll index 381c67c6d749e..da6b3bb99dbda 100644 --- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-fcopysign.ll @@ -74,30 +74,16 @@ define void @test_copysign_bf16(ptr %ap, ptr %bp) { ; ; NONEON-NOSVE-LABEL: test_copysign_bf16: ; NONEON-NOSVE: // %bb.0: -; NONEON-NOSVE-NEXT: sub sp, sp, #80 -; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 80 +; NONEON-NOSVE-NEXT: sub sp, sp, #16 +; NONEON-NOSVE-NEXT: .cfi_def_cfa_offset 16 ; NONEON-NOSVE-NEXT: ldr h0, [x0] ; NONEON-NOSVE-NEXT: ldr h1, [x1] -; NONEON-NOSVE-NEXT: str h0, [sp, #40] -; NONEON-NOSVE-NEXT: ldr d0, [sp, #40] -; NONEON-NOSVE-NEXT: str h1, [sp, #76] -; NONEON-NOSVE-NEXT: ushll v0.4s, v0.4h, #0 -; NONEON-NOSVE-NEXT: str q0, [sp] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #12] -; NONEON-NOSVE-NEXT: lsl w9, w8, #16 -; NONEON-NOSVE-NEXT: ldr w8, [sp, #8] -; NONEON-NOSVE-NEXT: lsl w8, w8, #16 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #24] -; NONEON-NOSVE-NEXT: ldr w8, [sp, #4] -; NONEON-NOSVE-NEXT: lsl w9, w8, #16 -; NONEON-NOSVE-NEXT: ldr w8, [sp] +; NONEON-NOSVE-NEXT: fmov w8, s0 +; NONEON-NOSVE-NEXT: str h1, [sp, #12] ; NONEON-NOSVE-NEXT: lsl w8, w8, #16 -; NONEON-NOSVE-NEXT: stp w8, w9, [sp, #16] -; NONEON-NOSVE-NEXT: ldrb w8, [sp, #77] -; NONEON-NOSVE-NEXT: ldr q0, [sp, #16] +; NONEON-NOSVE-NEXT: fmov s0, w8 +; NONEON-NOSVE-NEXT: ldrb w8, [sp, #13] ; NONEON-NOSVE-NEXT: tst w8, #0x80 -; NONEON-NOSVE-NEXT: str q0, [sp, #48] -; NONEON-NOSVE-NEXT: ldr s0, [sp, #48] ; NONEON-NOSVE-NEXT: fabs s0, s0 ; NONEON-NOSVE-NEXT: fneg s1, s0 ; NONEON-NOSVE-NEXT: fcsel s0, s1, s0, ne @@ -105,7 +91,7 @@ define void @test_copysign_bf16(ptr %ap, ptr %bp) { ; NONEON-NOSVE-NEXT: lsr w8, w8, #16 ; NONEON-NOSVE-NEXT: fmov s0, w8 ; NONEON-NOSVE-NEXT: str h0, [x0] -; NONEON-NOSVE-NEXT: add sp, sp, #80 +; NONEON-NOSVE-NEXT: add sp, sp, #16 ; NONEON-NOSVE-NEXT: ret %a = load bfloat, ptr %ap %b = load bfloat, ptr %bp From 49c750112f3b30bc269fed7814950e067eb82032 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Mon, 28 Apr 2025 12:32:16 +0100 Subject: [PATCH 2/3] Update tests now that PR#131345 has been merged. --- .../test/CodeGen/AArch64/bf16-instructions.ll | 9 +- llvm/test/CodeGen/AArch64/bf16_fast_math.ll | 190 +++++++----------- 2 files changed, 76 insertions(+), 123 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/bf16-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-instructions.ll index 1dd883580715e..9f002b1e0da55 100644 --- a/llvm/test/CodeGen/AArch64/bf16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-instructions.ll @@ -202,16 +202,13 @@ define bfloat @test_fmadd(bfloat %a, bfloat %b, bfloat %c) #0 { ; ; CHECK-BF16-LABEL: test_fmadd: ; CHECK-BF16: // %bb.0: +; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 ; CHECK-BF16-NEXT: // kill: def $h1 killed $h1 def $d1 ; CHECK-BF16-NEXT: // kill: def $h0 killed $h0 def $d0 -; CHECK-BF16-NEXT: // kill: def $h2 killed $h2 def $d2 ; CHECK-BF16-NEXT: shll v1.4s, v1.4h, #16 ; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fmul s0, s0, s1 -; CHECK-BF16-NEXT: shll v1.4s, v2.4h, #16 -; CHECK-BF16-NEXT: bfcvt h0, s0 -; CHECK-BF16-NEXT: shll v0.4s, v0.4h, #16 -; CHECK-BF16-NEXT: fadd s0, s0, s1 +; CHECK-BF16-NEXT: shll v2.4s, v2.4h, #16 +; CHECK-BF16-NEXT: fmadd s0, s0, s1, s2 ; CHECK-BF16-NEXT: bfcvt h0, s0 ; CHECK-BF16-NEXT: ret %mul = fmul fast bfloat %a, %b diff --git a/llvm/test/CodeGen/AArch64/bf16_fast_math.ll b/llvm/test/CodeGen/AArch64/bf16_fast_math.ll index 7d7fb67ca2f77..871ca12c9de77 100644 --- a/llvm/test/CodeGen/AArch64/bf16_fast_math.ll +++ b/llvm/test/CodeGen/AArch64/bf16_fast_math.ll @@ -4,8 +4,6 @@ ; Check that the output instructions have the same fast math flags as the input ; fadd, even when bf16 is legalized to f32. -; FIXME: Conversion from float to bf16 is done via a vector type for some -; reason, when we should just be using scalar instructions. define bfloat @normal_fadd(bfloat %x, bfloat %y) { ; CHECK-NOBF16-LABEL: name: normal_fadd @@ -14,13 +12,11 @@ define bfloat @normal_fadd(bfloat %x, bfloat %y) { ; CHECK-NOBF16-NEXT: {{ $}} ; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[FADDSrr]] @@ -40,13 +36,11 @@ define bfloat @normal_fadd(bfloat %x, bfloat %y) { ; CHECK-BF16-NEXT: {{ $}} ; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr @@ -64,13 +58,11 @@ define bfloat @fast_fadd(bfloat %x, bfloat %y) { ; CHECK-NOBF16-NEXT: {{ $}} ; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[FADDSrr]] @@ -90,13 +82,11 @@ define bfloat @fast_fadd(bfloat %x, bfloat %y) { ; CHECK-BF16-NEXT: {{ $}} ; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf nsz arcp contract afn reassoc nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nnan ninf nsz arcp contract afn reassoc nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr @@ -114,13 +104,11 @@ define bfloat @ninf_fadd(bfloat %x, bfloat %y) { ; CHECK-NOBF16-NEXT: {{ $}} ; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:gpr32 = COPY [[FADDSrr]] @@ -140,13 +128,11 @@ define bfloat @ninf_fadd(bfloat %x, bfloat %y) { ; CHECK-BF16-NEXT: {{ $}} ; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY1]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY3]], killed [[COPY2]], implicit $fpcr ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = ninf nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr @@ -159,8 +145,6 @@ entry: ; Check that when we have the right fast math flags the converts in between the ; two fadds are removed. -; FIXME: The convert from float to bf16 being done by a shift prevents this from -; happening. define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) { ; CHECK-NOBF16-LABEL: name: normal_fadd_sequence @@ -170,13 +154,11 @@ define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) { ; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2 ; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY5:%[0-9]+]]:gpr32 = COPY [[FADDSrr]] @@ -187,13 +169,11 @@ define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) { ; CHECK-NOBF16-NEXT: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri killed [[ADDWrr1]], 16, 31 ; CHECK-NOBF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]] ; CHECK-NOBF16-NEXT: [[COPY7:%[0-9]+]]:fpr16 = COPY [[COPY6]].hsub - ; CHECK-NOBF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[COPY7]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[COPY7]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]] ; CHECK-NOBF16-NEXT: [[COPY8:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub - ; CHECK-NOBF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]] ; CHECK-NOBF16-NEXT: [[COPY9:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY8]], killed [[COPY9]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY10:%[0-9]+]]:gpr32 = COPY [[FADDSrr1]] @@ -213,23 +193,19 @@ define bfloat @normal_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) { ; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2 ; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-BF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr - ; CHECK-BF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[BFCVT]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[BFCVT]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]] ; CHECK-BF16-NEXT: [[COPY5:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub - ; CHECK-BF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]] ; CHECK-BF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub ; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nofpexcept FADDSrr killed [[COPY5]], killed [[COPY6]], implicit $fpcr ; CHECK-BF16-NEXT: [[BFCVT1:%[0-9]+]]:fpr16 = nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr @@ -249,13 +225,11 @@ define bfloat @nnan_ninf_contract_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) ; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2 ; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY5:%[0-9]+]]:gpr32 = COPY [[FADDSrr]] @@ -266,13 +240,11 @@ define bfloat @nnan_ninf_contract_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) ; CHECK-NOBF16-NEXT: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri killed [[ADDWrr1]], 16, 31 ; CHECK-NOBF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]] ; CHECK-NOBF16-NEXT: [[COPY7:%[0-9]+]]:fpr16 = COPY [[COPY6]].hsub - ; CHECK-NOBF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[COPY7]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[COPY7]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]] ; CHECK-NOBF16-NEXT: [[COPY8:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub - ; CHECK-NOBF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]] ; CHECK-NOBF16-NEXT: [[COPY9:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY8]], killed [[COPY9]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY10:%[0-9]+]]:gpr32 = COPY [[FADDSrr1]] @@ -292,27 +264,19 @@ define bfloat @nnan_ninf_contract_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) ; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2 ; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-BF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr - ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr - ; CHECK-BF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[BFCVT]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]] ; CHECK-BF16-NEXT: [[COPY5:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub - ; CHECK-BF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]] - ; CHECK-BF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub - ; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[COPY5]], killed [[COPY6]], implicit $fpcr - ; CHECK-BF16-NEXT: [[BFCVT1:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr - ; CHECK-BF16-NEXT: $h0 = COPY [[BFCVT1]] + ; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = nnan ninf contract nofpexcept FADDSrr killed [[FADDSrr]], killed [[COPY5]], implicit $fpcr + ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = nnan ninf contract nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr + ; CHECK-BF16-NEXT: $h0 = COPY [[BFCVT]] ; CHECK-BF16-NEXT: RET_ReallyLR implicit $h0 entry: %add1 = fadd nnan ninf contract bfloat %x, %y @@ -328,13 +292,11 @@ define bfloat @ninf_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) { ; CHECK-NOBF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2 ; CHECK-NOBF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-NOBF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-NOBF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-NOBF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-NOBF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-NOBF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY5:%[0-9]+]]:gpr32 = COPY [[FADDSrr]] @@ -345,13 +307,11 @@ define bfloat @ninf_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) { ; CHECK-NOBF16-NEXT: [[UBFMWri1:%[0-9]+]]:gpr32 = UBFMWri killed [[ADDWrr1]], 16, 31 ; CHECK-NOBF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[UBFMWri1]] ; CHECK-NOBF16-NEXT: [[COPY7:%[0-9]+]]:fpr16 = COPY [[COPY6]].hsub - ; CHECK-NOBF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[COPY7]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[COPY7]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]] ; CHECK-NOBF16-NEXT: [[COPY8:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub - ; CHECK-NOBF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-NOBF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub - ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]] + ; CHECK-NOBF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-NOBF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]] ; CHECK-NOBF16-NEXT: [[COPY9:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub ; CHECK-NOBF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY8]], killed [[COPY9]], implicit $fpcr ; CHECK-NOBF16-NEXT: [[COPY10:%[0-9]+]]:gpr32 = COPY [[FADDSrr1]] @@ -371,23 +331,19 @@ define bfloat @ninf_fadd_sequence(bfloat %x, bfloat %y, bfloat %z) { ; CHECK-BF16-NEXT: [[COPY:%[0-9]+]]:fpr16 = COPY $h2 ; CHECK-BF16-NEXT: [[COPY1:%[0-9]+]]:fpr16 = COPY $h1 ; CHECK-BF16-NEXT: [[COPY2:%[0-9]+]]:fpr16 = COPY $h0 - ; CHECK-BF16-NEXT: [[DEF:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF]], [[COPY1]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY1]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG]] ; CHECK-BF16-NEXT: [[COPY3:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_]].ssub - ; CHECK-BF16-NEXT: [[DEF1:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG1:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF1]], [[COPY2]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG1]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG1:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY2]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_1:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG1]] ; CHECK-BF16-NEXT: [[COPY4:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_1]].ssub ; CHECK-BF16-NEXT: [[FADDSrr:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY4]], killed [[COPY3]], implicit $fpcr ; CHECK-BF16-NEXT: [[BFCVT:%[0-9]+]]:fpr16 = ninf nofpexcept BFCVT killed [[FADDSrr]], implicit $fpcr - ; CHECK-BF16-NEXT: [[DEF2:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG2:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF2]], killed [[BFCVT]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG2]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG2:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, killed [[BFCVT]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_2:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG2]] ; CHECK-BF16-NEXT: [[COPY5:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_2]].ssub - ; CHECK-BF16-NEXT: [[DEF3:%[0-9]+]]:fpr64 = IMPLICIT_DEF - ; CHECK-BF16-NEXT: [[INSERT_SUBREG3:%[0-9]+]]:fpr64 = INSERT_SUBREG [[DEF3]], [[COPY]], %subreg.hsub - ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[INSERT_SUBREG3]] + ; CHECK-BF16-NEXT: [[SUBREG_TO_REG3:%[0-9]+]]:fpr64 = SUBREG_TO_REG 0, [[COPY]], %subreg.hsub + ; CHECK-BF16-NEXT: [[SHLLv4i16_3:%[0-9]+]]:fpr128 = SHLLv4i16 killed [[SUBREG_TO_REG3]] ; CHECK-BF16-NEXT: [[COPY6:%[0-9]+]]:fpr32 = COPY [[SHLLv4i16_3]].ssub ; CHECK-BF16-NEXT: [[FADDSrr1:%[0-9]+]]:fpr32 = ninf nofpexcept FADDSrr killed [[COPY5]], killed [[COPY6]], implicit $fpcr ; CHECK-BF16-NEXT: [[BFCVT1:%[0-9]+]]:fpr16 = ninf nofpexcept BFCVT killed [[FADDSrr1]], implicit $fpcr From 302aa53f0425bc07f9c9d6b508127a42b135ae75 Mon Sep 17 00:00:00 2001 From: John Brawn Date: Mon, 28 Apr 2025 14:28:39 +0100 Subject: [PATCH 3/3] Add an explicit COPY_TO_REGCLASS in the no-NEON pattern --- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 3562406738c93..9e210f6161f09 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -8527,7 +8527,9 @@ def : Pat<(v4f32 (any_fpextend (extract_high_v8bf16 (v8bf16 V128:$Rn)))), // Fallback pattern for when we don't have NEON def : Pat<(f32 (any_fpextend (bf16 FPR16:$Rn))), (f32 (COPY_TO_REGCLASS - (i32 (UBFMWri (i32 (SUBREG_TO_REG (i32 0), (bf16 FPR16:$Rn), hsub)), + (i32 (UBFMWri (COPY_TO_REGCLASS + (f32 (SUBREG_TO_REG (i32 0), (bf16 FPR16:$Rn), hsub)), + GPR32), (i64 16), (i64 15))), FPR32))>;