diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp index 83bb1dfe86c6a..d8933826ebb05 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp @@ -3313,7 +3313,6 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FP_ROUND: R = SoftPromoteHalfRes_FP_ROUND(N); break; // Unary FP Operations - case ISD::FABS: case ISD::FACOS: case ISD::FASIN: case ISD::FATAN: @@ -3329,7 +3328,6 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FLOG2: case ISD::FLOG10: case ISD::FNEARBYINT: - case ISD::FNEG: case ISD::FREEZE: case ISD::FRINT: case ISD::FROUND: @@ -3341,6 +3339,12 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) { case ISD::FTAN: case ISD::FTANH: case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break; + case ISD::FABS: + R = SoftPromoteHalfRes_FABS(N); + break; + case ISD::FNEG: + R = SoftPromoteHalfRes_FNEG(N); + break; case ISD::AssertNoFPClass: R = SoftPromoteHalfRes_AssertNoFPClass(N); break; @@ -3670,6 +3674,24 @@ SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) { return DAG.getNode(GetPromotionOpcode(NVT, OVT), dl, MVT::i16, Res); } +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FABS(SDNode *N) { + SDValue Op = GetSoftPromotedHalf(N->getOperand(0)); + SDLoc dl(N); + + // Clear the sign bit. + return DAG.getNode(ISD::AND, dl, MVT::i16, Op, + DAG.getConstant(0x7fff, dl, MVT::i16)); +} + +SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FNEG(SDNode *N) { + SDValue Op = GetSoftPromotedHalf(N->getOperand(0)); + SDLoc dl(N); + + // Invert the sign bit. + return DAG.getNode(ISD::XOR, dl, MVT::i16, Op, + DAG.getConstant(0x8000, dl, MVT::i16)); +} + SDValue DAGTypeLegalizer::SoftPromoteHalfRes_AssertNoFPClass(SDNode *N) { return GetSoftPromotedHalf(N->getOperand(0)); } diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h index 586c3411791f9..ed70ae63feb12 100644 --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -832,6 +832,8 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer { SDValue SoftPromoteHalfRes_SELECT(SDNode *N); SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N); SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N); + SDValue SoftPromoteHalfRes_FABS(SDNode *N); + SDValue SoftPromoteHalfRes_FNEG(SDNode *N); SDValue SoftPromoteHalfRes_AssertNoFPClass(SDNode *N); SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N); SDValue SoftPromoteHalfRes_UNDEF(SDNode *N); diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index 49fe1eed9c514..9766cdd901bcf 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -19752,18 +19752,14 @@ define bfloat @v_fabs_bf16(bfloat %a) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fabs_bf16: @@ -19946,10 +19942,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: v_fneg_fabs_bf16: @@ -19957,10 +19950,7 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) { ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 +; GFX7-NEXT: v_or_b32_e32 v0, 0x80000000, v0 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_fneg_fabs_bf16: @@ -20002,23 +19992,17 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) { ; GCN-LABEL: s_fneg_fabs_bf16: ; GCN: ; %bb.0: ; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GCN-NEXT: s_bitset0_b32 s0, 31 -; GCN-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GCN-NEXT: s_xor_b32 s0, s0, 0x80000000 -; GCN-NEXT: s_lshr_b32 s0, s0, 16 ; GCN-NEXT: ; return to shader part epilog ; ; GFX7-LABEL: s_fneg_fabs_bf16: ; GFX7: ; %bb.0: ; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0 +; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX7-NEXT: s_bitset0_b32 s0, 31 -; GFX7-NEXT: s_and_b32 s0, s0, 0xffff0000 -; GFX7-NEXT: s_xor_b32 s0, s0, 0x80000000 -; GFX7-NEXT: s_lshr_b32 s0, s0, 16 ; GFX7-NEXT: ; return to shader part epilog ; ; GFX8-LABEL: s_fneg_fabs_bf16: diff --git a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll index 5d184b17f32e6..c46fcde739b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fabs.bf16.ll @@ -218,19 +218,11 @@ define amdgpu_kernel void @s_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> %in ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s4, s3, 0xffff0000 -; CI-NEXT: s_lshl_b32 s3, s3, 16 -; CI-NEXT: s_and_b32 s5, s2, 0xffff0000 -; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4| -; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s3| -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s5| -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_alignbit_b32 v1, v0, v1, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v2 -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2| -; CI-NEXT: v_alignbit_b32 v0, v0, v2, 16 +; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff +; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -537,16 +529,15 @@ define amdgpu_kernel void @v_fabs_fold_self_v2bf16(ptr addrspace(1) %out, ptr ad ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e64 v4, 1.0, |v3| -; CI-NEXT: v_mul_f32_e64 v5, 1.0, |v2| -; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; CI-NEXT: v_mul_f32_e32 v3, v4, v3 -; CI-NEXT: v_mul_f32_e32 v2, v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_and_b32_e32 v3, 0x7fff, v2 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v2 +; CI-NEXT: v_and_b32_e32 v2, 0x7fff0000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e32 v2, v2, v5 +; CI-NEXT: v_mul_f32_e32 v3, v3, v4 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -898,16 +889,13 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2bf16(ptr addrspace(1) %in) #0 { ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v0 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_mul_f32_e64 v1, 1.0, |v1| -; CI-NEXT: v_mul_f32_e64 v0, 1.0, |v0| -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1 +; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; CI-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0 +; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; CI-NEXT: v_add_f32_e32 v0, 2.0, v0 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_mul_f32_e32 v1, 4.0, v1 ; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; CI-NEXT: flat_store_short v[0:1], v1 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_short v[0:1], v0 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll index 64a9727330cfd..76da0aaf251b2 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.bf16.ll @@ -107,12 +107,10 @@ define amdgpu_kernel void @fneg_fabs_fmul_bf16(ptr addrspace(1) %out, bfloat %x, ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s3, s3, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 +; CI-NEXT: s_lshl_b32 s3, s2, 16 ; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_mul_f32_e32 v0, s2, v0 +; CI-NEXT: v_mov_b32_e32 v0, s3 +; CI-NEXT: v_mul_f32_e64 v0, s2, -|v0| ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 @@ -204,12 +202,10 @@ define amdgpu_kernel void @fneg_fabs_free_bf16(ptr addrspace(1) %out, i16 %in) { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -279,12 +275,10 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s2, s2, 0x7fff -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_bitset1_b32 s2, 15 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -345,43 +339,22 @@ define amdgpu_kernel void @fneg_fabs_bf16(ptr addrspace(1) %out, bfloat %in) { } define amdgpu_kernel void @v_fneg_fabs_bf16(ptr addrspace(1) %out, ptr addrspace(1) %in) { -; CI-LABEL: v_fneg_fabs_bf16: -; CI: ; %bb.0: -; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; CI-NEXT: s_add_i32 s12, s12, s17 -; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v0, s2 -; CI-NEXT: v_mov_b32_e32 v1, s3 -; CI-NEXT: flat_load_ushort v2, v[0:1] -; CI-NEXT: v_mov_b32_e32 v0, s0 -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |v2| -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: flat_store_short v[0:1], v2 -; CI-NEXT: s_endpgm -; -; VI-LABEL: v_fneg_fabs_bf16: -; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_add_i32 s12, s12, s17 -; VI-NEXT: s_mov_b32 flat_scratch_lo, s13 -; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: flat_load_ushort v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_or_b32_e32 v2, 0x8000, v2 -; VI-NEXT: flat_store_short v[0:1], v2 -; VI-NEXT: s_endpgm +; CIVI-LABEL: v_fneg_fabs_bf16: +; CIVI: ; %bb.0: +; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; CIVI-NEXT: s_add_i32 s12, s12, s17 +; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13 +; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; CIVI-NEXT: s_waitcnt lgkmcnt(0) +; CIVI-NEXT: v_mov_b32_e32 v0, s2 +; CIVI-NEXT: v_mov_b32_e32 v1, s3 +; CIVI-NEXT: flat_load_ushort v2, v[0:1] +; CIVI-NEXT: v_mov_b32_e32 v0, s0 +; CIVI-NEXT: v_mov_b32_e32 v1, s1 +; CIVI-NEXT: s_waitcnt vmcnt(0) +; CIVI-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; CIVI-NEXT: flat_store_short v[0:1], v2 +; CIVI-NEXT: s_endpgm ; ; GFX9-LABEL: v_fneg_fabs_bf16: ; GFX9: ; %bb.0: @@ -431,21 +404,13 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_non_bc_src(ptr addrspace(1) %out, ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_add_f32_e64 v0, s3, 2.0 -; CI-NEXT: v_add_f32_e64 v1, s2, 1.0 -; CI-NEXT: v_readfirstlane_b32 s2, v0 +; CI-NEXT: s_lshl_b32 s3, s2, 16 ; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: s_bitset0_b32 s2, 31 -; CI-NEXT: v_and_b32_e32 v0, 0x7fffffff, v1 -; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: s_xor_b32 s2, s2, 0x80000000 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: s_lshr_b32 s2, s2, 16 -; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; CI-NEXT: v_alignbit_b32 v2, s2, v0, 16 +; CI-NEXT: v_add_f32_e64 v1, s2, 2.0 +; CI-NEXT: v_add_f32_e64 v0, s3, 1.0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; CI-NEXT: v_alignbit_b32 v0, v1, v0, 16 +; CI-NEXT: v_or_b32_e32 v2, 0x80008000, v0 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_store_dword v[0:1], v2 @@ -566,15 +531,10 @@ define amdgpu_kernel void @s_fneg_fabs_v2bf16_bc_src(ptr addrspace(1) %out, <2 x ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_and_b32 s3, s2, 0x7fff -; CI-NEXT: s_and_b32 s2, s2, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: s_lshl_b32 s2, s3, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -629,27 +589,11 @@ define amdgpu_kernel void @fneg_fabs_v4bf16(ptr addrspace(1) %out, <4 x bfloat> ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s4, s2, 16 -; CI-NEXT: s_and_b32 s2, s2, 0xffff0000 -; CI-NEXT: v_mul_f32_e64 v2, 1.0, |s2| -; CI-NEXT: s_and_b32 s2, s3, 0xffff0000 -; CI-NEXT: s_lshl_b32 s5, s3, 16 -; CI-NEXT: v_mul_f32_e64 v3, 1.0, |s2| -; CI-NEXT: v_mul_f32_e64 v0, 1.0, |s4| -; CI-NEXT: v_mul_f32_e64 v1, 1.0, |s5| -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 -; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 -; CI-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v1 -; CI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; CI-NEXT: v_alignbit_b32 v1, v3, v1, 16 -; CI-NEXT: v_alignbit_b32 v0, v2, v0, 16 +; CI-NEXT: s_or_b32 s3, s3, 0x80008000 +; CI-NEXT: s_or_b32 s2, s2, 0x80008000 ; CI-NEXT: v_mov_b32_e32 v3, s1 +; CI-NEXT: v_mov_b32_e32 v0, s2 +; CI-NEXT: v_mov_b32_e32 v1, s3 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; CI-NEXT: s_endpgm @@ -860,21 +804,20 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2bf16(ptr addrspace(1) %out0, ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: v_mov_b32_e32 v1, s1 -; CI-NEXT: v_mov_b32_e32 v2, s2 -; CI-NEXT: s_and_b32 s1, s4, 0x7fff -; CI-NEXT: s_and_b32 s2, s4, 0x7fff0000 -; CI-NEXT: v_mul_f32_e64 v4, -1.0, s2 -; CI-NEXT: s_lshl_b32 s1, s1, 16 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff -; CI-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; CI-NEXT: v_mul_f32_e64 v5, -1.0, s1 -; CI-NEXT: v_alignbit_b32 v4, v4, v5, 16 -; CI-NEXT: v_mov_b32_e32 v5, s0 +; CI-NEXT: v_mov_b32_e32 v2, s2 +; CI-NEXT: s_or_b32 s2, s0, 0x8000 +; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: s_and_b32 s1, s4, 0x7fff0000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s1, s1, s2 +; CI-NEXT: s_bitset1_b32 s1, 31 +; CI-NEXT: v_mov_b32_e32 v4, s0 ; CI-NEXT: v_mov_b32_e32 v3, s3 -; CI-NEXT: flat_store_dword v[0:1], v5 -; CI-NEXT: flat_store_dword v[2:3], v4 +; CI-NEXT: flat_store_dword v[0:1], v4 +; CI-NEXT: v_mov_b32_e32 v0, s1 +; CI-NEXT: flat_store_dword v[2:3], v0 ; CI-NEXT: s_endpgm ; ; VI-LABEL: s_fneg_multi_use_fabs_v2bf16: @@ -1086,5 +1029,3 @@ declare <4 x bfloat> @llvm.fabs.v4bf16(<4 x bfloat>) #1 attributes #0 = { nounwind } attributes #1 = { nounwind readnone } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CIVI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll index d232693b46ad9..98044a72870fb 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg.bf16.ll @@ -14,11 +14,10 @@ define amdgpu_kernel void @s_fneg_bf16(ptr addrspace(1) %out, bfloat %in) #0 { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -93,9 +92,7 @@ define amdgpu_kernel void @v_fneg_bf16(ptr addrspace(1) %out, ptr addrspace(1) % ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_ushort v2, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_xor_b32_e32 v2, 0x8000, v2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -170,11 +167,10 @@ define amdgpu_kernel void @s_fneg_free_bf16(ptr addrspace(1) %out, i16 %in) #0 { ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_short v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -248,9 +244,9 @@ define amdgpu_kernel void @v_fneg_fold_bf16(ptr addrspace(1) %out, ptr addrspace ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) +; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 ; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v3, -1.0, v2 -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v3 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; CI-NEXT: v_mul_f32_e32 v2, v3, v2 ; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; CI-NEXT: flat_store_short v[0:1], v2 @@ -365,13 +361,13 @@ define amdgpu_kernel void @s_fneg_v2bf16(ptr addrspace(1) %out, <2 x bfloat> %in ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s2, s2, s3 +; CI-NEXT: s_add_i32 s2, s2, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -426,16 +422,16 @@ define amdgpu_kernel void @s_fneg_v2bf16_nonload(ptr addrspace(1) %out) #0 { ; CI-NEXT: ; def s2 ; CI-NEXT: ;;#ASMEND ; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s2, s2, s3 +; CI-NEXT: s_add_i32 s2, s2, 0x80000000 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_mov_b32 flat_scratch_lo, s13 ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -501,13 +497,11 @@ define amdgpu_kernel void @v_fneg_v2bf16(ptr addrspace(1) %out, ptr addrspace(1) ; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; CI-NEXT: flat_load_dword v2, v[0:1] +; CI-NEXT: s_mov_b32 s0, 0xffff ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v3, -1.0, v3 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_mul_f32_e32 v2, -1.0, v2 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 +; CI-NEXT: v_bfi_b32 v2, s0, v3, v2 +; CI-NEXT: v_add_i32_e32 v2, vcc, 0x80000000, v2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -570,13 +564,13 @@ define amdgpu_kernel void @fneg_free_v2bf16(ptr addrspace(1) %out, i32 %in) #0 { ; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_and_b32 s3, s2, 0xffff0000 -; CI-NEXT: s_lshl_b32 s2, s2, 16 -; CI-NEXT: v_mul_f32_e64 v0, -1.0, s3 -; CI-NEXT: v_mul_f32_e64 v1, -1.0, s2 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_alignbit_b32 v2, v0, v1, 16 +; CI-NEXT: s_xor_b32 s2, s2, 0x8000 +; CI-NEXT: s_and_b32 s2, s2, 0xffff +; CI-NEXT: s_or_b32 s2, s2, s3 +; CI-NEXT: s_add_i32 s2, s2, 0x80000000 ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -637,16 +631,14 @@ define amdgpu_kernel void @v_fneg_fold_v2bf16(ptr addrspace(1) %out, ptr addrspa ; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v3, 0xffff0000, v2 -; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; CI-NEXT: v_mul_f32_e32 v4, -1.0, v3 -; CI-NEXT: v_mul_f32_e32 v5, -1.0, v2 -; CI-NEXT: v_and_b32_e32 v4, 0xffff0000, v4 -; CI-NEXT: v_and_b32_e32 v5, 0xffff0000, v5 -; CI-NEXT: v_mul_f32_e32 v3, v4, v3 -; CI-NEXT: v_mul_f32_e32 v2, v5, v2 -; CI-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_alignbit_b32 v2, v3, v2, 16 +; CI-NEXT: v_xor_b32_e32 v3, 0x8000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v2 +; CI-NEXT: v_and_b32_e32 v2, 0xffff0000, v2 +; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; CI-NEXT: v_mul_f32_e64 v2, -v2, v2 +; CI-NEXT: v_mul_f32_e32 v3, v3, v4 +; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_alignbit_b32 v2, v2, v3, 16 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm ; @@ -912,12 +904,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2bf16(ptr addrspace(1) %in) # ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: flat_load_dword v0, v[0:1] ; CI-NEXT: s_waitcnt vmcnt(0) -; CI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0 -; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; CI-NEXT: v_mul_f32_e32 v1, -1.0, v1 -; CI-NEXT: v_mul_f32_e32 v0, -1.0, v0 -; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; CI-NEXT: v_xor_b32_e32 v0, 0x8000, v0 +; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1 ; CI-NEXT: flat_store_short v[0:1], v0 ; CI-NEXT: s_waitcnt vmcnt(0) ; CI-NEXT: flat_store_short v[0:1], v1 diff --git a/llvm/test/CodeGen/ARM/fp16-promote.ll b/llvm/test/CodeGen/ARM/fp16-promote.ll index 800ee87b95ca8..8230e47259dd8 100644 --- a/llvm/test/CodeGen/ARM/fp16-promote.ll +++ b/llvm/test/CodeGen/ARM/fp16-promote.ll @@ -1572,26 +1572,11 @@ define void @test_fma(ptr %p, ptr %q, ptr %r) #0 { } define void @test_fabs(ptr %p) { -; CHECK-FP16-LABEL: test_fabs: -; CHECK-FP16: ldrh r1, [r0] -; CHECK-FP16-NEXT: vmov s0, r1 -; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FP16-NEXT: vabs.f32 s0, s0 -; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FP16-NEXT: vmov r1, s0 -; CHECK-FP16-NEXT: strh r1, [r0] -; CHECK-FP16-NEXT: bx lr -; -; CHECK-LIBCALL-LABEL: test_fabs: -; CHECK-LIBCALL: .save {r4, lr} -; CHECK-LIBCALL-NEXT: push {r4, lr} -; CHECK-LIBCALL-NEXT: mov r4, r0 -; CHECK-LIBCALL-NEXT: ldrh r0, [r0] -; CHECK-LIBCALL-NEXT: bl __aeabi_h2f -; CHECK-LIBCALL-NEXT: bic r0, r0, #-2147483648 -; CHECK-LIBCALL-NEXT: bl __aeabi_f2h -; CHECK-LIBCALL-NEXT: strh r0, [r4] -; CHECK-LIBCALL-NEXT: pop {r4, pc} +; CHECK-ALL-LABEL: test_fabs: +; CHECK-ALL: ldrh r1, [r0] +; CHECK-ALL-NEXT: bfc r1, #15, #17 +; CHECK-ALL-NEXT: strh r1, [r0] +; CHECK-ALL-NEXT: bx lr %a = load half, ptr %p, align 2 %r = call half @llvm.fabs.f16(half %a) store half %r, ptr %p @@ -2454,26 +2439,11 @@ define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { } define void @test_fneg(ptr %p1, ptr %p2) #0 { -; CHECK-FP16-LABEL: test_fneg: -; CHECK-FP16: ldrh r0, [r0] -; CHECK-FP16-NEXT: vmov s0, r0 -; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 -; CHECK-FP16-NEXT: vneg.f32 s0, s0 -; CHECK-FP16-NEXT: vcvtb.f16.f32 s0, s0 -; CHECK-FP16-NEXT: vmov r0, s0 -; CHECK-FP16-NEXT: strh r0, [r1] -; CHECK-FP16-NEXT: bx lr -; -; CHECK-LIBCALL-LABEL: test_fneg: -; CHECK-LIBCALL: .save {r4, lr} -; CHECK-LIBCALL-NEXT: push {r4, lr} -; CHECK-LIBCALL-NEXT: ldrh r0, [r0] -; CHECK-LIBCALL-NEXT: mov r4, r1 -; CHECK-LIBCALL-NEXT: bl __aeabi_h2f -; CHECK-LIBCALL-NEXT: eor r0, r0, #-2147483648 -; CHECK-LIBCALL-NEXT: bl __aeabi_f2h -; CHECK-LIBCALL-NEXT: strh r0, [r4] -; CHECK-LIBCALL-NEXT: pop {r4, pc} +; CHECK-ALL-LABEL: test_fneg: +; CHECK-ALL: ldrh r0, [r0] +; CHECK-ALL-NEXT: eor r0, r0, #32768 +; CHECK-ALL-NEXT: strh r0, [r1] +; CHECK-ALL-NEXT: bx lr %v = load half, ptr %p1, align 2 %res = fneg half %v store half %res, ptr %p2, align 2 diff --git a/llvm/test/CodeGen/Generic/bfloat-op.ll b/llvm/test/CodeGen/Generic/bfloat-op.ll new file mode 100644 index 0000000000000..d59332856a2e9 --- /dev/null +++ b/llvm/test/CodeGen/Generic/bfloat-op.ll @@ -0,0 +1,104 @@ +; Same as `bfloat.ll`, but for `fneg`, `fabs`, `copysign` and `fma`. +; Can be merged back into `bfloat.ll` once they have the same platform coverage. +; Once all targets are fixed, the `CHECK-*` prefixes should all be merged into a single `CHECK` prefix and the `BAD-*` prefixes should be removed. + +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-apple-darwin | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: BPF has a compiler error +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: hard float csky crashes +; FIXME: directx has a compiler error +; FIXME: hexagon crashes +; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: mips crashes +; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %} +; FIXME: powerpc crashes +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: sparc crashes +; FIXME: spirv crashes +; FIXME: s390x crashes +; FIXME: ve crashes +; FIXME: wasm crashes +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,BAD-COPYSIGN,CHECK-FMA %} + +; Note that arm64ec labels are quoted, hence the `{{"?}}:`. + +; Codegen tests don't work the same for graphics targets. Add a dummy directive +; for filecheck, just make sure we don't crash. +; NOCRASH: {{.*}} + +; fneg, fabs and copysign all need to not quieten signalling NaNs, so should not call any conversion functions which do. +; These tests won't catch cases where the everything is done using native instructions instead of builtins. + +define void @test_fneg(ptr %p1, ptr %p2) #0 { +; ALL-LABEL: test_fneg{{"?}}: +; ALL-NEG-NOT: __extend +; ALL-NEG-NOT: __trunc +; ALL-NEG-NOT: __gnu +; ALL-NEG-NOT: __aeabi + %v = load bfloat, ptr %p1 + %res = fneg bfloat %v + store bfloat %res, ptr %p2 + ret void +} + +define void @test_fabs(ptr %p1, ptr %p2) { +; ALL-LABEL: test_fabs{{"?}}: +; ALL-ABS-NOT: __extend +; ALL-ABS-NOT: __trunc +; ALL-ABS-NOT: __gnu +; ALL-ABS-NOT: __aeabi + %a = load bfloat, ptr %p1 + %r = call bfloat @llvm.fabs.f16(bfloat %a) + store bfloat %r, ptr %p2 + ret void +} + +define void @test_copysign(ptr %p1, ptr %p2, ptr %p3) { +; ALL-LABEL: test_copysign{{"?}}: +; CHECK-COPYSIGN-NOT: __extend +; CHECK-COPYSIGN-NOT: __trunc +; CHECK-COPYSIGN-NOT: __gnu +; CHECK-COPYSIGN-NOT: __aeabi +; BAD-COPYSIGN: __truncsfbf2 + %a = load bfloat, ptr %p1 + %b = load bfloat, ptr %p2 + %r = call bfloat @llvm.copysign.f16(bfloat %a, bfloat %b) + store bfloat %r, ptr %p3 + ret void +} + +; There is no floating-point type LLVM supports that is large enough to promote bfloat FMA to +; without causing double rounding issues. This checks for libcalls to f32/f64 fma and truncating +; f32/f64 to bf16. See https://github.com/llvm/llvm-project/issues/131531 + +define void @test_fma(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { +; ALL-LABEL: test_fma{{"?}}: +; CHECK-FMA-NOT: {{\bfmaf?\b}} +; CHECK-FMA-NOT: __truncsfbf2 +; CHECK-FMA-NOT: __truncdfbf2 +; BAD-FMA: {{__truncsfbf2|\bfmaf?\b}} + %a = load bfloat, ptr %p1 + %b = load bfloat, ptr %p2 + %c = load bfloat, ptr %p3 + %r = call bfloat @llvm.fma.f16(bfloat %a, bfloat %b, bfloat %c) + store bfloat %r, ptr %p4 + ret void +} diff --git a/llvm/test/CodeGen/Generic/bfloat.ll b/llvm/test/CodeGen/Generic/bfloat.ll new file mode 100644 index 0000000000000..83c6711de89cd --- /dev/null +++ b/llvm/test/CodeGen/Generic/bfloat.ll @@ -0,0 +1,75 @@ +; Simple cross-platform smoke checks for basic bf16 operations. +; +; There shouldn't be any architectures that crash when trying to use `bfloat`; +; check that here. Additionally do a small handful of smoke tests that work +; well cross-platform. + +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-apple-darwin | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: arm64ec crashes when passing/returning bfloat +; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if bpf-registered-target %{ llc %s -o - -mtriple=bpfel | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: hard float csky crashes +; RUN: %if directx-registered-target %{ llc %s -o - -mtriple=dxil-pc-shadermodel6.3-library | FileCheck %s --check-prefixes=NOCRASH %} +; FIXME: hexagon crashes +; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: mips crashes +; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %} +; FIXME: powerpc crashes +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; FIXME: sparc crashes +; FIXME: spirv crashes +; FIXME: s390x crashes +; FIXME: ve crashes +; FIXME: wasm crashes +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,CHECK %} +; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,CHECK %} + +; Note that arm64ec labels are quoted, hence the `{{"?}}:`. + +; Codegen tests don't work the same for graphics targets. Add a dummy directive +; for filecheck, just make sure we don't crash. +; NOCRASH: {{.*}} + +; All backends need to be able to bitcast without converting to another format, +; so we assert against libcalls (specifically __truncsfbf2). This won't catch hardware conversions. + +define bfloat @from_bits(i16 %bits) nounwind { +; ALL-LABEL: from_bits{{"?}}: +; ALL-NOT: __extend +; ALL-NOT: __trunc +; ALL-NOT: __gnu + %f = bitcast i16 %bits to bfloat + ret bfloat %f +} + +define i16 @to_bits(bfloat %f) nounwind { +; ALL-LABEL: to_bits{{"?}}: +; CHECK-NOT: __extend +; CHECK-NOT: __trunc +; CHECK-NOT: __gnu +; BAD: __truncsfbf2 + %bits = bitcast bfloat %f to i16 + ret i16 %bits +} + +define bfloat @check_freeze(bfloat %f) nounwind { +; ALL-LABEL: check_freeze{{"?}}: + %t0 = freeze bfloat %f + ret bfloat %t0 +} diff --git a/llvm/test/CodeGen/Generic/half-op.ll b/llvm/test/CodeGen/Generic/half-op.ll new file mode 100644 index 0000000000000..1037d8e20cc10 --- /dev/null +++ b/llvm/test/CodeGen/Generic/half-op.ll @@ -0,0 +1,115 @@ +; Same as `half.ll`, but for `fneg`, `fabs`, `copysign` and `fma`. +; Can be merged back into `half.ll` once BPF doesn't have a compiler error. +; Once all targets are fixed, the `CHECK-*` prefixes should all be merged into a single `CHECK` prefix and the `BAD-*` prefixes should be removed. + +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-apple-darwin | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=aarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if aarch64-registered-target %{ llc %s -o - -mtriple=arm64ec-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if amdgpu-registered-target %{ llc %s -o - -mtriple=amdgcn-amd-amdhsa | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,CHECK-FMA %} +; RUN: %if arc-registered-target %{ llc %s -o - -mtriple=arc-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=arm-unknown-linux-gnueabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if arm-registered-target %{ llc %s -o - -mtriple=thumbv7em-none-eabi | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if avr-registered-target %{ llc %s -o - -mtriple=avr-none | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; FIXME: BPF has a compiler error +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if csky-registered-target %{ llc %s -o - -mtriple=csky-unknown-linux-gnuabiv2 -mcpu=ck860fv -mattr=+hard-float | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; FIXME: directx has a compiler error +; RUN: %if hexagon-registered-target %{ llc %s -o - -mtriple=hexagon-unknown-linux-musl | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if lanai-registered-target %{ llc %s -o - -mtriple=lanai-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if loongarch-registered-target %{ llc %s -o - -mtriple=loongarch64-unknown-linux-gnu -mattr=+f | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if m68k-registered-target %{ llc %s -o - -mtriple=m68k-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mips64el-unknown-linux-gnuabi64 | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if mips-registered-target %{ llc %s -o - -mtriple=mipsel-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if msp430-registered-target %{ llc %s -o - -mtriple=msp430-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if nvptx-registered-target %{ llc %s -o - -mtriple=nvptx64-nvidia-cuda | FileCheck %s --check-prefixes=NOCRASH %} +; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if powerpc-registered-target %{ llc %s -o - -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv32-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if riscv-registered-target %{ llc %s -o - -mtriple=riscv64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if sparc-registered-target %{ llc %s -o - -mtriple=sparc64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if spirv-registered-target %{ llc %s -o - -mtriple=spirv-unknown-unknown | FileCheck %s --check-prefixes=NOCRASH %} +; RUN: %if systemz-registered-target %{ llc %s -o - -mtriple=s390x-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if ve-registered-target %{ llc %s -o - -mtriple=ve-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if webassembly-registered-target %{ llc %s -o - -mtriple=wasm32-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-pc-windows-msvc | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if x86-registered-target %{ llc %s -o - -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=ALL,CHECK-NEG-ABS,CHECK-COPYSIGN,BAD-FMA %} +; RUN: %if xcore-registered-target %{ llc %s -o - -mtriple=xcore-unknown-unknown | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,BAD-FMA %} +; RUN: %if xtensa-registered-target %{ llc %s -o - -mtriple=xtensa-none-elf | FileCheck %s --check-prefixes=ALL,BAD-NEG-ABS,BAD-COPYSIGN,CHECK-FMA %} + +; Note that arm64ec labels are quoted, hence the `{{"?}}:`. + +; Codegen tests don't work the same for graphics targets. Add a dummy directive +; for filecheck, just make sure we don't crash. +; NOCRASH: {{.*}} + +; fneg, fabs and copysign all need to not quieten signalling NaNs, so should not call any conversion functions which do. +; These tests won't catch cases where the everything is done using native instructions instead of builtins. +; See https://github.com/llvm/llvm-project/issues/104915 + +define void @test_fneg(ptr %p1, ptr %p2) #0 { +; ALL-LABEL: test_fneg{{"?}}: +; CHECK-NEG-ABS-NOT: __extend +; CHECK-NEG-ABS-NOT: __trunc +; CHECK-NEG-ABS-NOT: __gnu +; CHECK-NEG-ABS-NOT: __aeabi +; BAD-NEG-ABS: {{__extendhfsf2|__gnu_h2f_ieee|__aeabi_h2f}} + %v = load half, ptr %p1 + %res = fneg half %v + store half %res, ptr %p2 + ret void +} + +define void @test_fabs(ptr %p1, ptr %p2) { +; ALL-LABEL: test_fabs{{"?}}: +; CHECK-NEG-ABS-NOT: __extend +; CHECK-NEG-ABS-NOT: __trunc +; CHECK-NEG-ABS-NOT: __gnu +; CHECK-NEG-ABS-NOT: __aeabi +; BAD-NEG-ABS: {{__extendhfsf2|__gnu_h2f_ieee|__aeabi_h2f}} + %a = load half, ptr %p1 + %r = call half @llvm.fabs.f16(half %a) + store half %r, ptr %p2 + ret void +} + +define void @test_copysign(ptr %p1, ptr %p2, ptr %p3) { +; ALL-LABEL: test_copysign{{"?}}: +; CHECK-COPYSIGN-NOT: __extend +; CHECK-COPYSIGN-NOT: __trunc +; CHECK-COPYSIGN-NOT: __gnu +; CHECK-COPYSIGN-NOT: __aeabi +; BAD-COPYSIGN: {{__extendhfsf2|__gnu_h2f_ieee}} + %a = load half, ptr %p1 + %b = load half, ptr %p2 + %r = call half @llvm.copysign.f16(half %a, half %b) + store half %r, ptr %p3 + ret void +} + +; If promoting, fma must promote at least to f64 to avoid double rounding issues. +; This checks for calls to f32 fmaf and truncating f32 to f16. +; See https://github.com/llvm/llvm-project/issues/98389 + +define void @test_fma(ptr %p1, ptr %p2, ptr %p3, ptr %p4) { +; ALL-LABEL: test_fma{{"?}}: +; Allow fmaf16 +; CHECK-FMA-NOT: fmaf{{\b}} +; CHECK-FMA-NOT: __truncsfhf2 +; CHECK-FMA-NOT: __gnu_f2h_ieee +; CHECK-FMA-NOT: __aeabi_f2h +; BAD-FMA: {{__truncsfhf2|__gnu_f2h_ieee|__aeabi_f2h|fmaf\b}} + %a = load half, ptr %p1 + %b = load half, ptr %p2 + %c = load half, ptr %p3 + %r = call half @llvm.fma.f16(half %a, half %b, half %c) + store half %r, ptr %p4 + ret void +} diff --git a/llvm/test/CodeGen/RISCV/half-arith.ll b/llvm/test/CodeGen/RISCV/half-arith.ll index 84163b52bb98d..a11bce9bae319 100644 --- a/llvm/test/CodeGen/RISCV/half-arith.ll +++ b/llvm/test/CodeGen/RISCV/half-arith.ll @@ -514,6 +514,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV32I-NEXT: sw ra, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s0, 8(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s1, 4(sp) # 4-byte Folded Spill +; RV32I-NEXT: sw s2, 0(sp) # 4-byte Folded Spill ; RV32I-NEXT: lui a1, 16 ; RV32I-NEXT: addi s1, a1, -1 ; RV32I-NEXT: and a0, a0, s1 @@ -521,13 +522,12 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s2, a0, a1 ; RV32I-NEXT: and a0, a0, s1 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui a0, 524288 -; RV32I-NEXT: xor a0, s0, a0 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s1 +; RV32I-NEXT: and a0, s2, s1 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: mv a0, s0 @@ -536,6 +536,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s0, 8(sp) # 4-byte Folded Reload ; RV32I-NEXT: lw s1, 4(sp) # 4-byte Folded Reload +; RV32I-NEXT: lw s2, 0(sp) # 4-byte Folded Reload ; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: ret ; @@ -545,6 +546,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV64I-NEXT: sd ra, 24(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s0, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s1, 8(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 0(sp) # 8-byte Folded Spill ; RV64I-NEXT: lui a1, 16 ; RV64I-NEXT: addi s1, a1, -1 ; RV64I-NEXT: and a0, a0, s1 @@ -552,13 +554,12 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s2, a0, a1 ; RV64I-NEXT: and a0, a0, s1 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lui a0, 524288 -; RV64I-NEXT: xor a0, s0, a0 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s1 +; RV64I-NEXT: and a0, s2, s1 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s0 @@ -567,6 +568,7 @@ define i32 @fneg_h(half %a, half %b) nounwind { ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s0, 16(sp) # 8-byte Folded Reload ; RV64I-NEXT: ld s1, 8(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 0(sp) # 8-byte Folded Reload ; RV64I-NEXT: addi sp, sp, 32 ; RV64I-NEXT: ret ; @@ -638,11 +640,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: not a0, a0 ; RV32I-NEXT: lui a1, 1048568 ; RV32I-NEXT: slli s1, s1, 17 ; RV32I-NEXT: and a0, a0, a1 @@ -677,11 +675,7 @@ define half @fsgnjn_h(half %a, half %b) nounwind { ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: not a0, a0 ; RV64I-NEXT: lui a1, 1048568 ; RV64I-NEXT: slli s1, s1, 49 ; RV64I-NEXT: and a0, a0, a1 @@ -804,15 +798,14 @@ define half @fabs_h(half %a, half %b) nounwind { ; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 +; RV32I-NEXT: slli s0, a0, 17 +; RV32I-NEXT: srli s0, s0, 17 ; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: slli a0, a0, 1 -; RV32I-NEXT: srli a0, a0, 1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: mv a0, s0 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv a1, s0 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lw ra, 12(sp) # 4-byte Folded Reload @@ -841,15 +834,14 @@ define half @fabs_h(half %a, half %b) nounwind { ; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 +; RV64I-NEXT: slli s0, a0, 49 +; RV64I-NEXT: srli s0, s0, 49 ; RV64I-NEXT: and a0, a0, s2 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: slli a0, a0, 33 -; RV64I-NEXT: srli a0, a0, 33 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s2 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: mv a0, s0 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv a1, s0 +; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: ld ra, 24(sp) # 8-byte Folded Reload @@ -1217,25 +1209,21 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv s0, a1 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and a0, a2, s3 +; RV32I-NEXT: addi s2, a0, -1 +; RV32I-NEXT: and a0, a2, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s3, a0, a1 +; RV32I-NEXT: and a0, s1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: and a0, s0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s2, s3 +; RV32I-NEXT: and a0, s3, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: mv a0, s1 @@ -1261,25 +1249,21 @@ define half @fmsub_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv s0, a1 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 16 -; RV64I-NEXT: addi s3, a0, -1 -; RV64I-NEXT: and a0, a2, s3 +; RV64I-NEXT: addi s2, a0, -1 +; RV64I-NEXT: and a0, a2, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s3, a0, a1 +; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 +; RV64I-NEXT: and a0, s0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: and a0, s2, s3 +; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: mv a0, s1 @@ -1355,43 +1339,34 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s1, a2 -; RV32I-NEXT: mv s0, a1 -; RV32I-NEXT: lui s3, 16 -; RV32I-NEXT: addi s3, s3, -1 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a1 +; RV32I-NEXT: lui a1, 16 +; RV32I-NEXT: addi s3, a1, -1 ; RV32I-NEXT: and a0, a0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s2, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui s4, 524288 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s2, s2, a1 +; RV32I-NEXT: xor s4, a0, a1 ; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 -; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: and a0, s2, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a2, a0 -; RV32I-NEXT: mv a0, s2 +; RV32I-NEXT: mv a0, s1 ; RV32I-NEXT: mv a1, s0 ; RV32I-NEXT: call fmaf ; RV32I-NEXT: call __truncsfhf2 @@ -1413,43 +1388,34 @@ define half @fnmadd_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s1, a2 -; RV64I-NEXT: mv s0, a1 -; RV64I-NEXT: lui s3, 16 -; RV64I-NEXT: addi s3, s3, -1 +; RV64I-NEXT: mv s0, a2 +; RV64I-NEXT: mv s1, a1 +; RV64I-NEXT: lui a1, 16 +; RV64I-NEXT: addi s3, a1, -1 ; RV64I-NEXT: and a0, a0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s2, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui s4, 524288 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s2, s2, a1 +; RV64I-NEXT: xor s4, a0, a1 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 -; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s2, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: and a0, s4, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a2, a0 -; RV64I-NEXT: mv a0, s2 +; RV64I-NEXT: mv a0, s1 ; RV64I-NEXT: mv a1, s0 ; RV64I-NEXT: call fmaf ; RV64I-NEXT: call __truncsfhf2 @@ -1535,44 +1501,35 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: sw s2, 16(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s3, 12(sp) # 4-byte Folded Spill ; RV32I-NEXT: sw s4, 8(sp) # 4-byte Folded Spill -; RV32I-NEXT: mv s1, a2 -; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: lui s3, 16 -; RV32I-NEXT: addi s3, s3, -1 +; RV32I-NEXT: mv s0, a2 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a0, 16 +; RV32I-NEXT: addi s3, a0, -1 ; RV32I-NEXT: and a0, a1, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s2, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui s4, 524288 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s2, s2, a1 +; RV32I-NEXT: xor s4, a0, a1 ; RV32I-NEXT: and a0, s1, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: xor a0, a0, s4 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 -; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 ; RV32I-NEXT: and a0, s2, s3 ; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: and a0, s4, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a2, a0 ; RV32I-NEXT: mv a0, s0 -; RV32I-NEXT: mv a1, s2 +; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: call fmaf ; RV32I-NEXT: call __truncsfhf2 ; RV32I-NEXT: lw ra, 28(sp) # 4-byte Folded Reload @@ -1593,44 +1550,35 @@ define half @fnmadd_h_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: sd s2, 16(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s3, 8(sp) # 8-byte Folded Spill ; RV64I-NEXT: sd s4, 0(sp) # 8-byte Folded Spill -; RV64I-NEXT: mv s1, a2 -; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: lui s3, 16 -; RV64I-NEXT: addi s3, s3, -1 +; RV64I-NEXT: mv s0, a2 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a0, 16 +; RV64I-NEXT: addi s3, a0, -1 ; RV64I-NEXT: and a0, a1, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s2, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui s4, 524288 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s2, s2, a1 +; RV64I-NEXT: xor s4, a0, a1 ; RV64I-NEXT: and a0, s1, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: xor a0, a0, s4 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 -; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 ; RV64I-NEXT: and a0, s2, s3 ; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: and a0, s4, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a2, a0 ; RV64I-NEXT: mv a0, s0 -; RV64I-NEXT: mv a1, s2 +; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: call fmaf ; RV64I-NEXT: call __truncsfhf2 ; RV64I-NEXT: ld ra, 40(sp) # 8-byte Folded Reload @@ -1960,25 +1908,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a1 ; RV32I-NEXT: lui a1, 16 -; RV32I-NEXT: addi s3, a1, -1 -; RV32I-NEXT: and a0, a0, s3 +; RV32I-NEXT: addi s2, a1, -1 +; RV32I-NEXT: and a0, a0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s3, a0, a1 +; RV32I-NEXT: and a0, s1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: and a0, s0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s2, s3 +; RV32I-NEXT: and a0, s3, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a1, s1 ; RV32I-NEXT: mv a2, s0 @@ -2003,25 +1947,21 @@ define half @fnmsub_h(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv s0, a2 ; RV64I-NEXT: mv s1, a1 ; RV64I-NEXT: lui a1, 16 -; RV64I-NEXT: addi s3, a1, -1 -; RV64I-NEXT: and a0, a0, s3 +; RV64I-NEXT: addi s2, a1, -1 +; RV64I-NEXT: and a0, a0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s3, a0, a1 +; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 +; RV64I-NEXT: and a0, s0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: and a0, s2, s3 +; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a1, s1 ; RV64I-NEXT: mv a2, s0 @@ -2096,25 +2036,21 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv s0, a2 ; RV32I-NEXT: mv s1, a0 ; RV32I-NEXT: lui a0, 16 -; RV32I-NEXT: addi s3, a0, -1 -; RV32I-NEXT: and a0, a1, s3 +; RV32I-NEXT: addi s2, a0, -1 +; RV32I-NEXT: and a0, a1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: li a1, 0 ; RV32I-NEXT: call __addsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s2, a0 -; RV32I-NEXT: and a0, s1, s3 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s3, a0, a1 +; RV32I-NEXT: and a0, s1, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s1, a0 -; RV32I-NEXT: and a0, s0, s3 +; RV32I-NEXT: and a0, s0, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 -; RV32I-NEXT: and a0, s2, s3 +; RV32I-NEXT: and a0, s3, s2 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv a1, a0 ; RV32I-NEXT: mv a0, s1 @@ -2140,25 +2076,21 @@ define half @fnmsub_h_2(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv s0, a2 ; RV64I-NEXT: mv s1, a0 ; RV64I-NEXT: lui a0, 16 -; RV64I-NEXT: addi s3, a0, -1 -; RV64I-NEXT: and a0, a1, s3 +; RV64I-NEXT: addi s2, a0, -1 +; RV64I-NEXT: and a0, a1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: li a1, 0 ; RV64I-NEXT: call __addsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s2, a0 -; RV64I-NEXT: and a0, s1, s3 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s3, a0, a1 +; RV64I-NEXT: and a0, s1, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s1, a0 -; RV64I-NEXT: and a0, s0, s3 +; RV64I-NEXT: and a0, s0, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 -; RV64I-NEXT: and a0, s2, s3 +; RV64I-NEXT: and a0, s3, s2 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv a1, a0 ; RV64I-NEXT: mv a0, s1 @@ -2519,12 +2451,8 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind { ; RV32I-NEXT: mv a0, s2 ; RV32I-NEXT: call __mulsf3 ; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: and a0, a0, s3 -; RV32I-NEXT: call __extendhfsf2 -; RV32I-NEXT: lui a1, 524288 -; RV32I-NEXT: xor a0, a0, a1 -; RV32I-NEXT: call __truncsfhf2 -; RV32I-NEXT: mv s1, a0 +; RV32I-NEXT: lui a1, 8 +; RV32I-NEXT: xor s1, a0, a1 ; RV32I-NEXT: and a0, s0, s3 ; RV32I-NEXT: call __extendhfsf2 ; RV32I-NEXT: mv s0, a0 @@ -2580,12 +2508,8 @@ define half @fnmadd_h_contract(half %a, half %b, half %c) nounwind { ; RV64I-NEXT: mv a0, s2 ; RV64I-NEXT: call __mulsf3 ; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: and a0, a0, s3 -; RV64I-NEXT: call __extendhfsf2 -; RV64I-NEXT: lui a1, 524288 -; RV64I-NEXT: xor a0, a0, a1 -; RV64I-NEXT: call __truncsfhf2 -; RV64I-NEXT: mv s1, a0 +; RV64I-NEXT: lui a1, 8 +; RV64I-NEXT: xor s1, a0, a1 ; RV64I-NEXT: and a0, s0, s3 ; RV64I-NEXT: call __extendhfsf2 ; RV64I-NEXT: mv s0, a0 diff --git a/llvm/test/CodeGen/Thumb2/mve-vabd.ll b/llvm/test/CodeGen/Thumb2/mve-vabd.ll index 8d52fe52d9360..3c35a29c0a84c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vabd.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vabd.ll @@ -63,34 +63,30 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-MVE-NEXT: mov r4, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q1[0] +; CHECK-MVE-NEXT: vmov.u16 r0, q1[1] ; CHECK-MVE-NEXT: vmov q5, q1 ; CHECK-MVE-NEXT: vmov q4, q0 ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q4[0] +; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h ; CHECK-MVE-NEXT: mov r5, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q5[1] +; CHECK-MVE-NEXT: vmov.u16 r0, q5[0] ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r6, r0 -; CHECK-MVE-NEXT: vmov.u16 r0, q4[1] +; CHECK-MVE-NEXT: vmov.u16 r0, q4[0] ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r1, r6 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: vmov.16 q6[0], r5 -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: vmov.16 q6[1], r0 +; CHECK-MVE-NEXT: bfc r0, #15, #17 +; CHECK-MVE-NEXT: bfc r5, #15, #17 +; CHECK-MVE-NEXT: vmov.16 q6[0], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[2] +; CHECK-MVE-NEXT: vmov.16 q6[1], r5 ; CHECK-MVE-NEXT: bl __aeabi_h2f ; CHECK-MVE-NEXT: mov r5, r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q4[2] @@ -98,9 +94,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[2], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[3] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -110,9 +104,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[3], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[4] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -122,9 +114,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[4], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[5] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -134,9 +124,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[5], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[6] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -146,9 +134,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[6], r0 ; CHECK-MVE-NEXT: vmov.u16 r0, q5[7] ; CHECK-MVE-NEXT: bl __aeabi_h2f @@ -158,9 +144,7 @@ define arm_aapcs_vfpcc void @vabd_v8f16(<8 x half> %x, <8 x half> %y, ptr %z) { ; CHECK-MVE-NEXT: mov r1, r5 ; CHECK-MVE-NEXT: bl __aeabi_fsub ; CHECK-MVE-NEXT: bl __aeabi_f2h -; CHECK-MVE-NEXT: bl __aeabi_h2f -; CHECK-MVE-NEXT: bic r0, r0, #-2147483648 -; CHECK-MVE-NEXT: bl __aeabi_f2h +; CHECK-MVE-NEXT: bfc r0, #15, #17 ; CHECK-MVE-NEXT: vmov.16 q6[7], r0 ; CHECK-MVE-NEXT: vstrw.32 q6, [r4] ; CHECK-MVE-NEXT: vpop {d8, d9, d10, d11, d12, d13}