diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index d5d54337306c0..98a06670e3d90 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -815,7 +815,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, if (Fix16BitCopies) { if (((Size == 16) != (SrcSize == 16))) { // Non-VGPR Src and Dst will later be expanded back to 32 bits. - assert(ST.hasTrue16BitInsts()); + assert(ST.useRealTrue16Insts()); Register &RegToFix = (Size == 32) ? DestReg : SrcReg; MCRegister SubReg = RI.getSubReg(RegToFix, AMDGPU::lo16); RegToFix = SubReg; @@ -989,7 +989,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } - if (ST.hasTrue16BitInsts()) { + if (ST.useRealTrue16Insts()) { if (IsSGPRSrc) { assert(SrcLow); SrcReg = NewSrcReg; @@ -5579,9 +5579,11 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { return ST.useRealTrue16Insts() ? AMDGPU::V_FLOOR_F16_t16_e64 : AMDGPU::V_FLOOR_F16_fake16_e64; case AMDGPU::S_TRUNC_F16: - return AMDGPU::V_TRUNC_F16_fake16_e64; + return ST.useRealTrue16Insts() ? AMDGPU::V_TRUNC_F16_t16_e64 + : AMDGPU::V_TRUNC_F16_fake16_e64; case AMDGPU::S_RNDNE_F16: - return AMDGPU::V_RNDNE_F16_fake16_e64; + return ST.useRealTrue16Insts() ? AMDGPU::V_RNDNE_F16_t16_e64 + : AMDGPU::V_RNDNE_F16_fake16_e64; case AMDGPU::S_ADD_F32: return AMDGPU::V_ADD_F32_e64; case AMDGPU::S_SUB_F32: return AMDGPU::V_SUB_F32_e64; case AMDGPU::S_MIN_F32: return AMDGPU::V_MIN_F32_e64; @@ -5589,17 +5591,27 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_MINIMUM_F32: return AMDGPU::V_MINIMUM_F32_e64; case AMDGPU::S_MAXIMUM_F32: return AMDGPU::V_MAXIMUM_F32_e64; case AMDGPU::S_MUL_F32: return AMDGPU::V_MUL_F32_e64; - case AMDGPU::S_ADD_F16: return AMDGPU::V_ADD_F16_fake16_e64; - case AMDGPU::S_SUB_F16: return AMDGPU::V_SUB_F16_fake16_e64; - case AMDGPU::S_MIN_F16: return AMDGPU::V_MIN_F16_fake16_e64; - case AMDGPU::S_MAX_F16: return AMDGPU::V_MAX_F16_fake16_e64; + case AMDGPU::S_ADD_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_ADD_F16_t16_e64 + : AMDGPU::V_ADD_F16_fake16_e64; + case AMDGPU::S_SUB_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_SUB_F16_t16_e64 + : AMDGPU::V_SUB_F16_fake16_e64; + case AMDGPU::S_MIN_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_MIN_F16_t16_e64 + : AMDGPU::V_MIN_F16_fake16_e64; + case AMDGPU::S_MAX_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_MAX_F16_t16_e64 + : AMDGPU::V_MAX_F16_fake16_e64; case AMDGPU::S_MINIMUM_F16: return ST.useRealTrue16Insts() ? AMDGPU::V_MINIMUM_F16_t16_e64 : AMDGPU::V_MINIMUM_F16_fake16_e64; case AMDGPU::S_MAXIMUM_F16: return ST.useRealTrue16Insts() ? AMDGPU::V_MAXIMUM_F16_t16_e64 : AMDGPU::V_MAXIMUM_F16_fake16_e64; - case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; + case AMDGPU::S_MUL_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_MUL_F16_t16_e64 + : AMDGPU::V_MUL_F16_fake16_e64; case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; case AMDGPU::S_FMAC_F16: @@ -5664,15 +5676,25 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { return ST.useRealTrue16Insts() ? AMDGPU::V_CMP_NLT_F16_t16_e64 : AMDGPU::V_CMP_NLT_F16_fake16_e64; case AMDGPU::V_S_EXP_F32_e64: return AMDGPU::V_EXP_F32_e64; - case AMDGPU::V_S_EXP_F16_e64: return AMDGPU::V_EXP_F16_fake16_e64; + case AMDGPU::V_S_EXP_F16_e64: + return ST.useRealTrue16Insts() ? AMDGPU::V_EXP_F16_t16_e64 + : AMDGPU::V_EXP_F16_fake16_e64; case AMDGPU::V_S_LOG_F32_e64: return AMDGPU::V_LOG_F32_e64; - case AMDGPU::V_S_LOG_F16_e64: return AMDGPU::V_LOG_F16_fake16_e64; + case AMDGPU::V_S_LOG_F16_e64: + return ST.useRealTrue16Insts() ? AMDGPU::V_LOG_F16_t16_e64 + : AMDGPU::V_LOG_F16_fake16_e64; case AMDGPU::V_S_RCP_F32_e64: return AMDGPU::V_RCP_F32_e64; - case AMDGPU::V_S_RCP_F16_e64: return AMDGPU::V_RCP_F16_fake16_e64; + case AMDGPU::V_S_RCP_F16_e64: + return ST.useRealTrue16Insts() ? AMDGPU::V_RCP_F16_t16_e64 + : AMDGPU::V_RCP_F16_fake16_e64; case AMDGPU::V_S_RSQ_F32_e64: return AMDGPU::V_RSQ_F32_e64; - case AMDGPU::V_S_RSQ_F16_e64: return AMDGPU::V_RSQ_F16_fake16_e64; + case AMDGPU::V_S_RSQ_F16_e64: + return ST.useRealTrue16Insts() ? AMDGPU::V_RSQ_F16_t16_e64 + : AMDGPU::V_RSQ_F16_fake16_e64; case AMDGPU::V_S_SQRT_F32_e64: return AMDGPU::V_SQRT_F32_e64; - case AMDGPU::V_S_SQRT_F16_e64: return AMDGPU::V_SQRT_F16_fake16_e64; + case AMDGPU::V_S_SQRT_F16_e64: + return ST.useRealTrue16Insts() ? AMDGPU::V_SQRT_F16_t16_e64 + : AMDGPU::V_SQRT_F16_fake16_e64; } llvm_unreachable( "Unexpected scalar opcode without corresponding vector one!"); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll index d81faf91801b0..235ec22ba5c60 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll @@ -3,7 +3,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define float @v_pow_f32(float %x, float %y) { ; GFX6-LABEL: v_pow_f32: @@ -371,19 +372,33 @@ define half @v_pow_f16(half %x, half %y) { ; GFX10-NEXT: v_exp_f16_e32 v0, v0 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_pow_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_log_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_pow_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_pow_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_mul_dx9_zero_f32_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %pow = call half @llvm.pow.f16(half %x, half %y) ret half %pow } @@ -474,31 +489,54 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_pow_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_pow_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_pow_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y) ret <2 x half> %pow } @@ -597,33 +635,57 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_pow_v2f16_fneg_lhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v1, v1 -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y) ret <2 x half> %pow @@ -723,32 +785,56 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_pow_v2f16_fneg_rhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_log_f16_e32 v0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_rhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_rhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x, <2 x half> %y.fneg) ret <2 x half> %pow @@ -856,34 +942,60 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) { ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 16, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_pow_v2f16_fneg_lhs_rhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v2, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_cvt_f32_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_log_f16_e32 v0, v0 -; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v3 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v2 -; GFX11-NEXT: v_cvt_f32_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 -; GFX11-NEXT: v_cvt_f16_f32_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-NEXT: v_exp_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_exp_f16_e32 v0, v0 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_pow_v2f16_fneg_lhs_rhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: v_log_f16_e32 v0.h, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v1, v1.l +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v3, v0.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v2, v2.l +; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e32 v0, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v3, v1 :: v_dual_mul_dx9_zero_f32 v2, v0, v2 +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v2 +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_exp_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_pow_v2f16_fneg_lhs_rhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v0 +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v2, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_log_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v3, v3 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v2, v2 +; GFX11-FAKE16-NEXT: v_cvt_f32_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_mul_dx9_zero_f32 v1, v2, v1 :: v_dual_mul_dx9_zero_f32 v0, v0, v3 +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_exp_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %x.fneg = fneg <2 x half> %x %y.fneg = fneg <2 x half> %y %pow = call <2 x half> @llvm.pow.v2f16(<2 x half> %x.fneg, <2 x half> %y.fneg) diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index 2cc5159c29f7f..e9877ae5144f5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 %s define amdgpu_kernel void @fmul_f16( ; SI-LABEL: fmul_f16: @@ -54,29 +55,55 @@ define amdgpu_kernel void @fmul_f16( ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; -; GFX11-LABEL: fmul_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fmul_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fmul_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) { @@ -127,23 +154,41 @@ define amdgpu_kernel void @fmul_f16_imm_a( ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; -; GFX11-LABEL: fmul_f16_imm_a: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: v_mul_f16_e32 v0, 0x4200, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fmul_f16_imm_a: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x4200, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fmul_f16_imm_a: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) { entry: @@ -192,23 +237,41 @@ define amdgpu_kernel void @fmul_f16_imm_b( ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; -; GFX11-LABEL: fmul_f16_imm_b: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: v_mul_f16_e32 v0, 4.0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: fmul_f16_imm_b: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 4.0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: fmul_f16_imm_b: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, 4.0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll index b78ea1033baac..c6ea12dd61651 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll @@ -1,10 +1,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s declare half @llvm.amdgcn.rcp.f16(half %a) ; GCN-LABEL: {{^}}rcp_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GFX11-TRUE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l +; GFX11-FAKE16: v_rcp_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @rcp_f16( diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll index 8c003c96dddfe..0924e9a5c2314 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll @@ -1,10 +1,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11-FAKE16 %s declare half @llvm.amdgcn.rsq.f16(half %a) ; GCN-LABEL: {{^}}rsq_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; VI: v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]] +; GFX11-TRUE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]].l, v[[A_F16]].l +; GFX11-FAKE16: v_rsq_f16_e32 v[[A_F16:[0-9]+]], v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @rsq_f16( diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll index bbfb88a4b22a3..aa994558162db 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sqrt.f16.ll @@ -1,6 +1,10 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG-FAKE16 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL-FAKE16 %s define half @v_sqrt_f16(half %src) { ; GCN-LABEL: v_sqrt_f16: @@ -8,6 +12,30 @@ define half @v_sqrt_f16(half %src) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sqrt_f16_e32 v0, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_sqrt_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_sqrt_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_sqrt_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_sqrt_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %sqrt = call half @llvm.amdgcn.sqrt.f16(half %src) ret half %sqrt } @@ -18,6 +46,30 @@ define half @v_fabs_sqrt_f16(half %src) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sqrt_f16_e64 v0, |v0| ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_fabs_sqrt_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, |v0.l| +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_fabs_sqrt_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sqrt_f16_e64 v0, |v0| +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_fabs_sqrt_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, |v0.l| +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_fabs_sqrt_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_sqrt_f16_e64 v0, |v0| +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %fabs.src = call half @llvm.fabs.f16(half %src) %sqrt = call half @llvm.amdgcn.sqrt.f16(half %fabs.src) ret half %sqrt @@ -29,6 +81,30 @@ define half @v_fneg_fabs_sqrt_f16(half %src) { ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_sqrt_f16_e64 v0, -|v0| ; GCN-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_fneg_fabs_sqrt_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, -|v0.l| +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_fneg_fabs_sqrt_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sqrt_f16_e64 v0, -|v0| +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_fneg_fabs_sqrt_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sqrt_f16_e64 v0.l, -|v0.l| +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_fneg_fabs_sqrt_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_sqrt_f16_e64 v0, -|v0| +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %fabs.src = call half @llvm.fabs.f16(half %src) %neg.fabs.src = fneg half %fabs.src %sqrt = call half @llvm.amdgcn.sqrt.f16(half %neg.fabs.src) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 0517e41e3d651..3faf84e5d58c8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=VI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s declare half @llvm.maxnum.f16(half %a, half %b) declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -113,32 +114,60 @@ define amdgpu_kernel void @maxnum_f16( ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: maxnum_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: maxnum_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v0.h, v0.h +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: maxnum_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { @@ -228,25 +257,45 @@ define amdgpu_kernel void @maxnum_f16_imm_a( ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: maxnum_f16_imm_a: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, 0x4200, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: maxnum_f16_imm_a: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, 0x4200, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: maxnum_f16_imm_a: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, 0x4200, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { entry: @@ -334,25 +383,45 @@ define amdgpu_kernel void @maxnum_f16_imm_b( ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: maxnum_f16_imm_b: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_max_f16_e32 v0, 4.0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: maxnum_f16_imm_b: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, 4.0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: maxnum_f16_imm_b: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, 4.0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 795ed6d542a13..3a2bf9d009460 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck --check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s declare half @llvm.rint.f16(half %a) @@ -47,23 +48,41 @@ define amdgpu_kernel void @rint_f16( ; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX89-NEXT: s_endpgm ; -; GFX11-LABEL: rint_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_rndne_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: rint_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: rint_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_rndne_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: rint_f16: ; GFX12: ; %bb.0: ; %entry @@ -166,27 +185,49 @@ define amdgpu_kernel void @rint_v2f16( ; GFX9-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX9-NEXT: s_endpgm ; -; GFX11-LABEL: rint_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_rndne_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_rndne_f16_e32 v1, v1 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: rint_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: rint_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_rndne_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_rndne_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: rint_v2f16: ; GFX12: ; %bb.0: ; %entry diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index 0d58afd1812de..c1ba985d37453 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s declare half @llvm.trunc.f16(half %a) @@ -46,23 +47,41 @@ define amdgpu_kernel void @trunc_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: trunc_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_trunc_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: trunc_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: trunc_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: trunc_f16: ; GFX12: ; %bb.0: ; %entry @@ -145,27 +164,49 @@ define amdgpu_kernel void @trunc_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: trunc_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_trunc_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_trunc_f16_e32 v1, v1 -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: trunc_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: trunc_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_trunc_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ; ; GFX12-LABEL: trunc_v2f16: ; GFX12: ; %bb.0: ; %entry