diff --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll index 867025adca944..644c88457714b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.f16.ll @@ -3,8 +3,10 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: cos_f16: @@ -69,31 +71,57 @@ define amdgpu_kernel void @cos_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: cos_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cos_f16_e32 v1, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: cos_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm ; -; GFX12-LABEL: cos_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_cos_f16_e32 v1, v1 -; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-FAKE16-LABEL: cos_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: cos_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: cos_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.cos.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -184,42 +212,79 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: cos_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 -; GFX11-NEXT: v_cos_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cos_f16_e32 v2, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: cos_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l +; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: cos_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX11-FAKE16-NEXT: v_cos_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cos_f16_e32 v2, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: cos_v2f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: v_cos_f16_e32 v0.h, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm ; -; GFX12-LABEL: cos_v2f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 -; GFX12-NEXT: v_cos_f16_e32 v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX12-NEXT: v_cos_f16_e32 v2, v2 -; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-FAKE16-LABEL: cos_v2f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-FAKE16-NEXT: v_cos_f16_e32 v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-FAKE16-NEXT: v_cos_f16_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.cos.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r @@ -228,3 +293,6 @@ define amdgpu_kernel void @cos_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { declare half @llvm.cos.f16(half %a) declare <2 x half> @llvm.cos.v2f16(<2 x half> %a) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll index 814f44477f528..61991c8b409dd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll @@ -5,8 +5,10 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=VI-DENORM %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-FLUSH %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DENORM %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FLUSH,GFX11-FLUSH-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -denormal-fp-math=ieee -denormal-fp-math-f32=preserve-sign -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-DENORM,GFX11-DENORM-FAKE16 %s declare half @llvm.fmuladd.f16(half %a, half %b, half %c) declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) @@ -160,64 +162,122 @@ define amdgpu_kernel void @fmuladd_f16( ; GFX10-DENORM-NEXT: buffer_store_short v2, off, s[0:3], 0 ; GFX10-DENORM-NEXT: s_endpgm ; -; GFX11-FLUSH-LABEL: fmuladd_f16: -; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX11-FLUSH-NEXT: s_mov_b32 s18, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s19, s11 -; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2 -; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 -; GFX11-FLUSH-NEXT: s_mov_b32 s16, s4 -; GFX11-FLUSH-NEXT: s_mov_b32 s17, s5 -; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 -; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[16:19], 0 -; GFX11-FLUSH-NEXT: s_mov_b32 s4, s6 -; GFX11-FLUSH-NEXT: s_mov_b32 s5, s7 -; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11 -; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0 -; GFX11-FLUSH-NEXT: buffer_load_u16 v2, off, s[4:7], 0 -; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1 -; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(1) -; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FLUSH-NEXT: s_endpgm +; GFX11-FLUSH-TRUE16-LABEL: fmuladd_f16: +; GFX11-FLUSH-TRUE16: ; %bb.0: +; GFX11-FLUSH-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v2, off, s[12:15], 0 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-FLUSH-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-TRUE16-NEXT: s_endpgm ; -; GFX11-DENORM-LABEL: fmuladd_f16: -; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX11-DENORM-NEXT: s_mov_b32 s18, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s19, s11 -; GFX11-DENORM-NEXT: s_mov_b32 s22, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s23, s11 -; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s12, s2 -; GFX11-DENORM-NEXT: s_mov_b32 s13, s3 -; GFX11-DENORM-NEXT: s_mov_b32 s16, s4 -; GFX11-DENORM-NEXT: s_mov_b32 s17, s5 -; GFX11-DENORM-NEXT: s_mov_b32 s20, s6 -; GFX11-DENORM-NEXT: s_mov_b32 s21, s7 -; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 -; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[16:19], 0 -; GFX11-DENORM-NEXT: buffer_load_u16 v2, off, s[20:23], 0 -; GFX11-DENORM-NEXT: s_mov_b32 s8, s0 -; GFX11-DENORM-NEXT: s_mov_b32 s9, s1 -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: v_fmac_f16_e32 v2, v0, v1 -; GFX11-DENORM-NEXT: buffer_store_b16 v2, off, s[8:11], 0 -; GFX11-DENORM-NEXT: s_endpgm +; GFX11-FLUSH-FAKE16-LABEL: fmuladd_f16: +; GFX11-FLUSH-FAKE16: ; %bb.0: +; GFX11-FLUSH-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s4, s6 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s5, s7 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v2, off, s[4:7], 0 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-FLUSH-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-FAKE16-NEXT: s_endpgm +; +; GFX11-DENORM-TRUE16-LABEL: fmuladd_f16: +; GFX11-DENORM-TRUE16: ; %bb.0: +; GFX11-DENORM-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s22, s10 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s20, s6 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-DENORM-TRUE16-NEXT: buffer_store_b16 v2, off, s[8:11], 0 +; GFX11-DENORM-TRUE16-NEXT: s_endpgm +; +; GFX11-DENORM-FAKE16-LABEL: fmuladd_f16: +; GFX11-DENORM-FAKE16: ; %bb.0: +; GFX11-DENORM-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 +; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: v_fmac_f16_e32 v2, v0, v1 +; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v2, off, s[8:11], 0 +; GFX11-DENORM-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -353,55 +413,105 @@ define amdgpu_kernel void @fmuladd_f16_imm_a( ; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: s_endpgm ; -; GFX11-FLUSH-LABEL: fmuladd_f16_imm_a: -; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11 -; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2 -; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 -; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0 -; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1 -; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FLUSH-NEXT: s_endpgm +; GFX11-FLUSH-TRUE16-LABEL: fmuladd_f16_imm_a: +; GFX11-FLUSH-TRUE16: ; %bb.0: +; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x1 +; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x4200, v0.l +; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-FLUSH-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-TRUE16-NEXT: s_endpgm ; -; GFX11-DENORM-LABEL: fmuladd_f16_imm_a: -; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX11-DENORM-NEXT: s_mov_b32 s6, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s7, s11 -; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s12, s2 -; GFX11-DENORM-NEXT: s_mov_b32 s13, s3 -; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s8, s0 -; GFX11-DENORM-NEXT: s_mov_b32 s9, s1 -; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 -; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 -; GFX11-DENORM-NEXT: s_endpgm +; GFX11-FLUSH-FAKE16-LABEL: fmuladd_f16_imm_a: +; GFX11-FLUSH-FAKE16: ; %bb.0: +; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x1 +; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FLUSH-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-FAKE16-NEXT: s_endpgm +; +; GFX11-DENORM-TRUE16-LABEL: fmuladd_f16_imm_a: +; GFX11-DENORM-TRUE16: ; %bb.0: +; GFX11-DENORM-TRUE16-NEXT: s_clause 0x1 +; GFX11-DENORM-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DENORM-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-TRUE16-NEXT: v_fmac_f16_e32 v1.l, 0x4200, v0.l +; GFX11-DENORM-TRUE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-TRUE16-NEXT: s_endpgm +; +; GFX11-DENORM-FAKE16-LABEL: fmuladd_f16_imm_a: +; GFX11-DENORM-FAKE16: ; %bb.0: +; GFX11-DENORM-FAKE16-NEXT: s_clause 0x1 +; GFX11-DENORM-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DENORM-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 +; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c) { @@ -535,55 +645,105 @@ define amdgpu_kernel void @fmuladd_f16_imm_b( ; GFX10-DENORM-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-DENORM-NEXT: s_endpgm ; -; GFX11-FLUSH-LABEL: fmuladd_f16_imm_b: -; GFX11-FLUSH: ; %bb.0: -; GFX11-FLUSH-NEXT: s_clause 0x1 -; GFX11-FLUSH-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-FLUSH-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-FLUSH-NEXT: s_mov_b32 s10, -1 -; GFX11-FLUSH-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-FLUSH-NEXT: s_mov_b32 s14, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s15, s11 -; GFX11-FLUSH-NEXT: s_mov_b32 s6, s10 -; GFX11-FLUSH-NEXT: s_mov_b32 s7, s11 -; GFX11-FLUSH-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s12, s2 -; GFX11-FLUSH-NEXT: s_mov_b32 s13, s3 -; GFX11-FLUSH-NEXT: s_mov_b32 s8, s0 -; GFX11-FLUSH-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) -; GFX11-FLUSH-NEXT: s_mov_b32 s9, s1 -; GFX11-FLUSH-NEXT: v_mul_f16_e32 v0, 0x4200, v0 -; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FLUSH-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-FLUSH-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-FLUSH-NEXT: s_endpgm +; GFX11-FLUSH-TRUE16-LABEL: fmuladd_f16_imm_b: +; GFX11-FLUSH-TRUE16: ; %bb.0: +; GFX11-FLUSH-TRUE16-NEXT: s_clause 0x1 +; GFX11-FLUSH-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FLUSH-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FLUSH-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x4200, v0.l +; GFX11-FLUSH-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-FLUSH-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-TRUE16-NEXT: s_endpgm ; -; GFX11-DENORM-LABEL: fmuladd_f16_imm_b: -; GFX11-DENORM: ; %bb.0: -; GFX11-DENORM-NEXT: s_clause 0x1 -; GFX11-DENORM-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-DENORM-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-DENORM-NEXT: s_mov_b32 s10, -1 -; GFX11-DENORM-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-DENORM-NEXT: s_mov_b32 s14, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s15, s11 -; GFX11-DENORM-NEXT: s_mov_b32 s6, s10 -; GFX11-DENORM-NEXT: s_mov_b32 s7, s11 -; GFX11-DENORM-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s12, s2 -; GFX11-DENORM-NEXT: s_mov_b32 s13, s3 -; GFX11-DENORM-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-DENORM-NEXT: s_waitcnt vmcnt(0) -; GFX11-DENORM-NEXT: s_mov_b32 s8, s0 -; GFX11-DENORM-NEXT: s_mov_b32 s9, s1 -; GFX11-DENORM-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 -; GFX11-DENORM-NEXT: buffer_store_b16 v1, off, s[8:11], 0 -; GFX11-DENORM-NEXT: s_endpgm +; GFX11-FLUSH-FAKE16-LABEL: fmuladd_f16_imm_b: +; GFX11-FLUSH-FAKE16: ; %bb.0: +; GFX11-FLUSH-FAKE16-NEXT: s_clause 0x1 +; GFX11-FLUSH-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FLUSH-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FLUSH-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FLUSH-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FLUSH-FAKE16-NEXT: v_mul_f16_e32 v0, 0x4200, v0 +; GFX11-FLUSH-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FLUSH-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FLUSH-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FLUSH-FAKE16-NEXT: s_endpgm +; +; GFX11-DENORM-TRUE16-LABEL: fmuladd_f16_imm_b: +; GFX11-DENORM-TRUE16: ; %bb.0: +; GFX11-DENORM-TRUE16-NEXT: s_clause 0x1 +; GFX11-DENORM-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DENORM-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-DENORM-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-TRUE16-NEXT: v_fmac_f16_e32 v1.l, 0x4200, v0.l +; GFX11-DENORM-TRUE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-TRUE16-NEXT: s_endpgm +; +; GFX11-DENORM-FAKE16-LABEL: fmuladd_f16_imm_b: +; GFX11-DENORM-FAKE16: ; %bb.0: +; GFX11-DENORM-FAKE16-NEXT: s_clause 0x1 +; GFX11-DENORM-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-DENORM-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-DENORM-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-DENORM-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-DENORM-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x4200, v0 +; GFX11-DENORM-FAKE16-NEXT: buffer_store_b16 v1, off, s[8:11], 0 +; GFX11-DENORM-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c) { diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 480d978fa530b..d329a7428115a 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -8,8 +8,10 @@ ; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX9CHECK,GFX9GLISEL %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK,GFX10SELDAG %s ; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK,GFX10GLISEL %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11SELDAG %s -; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11GLISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11SELDAG,GFX11SELDAG-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11SELDAG,GFX11SELDAG-FAKE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11GLISEL,GFX11GLISEL-TRUE16 %s +; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11GLISEL,GFX11GLISEL-FAKE16 %s define amdgpu_kernel void @sgpr_isnan_bf16(ptr addrspace(1) %out, bfloat %x) { ; GFX7CHECK-LABEL: sgpr_isnan_bf16: @@ -203,15 +205,25 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: snan_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: snan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: snan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1) ; 0x001 ret i1 %1 } @@ -253,13 +265,21 @@ define i1 @qnan_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: qnan_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: qnan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: qnan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 2) ; 0x002 ret i1 %1 } @@ -298,12 +318,19 @@ define i1 @posinf_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: posinf_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: posinf_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: posinf_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 512) ; 0x200 ret i1 %1 } @@ -342,12 +369,19 @@ define i1 @neginf_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: neginf_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: neginf_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: neginf_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 4) ; 0x004 ret i1 %1 } @@ -403,16 +437,27 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: posnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: posnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: posnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 256) ; 0x100 ret i1 %1 } @@ -468,16 +513,27 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 8) ; 0x008 ret i1 %1 } @@ -521,13 +577,21 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: possubnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, -1 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: possubnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, -1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: possubnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, -1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 128) ; 0x080 ret i1 %1 } @@ -582,16 +646,27 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negsubnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, v1, -1 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negsubnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v0.h, -1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negsubnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, -1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 16) ; 0x010 ret i1 %1 } @@ -627,12 +702,19 @@ define i1 @poszero_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: poszero_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: poszero_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: poszero_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 64) ; 0x040 ret i1 %1 } @@ -671,12 +753,19 @@ define i1 @negzero_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negzero_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negzero_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negzero_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x8000, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 32) ; 0x020 ret i1 %1 } @@ -715,12 +804,19 @@ define i1 @posfinite_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: posfinite_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: posfinite_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: posfinite_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 448) ; 0x1c0 ret i1 %1 } @@ -771,15 +867,25 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negfinite_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negfinite_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negfinite_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 56) ; 0x038 ret i1 %1 } @@ -821,13 +927,21 @@ define i1 @isnan_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnan_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 3) ; nan ret i1 %1 } @@ -869,13 +983,21 @@ define i1 @not_isnan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isnan_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isnan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isnan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1020) ; ~nan ret i1 %class } @@ -929,16 +1051,27 @@ define <2 x i1> @isnan_v2bf16(<2 x bfloat> %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnan_v2bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_v2bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_v2bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call <2 x i1> @llvm.is.fpclass.v2bf16(<2 x bfloat> %x, i32 3) ; nan ret <2 x i1> %1 } @@ -1005,19 +1138,33 @@ define <3 x i1> @isnan_v3bf16(<3 x bfloat> %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnan_v3bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11CHECK-NEXT: v_and_b32_e32 v3, 0x7fff, v1 -; GFX11CHECK-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_v3bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff, v1 +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_v3bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff, v1 +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call <3 x i1> @llvm.is.fpclass.v3bf16(<3 x bfloat> %x, i32 3) ; nan ret <3 x i1> %1 } @@ -1095,22 +1242,39 @@ define <4 x i1> @isnan_v4bf16(<4 x bfloat> %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnan_v4bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11CHECK-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_v4bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_v4bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v4 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call <4 x i1> @llvm.is.fpclass.v4bf16(<4 x bfloat> %x, i32 3) ; nan ret <4 x i1> %1 } @@ -1158,13 +1322,21 @@ define i1 @isinf_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isinf_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isinf_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isinf_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516) ; 0x204 = "inf" ret i1 %1 } @@ -1206,13 +1378,21 @@ define i1 @isfinite_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isfinite_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isfinite_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isfinite_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504) ; 0x1f8 = "finite" ret i1 %1 } @@ -1252,13 +1432,21 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: issubnormal_or_zero_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: issubnormal_or_zero_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: issubnormal_or_zero_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 240) ; 0xf0 = "subnormal|zero" ret i1 %class @@ -1299,13 +1487,21 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_issubnormal_or_zero_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_or_zero_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_or_zero_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 783) ; ~0xf0 = "~(subnormal|zero)" ret i1 %class @@ -1353,14 +1549,23 @@ define i1 @isnormal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v0, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 264) ; 0x108 = "normal" ret i1 %class } @@ -1407,14 +1612,23 @@ define i1 @not_isnormal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 -; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v0, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 759) ; ~0x108 = "~normal" ret i1 %class } @@ -1470,16 +1684,27 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_is_plus_normal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_is_plus_normal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_is_plus_normal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 767) ; ~0x100 = ~"+normal" ret i1 %class } @@ -1535,16 +1760,27 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_is_neg_normal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11CHECK-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_is_neg_normal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_is_neg_normal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1015) ; ~0x008 = ~"-normal" ret i1 %class } @@ -1590,14 +1826,23 @@ define i1 @issubnormal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: issubnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, -1 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: issubnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, -1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: issubnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, -1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 144) ; 0x90 = "subnormal" ret i1 %class } @@ -1643,14 +1888,23 @@ define i1 @not_issubnormal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_issubnormal_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, v0, -1 -; GFX11CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, -1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v0, v0, -1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7e, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 879) ; ~0x90 = ~"subnormal" ret i1 %class } @@ -1689,13 +1943,21 @@ define i1 @iszero_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 96) ; 0x60 = "zero" ret i1 %class } @@ -1734,13 +1996,21 @@ define i1 @not_iszero_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 927) ; ~0x60 = ~"zero" ret i1 %class } @@ -1779,12 +2049,19 @@ define i1 @ispositive_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: ispositive_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: ispositive_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: ispositive_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 960) ; fcPositive ret i1 %class } @@ -1855,19 +2132,33 @@ define i1 @not_ispositive_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_ispositive_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s2, 0x7f80, v1 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: s_or_b32 s0, s0, s1 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, s2 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_ispositive_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s2, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_ispositive_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s2, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 63) ; ~fcPositive ret i1 %class } @@ -1930,17 +2221,29 @@ define i1 @isnegative_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnegative_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: s_or_b32 s0, s0, s1 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnegative_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnegative_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 60) ; fcNegative ret i1 %class } @@ -1994,15 +2297,25 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isnegative_bf16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v1 -; GFX11CHECK-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isnegative_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isnegative_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 963) ; ~fcNegative ret i1 %class } @@ -2052,15 +2365,25 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_nan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 @@ -2111,15 +2434,25 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_nan_f_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 @@ -2170,15 +2503,25 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 @@ -2229,15 +2572,25 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_nan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~0x60 = "~(zero|nan)" ret i1 %0 @@ -2288,15 +2641,25 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" ret i1 %0 @@ -2347,15 +2710,25 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX11CHECK-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" ret i1 %0 @@ -2406,15 +2779,25 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_qnan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_qnan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_qnan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 98) ; 0x60|0x2 = "zero|qnan" ret i1 %0 @@ -2476,17 +2859,29 @@ define i1 @iszero_or_snan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_snan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0, v0 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: s_or_b32 s0, s1, s0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_snan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_snan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 97) ; 0x60|0x1 = "zero|snan" ret i1 %0 @@ -2579,23 +2974,41 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_qnan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, v0, -1 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v1 -; GFX11CHECK-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 -; GFX11CHECK-NEXT: s_or_b32 s1, s2, s1 -; GFX11CHECK-NEXT: s_or_b32 s0, s1, s0 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_qnan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v0.l, -1 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v0.h +; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0.l +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_qnan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, v0, -1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v0, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v1 +; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s1, s2, s1 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 925) ; ~(0x60|0x2) = "~(zero|qnan)" ret i1 %0 @@ -2680,21 +3093,37 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_snan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_add_nc_u16 v1, v0, -1 -; GFX11CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1 -; GFX11CHECK-NEXT: v_cmp_gt_u16_e64 s2, 0x7f00, v2 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11CHECK-NEXT: s_or_b32 s0, s0, s1 -; GFX11CHECK-NEXT: s_or_b32 s0, s0, s2 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_snan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v0.l, -1 +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f00, v1.l +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_snan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, v0, -1 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v2, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f00, v2 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s1 +; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 926) ; ~(0x60|0x1) = "~(zero|snan)" ret i1 %0 @@ -2737,13 +3166,21 @@ define i1 @isinf_or_nan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isinf_or_nan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isinf_or_nan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isinf_or_nan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f7f, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 519) ; 0x204|0x3 = "inf|nan" ret i1 %0 @@ -2786,13 +3223,21 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isinf_or_nan_bf16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isinf_or_nan_bf16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isinf_or_nan_bf16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 504) ; ~(0x204|0x3) = "~(inf|nan)" ret i1 %0 @@ -2835,13 +3280,21 @@ define i1 @isfinite_or_nan_f(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isfinite_or_nan_f: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isfinite_or_nan_f: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isfinite_or_nan_f: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 507) ; 0x1f8|0x3 = "finite|nan" ret i1 %0 @@ -2884,13 +3337,21 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isfinite_or_nan_f: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isfinite_or_nan_f: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isfinite_or_nan_f: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 516) ; ~(0x1f8|0x3) = "~(finite|nan)" ret i1 %0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 1d869559d9e77..8c0393b627110 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -7,8 +7,10 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GFX9CHECK,GFX9GLISEL %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK,GFX10SELDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck --check-prefixes=GFX10CHECK,GFX10GLISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11SELDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11GLISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11SELDAG,GFX11SELDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11SELDAG,GFX11SELDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11GLISEL,GFX11GLISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefixes=GFX11CHECK,GFX11GLISEL,GFX11GLISEL-FAKE16 %s define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX7SELDAG-LABEL: sgpr_isnan_f16: @@ -76,17 +78,42 @@ define amdgpu_kernel void @sgpr_isnan_f16(ptr addrspace(1) %out, half %x) { ; GFX10CHECK-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10CHECK-NEXT: s_endpgm ; -; GFX11CHECK-LABEL: sgpr_isnan_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_clause 0x1 -; GFX11CHECK-NEXT: s_load_b32 s2, s[4:5], 0x2c -; GFX11CHECK-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11CHECK-NEXT: v_mov_b32_e32 v0, 0 -; GFX11CHECK-NEXT: s_waitcnt lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s2, s2, 3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 -; GFX11CHECK-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11CHECK-NEXT: s_endpgm +; GFX11SELDAG-TRUE16-LABEL: sgpr_isnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_clause 0x1 +; GFX11SELDAG-TRUE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11SELDAG-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s2, v0.l, 3 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1, s2 +; GFX11SELDAG-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11SELDAG-TRUE16-NEXT: s_endpgm +; +; GFX11SELDAG-FAKE16-LABEL: sgpr_isnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_clause 0x1 +; GFX11SELDAG-FAKE16-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11SELDAG-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11SELDAG-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11SELDAG-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11SELDAG-FAKE16-NEXT: s_endpgm +; +; GFX11GLISEL-LABEL: sgpr_isnan_f16: +; GFX11GLISEL: ; %bb.0: +; GFX11GLISEL-NEXT: s_clause 0x1 +; GFX11GLISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c +; GFX11GLISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11GLISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11GLISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s2, s2, 3 +; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s2 +; GFX11GLISEL-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11GLISEL-NEXT: s_endpgm %result = call i1 @llvm.is.fpclass.f16(half %x, i32 3) %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -208,12 +235,33 @@ define i1 @snan_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: snan_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 1 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: snan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: snan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: snan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 1 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: snan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 1 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 1) ; 0x001 ret i1 %1 } @@ -260,12 +308,33 @@ define i1 @qnan_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: qnan_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 2 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: qnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 2 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: qnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: qnan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 2 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: qnan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 2 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 2) ; 0x002 ret i1 %1 } @@ -312,12 +381,33 @@ define i1 @posinf_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: posinf_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x200 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: posinf_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x200 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: posinf_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x200 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: posinf_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x200 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: posinf_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x200 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 512) ; 0x200 ret i1 %1 } @@ -362,12 +452,33 @@ define i1 @neginf_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: neginf_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 4 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: neginf_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 4 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: neginf_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 4 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: neginf_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 4 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: neginf_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 4 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 4) ; 0x004 ret i1 %1 } @@ -426,12 +537,33 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: posnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x100 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: posnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x100 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: posnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x100 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: posnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x100 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: posnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x100 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 256) ; 0x100 ret i1 %1 } @@ -488,12 +620,33 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 8 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 8 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 8 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: negnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 8 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: negnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 8 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 8) ; 0x008 ret i1 %1 } @@ -543,12 +696,33 @@ define i1 @possubnormal_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: possubnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x80 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: possubnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x80 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: possubnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x80 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: possubnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x80 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: possubnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x80 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 128) ; 0x080 ret i1 %1 } @@ -604,12 +778,33 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negsubnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 16 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negsubnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 16 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negsubnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 16 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: negsubnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 16 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: negsubnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 16 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 16) ; 0x010 ret i1 %1 } @@ -652,12 +847,33 @@ define i1 @poszero_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: poszero_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 64 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: poszero_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 64 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: poszero_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 64 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: poszero_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 64 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: poszero_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 64 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 64) ; 0x040 ret i1 %1 } @@ -702,12 +918,33 @@ define i1 @negzero_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negzero_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 32 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negzero_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 32 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negzero_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 32 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: negzero_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 32 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: negzero_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 32 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 32) ; 0x020 ret i1 %1 } @@ -754,12 +991,33 @@ define i1 @posfinite_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: posfinite_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1c0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: posfinite_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1c0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: posfinite_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1c0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: posfinite_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1c0 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: posfinite_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1c0 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 448) ; 0x1c0 ret i1 %1 } @@ -812,12 +1070,33 @@ define i1 @negfinite_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: negfinite_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 56 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: negfinite_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 56 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: negfinite_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 56 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: negfinite_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 56 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: negfinite_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 56 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 56) ; 0x038 ret i1 %1 } @@ -864,12 +1143,33 @@ define i1 @isnan_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnan_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isnan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isnan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) ; nan ret i1 %1 } @@ -918,12 +1218,33 @@ define i1 @not_isnan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isnan_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3fc -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3fc +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3fc +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_isnan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3fc +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_isnan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3fc +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.is.fpclass.f16(half %x, i32 1020) ; ~nan ret i1 %class } @@ -1018,25 +1339,45 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind { ; GFX10GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10GLISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11SELDAG-LABEL: isnan_v2f16: -; GFX11SELDAG: ; %bb.0: -; GFX11SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v0 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v1 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11GLISEL-LABEL: isnan_v2f16: -; GFX11GLISEL: ; %bb.0: -; GFX11GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v0, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v1, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11GLISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_v2f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0.l, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_v2f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isnan_v2f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isnan_v2f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v1, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call <2 x i1> @llvm.is.fpclass.v2f16(<2 x half> %x, i32 3) ; nan ret <2 x i1> %1 } @@ -1156,31 +1497,56 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind { ; GFX10GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 ; GFX10GLISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11SELDAG-LABEL: isnan_v3f16: -; GFX11SELDAG: ; %bb.0: -; GFX11SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v0 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v2, v2 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v1 -; GFX11SELDAG-NEXT: v_mov_b32_e32 v1, v3 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11GLISEL-LABEL: isnan_v3f16: -; GFX11GLISEL: ; %bb.0: -; GFX11GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v0, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v2, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v1, 3 -; GFX11GLISEL-NEXT: v_mov_b32_e32 v1, v3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11GLISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_v3f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0.l, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v2.l, v2.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_v3f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v2, v2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v1 +; GFX11SELDAG-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isnan_v3f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isnan_v3f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v2, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v1, 3 +; GFX11GLISEL-FAKE16-NEXT: v_mov_b32_e32 v1, v3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call <3 x i1> @llvm.is.fpclass.v3f16(<3 x half> %x, i32 3) ; nan ret <3 x i1> %1 } @@ -1322,35 +1688,65 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind { ; GFX10GLISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 ; GFX10GLISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11SELDAG-LABEL: isnan_v4f16: -; GFX11SELDAG: ; %bb.0: -; GFX11SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v0 -; GFX11SELDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11SELDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v1 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v4, v4 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v3 -; GFX11SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo -; GFX11SELDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11GLISEL-LABEL: isnan_v4f16: -; GFX11GLISEL: ; %bb.0: -; GFX11GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v0, 3 -; GFX11GLISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11GLISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v1, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v3, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11GLISEL-NEXT: v_cmp_class_f16_e64 s0, v4, 3 -; GFX11GLISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX11GLISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_v4f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0.l, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11SELDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1.l, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v4.l, v4.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v3.l, v3.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_v4f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11SELDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v1, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v4, v4 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_u_f16_e32 vcc_lo, v3, v3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isnan_v4f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v4 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v1.h, 3 +; GFX11GLISEL-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isnan_v4f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 3 +; GFX11GLISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11GLISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v1, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v3, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v4, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call <4 x i1> @llvm.is.fpclass.v4f16(<4 x half> %x, i32 3) ; nan ret <4 x i1> %1 } @@ -1400,12 +1796,33 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnan_f16_strictfp: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnan_f16_strictfp: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnan_f16_strictfp: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isnan_f16_strictfp: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isnan_f16_strictfp: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 3) strictfp ; nan ret i1 %1 } @@ -1454,12 +1871,33 @@ define i1 @isinf_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isinf_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isinf_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isinf_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isinf_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isinf_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 516) ; 0x204 = "inf" ret i1 %1 } @@ -1508,12 +1946,33 @@ define i1 @isfinite_f16(half %x) nounwind { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isfinite_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isfinite_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isfinite_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isfinite_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isfinite_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.f16(half %x, i32 504) ; 0x1f8 = "finite" ret i1 %1 } @@ -1560,12 +2019,33 @@ define i1 @issubnormal_or_zero_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: issubnormal_or_zero_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0xf0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: issubnormal_or_zero_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0xf0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: issubnormal_or_zero_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0xf0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: issubnormal_or_zero_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0xf0 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: issubnormal_or_zero_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0xf0 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 240) ; 0xf0 = "subnormal|zero" ret i1 %class @@ -1619,12 +2099,33 @@ define i1 @not_issubnormal_or_zero_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_issubnormal_or_zero_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x30f -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_or_zero_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x30f +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_or_zero_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x30f +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_or_zero_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x30f +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_or_zero_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x30f +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 783) ; ~0xf0 = "~(subnormal|zero)" ret i1 %class @@ -1677,12 +2178,33 @@ define i1 @isnormal_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x108 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x108 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x108 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x108 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x108 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 264) ; 0x108 = "normal" ret i1 %class } @@ -1739,12 +2261,33 @@ define i1 @not_isnormal_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2f7 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2f7 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2f7 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_isnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2f7 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_isnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2f7 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 759) ; ~0x108 = "~normal" ret i1 %class } @@ -1812,12 +2355,33 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_is_plus_normal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2ff -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_is_plus_normal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2ff +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_is_plus_normal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2ff +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_is_plus_normal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x2ff +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_is_plus_normal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x2ff +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 767) ; ~0x100 = ~"+normal" ret i1 %class } @@ -1885,12 +2449,33 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_is_neg_normal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3f7 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_is_neg_normal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3f7 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_is_neg_normal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3f7 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_is_neg_normal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3f7 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_is_neg_normal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3f7 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 1015) ; ~0x008 = ~"-normal" ret i1 %class } @@ -1941,12 +2526,33 @@ define i1 @issubnormal_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: issubnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x90 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: issubnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x90 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: issubnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x90 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: issubnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x90 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: issubnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x90 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 144) ; 0x90 = "subnormal" ret i1 %class } @@ -2005,12 +2611,33 @@ define i1 @not_issubnormal_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_issubnormal_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x36f -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_issubnormal_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x36f +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_issubnormal_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x36f +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_issubnormal_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x36f +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_issubnormal_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x36f +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 879) ; ~0x90 = ~"subnormal" ret i1 %class } @@ -2057,12 +2684,33 @@ define i1 @iszero_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x60 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x60 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x60 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: iszero_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x60 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: iszero_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x60 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 96) ; 0x60 = "zero" ret i1 %class } @@ -2122,12 +2770,33 @@ define i1 @not_iszero_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39f -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39f +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39f +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_iszero_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39f +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_iszero_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39f +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 927) ; ~0x60 = ~"zero" ret i1 %class } @@ -2174,12 +2843,33 @@ define i1 @ispositive_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: ispositive_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c0 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: ispositive_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: ispositive_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: ispositive_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c0 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: ispositive_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c0 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 960) ; fcPositive ret i1 %class } @@ -2242,12 +2932,33 @@ define i1 @not_ispositive_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_ispositive_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 63 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_ispositive_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 63 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_ispositive_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 63 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_ispositive_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 63 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_ispositive_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 63 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 63) ; ~fcPositive ret i1 %class } @@ -2306,12 +3017,33 @@ define i1 @isnegative_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isnegative_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 60 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isnegative_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 60 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isnegative_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 60 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isnegative_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 60 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isnegative_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 60 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 60) ; fcNegative ret i1 %class } @@ -2367,12 +3099,33 @@ define i1 @not_isnegative_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isnegative_f16: -; GFX11CHECK: ; %bb.0: -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c3 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isnegative_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c3 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isnegative_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_isnegative_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x3c3 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_isnegative_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x3c3 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.f16(half %x, i32 963) ; ~fcNegative ret i1 %class } @@ -2424,12 +3177,33 @@ define i1 @iszero_or_nan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_nan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 @@ -2482,12 +3256,33 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_nan_f_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_daz: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_daz: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 @@ -2540,12 +3335,33 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_nan_f_maybe_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x63 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x63 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 99) ; 0x60|0x3 = "zero|nan" ret i1 %0 @@ -2607,12 +3423,33 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_nan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924) ; ~0x60 = "~(zero|nan)" ret i1 %0 @@ -2674,12 +3511,33 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_nan_f_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_daz: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_daz: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" ret i1 %0 @@ -2741,12 +3599,33 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_nan_f_maybe_daz: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39c +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39c +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" ret i1 %0 @@ -2799,12 +3678,33 @@ define i1 @iszero_or_qnan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_qnan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x62 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_qnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x62 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_qnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x62 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: iszero_or_qnan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x62 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: iszero_or_qnan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x62 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 98) ; 0x60|0x2 = "zero|qnan" ret i1 %0 @@ -2862,12 +3762,33 @@ define i1 @iszero_or_snan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: iszero_or_snan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x61 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: iszero_or_snan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x61 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: iszero_or_snan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x61 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: iszero_or_snan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x61 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: iszero_or_snan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x61 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 97) ; 0x60|0x1 = "zero|snan" ret i1 %0 @@ -2945,12 +3866,33 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_qnan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39d -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_qnan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39d +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_qnan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39d +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_qnan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39d +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_qnan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39d +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 925) ; ~(0x60|0x2) = "~(zero|qnan)" ret i1 %0 @@ -3025,12 +3967,33 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_iszero_or_snan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39e -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_snan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39e +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_snan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39e +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_iszero_or_snan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x39e +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_iszero_or_snan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x39e +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 926) ; ~(0x60|0x1) = "~(zero|snan)" ret i1 %0 @@ -3080,12 +4043,33 @@ define i1 @isinf_or_nan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isinf_or_nan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x207 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isinf_or_nan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x207 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isinf_or_nan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x207 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isinf_or_nan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x207 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isinf_or_nan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x207 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 519) ; 0x204|0x3 = "inf|nan" ret i1 %0 @@ -3135,12 +4119,33 @@ define i1 @not_isinf_or_nan_f16(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isinf_or_nan_f16: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isinf_or_nan_f16: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isinf_or_nan_f16: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_isinf_or_nan_f16: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1f8 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_isinf_or_nan_f16: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1f8 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 504) ; ~(0x204|0x3) = "~(inf|nan)" ret i1 %0 @@ -3190,12 +4195,33 @@ define i1 @isfinite_or_nan_f(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: isfinite_or_nan_f: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1fb -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: isfinite_or_nan_f: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1fb +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: isfinite_or_nan_f: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1fb +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: isfinite_or_nan_f: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x1fb +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: isfinite_or_nan_f: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x1fb +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 507) ; 0x1f8|0x3 = "finite|nan" ret i1 %0 @@ -3245,12 +4271,33 @@ define i1 @not_isfinite_or_nan_f(half %x) { ; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; -; GFX11CHECK-LABEL: not_isfinite_or_nan_f: -; GFX11CHECK: ; %bb.0: ; %entry -; GFX11CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11CHECK-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 -; GFX11CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11CHECK-NEXT: s_setpc_b64 s[30:31] +; GFX11SELDAG-TRUE16-LABEL: not_isfinite_or_nan_f: +; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry +; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11SELDAG-FAKE16-LABEL: not_isfinite_or_nan_f: +; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry +; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-TRUE16-LABEL: not_isfinite_or_nan_f: +; GFX11GLISEL-TRUE16: ; %bb.0: ; %entry +; GFX11GLISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-TRUE16-NEXT: v_cmp_class_f16_e64 s0, v0.l, 0x204 +; GFX11GLISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11GLISEL-FAKE16-LABEL: not_isfinite_or_nan_f: +; GFX11GLISEL-FAKE16: ; %bb.0: ; %entry +; GFX11GLISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11GLISEL-FAKE16-NEXT: v_cmp_class_f16_e64 s0, v0, 0x204 +; GFX11GLISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11GLISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.f16(half %x, i32 516) ; ~(0x1f8|0x3) = "~(finite|nan)" ret i1 %0 @@ -3266,3 +4313,5 @@ attributes #0 = { "denormal-fp-math"="ieee,preserve-sign" } ; Maybe daz attributes #1 = { "denormal-fp-math"="ieee,dynamic" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11SELDAG: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll index 76ca99059d58d..afede06001736 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll @@ -4,8 +4,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s define half @v_maximum_f16(half %src0, half %src1) { ; GFX7-LABEL: v_maximum_f16: @@ -53,24 +55,43 @@ define half @v_maximum_f16(half %src0, half %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_maximum_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_maximum_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_maximum_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_maximum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call half @llvm.maximum.f16(half %src0, half %src1) ret half %op } @@ -110,21 +131,37 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) { ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_f16__nnan: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_maximum_f16__nnan: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_f16__nnan: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_f16__nnan: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_maximum_f16__nnan: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_maximum_f16__nnan: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_maximum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call nnan half @llvm.maximum.f16(half %src0, half %src1) ret half %op } @@ -175,24 +212,43 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_maximum_f16__nsz: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_maximum_f16__nsz: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_maximum_f16__nsz: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_maximum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call nsz half @llvm.maximum.f16(half %src0, half %src1) ret half %op } @@ -232,21 +288,37 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) { ; GFX10-NEXT: v_max_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_f16__nnan_nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_maximum_f16__nnan_nsz: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_maximum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_f16__nnan_nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_f16__nnan_nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_maximum_f16__nnan_nsz: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_maximum_f16__nnan_nsz: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_maximum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call nnan nsz half @llvm.maximum.f16(half %src0, half %src1) ret half %op } @@ -302,27 +374,49 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_f16__nnan_src0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_maximum_f16__nnan_src0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_maximum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_f16__nnan_src0: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_f16__nnan_src0: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_maximum_f16__nnan_src0: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_maximum_f16__nnan_src0: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_maximum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0 = fadd nnan half %arg0, 1.0 %op = call half @llvm.maximum.f16(half %src0, half %src1) ret half %op @@ -379,27 +473,49 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_f16__nnan_src1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_max_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_maximum_f16__nnan_src1: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_maximum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_f16__nnan_src1: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.h, 1.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_f16__nnan_src1: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_maximum_f16__nnan_src1: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_maximum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_maximum_f16__nnan_src1: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_maximum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src1 = fadd nnan half %arg1, 1.0 %op = call half @llvm.maximum.f16(half %src0, half %src1) ret half %op @@ -475,18 +591,31 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) { ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: s_maximum_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f16_e64 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: s_maximum_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1 +; GFX11-TRUE16-NEXT: v_max_f16_e64 v0.l, s0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: s_maximum_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e64 v0, s0, s1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: s_maximum_f16: ; GFX12: ; %bb.0: @@ -580,21 +709,36 @@ define <2 x half> @v_maximum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f16: ; GFX12: ; %bb.0: @@ -739,21 +883,36 @@ define <2 x half> @v_maximum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v2f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v2f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v4, v0, v1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v2f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v2f16__nsz: ; GFX12: ; %bb.0: @@ -929,25 +1088,50 @@ define void @s_maximum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: s_maximum_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_lshr_b32 s2, s1, 16 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: s_maximum_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v2, s0, s1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: s_maximum_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v0, s0, s1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: s_maximum_v2f16: ; GFX12: ; %bb.0: @@ -1053,25 +1237,43 @@ define <3 x half> @v_maximum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v3f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v3f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v6.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v3f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f16: ; GFX12: ; %bb.0: @@ -1245,25 +1447,43 @@ define <3 x half> @v_maximum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v3f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v3f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v6.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v3f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v3f16__nsz: ; GFX12: ; %bb.0: @@ -1459,30 +1679,52 @@ define <4 x half> @v_maximum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_max_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v8, v0, v2 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v7.l, v6.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v7, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f16: ; GFX12: ; %bb.0: @@ -1685,30 +1927,52 @@ define <4 x half> @v_maximum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v4f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v4, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_max_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v4f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v8, v0, v2 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v7.l, v6.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v4f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v7, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v4f16__nsz: ; GFX12: ; %bb.0: @@ -1989,48 +2253,85 @@ define <8 x half> @v_maximum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v8f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v8, v3, v7 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_pk_max_f16 v10, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX11-NEXT: v_pk_max_f16 v14, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 -; GFX11-NEXT: v_pk_max_f16 v11, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v8f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v7.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v8, v3, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v10, v2, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v8.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v9.l, v7.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v10.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v11.l, v6.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v12, v0, v4 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v0.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s4, v11.l, v9.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v12.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v5.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v6.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, 0x7e00, v7.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v8f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v8, v3, v7 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v10, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v14, v1, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v11, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v8f16: ; GFX12: ; %bb.0: @@ -2392,90 +2693,159 @@ define <16 x half> @v_maximum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_maximum_v16f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_max_f16 v16, v7, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_pk_max_f16 v15, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_pk_max_f16 v20, v4, v12 -; GFX11-NEXT: v_pk_max_f16 v22, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GFX11-NEXT: v_pk_max_f16 v14, v5, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_pk_max_f16 v17, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 -; GFX11-NEXT: v_pk_max_f16 v19, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_pk_max_f16 v22, v0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_maximum_v16f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v18, v7, v15 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v7.l, v15.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v6.l, v14.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v19, v5, v13 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v17.l, v16.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v16, v6, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, 0x7e00, v18.l, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, 0x7e00, v15.l, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, 0x7e00, v16.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v13.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v15.l, v14.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v18.l, v17.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v4.l, v12.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v15, v4, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, 0x7e00, v14.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, 0x7e00, v19.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, 0x7e00, v13.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v15.l, s2 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v16.l, v12.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v11.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v13, v3, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v2.l, v10.l +; GFX11-TRUE16-NEXT: v_pk_max_f16 v15, v2, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v13.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v14.l, v11.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v15.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v16.l, v10.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v17, v0, v8 +; GFX11-TRUE16-NEXT: v_pk_max_f16 v1, v1, v9 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v13.l, v10.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s4, v0.l, v8.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s5, v16.l, v14.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v17.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v8.l, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v9.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v10.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, 0x7e00, v11.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, 0x7e00, v12.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_maximum_v16f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_max_f16 v16, v7, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v15, v6, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v20, v4, v12 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v22, v2, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v14, v5, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v17, v3, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v19, v1, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_max_f16 v22, v0, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_maximum_v16f16: ; GFX12: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll index c472ee39a41e4..a645a8ab5d2f8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f16.ll @@ -5,8 +5,10 @@ ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX900 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx950 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX950 %s ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GCN,GFX10 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GCN,GFX12 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s define half @v_minimum_f16(half %src0, half %src1) { ; GFX8-LABEL: v_minimum_f16: @@ -41,24 +43,43 @@ define half @v_minimum_f16(half %src0, half %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_minimum_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_minimum_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_minimum_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_minimum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call half @llvm.minimum.f16(half %src0, half %src1) ret half %op } @@ -88,21 +109,37 @@ define half @v_minimum_f16__nnan(half %src0, half %src1) { ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_f16__nnan: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_minimum_f16__nnan: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_f16__nnan: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_f16__nnan: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_minimum_f16__nnan: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_minimum_f16__nnan: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_minimum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call nnan half @llvm.minimum.f16(half %src0, half %src1) ret half %op } @@ -140,24 +177,43 @@ define half @v_minimum_f16__nsz(half %src0, half %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_minimum_f16__nsz: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_minimum_f16__nsz: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_minimum_f16__nsz: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_minimum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call nsz half @llvm.minimum.f16(half %src0, half %src1) ret half %op } @@ -187,21 +243,37 @@ define half @v_minimum_f16__nnan_nsz(half %src0, half %src1) { ; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_f16__nnan_nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_minimum_f16__nnan_nsz: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_minimum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_f16__nnan_nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_f16__nnan_nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_minimum_f16__nnan_nsz: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_minimum_f16__nnan_nsz: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_minimum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %op = call nnan nsz half @llvm.minimum.f16(half %src0, half %src1) ret half %op } @@ -243,27 +315,49 @@ define half @v_minimum_f16__nnan_src0(half %arg0, half %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_f16__nnan_src0: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_minimum_f16__nnan_src0: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v0, 1.0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_minimum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_f16__nnan_src0: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_f16__nnan_src0: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_minimum_f16__nnan_src0: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, 1.0, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_minimum_f16__nnan_src0: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, 1.0, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_minimum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0 = fadd nnan half %arg0, 1.0 %op = call half @llvm.minimum.f16(half %src0, half %src1) ret half %op @@ -306,27 +400,49 @@ define half @v_minimum_f16__nnan_src1(half %src0, half %arg1) { ; GFX10-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_f16__nnan_src1: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_min_f16_e32 v2, v0, v1 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: v_minimum_f16__nnan_src1: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_add_f16_e32 v1, 1.0, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_minimum_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_f16__nnan_src1: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.h, 1.0, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_f16__nnan_src1: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_min_f16_e32 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-TRUE16-LABEL: v_minimum_f16__nnan_src1: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: v_add_f16_e32 v1.l, 1.0, v1.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_minimum_f16 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-FAKE16-LABEL: v_minimum_f16__nnan_src1: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: v_add_f16_e32 v1, 1.0, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_minimum_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src1 = fadd nnan half %arg1, 1.0 %op = call half @llvm.minimum.f16(half %src0, half %src1) ret half %op @@ -385,18 +501,31 @@ define void @s_minimum_f16(half inreg %src0, half inreg %src1) { ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: s_minimum_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_min_f16_e64 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: s_minimum_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, s0, s1 +; GFX11-TRUE16-NEXT: v_min_f16_e64 v0.l, s0, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v0.l, s2 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: s_minimum_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_min_f16_e64 v0, s0, s1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: s_minimum_f16: ; GFX12: ; %bb.0: @@ -470,21 +599,36 @@ define <2 x half> @v_minimum_v2f16(<2 x half> %src0, <2 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f16: ; GFX12: ; %bb.0: @@ -594,21 +738,36 @@ define <2 x half> @v_minimum_v2f16__nsz(<2 x half> %src0, <2 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v2f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v2, v0, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v2f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v4, v0, v1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v2.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v4.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v2f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v2, v0, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v2f16__nsz: ; GFX12: ; %bb.0: @@ -742,25 +901,50 @@ define void @s_minimum_v2f16(<2 x half> inreg %src0, <2 x half> inreg %src1) { ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: s_minimum_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v0, s0, s1 -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 -; GFX11-NEXT: s_lshr_b32 s2, s1, 16 -; GFX11-NEXT: s_lshr_b32 s0, s0, 16 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: ;;#ASMSTART -; GFX11-NEXT: ; use v0 -; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: s_minimum_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, s1 +; GFX11-TRUE16-NEXT: s_lshr_b32 s2, s0, 16 +; GFX11-TRUE16-NEXT: s_lshr_b32 s3, s1, 16 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, s2 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, s3 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v2, s0, s1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0.l, v0.h +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v1.l, v1.h +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: ;;#ASMSTART +; GFX11-TRUE16-NEXT: ; use v0 +; GFX11-TRUE16-NEXT: ;;#ASMEND +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: s_minimum_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v0, s0, s1 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s1 +; GFX11-FAKE16-NEXT: s_lshr_b32 s2, s1, 16 +; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, 16 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e64 vcc_lo, s0, s2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: ;;#ASMSTART +; GFX11-FAKE16-NEXT: ; use v0 +; GFX11-FAKE16-NEXT: ;;#ASMEND +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: s_minimum_v2f16: ; GFX12: ; %bb.0: @@ -839,25 +1023,43 @@ define <3 x half> @v_minimum_v3f16(<3 x half> %src0, <3 x half> %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v3f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v3f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v6.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v3f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f16: ; GFX12: ; %bb.0: @@ -984,25 +1186,43 @@ define <3 x half> @v_minimum_v3f16__nsz(<3 x half> %src0, <3 x half> %src1) { ; GFX10-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v3f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v3f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v6, v0, v2 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v6 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v6.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s1 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v3f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v3f16__nsz: ; GFX12: ; %bb.0: @@ -1144,30 +1364,52 @@ define <4 x half> @v_minimum_v4f16(<4 x half> %src0, <4 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_min_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v8, v0, v2 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v7.l, v6.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v7, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f16: ; GFX12: ; %bb.0: @@ -1311,30 +1553,52 @@ define <4 x half> @v_minimum_v4f16__nsz(<4 x half> %src0, <4 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v1, v1, v6, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v4f16__nsz: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v4, v1, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-NEXT: v_pk_min_f16 v7, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v4f16__nsz: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v8, v0, v2 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v7.l, v6.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v8.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v2.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v3.l, s0 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v4f16__nsz: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v4, v1, v3 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v7, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v9, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v4f16__nsz: ; GFX12: ; %bb.0: @@ -1528,48 +1792,85 @@ define <8 x half> @v_minimum_v8f16(<8 x half> %src0, <8 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v3, v3, v10, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v8f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v8, v3, v7 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_pk_min_f16 v10, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v2 -; GFX11-NEXT: v_pk_min_f16 v14, v1, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 -; GFX11-NEXT: v_pk_min_f16 v11, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v13, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v15, 16, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v14, 16, v14 -; GFX11-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 -; GFX11-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v8f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3.l, v7.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v8, v3, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v2.l, v6.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v10, v2, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v8.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v9.l, v7.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v10.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v11.l, v6.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v1.l, v5.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v12, v0, v4 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v5 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v8.l, v6.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v0.l, v4.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s4, v11.l, v9.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v12.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v4.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v5.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v6.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, 0x7e00, v7.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v8f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v8, v3, v7 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v10, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v2 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v14, v1, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, 0x7e00, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v12, v11 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v11, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v13, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, 0x7e00, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v6, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v11, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v13, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v4, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v7 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v3, v9, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v8f16: ; GFX12: ; %bb.0: @@ -1811,90 +2112,159 @@ define <16 x half> @v_minimum_v16f16(<16 x half> %src0, <16 x half> %src1) { ; GFX10-NEXT: v_perm_b32 v4, v4, v13, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_minimum_v16f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_min_f16 v16, v7, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v7 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 -; GFX11-NEXT: v_pk_min_f16 v15, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v16 -; GFX11-NEXT: v_pk_min_f16 v20, v4, v12 -; GFX11-NEXT: v_pk_min_f16 v22, v2, v10 -; GFX11-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v23, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v24, 16, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v15 -; GFX11-NEXT: v_pk_min_f16 v14, v5, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 -; GFX11-NEXT: v_pk_min_f16 v17, v3, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v17 -; GFX11-NEXT: v_lshrrev_b32_e32 v12, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v3 -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 -; GFX11-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 -; GFX11-NEXT: v_pk_min_f16 v19, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 -; GFX11-NEXT: v_pk_min_f16 v22, v0, v8 -; GFX11-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v19 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v25, 16, v22 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 -; GFX11-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo -; GFX11-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 -; GFX11-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_minimum_v16f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v18, v7, v15 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v7.l, v15.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v6.l, v14.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v19, v5, v13 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v17.l, v16.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v18 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v16, v6, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, 0x7e00, v18.l, vcc_lo +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, 0x7e00, v15.l, s0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, 0x7e00, v16.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v5.l, v13.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v19 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v15.l, v14.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v16 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v18.l, v17.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v4.l, v12.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v15, v4, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, 0x7e00, v14.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, 0x7e00, v19.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, 0x7e00, v13.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, 0x7e00, v15.l, s2 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v16.l, v12.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v15 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v3.l, v11.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v13, v3, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v3 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v2.l, v10.l +; GFX11-TRUE16-NEXT: v_pk_min_f16 v15, v2, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, 0x7e00, v13.l, s0 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s0, v14.l, v11.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, 0x7e00, v15.l, s1 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s1, v16.l, v10.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s2, v1.l, v9.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v14, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v0 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v17, v0, v8 +; GFX11-TRUE16-NEXT: v_pk_min_f16 v1, v1, v9 +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s3, v13.l, v10.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s4, v0.l, v8.l +; GFX11-TRUE16-NEXT: v_cmp_o_f16_e64 s5, v16.l, v14.l +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v17 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x7e00, v1.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, v17.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x7e00, v8.l, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x7e00, v9.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, 0x7e00, v10.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, 0x7e00, v11.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, 0x7e00, v12.l, vcc_lo +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_minimum_v16f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_min_f16 v16, v7, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v7 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v7, v15 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v15, v6, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v16 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v20, v4, v12 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v22, v2, v10 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, 0x7e00, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v23, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v24, 16, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v6, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v15 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v14, v5, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v16, v7, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, 0x7e00, v15, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v5, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v15, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, 0x7e00, v14, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v18, v17 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v17, v3, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v17 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, 0x7e00, v20, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v3, v11 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v13, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, 0x7e00, v17, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v20, v19 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v19, v1, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, 0x7e00, v21, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v11, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v17, 0x7e00, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v1 +; GFX11-FAKE16-NEXT: v_pk_min_f16 v22, v0, v8 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v21, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v19 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v1, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v25, 16, v22 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x7e00, v19, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v0, v8 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v1, v21, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x7e00, v22, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v24, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, 0x7e00, v25, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v2, v10 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v8, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, 0x7e00, v20, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_o_f16_e32 vcc_lo, v4, v12 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v2, v17, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, 0x7e00, v18, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v4, v14, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: v_minimum_v16f16: ; GFX12: ; %bb.0: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 8a2c6e2ad97e9..7e8c30161c1c8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s declare half @llvm.minnum.f16(half %a, half %b) declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) @@ -113,31 +114,57 @@ define amdgpu_kernel void @minnum_f16_ieee( ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: minnum_f16_ieee: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_max_f16_e32 v1, v1, v1 -; GFX11-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: minnum_f16_ieee: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.h, v1.l, v1.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: minnum_f16_ieee: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_max_f16_e32 v1, v1, v1 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { @@ -169,10 +196,20 @@ define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 { ; GFX9-NEXT: v_min_f16_e32 v0, v0, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: minnum_f16_no_ieee: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_min_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: minnum_f16_no_ieee: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: minnum_f16_no_ieee: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: minnum_f16_no_ieee: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %r.val = call half @llvm.minnum.f16(half %a, half %b) ret half %r.val } @@ -255,24 +292,43 @@ define amdgpu_kernel void @minnum_f16_imm_a( ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: minnum_f16_imm_a: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_min_f16_e32 v0, 0x4200, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: minnum_f16_imm_a: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, 0x4200, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: minnum_f16_imm_a: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, 0x4200, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b) #0 { entry: @@ -360,24 +416,43 @@ define amdgpu_kernel void @minnum_f16_imm_b( ; GFX10-NEXT: buffer_store_short v0, off, s[4:7], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: minnum_f16_imm_b: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_max_f16_e32 v0, v0, v0 -; GFX11-NEXT: v_min_f16_e32 v0, 4.0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: minnum_f16_imm_b: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_max_f16_e32 v0.l, v0.l, v0.l +; GFX11-TRUE16-NEXT: v_min_f16_e32 v0.l, 4.0, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: minnum_f16_imm_b: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_max_f16_e32 v0, v0, v0 +; GFX11-FAKE16-NEXT: v_min_f16_e32 v0, 4.0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) #0 { entry: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll index 3a2bf9d009460..27ec1cfadd9d2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.rint.f16.ll @@ -4,7 +4,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX89,GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-FAKE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12,GFX12-FAKE16 %s declare half @llvm.rint.f16(half %a) declare <2 x half> @llvm.rint.v2f16(<2 x half> %a) @@ -84,23 +85,41 @@ define amdgpu_kernel void @rint_f16( ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX12-LABEL: rint_f16: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_rndne_f16_e32 v0, v0 -; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: rint_f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: rint_f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_rndne_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -229,27 +248,49 @@ define amdgpu_kernel void @rint_v2f16( ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX12-LABEL: rint_v2f16: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-NEXT: v_rndne_f16_e32 v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_rndne_f16_e32 v1, v1 -; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: rint_v2f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: v_rndne_f16_e32 v0.h, v1.l +; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: rint_v2f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-FAKE16-NEXT: v_rndne_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_rndne_f16_e32 v1, v1 +; GFX12-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -258,3 +299,5 @@ entry: store <2 x half> %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll index 6927636ad04aa..e16540fec0229 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.f16.ll @@ -3,8 +3,10 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX8 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX9 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX10 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX11 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefix=GFX12 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX11,GFX11-FAKE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -enable-var-scope --check-prefixes=GFX12,GFX12-FAKE16 %s define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX6-LABEL: sin_f16: @@ -69,31 +71,57 @@ define amdgpu_kernel void @sin_f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX10-NEXT: global_store_short v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sin_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sin_f16_e32 v1, v1 -; GFX11-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sin_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm ; -; GFX12-LABEL: sin_f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_sin_f16_e32 v1, v1 -; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm +; GFX11-FAKE16-LABEL: sin_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sin_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: sin_f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: sin_f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %a.val = load half, ptr addrspace(1) %a %r.val = call half @llvm.sin.f16(half %a.val) store half %r.val, ptr addrspace(1) %r @@ -184,42 +212,79 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: sin_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX11-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 -; GFX11-NEXT: v_sin_f16_e32 v1, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sin_f16_e32 v2, v2 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sin_v2f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v0, v1, s[2:3] +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l +; GFX11-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sin_v2f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX11-FAKE16-NEXT: v_sin_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sin_f16_e32 v2, v2 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX11-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-FAKE16-NEXT: s_endpgm +; +; GFX12-TRUE16-LABEL: sin_v2f16: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_b32 v2, v1, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: v_mul_f16_e32 v0.h, 0.15915494, v2.l +; GFX12-TRUE16-NEXT: ; kill: def $vgpr2 killed $vgpr2_lo16 killed $exec +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: v_sin_f16_e32 v0.h, v0.h +; GFX12-TRUE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1) +; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX12-TRUE16-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm ; -; GFX12-LABEL: sin_v2f16: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v1, v0, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v2, 16, v1 -; GFX12-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 -; GFX12-NEXT: v_sin_f16_e32 v1, v1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) -; GFX12-NEXT: v_sin_f16_e32 v2, v2 -; GFX12-NEXT: v_pack_b32_f16 v1, v1, v2 -; GFX12-NEXT: global_store_b32 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-FAKE16-LABEL: sin_v2f16: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-FAKE16-NEXT: v_mul_f16_e32 v1, 0.15915494, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-FAKE16-NEXT: v_mul_f16_e32 v2, 0.15915494, v2 +; GFX12-FAKE16-NEXT: v_sin_f16_e32 v1, v1 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) +; GFX12-FAKE16-NEXT: v_sin_f16_e32 v2, v2 +; GFX12-FAKE16-NEXT: v_pack_b32_f16 v1, v1, v2 +; GFX12-FAKE16-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %a.val = load <2 x half>, ptr addrspace(1) %a %r.val = call <2 x half> @llvm.sin.v2f16(<2 x half> %a.val) store <2 x half> %r.val, ptr addrspace(1) %r @@ -228,3 +293,6 @@ define amdgpu_kernel void @sin_v2f16(ptr addrspace(1) %r, ptr addrspace(1) %a) { declare half @llvm.sin.f16(half %a) declare <2 x half> @llvm.sin.v2f16(<2 x half> %a) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll index 716dd3fbd4c74..2996a4e22a3ef 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s declare half @llvm.sqrt.f16(half %a) declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a) @@ -45,23 +46,41 @@ define amdgpu_kernel void @sqrt_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sqrt_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_u16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sqrt_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sqrt_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -125,28 +144,51 @@ define amdgpu_kernel void @sqrt_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: sqrt_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s10, s6 -; GFX11-NEXT: s_mov_b32 s11, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s2 -; GFX11-NEXT: s_mov_b32 s9, s3 -; GFX11-NEXT: s_mov_b32 s4, s0 -; GFX11-NEXT: buffer_load_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_mov_b32 s5, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-NEXT: v_sqrt_f16_e32 v0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sqrt_f16_e32 v1, v1 -; GFX11-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: buffer_store_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: sqrt_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.l, v0.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_sqrt_f16_e32 v0.h, v1.l +; GFX11-TRUE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: sqrt_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v0, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_sqrt_f16_e32 v1, v1 +; GFX11-FAKE16-NEXT: s_waitcnt_depctr 0xfff +; GFX11-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -155,3 +197,5 @@ entry: store <2 x half> %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll index c1ba985d37453..ae41f4381251d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.trunc.f16.ll @@ -3,7 +3,8 @@ ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s declare half @llvm.trunc.f16(half %a) declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a) @@ -83,23 +84,41 @@ define amdgpu_kernel void @trunc_f16( ; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX12-LABEL: trunc_f16: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_u16 v0, off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_trunc_f16_e32 v0, v0 -; GFX12-NEXT: buffer_store_b16 v0, off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: trunc_f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v0.l +; GFX12-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: trunc_f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_u16 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -208,27 +227,49 @@ define amdgpu_kernel void @trunc_v2f16( ; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: s_endpgm ; -; GFX12-LABEL: trunc_v2f16: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_mov_b32 s6, -1 -; GFX12-NEXT: s_mov_b32 s7, 0x31016000 -; GFX12-NEXT: s_mov_b32 s10, s6 -; GFX12-NEXT: s_mov_b32 s11, s7 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_mov_b32 s8, s2 -; GFX12-NEXT: s_mov_b32 s9, s3 -; GFX12-NEXT: s_mov_b32 s4, s0 -; GFX12-NEXT: buffer_load_b32 v0, off, s[8:11], null -; GFX12-NEXT: s_mov_b32 s5, s1 -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-NEXT: v_trunc_f16_e32 v0, v0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_trunc_f16_e32 v1, v1 -; GFX12-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX12-NEXT: buffer_store_b32 v0, off, s[4:7], null -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: trunc_v2f16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX12-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-TRUE16-NEXT: s_mov_b32 s10, s6 +; GFX12-TRUE16-NEXT: s_mov_b32 s11, s7 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s8, s2 +; GFX12-TRUE16-NEXT: s_mov_b32 s9, s3 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s0 +; GFX12-TRUE16-NEXT: buffer_load_b32 v1, off, s[8:11], null +; GFX12-TRUE16-NEXT: s_mov_b32 s5, s1 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.l, v1.l +; GFX12-TRUE16-NEXT: v_trunc_f16_e32 v0.h, v1.l +; GFX12-TRUE16-NEXT: ; kill: def $vgpr1 killed $vgpr1_lo16 killed $exec +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h +; GFX12-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: trunc_v2f16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX12-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX12-FAKE16-NEXT: s_mov_b32 s10, s6 +; GFX12-FAKE16-NEXT: s_mov_b32 s11, s7 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_mov_b32 s8, s2 +; GFX12-FAKE16-NEXT: s_mov_b32 s9, s3 +; GFX12-FAKE16-NEXT: s_mov_b32 s4, s0 +; GFX12-FAKE16-NEXT: buffer_load_b32 v0, off, s[8:11], null +; GFX12-FAKE16-NEXT: s_mov_b32 s5, s1 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-FAKE16-NEXT: v_trunc_f16_e32 v0, v0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_trunc_f16_e32 v1, v1 +; GFX12-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX12-FAKE16-NEXT: buffer_store_b32 v0, off, s[4:7], null +; GFX12-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a) { entry: @@ -237,3 +278,5 @@ entry: store <2 x half> %r.val, ptr addrspace(1) %r ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX12: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll index b5c05d609b100..f6e9f152dca5e 100644 --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -3,7 +3,8 @@ ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-HSA %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=GCN-NOHSA-VI %s ; RUN: llc -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck --check-prefix=EG %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspace(4) %in) { ; GCN-NOHSA-SI-LABEL: constant_load_i16: @@ -75,15 +76,25 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX12-LABEL: constant_load_i16: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v0, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v1, v0, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b16 v0, v1, s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_load_i16: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_load_i16: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_u16 v1, v0, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_store_b16 v0, v1, s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm entry: %ld = load i16, ptr addrspace(4) %in store i16 %ld, ptr addrspace(1) %out @@ -722,41 +733,83 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) # ; EG-NEXT: MOV * T2.X, literal.x, ; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) ; -; GFX12-LABEL: constant_load_v16i16_align2: -; GFX12: ; %bb.0: ; %entry -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v8, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_clause 0x7 -; GFX12-NEXT: global_load_u16 v3, v8, s[0:1] offset:28 -; GFX12-NEXT: global_load_u16 v2, v8, s[0:1] offset:24 -; GFX12-NEXT: global_load_u16 v1, v8, s[0:1] offset:20 -; GFX12-NEXT: global_load_u16 v0, v8, s[0:1] offset:16 -; GFX12-NEXT: global_load_u16 v7, v8, s[0:1] offset:12 -; GFX12-NEXT: global_load_u16 v6, v8, s[0:1] offset:8 -; GFX12-NEXT: global_load_u16 v5, v8, s[0:1] offset:4 -; GFX12-NEXT: global_load_u16 v4, v8, s[0:1] -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 -; GFX12-NEXT: s_wait_loadcnt 0x7 -; GFX12-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 -; GFX12-NEXT: s_wait_loadcnt 0x4 -; GFX12-NEXT: global_store_b128 v[0:1], v[0:3], off -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_store_b128 v[0:1], v[4:7], off -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_load_v16i16_align2: +; GFX12-TRUE16: ; %bb.0: ; %entry +; GFX12-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v9, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_clause 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v9, s[0:1] offset:16 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:12 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v1, v9, s[0:1] offset:8 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:4 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v4, v9, s[0:1] offset:28 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:24 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v5, v9, s[0:1] offset:20 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v8, v9, s[0:1] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x6 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v3.l, v0.h +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x5 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v2.l, v1.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x4 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x3 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x2 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v6.l, v4.h +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v4.l, v0.l +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, v8.l +; GFX12-TRUE16-NEXT: s_clause 0x7 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v7, v9, s[0:1] offset:30 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v6, v9, s[0:1] offset:26 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v5, v9, s[0:1] offset:22 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v4, v9, s[0:1] offset:18 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v3, v9, s[0:1] offset:14 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v2, v9, s[0:1] offset:10 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v1, v9, s[0:1] offset:6 +; GFX12-TRUE16-NEXT: global_load_d16_hi_b16 v0, v9, s[0:1] offset:2 +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: s_clause 0x1 +; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-TRUE16-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_load_v16i16_align2: +; GFX12-FAKE16: ; %bb.0: ; %entry +; GFX12-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v8, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_clause 0x7 +; GFX12-FAKE16-NEXT: global_load_u16 v3, v8, s[0:1] offset:28 +; GFX12-FAKE16-NEXT: global_load_u16 v2, v8, s[0:1] offset:24 +; GFX12-FAKE16-NEXT: global_load_u16 v1, v8, s[0:1] offset:20 +; GFX12-FAKE16-NEXT: global_load_u16 v0, v8, s[0:1] offset:16 +; GFX12-FAKE16-NEXT: global_load_u16 v7, v8, s[0:1] offset:12 +; GFX12-FAKE16-NEXT: global_load_u16 v6, v8, s[0:1] offset:8 +; GFX12-FAKE16-NEXT: global_load_u16 v5, v8, s[0:1] offset:4 +; GFX12-FAKE16-NEXT: global_load_u16 v4, v8, s[0:1] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v3, v8, s[0:1] offset:30 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v2, v8, s[0:1] offset:26 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v1, v8, s[0:1] offset:22 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v0, v8, s[0:1] offset:18 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v7, v8, s[0:1] offset:14 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v6, v8, s[0:1] offset:10 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v5, v8, s[0:1] offset:6 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x7 +; GFX12-FAKE16-NEXT: global_load_d16_hi_b16 v4, v8, s[0:1] offset:2 +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x4 +; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[0:3], off +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: global_store_b128 v[0:1], v[4:7], off +; GFX12-FAKE16-NEXT: s_endpgm entry: %ld = load <16 x i16>, ptr addrspace(4) %ptr0, align 2 store <16 x i16> %ld, ptr addrspace(1) poison, align 32 @@ -5379,16 +5432,27 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX12-LABEL: constant_zextload_i16_to_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_zextload_i16_to_i64: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-TRUE16-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_zextload_i16_to_i64: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = zext i16 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -5467,18 +5531,31 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p ; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; -; GFX12-LABEL: constant_sextload_i16_to_i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_sextload_i16_to_i64: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_sextload_i16_to_i64: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %a = load i16, ptr addrspace(4) %in %ext = sext i16 %a to i64 store i64 %ext, ptr addrspace(1) %out @@ -5550,16 +5627,27 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX12-LABEL: constant_zextload_v1i16_to_v1i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v1, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v1, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_zextload_v1i16_to_v1i64: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-TRUE16-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_zextload_v1i16_to_v1i64: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_u16 v0, v1, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX12-FAKE16-NEXT: global_store_b64 v1, v[0:1], s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = zext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -5633,18 +5721,31 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou ; EG-NEXT: ASHR * T0.Y, PV.X, literal.x, ; EG-NEXT: 31(4.344025e-44), 0(0.000000e+00) ; -; GFX12-LABEL: constant_sextload_v1i16_to_v1i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: v_mov_b32_e32 v2, 0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_u16 v0, v2, s[2:3] -; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_sextload_v1i16_to_v1i64: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: global_load_d16_b16 v0, v2, s[2:3] +; GFX12-TRUE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-TRUE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_sextload_v1i16_to_v1i64: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: global_load_u16 v0, v2, s[2:3] +; GFX12-FAKE16-NEXT: s_wait_loadcnt 0x0 +; GFX12-FAKE16-NEXT: v_bfe_i32 v0, v0, 0, 16 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GFX12-FAKE16-NEXT: global_store_b64 v2, v[0:1], s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %load = load <1 x i16>, ptr addrspace(4) %in %ext = sext <1 x i16> %load to <1 x i64> store <1 x i64> %ext, ptr addrspace(1) %out @@ -5724,20 +5825,36 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou ; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.y, ; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) ; -; GFX12-LABEL: constant_zextload_v2i16_to_v2i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b32 s2, s[2:3], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_and_b32 s3, 0xffff, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 -; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_zextload_v2i16_to_v2i64: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s3, s2 +; GFX12-TRUE16-NEXT: s_pack_hl_b32_b16 s2, s2, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-TRUE16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_zextload_v2i16_to_v2i64: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_load_b32 s2, s[2:3], 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_and_b32 s3, 0xffff, s2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3 +; GFX12-FAKE16-NEXT: s_pack_hl_b32_b16 s2, s2, 0 +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-FAKE16-NEXT: s_endpgm %load = load <2 x i16>, ptr addrspace(4) %in %ext = zext <2 x i16> %load to <2 x i64> store <2 x i64> %ext, ptr addrspace(1) %out @@ -5947,26 +6064,47 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou ; EG-NEXT: LSHR * T8.X, PV.W, literal.x, ; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) ; -; GFX12-LABEL: constant_zextload_v4i16_to_v4i64: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: s_and_b32 s4, 0xffff, s2 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 -; GFX12-NEXT: s_pack_hl_b32_b16 s2, s2, 0 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 -; GFX12-NEXT: s_pack_hl_b32_b16 s2, s3, 0 -; GFX12-NEXT: s_and_b32 s3, 0xffff, s3 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_mov_b32_e32 v0, s3 -; GFX12-NEXT: v_mov_b32_e32 v2, s2 -; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 -; GFX12-NEXT: s_endpgm +; GFX12-TRUE16-LABEL: constant_zextload_v4i16_to_v4i64: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-TRUE16-NEXT: s_mov_b32 s4, s2 +; GFX12-TRUE16-NEXT: s_pack_hl_b32_b16 s2, s2, 0 +; GFX12-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 +; GFX12-TRUE16-NEXT: s_and_b32 s4, 0xffff, s4 +; GFX12-TRUE16-NEXT: s_pack_hl_b32_b16 s2, s3, 0 +; GFX12-TRUE16-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v3, v1 +; GFX12-TRUE16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX12-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-TRUE16-NEXT: s_wait_alu 0xfffe +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-TRUE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-TRUE16-NEXT: s_endpgm +; +; GFX12-FAKE16-LABEL: constant_zextload_v4i16_to_v4i64: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_load_b64 s[2:3], s[2:3], 0x0 +; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-FAKE16-NEXT: s_and_b32 s4, 0xffff, s2 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4 +; GFX12-FAKE16-NEXT: s_pack_hl_b32_b16 s2, s2, 0 +; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX12-FAKE16-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, v1 +; GFX12-FAKE16-NEXT: s_pack_hl_b32_b16 s2, s3, 0 +; GFX12-FAKE16-NEXT: s_and_b32 s3, 0xffff, s3 +; GFX12-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] +; GFX12-FAKE16-NEXT: s_wait_alu 0xfffe +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s3 +; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s2 +; GFX12-FAKE16-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16 +; GFX12-FAKE16-NEXT: s_endpgm %load = load <4 x i16>, ptr addrspace(4) %in %ext = zext <4 x i16> %load to <4 x i64> store <4 x i64> %ext, ptr addrspace(1) %out diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 7c1da18de70f8..030c332850124 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -1,11 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SAFE %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-SAFE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=CI,CI-NSZ %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=VI,VI-NSZ %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ,GFX11-NSZ-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ,GFX11-NSZ-FAKE16 %s define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; CI-LABEL: add_select_fabs_fabs_f16: @@ -30,14 +32,41 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_add_f16_e64 v0, |v0|, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -73,15 +102,45 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h ; VI-NEXT: v_add_f16_e64 v1, |v1|, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e64 v1, |v1|, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v1.l, |v1.l|, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v4.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v1, |v1|, v3 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v1.l, |v1.l|, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v4.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_lhs_fabs_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v1, |v1|, v3 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -119,15 +178,45 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half ; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -165,15 +254,45 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h ; VI-NEXT: v_add_f16_e64 v1, |v2|, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e64 v1, |v2|, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v1.l, |v2.l|, v4.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v1, |v2|, v4 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v1.l, |v2.l|, v4.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_rhs_fabs_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v1, |v2|, v4 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -209,15 +328,45 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_var_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_var_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_var_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_var_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half %fabs.x, half %y @@ -248,15 +397,45 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_negk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half %fabs, half -1.0 @@ -286,15 +465,45 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) { ; VI-NEXT: v_add_f16_e64 v0, |v0|, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_negk_negk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xc000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v1 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xc000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v1 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2.0, half -1.0 %fabs = call half @llvm.fabs.f16(half %select) @@ -323,15 +532,45 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) { ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_posk_posk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0x4000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_posk_posk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x4000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_posk_posk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_posk_posk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0x4000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_posk_posk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v2, 0x4000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half 2.0, half 1.0 %add = fadd half %select, %x @@ -361,15 +600,45 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negk_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negk_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negk_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negk_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negk_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half -1.0, half %fabs @@ -401,15 +670,45 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_add_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negliteralk_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negliteralk_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xe400, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negliteralk_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xe400, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half -1024.0, half %fabs @@ -439,14 +738,41 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_add_f16_e64 v0, |v0|, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_posk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_posk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_posk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_posk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_posk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half %fabs, half 1.0 @@ -476,14 +802,41 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_add_f16_e64 v0, |v0|, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_posk_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e64 v0, |v0|, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_posk_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_posk_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_posk_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e64 v0.l, |v0.l|, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_posk_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e64 v0, |v0|, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs = call half @llvm.fabs.f16(half %x) %select = select i1 %cmp, half 1.0, half %fabs @@ -514,14 +867,41 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_fneg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_fneg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_fneg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_fneg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_fneg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -557,15 +937,45 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h ; VI-NEXT: v_sub_f16_e32 v1, v4, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_sub_f16_e32 v1, v4, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v1.l, v4.l, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v1, v4, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v1.l, v4.l, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_lhs_fneg_fneg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v1, v4, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -603,15 +1013,45 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half ; VI-NEXT: v_mov_b32_e32 v1, v4 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -649,15 +1089,45 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h ; VI-NEXT: v_sub_f16_e32 v1, v4, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_sub_f16_e32 v1, v4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v1.l, v4.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v1, v4, v2 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v1.l, v4.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_rhs_fneg_fneg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v1, v4, v2 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %fneg.y = fneg half %y @@ -693,15 +1163,45 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_var_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_var_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_var_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_var_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_var_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half %y @@ -731,14 +1231,41 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_negk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_negk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_negk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_negk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_negk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half -1.0 @@ -769,14 +1296,41 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_inv2pi_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_inv2pi_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xb118, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_inv2pi_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_inv2pi_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xb118, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_inv2pi_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xb118, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half 0xH3118 @@ -807,14 +1361,41 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_neginv2pi_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_neginv2pi_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3118, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_neginv2pi_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_neginv2pi_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3118, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_neginv2pi_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3118, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half 0xHB118 @@ -843,15 +1424,45 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) { ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negk_negk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negk_negk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xc000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negk_negk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negk_negk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xc000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negk_negk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2.0, half -1.0 %add = fadd half %select, %x @@ -881,15 +1492,45 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) { ; VI-NEXT: v_add_f16_e32 v0, v0, v1 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negliteralk_negliteralk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xe800 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_negliteralk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xe800 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xec00, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negliteralk_negliteralk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v2, 0xe800 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_negliteralk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xe800 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xec00, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negliteralk_negliteralk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v2, 0xe800 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xec00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2048.0, half -4096.0 %add = fadd half %select, %x @@ -917,15 +1558,45 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) { ; VI-NEXT: v_sub_f16_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_negk_negk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v2, 0xc000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_sub_f16_e32 v0, v1, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_negk_negk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xc000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_negk_negk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_negk_negk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0xc000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v1.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_negk_negk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v2, 0xc000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v1, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %select = select i1 %cmp, half -2.0, half -1.0 %fneg.x = fneg half %select @@ -955,14 +1626,41 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negk_fneg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negk_fneg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negk_fneg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negk_fneg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negk_fneg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half -1.0, half %fneg.x @@ -992,14 +1690,41 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_posk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_posk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_posk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_posk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_posk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half %fneg.x, half 1.0 @@ -1029,14 +1754,41 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_sub_f16_e32 v0, v2, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_posk_fneg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_sub_f16_e32 v0, v2, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_posk_fneg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_posk_fneg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_posk_fneg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v2.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_posk_fneg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v2, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fneg half %x %select = select i1 %cmp, half 1.0, half %fneg.x @@ -1069,16 +1821,49 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negfabs_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negfabs_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negfabs_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negfabs_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negfabs_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1113,16 +1898,49 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_negfabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_or_b32_e32 v2, 0x8000, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negfabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_or_b16 v0.h, 0x8000, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negfabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negfabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: v_or_b16 v0.h, 0x8000, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negfabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v2, 0x8000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fabs.y = call half @llvm.fabs.f16(half %y) @@ -1157,16 +1975,49 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_neg_fabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_neg_fabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_neg_fabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_neg_fabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_neg_fabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fsub half -0.000000e+00, %x %fabs.y = call half @llvm.fabs.f16(half %y) @@ -1200,16 +2051,49 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_add_f16_e32 v0, v0, v3 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_neg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_xor_b32_e32 v2, 0x8000, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_add_f16_e32 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_neg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_neg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_neg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: v_xor_b16 v0.h, 0x8000, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v3.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_neg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v2, 0x8000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_add_f16_e32 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.y = fsub half -0.000000e+00, %y @@ -1242,15 +2126,45 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_neg_negfabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_neg_negfabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_neg_negfabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_neg_negfabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_neg_negfabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fneg.x = fsub half -0.000000e+00, %x %fabs.y = call half @llvm.fabs.f16(half %y) @@ -1284,15 +2198,45 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) { ; VI-NEXT: v_sub_f16_e32 v0, v3, v0 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negfabs_neg_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo -; GFX11-NEXT: v_sub_f16_e32 v0, v3, v0 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negfabs_neg_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negfabs_neg_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negfabs_neg_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, v3.l, v0.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negfabs_neg_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v0, v3, v0 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1325,15 +2269,45 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_negfabs_posk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_negfabs_posk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_negfabs_posk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_negfabs_posk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_negfabs_posk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1365,15 +2339,45 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_posk_negfabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_posk_negfabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_posk_negfabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_posk_negfabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4400, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_posk_negfabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1405,15 +2409,45 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_negfabs_negk_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_negfabs_negk_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_negfabs_negk_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_negfabs_negk_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_negfabs_negk_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1445,15 +2479,45 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) { ; VI-NEXT: v_mul_f16_e32 v0, v0, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_negk_negfabs_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v1, 0x8000, v1 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_negk_negfabs_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_negk_negfabs_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_negk_negfabs_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xc400, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_negk_negfabs_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v1, 0x8000, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fabs.x = call half @llvm.fabs.f16(half %x) %fneg.fabs.x = fsub half -0.000000e+00, %fabs.x @@ -1487,15 +2551,25 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_add_f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_add_f16_e32 v1, 4.0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_add_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v1.l, 4.0, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_add_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v1, 4.0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_add_f16: ; CI-NSZ: ; %bb.0: @@ -1516,14 +2590,23 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_add_f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, -4.0, v1 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_add_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, -4.0, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_add_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v1, -4.0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fadd half %x, 4.0 %fneg = fneg half %add @@ -1552,15 +2635,25 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_sub_f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_sub_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_sub_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16: ; CI-NSZ: ; %bb.0: @@ -1581,14 +2674,23 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_sub_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_sub_f16_e32 v0.l, 4.0, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_sub_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_sub_f16_e32 v1, 4.0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %add = fsub half %x, 4.0 %fneg = fneg half %add @@ -1616,14 +2718,41 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) { ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: select_fneg_posk_src_mul_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mul_f16_e32 v1, -4.0, v1 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_mul_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, -4.0, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_mul_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_mul_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, -4.0, v1.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_mul_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mul_f16_e32 v1, -4.0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %mul = fmul half %x, 4.0 %fneg = fneg half %mul @@ -1654,15 +2783,25 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_fma_f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fma_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 4.0, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fma_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_fmac_f16_e32 v2, 4.0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16: ; CI-NSZ: ; %bb.0: @@ -1685,14 +2824,23 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) { ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fma_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v1.l, -4.0, -v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fma_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v1, v1, -4.0, -v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fma = call half @llvm.fma.f16(half %x, half 4.0, half %z) %fneg = fneg half %fma @@ -1724,15 +2872,25 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_fmad_f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_fmac_f16_e32 v2, 4.0, v1 -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fmad_f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 4.0, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fmad_f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_fmac_f16_e32 v2, 4.0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16: ; CI-NSZ: ; %bb.0: @@ -1756,14 +2914,23 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) { ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc ; VI-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_fma_f16 v1, v1, -4.0, -v2 -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fmad_f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_fma_f16 v0.l, v1.l, -4.0, -v2.l +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fmad_f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_fma_f16 v1, v1, -4.0, -v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq i32 %c, 0 %fmad = call half @llvm.fmuladd.f16(half %x, half 4.0, half %z) %fneg = fneg half %fmad @@ -1776,3 +2943,7 @@ declare half @llvm.fma.f16(half, half, half) #0 declare half @llvm.fmuladd.f16(half, half, half) #0 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX11-NSZ: {{.*}} +; GFX11-SAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index d2bb971b68030..82d706f073258 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -2,12 +2,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefixes=CI,CI-SAFE %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=VI,VI-SAFE %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9,GFX9-SAFE %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SAFE,GFX11-SAFE-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=CI,CI-NSZ %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=VI,VI-NSZ %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX9,GFX9-NSZ %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ,GFX11-NSZ-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -enable-no-signed-zeros-fp-math < %s | FileCheck -check-prefixes=GFX11,GFX11-NSZ,GFX11-NSZ-FAKE16 %s define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x half> %y, <2 x half> %z) { ; CI-LABEL: add_select_fabs_fabs_v2f16: @@ -65,23 +67,75 @@ define <2 x half> @add_select_fabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fabs.y = call <2 x half> @llvm.fabs.v2f16(<2 x half> %y) @@ -156,24 +210,79 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fabs_fabs_v2f16(<2 x ; GFX9-NEXT: v_pk_add_f16 v1, v2, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v1, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v1, v2, v4 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v5 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v1, v2, v4 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v5 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v1, v2, v4 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v5 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_lhs_fabs_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v1, v2, v4 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v5 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fabs.y = call <2 x half> @llvm.fabs.v2f16(<2 x half> %y) @@ -243,24 +352,79 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fabs_fabs_v2f1 ; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_mov_b32_e32 v1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v3.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v3.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_store_use_lhs_fabs_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v1, v2 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fabs.y = call <2 x half> @llvm.fabs.v2f16(<2 x half> %y) @@ -337,24 +501,79 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fabs_fabs_v2f16(<2 x ; GFX9-NEXT: v_pk_add_f16 v1, v3, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v1, v3, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_rhs_fabs_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v1, v3, v5 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fabs.y = call <2 x half> @llvm.fabs.v2f16(<2 x half> %y) @@ -420,21 +639,67 @@ define <2 x half> @add_select_fabs_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_var_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_var_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_var_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_var_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_var_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %select = select <2 x i1> %cmp, <2 x half> %fabs.x, <2 x half> %y @@ -492,20 +757,63 @@ define <2 x half> @add_select_fabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_negk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xbc00, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xbc00, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xbc00, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xbc00, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %select = select <2 x i1> %cmp, <2 x half> %fabs, <2 x half> @@ -562,20 +870,67 @@ define <2 x half> @add_select_fabs_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_negk_negk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v3, 0xc000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negk_negk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negk_negk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v3, 0xc000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %select) @@ -626,19 +981,61 @@ define <2 x half> @add_select_posk_posk_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_posk_posk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, 0x4000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_posk_posk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v3.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_posk_posk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_posk_posk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0x4000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v3.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v3.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_posk_posk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v3, 0x4000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %add = fadd <2 x half> %select, %x @@ -695,20 +1092,63 @@ define <2 x half> @add_select_negk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negk_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negk_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xbc00, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xbc00, v0.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negk_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negk_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xbc00, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xbc00, v0.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negk_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %fabs @@ -767,20 +1207,63 @@ define <2 x half> @add_select_negliteralk_fabs_v2f16(<2 x i32> %c, <2 x half> %x ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negliteralk_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xe400, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xe400, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xe400, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xe400, v0.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negliteralk_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xe400, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xe400, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xe400, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xe400, v0.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negliteralk_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xe400, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xe400, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %fabs @@ -838,20 +1321,63 @@ define <2 x half> @add_select_fabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_posk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_posk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3c00, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3c00, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_posk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_posk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3c00, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3c00, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_posk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %select = select <2 x i1> %cmp, <2 x half> %fabs, <2 x half> @@ -909,20 +1435,63 @@ define <2 x half> @add_select_posk_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_posk_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_posk_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3c00, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3c00, v0.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_posk_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_posk_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3c00, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x3c00, v0.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_posk_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %fabs @@ -982,20 +1551,63 @@ define <2 x half> @add_select_fneg_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_fneg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_fneg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_fneg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_fneg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_fneg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %fneg.y = fneg <2 x half> %y @@ -1066,21 +1678,67 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_lhs_fneg_fneg_v2f16(<2 x ; GFX9-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_lhs_fneg_fneg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v1, v5, v2 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %fneg.y = fneg <2 x half> %y @@ -1150,21 +1808,67 @@ define { <2 x half>, <2 x half> } @add_select_multi_store_use_lhs_fneg_fneg_v2f1 ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_store_use_lhs_fneg_fneg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v1, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %fneg.y = fneg <2 x half> %y @@ -1237,21 +1941,67 @@ define { <2 x half>, <2 x half> } @add_select_multi_use_rhs_fneg_fneg_v2f16(<2 x ; GFX9-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v6.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_multi_use_rhs_fneg_fneg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v7, v6, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v1, v5, v3 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %fneg.y = fneg <2 x half> %y @@ -1321,21 +2071,67 @@ define <2 x half> @add_select_fneg_var_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_var_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_var_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_var_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_var_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v1.l, v5.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_var_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v6, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %select = select <2 x i1> %cmp, <2 x half> %fneg.x, <2 x half> %y @@ -1391,19 +2187,59 @@ define <2 x half> @add_select_fneg_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_negk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_negk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_negk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_negk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_negk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %select = select <2 x i1> %cmp, <2 x half> %fneg.x, <2 x half> @@ -1460,19 +2296,59 @@ define <2 x half> @add_select_fneg_inv2pi_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_inv2pi_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xb118, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xb118, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_inv2pi_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xb118, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xb118, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_inv2pi_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xb118, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xb118, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_inv2pi_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xb118, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xb118, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_inv2pi_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xb118, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xb118, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %select = select <2 x i1> %cmp, <2 x half> %fneg.x, <2 x half> @@ -1529,19 +2405,59 @@ define <2 x half> @add_select_fneg_neginv2pi_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_neginv2pi_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3118, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3118, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_neginv2pi_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3118, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3118, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_neginv2pi_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3118, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3118, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_neginv2pi_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3118, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3118, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_neginv2pi_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3118, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3118, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %select = select <2 x i1> %cmp, <2 x half> %fneg.x, <2 x half> @@ -1592,19 +2508,61 @@ define <2 x half> @add_select_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negk_negk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negk_negk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negk_negk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v3, 0xc000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negk_negk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negk_negk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v3, 0xc000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %add = fadd <2 x half> %select, %x @@ -1656,19 +2614,61 @@ define <2 x half> @add_select_negliteralk_negliteralk_v2f16(<2 x i32> %c, <2 x h ; GFX9-NEXT: v_pk_add_f16 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negliteralk_negliteralk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, 0xe800 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xec00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xec00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negliteralk_negliteralk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xe800 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xec00, v3.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xec00, v3.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negliteralk_negliteralk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v3, 0xe800 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xec00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xec00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negliteralk_negliteralk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xe800 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xec00, v3.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xec00, v3.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negliteralk_negliteralk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v3, 0xe800 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xec00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xec00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %add = fadd <2 x half> %select, %x @@ -1718,19 +2718,61 @@ define <2 x half> @add_select_fneg_negk_negk_v2f16(<2 x i32> %c, <2 x half> %x) ; GFX9-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_negk_negk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v3, 0xc000 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo -; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_negk_negk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc000 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_negk_negk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_mov_b32_e32 v3, 0xc000 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_negk_negk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0xc000 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v3.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v3.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pack_b32_f16 v0, v0.h, v0.l +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_negk_negk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_mov_b32_e32 v3, 0xc000 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_pack_b32_f16 v0, v0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v2, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %fneg.x = fneg <2 x half> %select @@ -1786,19 +2828,59 @@ define <2 x half> @add_select_negk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negk_fneg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negk_fneg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negk_fneg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negk_fneg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x3c00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3c00, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negk_fneg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3c00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3c00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %fneg.x @@ -1854,19 +2936,59 @@ define <2 x half> @add_select_fneg_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fneg_posk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fneg_posk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fneg_posk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fneg_posk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fneg_posk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %select = select <2 x i1> %cmp, <2 x half> %fneg.x, <2 x half> @@ -1922,19 +3044,59 @@ define <2 x half> @add_select_posk_fneg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x ; GFX9-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_posk_fneg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_posk_fneg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_posk_fneg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_posk_fneg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0xbc00, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0xbc00, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_posk_fneg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xbc00, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xbc00, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v3, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %select = select <2 x i1> %cmp, <2 x half> , <2 x half> %fneg.x @@ -2002,23 +3164,75 @@ define <2 x half> @add_select_negfabs_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negfabs_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negfabs_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negfabs_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negfabs_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negfabs_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fneg.fabs.x = fneg <2 x half> %fabs.x @@ -2088,23 +3302,75 @@ define <2 x half> @add_select_fabs_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_negfabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v3, 0x80008000, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_negfabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v3, 0x80008000, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_negfabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v3, 0x80008000, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_negfabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v3, 0x80008000, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_negfabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v3, 0x80008000, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fabs.y = call <2 x half> @llvm.fabs.v2f16(<2 x half> %y) @@ -2174,23 +3440,75 @@ define <2 x half> @add_select_neg_fabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_neg_fabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_neg_fabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_neg_fabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_neg_fabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_neg_fabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %fabs.y = call <2 x half> @llvm.fabs.v2f16(<2 x half> %y) @@ -2259,23 +3577,75 @@ define <2 x half> @add_select_fabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 x h ; GFX9-NEXT: v_pk_add_f16 v0, v0, v4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_fabs_neg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v4 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_fabs_neg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_fabs_neg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_fabs_neg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v2.h, v1.l, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v2, v4 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_fabs_neg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_xor_b32_e32 v3, 0x80008000, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v0, v4 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fneg.y = fneg <2 x half> %y @@ -2338,21 +3708,67 @@ define <2 x half> @add_select_neg_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_neg_negfabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_neg_negfabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_neg_negfabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_neg_negfabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v3.l, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_neg_negfabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v3 +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fneg.x = fneg <2 x half> %x %fabs.y = call <2 x half> @llvm.fabs.v2f16(<2 x half> %y) @@ -2416,21 +3832,67 @@ define <2 x half> @add_select_negfabs_neg_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: add_select_negfabs_neg_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: add_select_negfabs_neg_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v3.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: add_select_negfabs_neg_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: add_select_negfabs_neg_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v3.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, v5.l, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: add_select_negfabs_neg_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v3 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v2 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v0, v4, v0 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fneg.fabs.x = fneg <2 x half> %fabs.x @@ -2494,20 +3956,63 @@ define <2 x half> @mul_select_negfabs_posk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_negfabs_posk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_negfabs_posk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4400, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4400, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_negfabs_posk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_negfabs_posk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4400, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4400, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_negfabs_posk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fneg.fabs.x = fneg <2 x half> %fabs.x @@ -2570,20 +4075,63 @@ define <2 x half> @mul_select_posk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_posk_negfabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_posk_negfabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4400, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4400, v0.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_posk_negfabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_posk_negfabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x4400, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0x4400, v0.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_posk_negfabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4400, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4400, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fneg.fabs.x = fneg <2 x half> %fabs.x @@ -2646,20 +4194,63 @@ define <2 x half> @mul_select_negfabs_negk_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_negfabs_negk_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_negfabs_negk_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xc400, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xc400, v0.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_negfabs_negk_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_negfabs_negk_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xc400, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xc400, v0.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_negfabs_negk_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fneg.fabs.x = fneg <2 x half> %fabs.x @@ -2722,20 +4313,63 @@ define <2 x half> @mul_select_negk_negfabs_v2f16(<2 x i32> %c, <2 x half> %x, <2 ; GFX9-NEXT: v_pk_mul_f16 v0, v0, v3 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: mul_select_negk_negfabs_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_or_b32_e32 v2, 0x80008000, v2 -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: v_pk_mul_f16 v0, v0, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: mul_select_negk_negfabs_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xc400, v2.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xc400, v0.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: mul_select_negk_negfabs_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: mul_select_negk_negfabs_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.l, 0xc400, v2.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v1.h, 0xc400, v0.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-TRUE16-NEXT: v_pk_mul_f16 v0, v1, v3 +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: mul_select_negk_negfabs_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_or_b32_e32 v2, 0x80008000, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0xc400, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0xc400, v4, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v3 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fabs.x = call <2 x half> @llvm.fabs.v2f16(<2 x half> %x) %fneg.fabs.x = fneg <2 x half> %fabs.x @@ -2805,20 +4439,34 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_add_v2f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_add_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_add_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_add_v2f16: ; CI-NSZ: ; %bb.0: @@ -2864,19 +4512,32 @@ define <2 x half> @select_fneg_posk_src_add_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_add_v2f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_add_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_add_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %add = fadd <2 x half> %x, %fneg = fneg <2 x half> %add @@ -2941,20 +4602,34 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_sub_v2f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_sub_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_sub_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: ; CI-NSZ: ; %bb.0: @@ -3000,19 +4675,32 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_sub_v2f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_sub_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_sub_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %add = fsub <2 x half> %x, %fneg = fneg <2 x half> %add @@ -3065,19 +4753,59 @@ define <2 x half> @select_fneg_posk_src_mul_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: select_fneg_posk_src_mul_v2f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_mul_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_mul_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_mul_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_mul_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_mul_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %mul = fmul <2 x half> %x, %fneg = fneg <2 x half> %mul @@ -3148,20 +4876,34 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_fma_v2f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fma_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fma_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; VI-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: ; VI-NSZ: ; %bb.0: @@ -3193,19 +4935,32 @@ define <2 x half> @select_fneg_posk_src_fma_v2f16(<2 x i32> %c, <2 x half> %x, < ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_fma_v2f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fma_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fma_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %x, <2 x half> , <2 x half> %z) %fneg = fneg <2 x half> %fma @@ -3278,20 +5033,34 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-SAFE-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-SAFE-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SAFE-LABEL: select_fneg_posk_src_fmad_v2f16: -; GFX11-SAFE: ; %bb.0: -; GFX11-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; GFX11-SAFE-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-SAFE-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-SAFE-NEXT: s_setpc_b64 s[30:31] +; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16: +; GFX11-SAFE-TRUE16: ; %bb.0: +; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-TRUE16-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16: +; GFX11-SAFE-FAKE16: ; %bb.0: +; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SAFE-FAKE16-NEXT: v_pk_fma_f16 v2, v2, 4.0, v3 op_sel_hi:[1,0,1] +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-SAFE-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-SAFE-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; CI-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: ; CI-NSZ: ; %bb.0: @@ -3344,19 +5113,32 @@ define <2 x half> @select_fneg_posk_src_fmad_v2f16(<2 x i32> %c, <2 x half> %x, ; GFX9-NSZ-NEXT: v_perm_b32 v0, v1, v0, s4 ; GFX9-NSZ-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-NSZ-LABEL: select_fneg_posk_src_fmad_v2f16: -; GFX11-NSZ: ; %bb.0: -; GFX11-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NSZ-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-NSZ-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo -; GFX11-NSZ-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-NSZ-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-NSZ-NEXT: s_setpc_b64 s[30:31] +; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_fmad_v2f16: +; GFX11-NSZ-TRUE16: ; %bb.0: +; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-TRUE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v2.l, s0 +; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v1.l, vcc_lo +; GFX11-NSZ-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-NSZ-FAKE16-LABEL: select_fneg_posk_src_fmad_v2f16: +; GFX11-NSZ-FAKE16: ; %bb.0: +; GFX11-NSZ-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NSZ-FAKE16-NEXT: v_pk_fma_f16 v2, v2, -4.0, v3 op_sel_hi:[1,0,1] neg_lo:[0,0,1] neg_hi:[0,0,1] +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-NSZ-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX11-NSZ-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-NSZ-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NSZ-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-NSZ-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer %fmad = call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %x, <2 x half> , <2 x half> %z) %fneg = fneg <2 x half> %fmad @@ -3369,3 +5151,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #0 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} +; GFX11-NSZ: {{.*}} +; GFX11-SAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index e920fdee51815..5ae6b1d78b70e 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI -; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-TRUE16 +; RUN: llc < %s -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 define amdgpu_kernel void @select_f16( ; SI-LABEL: select_f16: @@ -81,42 +82,81 @@ define amdgpu_kernel void @select_f16( ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s6 -; GFX11-NEXT: s_mov_b32 s19, s7 -; GFX11-NEXT: s_mov_b32 s22, s6 -; GFX11-NEXT: s_mov_b32 s23, s7 -; GFX11-NEXT: s_mov_b32 s26, s6 -; GFX11-NEXT: s_mov_b32 s27, s7 -; GFX11-NEXT: s_mov_b32 s2, s6 -; GFX11-NEXT: s_mov_b32 s3, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s10 -; GFX11-NEXT: s_mov_b32 s17, s11 -; GFX11-NEXT: s_mov_b32 s20, s12 -; GFX11-NEXT: s_mov_b32 s21, s13 -; GFX11-NEXT: s_mov_b32 s24, s14 -; GFX11-NEXT: s_mov_b32 s25, s15 -; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s8 -; GFX11-NEXT: s_mov_b32 s5, s9 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo -; GFX11-NEXT: buffer_store_b16 v0, off, s[4:7], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v3, off, s[24:27], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v3.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s26, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s27, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13 +; GFX11-FAKE16-NEXT: s_mov_b32 s24, s14 +; GFX11-FAKE16-NEXT: s_mov_b32 s25, s15 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[24:27], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v3, off, s[0:3], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, @@ -201,36 +241,69 @@ define amdgpu_kernel void @select_f16_imm_a( ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_f16_imm_a: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_f16_imm_a: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_f16_imm_a: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c, @@ -312,36 +385,69 @@ define amdgpu_kernel void @select_f16_imm_b( ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_f16_imm_b: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_f16_imm_b: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s7 +; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[16:19], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v1.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_f16_imm_b: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c, @@ -424,36 +530,67 @@ define amdgpu_kernel void @select_f16_imm_c( ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_f16_imm_c: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_f16_imm_c: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_f16_imm_c: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -536,36 +673,67 @@ define amdgpu_kernel void @select_f16_imm_d( ; VI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_f16_imm_d: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_f16_imm_d: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s4 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s5 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[12:15], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_f16_imm_d: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[16:19], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[20:23], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -679,50 +847,97 @@ define amdgpu_kernel void @select_v2f16( ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_v2f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s6, s2 -; GFX11-NEXT: s_mov_b32 s7, s3 -; GFX11-NEXT: s_mov_b32 s22, s2 -; GFX11-NEXT: s_mov_b32 s23, s3 -; GFX11-NEXT: s_mov_b32 s18, s2 -; GFX11-NEXT: s_mov_b32 s19, s3 -; GFX11-NEXT: s_mov_b32 s26, s2 -; GFX11-NEXT: s_mov_b32 s27, s3 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s20, s12 -; GFX11-NEXT: s_mov_b32 s21, s13 -; GFX11-NEXT: s_mov_b32 s16, s10 -; GFX11-NEXT: s_mov_b32 s17, s11 -; GFX11-NEXT: s_mov_b32 s24, s14 -; GFX11-NEXT: s_mov_b32 s25, s15 -; GFX11-NEXT: buffer_load_b32 v0, off, s[4:7], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 -; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 -; GFX11-NEXT: buffer_load_b32 v3, off, s[24:27], 0 -; GFX11-NEXT: s_mov_b32 s0, s8 -; GFX11-NEXT: s_mov_b32 s1, s9 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[0:3], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_v2f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s26, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s27, s3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s11 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[20:23], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s24, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s25, s15 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s9 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v3 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v5.l, v4.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v3.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v1.l, v6.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_v2f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x44 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s26, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s27, s3 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s12 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s13 +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s24, s14 +; GFX11-FAKE16-NEXT: s_mov_b32 s25, s15 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[20:23], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[16:19], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v3, off, s[24:27], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, s8 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, s9 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v2, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v6, v5 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_dual_cndmask_b32 v1, v4, v7 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -825,45 +1040,85 @@ define amdgpu_kernel void @select_v2f16_imm_a( ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_v2f16_imm_a: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 -; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_v2f16_imm_a: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, 0x3900, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_v2f16_imm_a: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, 0x3900, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %b, ptr addrspace(1) %c, @@ -964,45 +1219,85 @@ define amdgpu_kernel void @select_v2f16_imm_b( ; VI-NEXT: buffer_store_dword v0, off, s[8:11], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_v2f16_imm_b: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[16:19], 0 -; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo -; GFX11-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_v2f16_imm_b: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0x3900, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v4.l, s0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_v2f16_imm_b: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[16:19], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0.5, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0x3900, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %c, @@ -1105,45 +1400,85 @@ define amdgpu_kernel void @select_v2f16_imm_c( ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_v2f16_imm_c: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo -; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_v2f16_imm_c: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_nlt_f16_e64 s0, v4.l, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_v2f16_imm_c: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -1246,45 +1581,85 @@ define amdgpu_kernel void @select_v2f16_imm_d( ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: select_v2f16_imm_d: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s10 -; GFX11-NEXT: s_mov_b32 s19, s11 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s22, s10 -; GFX11-NEXT: s_mov_b32 s23, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s4 -; GFX11-NEXT: s_mov_b32 s17, s5 -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: s_mov_b32 s20, s6 -; GFX11-NEXT: s_mov_b32 s21, s7 -; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 -; GFX11-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo -; GFX11-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX11-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-NEXT: buffer_store_b32 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: select_v2f16_imm_d: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s3 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s3 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s7 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s5 +; GFX11-TRUE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1.l, v0.l +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_cmp_lt_f16_e64 s0, v4.l, v3.l +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x3800, v2.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, 0x3900, v1.l, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s4 +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: select_v2f16_imm_d: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s4 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s5 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s7 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: buffer_load_b32 v2, off, s[20:23], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v1, v0 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x3800, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_lt_f16_e32 vcc_lo, v4, v3 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x3900, v5, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-FAKE16-NEXT: buffer_store_b32 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b, @@ -1400,26 +1775,45 @@ define <4 x half> @v_vselect_v4f16(<4 x half> %a, <4 x half> %b, <4 x i32> %cond ; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_vselect_v4f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v3 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX11-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_vselect_v4f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v7.l, v5.l, s0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v9.l, v8.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v3.l, v1.l, s2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_vselect_v4f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v3 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v5 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v11, v10, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v6 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v5, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v7, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <4 x i32> %cond, zeroinitializer %select = select <4 x i1> %cmp, <4 x half> %a, <4 x half> %b ret <4 x half> %select @@ -1593,41 +1987,70 @@ define <8 x half> @v_vselect_v8f16(<8 x half> %a, <8 x half> %b, <8 x i32> %cond ; VI-NEXT: v_or_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_vselect_v8f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v7 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v15 -; GFX11-NEXT: v_lshrrev_b32_e32 v18, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v19, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v20, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v21, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc_lo -; GFX11-NEXT: v_lshrrev_b32_e32 v16, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v17, 16, v6 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v19, v18, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v21, v20, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 -; GFX11-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v14 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo -; GFX11-NEXT: v_perm_b32 v3, v15, v3, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_vselect_v8f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v10 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v12 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v9 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v11 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v13 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v12, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v13, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v15, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v4 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v9.l, v8.l, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v11.l, v10.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v13.l, v12.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v16.l, v15.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v4.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v5.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v6.l, v2.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v7.l, v3.l, s6 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_vselect_v8f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v7 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v15 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v18, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v19, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v20, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v21, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v16, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v17, 16, v6 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v13 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v11 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v19, v18, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v9 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v21, v20, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v8 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v9, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v14 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v15, v3, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <8 x i32> %cond, zeroinitializer %select = select <8 x i1> %cmp, <8 x half> %a, <8 x half> %b ret <8 x half> %select @@ -1971,72 +2394,128 @@ define <16 x half> @v_vselect_v16f16(<16 x half> %a, <16 x half> %b, <16 x i32> ; VI-NEXT: v_or_b32_sdwa v7, v7, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_vselect_v16f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: scratch_load_b32 v31, off, s32 -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v34, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v36, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v38, 16, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v48, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v50, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v52, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v35, 16, v14 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v54, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v37, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v55, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v39, 16, v12 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v53, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v51, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v49, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v33, 16, v15 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v22 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v20 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v35, v34, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v8, v6, 0x5040100 -; GFX11-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v39, v38, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v19 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v53, v52, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v17 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v4, v10, v4, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v55, v54, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v21 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v51, v50, vcc_lo -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v23 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v49, v48, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 -; GFX11-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v33, v32, vcc_lo -; GFX11-NEXT: v_perm_b32 v0, v12, v0, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v7, v11, v7, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_vselect_v16f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v24 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v18 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v20 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v22 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v26 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v28 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v30 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 0, v17 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 0, v19 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 0, v21 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 0, v23 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 0, v25 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 0, v27 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 0, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v16, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v17, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v20, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v21, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v22, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v23, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v24, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v25, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v26, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v27, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v28, 16, v1 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v29, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v30, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v12.l, v4.l, s3 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v19.l, v18.l, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v21.l, v20.l, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v23.l, v22.l, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v25.l, v24.l, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v27.l, v26.l, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v29.l, v28.l, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v32.l, v30.l, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v15.l, v7.l, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v14.l, v6.l, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v13.l, v5.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v8.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v9.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v10.l, v2.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v11.l, v3.l, s2 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v31 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v17.l, v16.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_vselect_v16f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v34, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v36, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v38, 16, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v48, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v50, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v52, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v35, 16, v14 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v14, v6, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v54, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v37, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v55, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v39, 16, v12 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v13, v5, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v53, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v51, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v49, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v33, 16, v15 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v12, v4, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v22 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v20 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v18 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v29 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v35, v34, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v27 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v37, v36, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v25 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v8, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v9, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v39, v38, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v19 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v53, v52, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v17 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v10, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v11, v1, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v55, v54, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v21 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v51, v50, vcc_lo +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v23 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v49, v48, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v13, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v14, v3, 0x5040100 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v33, v32, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v12, v0, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v11, v7, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <16 x i32> %cond, zeroinitializer %select = select <16 x i1> %cmp, <16 x half> %a, <16 x half> %b ret <16 x half> %select @@ -2903,196 +3382,364 @@ define <32 x half> @v_vselect_v32f16(<32 x half> %a, <32 x half> %b, <32 x i32> ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_vselect_v32f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_clause 0x1f -; GFX11-NEXT: scratch_load_b32 v31, off, s32 offset:120 -; GFX11-NEXT: scratch_load_b32 v32, off, s32 offset:112 -; GFX11-NEXT: scratch_load_b32 v33, off, s32 -; GFX11-NEXT: scratch_load_b32 v34, off, s32 offset:104 -; GFX11-NEXT: scratch_load_b32 v35, off, s32 offset:96 -; GFX11-NEXT: scratch_load_b32 v36, off, s32 offset:88 -; GFX11-NEXT: scratch_load_b32 v37, off, s32 offset:80 -; GFX11-NEXT: scratch_load_b32 v38, off, s32 offset:72 -; GFX11-NEXT: scratch_load_b32 v39, off, s32 offset:64 -; GFX11-NEXT: scratch_load_b32 v48, off, s32 offset:56 -; GFX11-NEXT: scratch_load_b32 v49, off, s32 offset:48 -; GFX11-NEXT: scratch_load_b32 v50, off, s32 offset:40 -; GFX11-NEXT: scratch_load_b32 v51, off, s32 offset:32 -; GFX11-NEXT: scratch_load_b32 v52, off, s32 offset:24 -; GFX11-NEXT: scratch_load_b32 v53, off, s32 offset:16 -; GFX11-NEXT: scratch_load_b32 v54, off, s32 offset:8 -; GFX11-NEXT: scratch_load_b32 v55, off, s32 offset:124 -; GFX11-NEXT: scratch_load_b32 v64, off, s32 offset:116 -; GFX11-NEXT: scratch_load_b32 v65, off, s32 offset:108 -; GFX11-NEXT: scratch_load_b32 v66, off, s32 offset:100 -; GFX11-NEXT: scratch_load_b32 v67, off, s32 offset:92 -; GFX11-NEXT: scratch_load_b32 v68, off, s32 offset:84 -; GFX11-NEXT: scratch_load_b32 v69, off, s32 offset:76 -; GFX11-NEXT: scratch_load_b32 v70, off, s32 offset:68 -; GFX11-NEXT: scratch_load_b32 v71, off, s32 offset:60 -; GFX11-NEXT: scratch_load_b32 v80, off, s32 offset:52 -; GFX11-NEXT: scratch_load_b32 v81, off, s32 offset:44 -; GFX11-NEXT: scratch_load_b32 v82, off, s32 offset:36 -; GFX11-NEXT: scratch_load_b32 v83, off, s32 offset:28 -; GFX11-NEXT: scratch_load_b32 v84, off, s32 offset:12 -; GFX11-NEXT: scratch_load_b32 v85, off, s32 offset:4 -; GFX11-NEXT: scratch_load_b32 v86, off, s32 offset:20 -; GFX11-NEXT: scratch_load_b32 v87, off, s32 offset:128 -; GFX11-NEXT: v_lshrrev_b32_e32 v97, 16, v14 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v30 -; GFX11-NEXT: v_lshrrev_b32_e32 v99, 16, v13 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v29 -; GFX11-NEXT: v_lshrrev_b32_e32 v101, 16, v12 -; GFX11-NEXT: v_lshrrev_b32_e32 v102, 16, v28 -; GFX11-NEXT: v_lshrrev_b32_e32 v103, 16, v11 -; GFX11-NEXT: v_lshrrev_b32_e32 v112, 16, v27 -; GFX11-NEXT: v_lshrrev_b32_e32 v113, 16, v10 -; GFX11-NEXT: v_lshrrev_b32_e32 v114, 16, v26 -; GFX11-NEXT: v_lshrrev_b32_e32 v115, 16, v9 -; GFX11-NEXT: v_lshrrev_b32_e32 v116, 16, v25 -; GFX11-NEXT: v_lshrrev_b32_e32 v117, 16, v8 -; GFX11-NEXT: v_lshrrev_b32_e32 v118, 16, v24 -; GFX11-NEXT: v_lshrrev_b32_e32 v119, 16, v7 -; GFX11-NEXT: v_lshrrev_b32_e32 v128, 16, v23 -; GFX11-NEXT: v_lshrrev_b32_e32 v129, 16, v6 -; GFX11-NEXT: v_lshrrev_b32_e32 v130, 16, v22 -; GFX11-NEXT: v_lshrrev_b32_e32 v131, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v132, 16, v21 -; GFX11-NEXT: v_lshrrev_b32_e32 v133, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v134, 16, v20 -; GFX11-NEXT: v_lshrrev_b32_e32 v135, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v144, 16, v19 -; GFX11-NEXT: v_lshrrev_b32_e32 v145, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v146, 16, v18 -; GFX11-NEXT: v_lshrrev_b32_e32 v147, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v96, 16, v15 -; GFX11-NEXT: s_waitcnt vmcnt(32) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 -; GFX11-NEXT: v_lshrrev_b32_e32 v31, 16, v17 -; GFX11-NEXT: v_cndmask_b32_e32 v97, v98, v97, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(31) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v32 -; GFX11-NEXT: v_lshrrev_b32_e32 v98, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v32, 16, v16 -; GFX11-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(29) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34 -; GFX11-NEXT: v_lshrrev_b32_e32 v100, 16, v33 -; GFX11-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(28) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35 -; GFX11-NEXT: v_cndmask_b32_e32 v35, v112, v103, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(27) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v36 -; GFX11-NEXT: v_cndmask_b32_e32 v36, v114, v113, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(26) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v37 -; GFX11-NEXT: v_cndmask_b32_e32 v37, v116, v115, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(25) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v38 -; GFX11-NEXT: v_cndmask_b32_e32 v38, v118, v117, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(24) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v39 -; GFX11-NEXT: v_cndmask_b32_e32 v39, v128, v119, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(23) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v48 -; GFX11-NEXT: v_cndmask_b32_e32 v48, v130, v129, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(22) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v49 -; GFX11-NEXT: v_cndmask_b32_e32 v49, v132, v131, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(21) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v50 -; GFX11-NEXT: v_cndmask_b32_e32 v50, v134, v133, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(20) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v51 -; GFX11-NEXT: v_cndmask_b32_e32 v51, v144, v135, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(19) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v52 -; GFX11-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(18) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53 -; GFX11-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(17) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54 -; GFX11-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(16) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 -; GFX11-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(15) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 -; GFX11-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(14) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(13) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 -; GFX11-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(12) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(11) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68 -; GFX11-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69 -; GFX11-NEXT: v_perm_b32 v13, v99, v13, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(9) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70 -; GFX11-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(8) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 -; GFX11-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80 -; GFX11-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81 -; GFX11-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82 -; GFX11-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 -; GFX11-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v4, v50, v4, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 -; GFX11-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) -; GFX11-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87 -; GFX11-NEXT: v_perm_b32 v3, v51, v3, 0x5040100 -; GFX11-NEXT: v_perm_b32 v2, v52, v2, 0x5040100 -; GFX11-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo -; GFX11-NEXT: v_perm_b32 v1, v31, v1, 0x5040100 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_vselect_v32f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_clause 0x1f +; GFX11-TRUE16-NEXT: scratch_load_b32 v31, off, s32 offset:4 +; GFX11-TRUE16-NEXT: scratch_load_b32 v32, off, s32 offset:12 +; GFX11-TRUE16-NEXT: scratch_load_b32 v33, off, s32 offset:20 +; GFX11-TRUE16-NEXT: scratch_load_b32 v34, off, s32 offset:28 +; GFX11-TRUE16-NEXT: scratch_load_b32 v35, off, s32 offset:36 +; GFX11-TRUE16-NEXT: scratch_load_b32 v36, off, s32 offset:44 +; GFX11-TRUE16-NEXT: scratch_load_b32 v37, off, s32 offset:52 +; GFX11-TRUE16-NEXT: scratch_load_b32 v38, off, s32 offset:60 +; GFX11-TRUE16-NEXT: scratch_load_b32 v39, off, s32 offset:68 +; GFX11-TRUE16-NEXT: scratch_load_b32 v48, off, s32 offset:76 +; GFX11-TRUE16-NEXT: scratch_load_b32 v49, off, s32 offset:84 +; GFX11-TRUE16-NEXT: scratch_load_b32 v50, off, s32 offset:92 +; GFX11-TRUE16-NEXT: scratch_load_b32 v51, off, s32 offset:100 +; GFX11-TRUE16-NEXT: scratch_load_b32 v52, off, s32 offset:108 +; GFX11-TRUE16-NEXT: scratch_load_b32 v53, off, s32 offset:116 +; GFX11-TRUE16-NEXT: scratch_load_b32 v54, off, s32 offset:124 +; GFX11-TRUE16-NEXT: scratch_load_b32 v55, off, s32 offset:8 +; GFX11-TRUE16-NEXT: scratch_load_b32 v64, off, s32 offset:16 +; GFX11-TRUE16-NEXT: scratch_load_b32 v65, off, s32 offset:24 +; GFX11-TRUE16-NEXT: scratch_load_b32 v66, off, s32 offset:32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v67, off, s32 offset:40 +; GFX11-TRUE16-NEXT: scratch_load_b32 v68, off, s32 offset:48 +; GFX11-TRUE16-NEXT: scratch_load_b32 v69, off, s32 offset:56 +; GFX11-TRUE16-NEXT: scratch_load_b32 v70, off, s32 offset:64 +; GFX11-TRUE16-NEXT: scratch_load_b32 v71, off, s32 offset:72 +; GFX11-TRUE16-NEXT: scratch_load_b32 v80, off, s32 offset:80 +; GFX11-TRUE16-NEXT: scratch_load_b32 v81, off, s32 offset:88 +; GFX11-TRUE16-NEXT: scratch_load_b32 v82, off, s32 offset:128 +; GFX11-TRUE16-NEXT: scratch_load_b32 v83, off, s32 +; GFX11-TRUE16-NEXT: scratch_load_b32 v84, off, s32 offset:120 +; GFX11-TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:112 +; GFX11-TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:104 +; GFX11-TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:96 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v96, 16, v15 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v98, 16, v30 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v99, 16, v13 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v100, 16, v29 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v101, 16, v12 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v102, 16, v28 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v103, 16, v11 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v112, 16, v27 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v113, 16, v10 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v114, 16, v26 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v115, 16, v9 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v116, 16, v25 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v117, 16, v8 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v118, 16, v24 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v128, 16, v23 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v129, 16, v6 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v130, 16, v22 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v131, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v132, 16, v21 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v133, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v134, 16, v20 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v135, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v144, 16, v19 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v145, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v146, 16, v18 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v147, 16, v1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v32 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v0 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(30) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s1, 0, v33 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v33, 16, v16 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s2, 0, v34 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s3, 0, v35 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s4, 0, v36 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s5, 0, v37 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s6, 0, v38 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s7, 0, v39 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s8, 0, v48 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s9, 0, v49 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s10, 0, v50 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s11, 0, v51 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s12, 0, v52 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 0, v53 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s14, 0, v54 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 0, v55 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s16, 0, v64 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 0, v65 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s18, 0, v66 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 0, v67 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s20, 0, v68 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 0, v69 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s22, 0, v70 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s23, 0, v71 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s24, 0, v80 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s25, 0, v81 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 0, v82 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v34, 16, v83 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s27, 0, v84 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s28, 0, v85 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 0, v86 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s40, 0, v87 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.h, v34.l, v96.l, s26 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.h, v98.l, v97.l, s27 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.h, v100.l, v99.l, s28 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.h, v102.l, v101.l, s29 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.h, v112.l, v103.l, s40 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.h, v114.l, v113.l, s25 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.h, v116.l, v115.l, s24 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.h, v118.l, v117.l, s23 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.h, v128.l, v119.l, s22 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.h, v130.l, v129.l, s21 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.h, v132.l, v131.l, s20 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.h, v134.l, v133.l, s19 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.h, v144.l, v135.l, s18 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.h, v146.l, v145.l, s17 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.h, v31.l, v147.l, s16 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, v33.l, v32.l, s15 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v15.l, v83.l, v15.l, s14 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v14.l, v30.l, v14.l, s13 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v13.l, v29.l, v13.l, s12 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v12.l, v28.l, v12.l, s11 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v11.l, v27.l, v11.l, s10 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v10.l, v26.l, v10.l, s9 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v9.l, v25.l, v9.l, s8 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v8.l, v24.l, v8.l, s7 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v7.l, v23.l, v7.l, s6 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v6.l, v22.l, v6.l, s5 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v5.l, v21.l, v5.l, s4 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, v16.l, v0.l, vcc_lo +; GFX11-TRUE16-NEXT: v_cndmask_b16 v1.l, v17.l, v1.l, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v2.l, v18.l, v2.l, s1 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v3.l, v19.l, v3.l, s2 +; GFX11-TRUE16-NEXT: v_cndmask_b16 v4.l, v20.l, v4.l, s3 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_vselect_v32f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_clause 0x1f +; GFX11-FAKE16-NEXT: scratch_load_b32 v31, off, s32 offset:120 +; GFX11-FAKE16-NEXT: scratch_load_b32 v32, off, s32 offset:112 +; GFX11-FAKE16-NEXT: scratch_load_b32 v33, off, s32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v34, off, s32 offset:104 +; GFX11-FAKE16-NEXT: scratch_load_b32 v35, off, s32 offset:96 +; GFX11-FAKE16-NEXT: scratch_load_b32 v36, off, s32 offset:88 +; GFX11-FAKE16-NEXT: scratch_load_b32 v37, off, s32 offset:80 +; GFX11-FAKE16-NEXT: scratch_load_b32 v38, off, s32 offset:72 +; GFX11-FAKE16-NEXT: scratch_load_b32 v39, off, s32 offset:64 +; GFX11-FAKE16-NEXT: scratch_load_b32 v48, off, s32 offset:56 +; GFX11-FAKE16-NEXT: scratch_load_b32 v49, off, s32 offset:48 +; GFX11-FAKE16-NEXT: scratch_load_b32 v50, off, s32 offset:40 +; GFX11-FAKE16-NEXT: scratch_load_b32 v51, off, s32 offset:32 +; GFX11-FAKE16-NEXT: scratch_load_b32 v52, off, s32 offset:24 +; GFX11-FAKE16-NEXT: scratch_load_b32 v53, off, s32 offset:16 +; GFX11-FAKE16-NEXT: scratch_load_b32 v54, off, s32 offset:8 +; GFX11-FAKE16-NEXT: scratch_load_b32 v55, off, s32 offset:124 +; GFX11-FAKE16-NEXT: scratch_load_b32 v64, off, s32 offset:116 +; GFX11-FAKE16-NEXT: scratch_load_b32 v65, off, s32 offset:108 +; GFX11-FAKE16-NEXT: scratch_load_b32 v66, off, s32 offset:100 +; GFX11-FAKE16-NEXT: scratch_load_b32 v67, off, s32 offset:92 +; GFX11-FAKE16-NEXT: scratch_load_b32 v68, off, s32 offset:84 +; GFX11-FAKE16-NEXT: scratch_load_b32 v69, off, s32 offset:76 +; GFX11-FAKE16-NEXT: scratch_load_b32 v70, off, s32 offset:68 +; GFX11-FAKE16-NEXT: scratch_load_b32 v71, off, s32 offset:60 +; GFX11-FAKE16-NEXT: scratch_load_b32 v80, off, s32 offset:52 +; GFX11-FAKE16-NEXT: scratch_load_b32 v81, off, s32 offset:44 +; GFX11-FAKE16-NEXT: scratch_load_b32 v82, off, s32 offset:36 +; GFX11-FAKE16-NEXT: scratch_load_b32 v83, off, s32 offset:28 +; GFX11-FAKE16-NEXT: scratch_load_b32 v84, off, s32 offset:12 +; GFX11-FAKE16-NEXT: scratch_load_b32 v85, off, s32 offset:4 +; GFX11-FAKE16-NEXT: scratch_load_b32 v86, off, s32 offset:20 +; GFX11-FAKE16-NEXT: scratch_load_b32 v87, off, s32 offset:128 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v97, 16, v14 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v30 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v99, 16, v13 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v29 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v101, 16, v12 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v102, 16, v28 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v103, 16, v11 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v112, 16, v27 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v113, 16, v10 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v114, 16, v26 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v115, 16, v9 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v116, 16, v25 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v117, 16, v8 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v118, 16, v24 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v119, 16, v7 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v128, 16, v23 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v129, 16, v6 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v130, 16, v22 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v131, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v132, 16, v21 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v133, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v134, 16, v20 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v135, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v144, 16, v19 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v145, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v146, 16, v18 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v147, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v96, 16, v15 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(32) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v31 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v31, 16, v17 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v97, v98, v97, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(31) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v32 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v98, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v32, 16, v16 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v99, v100, v99, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(29) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v34 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v100, 16, v33 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v34, v102, v101, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(28) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v35 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v35, v112, v103, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(27) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v36 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v36, v114, v113, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(26) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v37 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v37, v116, v115, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(25) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v38 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v38, v118, v117, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(24) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v39 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v39, v128, v119, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(23) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v48 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v48, v130, v129, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(22) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v49 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v49, v132, v131, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(21) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v50 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v50, v134, v133, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(20) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v51 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v51, v144, v135, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(19) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v52 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v52, v146, v145, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(18) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v53 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v31, v31, v147, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(17) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v54 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v32, v32, v98, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(16) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v55 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v15, v33, v15, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(15) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v64 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v14, v30, v14, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(14) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v65 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v14, v97, v14, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v13, v29, v13, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(13) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v66 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v12, v28, v12, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(12) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v67 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v12, v34, v12, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v11, v27, v11, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(11) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v68 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v10, v26, v10, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(10) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v69 +; GFX11-FAKE16-NEXT: v_perm_b32 v13, v99, v13, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v10, v36, v10, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v9, v25, v9, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(9) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v70 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v8, v24, v8, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(8) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v71 +; GFX11-FAKE16-NEXT: v_perm_b32 v11, v35, v11, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v8, v38, v8, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v7, v23, v7, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(7) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v80 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v6, v22, v6, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(6) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v81 +; GFX11-FAKE16-NEXT: v_perm_b32 v9, v37, v9, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v6, v48, v6, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(5) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v82 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(4) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v83 +; GFX11-FAKE16-NEXT: v_perm_b32 v7, v39, v7, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v4, v50, v4, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(3) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v84 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(2) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v85 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v86 +; GFX11-FAKE16-NEXT: v_perm_b32 v5, v49, v5, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3) +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v32, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v87 +; GFX11-FAKE16-NEXT: v_perm_b32 v3, v51, v3, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v2, v52, v2, 0x5040100 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v16, v100, v96, vcc_lo +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v31, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_perm_b32 v15, v16, v15, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <32 x i32> %cond, zeroinitializer %select = select <32 x i1> %cmp, <32 x half> %a, <32 x half> %b ret <32 x half> %select diff --git a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll index f3be926f76bef..ce31f2a74d16a 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fadd.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-FAKE16 %s ; FIXME: promotion not handled without f16 insts define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 { @@ -12,11 +13,23 @@ define half @v_constained_fadd_f16_fpexcept_strict(half %x, half %y) #0 { ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_strict: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constained_fadd_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constained_fadd_f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fadd_f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -28,11 +41,23 @@ define half @v_constained_fadd_f16_fpexcept_ignore(half %x, half %y) #0 { ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_ignore: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constained_fadd_f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constained_fadd_f16_fpexcept_ignore: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fadd_f16_fpexcept_ignore: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -44,11 +69,23 @@ define half @v_constained_fadd_f16_fpexcept_maytrap(half %x, half %y) #0 { ; GCN-NEXT: v_add_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fadd_f16_fpexcept_maytrap: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constained_fadd_f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constained_fadd_f16_fpexcept_maytrap: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fadd_f16_fpexcept_maytrap: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -142,12 +179,26 @@ define <3 x half> @v_constained_fadd_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX8-NEXT: v_add_f16_e32 v1, v1, v3 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fadd_v3f16_fpexcept_strict: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_pk_add_f16 v0, v0, v2 -; GFX10PLUS-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constained_fadd_v3f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX10-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-TRUE16-LABEL: v_constained_fadd_v3f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fadd_v3f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_add_f16 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fadd.v3f16(<3 x half> %x, <3 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -188,20 +239,33 @@ define <4 x half> @v_constained_fadd_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10-NEXT: v_perm_b32 v1, v4, v1, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constained_fadd_v4f16_fpexcept_strict: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-NEXT: v_add_f16_e32 v1, v1, v3 -; GFX11-NEXT: v_add_f16_e32 v0, v0, v2 -; GFX11-NEXT: v_add_f16_e32 v2, v6, v5 -; GFX11-NEXT: v_add_f16_e32 v3, v7, v4 -; GFX11-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constained_fadd_v4f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.h, v6.l, v5.l +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.h, v7.l, v4.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fadd_v4f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, v1, v3 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v2, v6, v5 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v3, v7, v4 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half> %x, <4 x half> %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -213,10 +277,20 @@ define amdgpu_ps half @s_constained_fadd_f16_fpexcept_strict(half inreg %x, half ; GCN-NEXT: v_add_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_constained_fadd_f16_fpexcept_strict: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_f16_e64 v0, s2, s3 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_constained_fadd_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_f16_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: s_constained_fadd_f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s2, s3 +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-FAKE16-LABEL: s_constained_fadd_f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s2, s3 +; GFX11-FAKE16-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fadd.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -255,3 +329,5 @@ declare <4 x half> @llvm.experimental.constrained.fadd.v4f16(<4 x half>, <4 x ha attributes #0 = { strictfp } attributes #1 = { inaccessiblememonly nounwind willreturn } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll index 407bb002483ec..ccd21b74f49bd 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fma.f16.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8 %s ; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 { ; GCN-LABEL: v_constained_fma_f16_fpexcept_strict: @@ -17,11 +18,17 @@ define half @v_constained_fma_f16_fpexcept_strict(half %x, half %y, half %z) #0 ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constained_fma_f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fma_f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -88,12 +95,20 @@ define <3 x half> @v_constained_fma_v3f16_fpexcept_strict(<3 x half> %x, <3 x ha ; GFX10-NEXT: v_fma_f16 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constained_fma_v3f16_fpexcept_strict: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_fma_f16 v0, v0, v2, v4 -; GFX11-NEXT: v_fma_f16 v1, v1, v3, v5 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constained_fma_v3f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v5.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fma_v3f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_pk_fma_f16 v0, v0, v2, v4 +; GFX11-FAKE16-NEXT: v_fma_f16 v1, v1, v3, v5 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call <3 x half> @llvm.experimental.constrained.fma.v3f16(<3 x half> %x, <3 x half> %y, <3 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <3 x half> %val } @@ -153,22 +168,40 @@ define <4 x half> @v_constained_fma_v4f16_fpexcept_strict(<4 x half> %x, <4 x ha ; GFX10-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constained_fma_v4f16_fpexcept_strict: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_lshrrev_b32_e32 v6, 16, v5 -; GFX11-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-NEXT: v_lshrrev_b32_e32 v8, 16, v1 -; GFX11-NEXT: v_lshrrev_b32_e32 v9, 16, v4 -; GFX11-NEXT: v_lshrrev_b32_e32 v10, 16, v2 -; GFX11-NEXT: v_lshrrev_b32_e32 v11, 16, v0 -; GFX11-NEXT: v_fmac_f16_e32 v4, v0, v2 -; GFX11-NEXT: v_fmac_f16_e32 v6, v8, v7 -; GFX11-NEXT: v_fmac_f16_e32 v5, v1, v3 -; GFX11-NEXT: v_fmac_f16_e32 v9, v11, v10 -; GFX11-NEXT: v_perm_b32 v1, v6, v5, 0x5040100 -; GFX11-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constained_fma_v4f16_fpexcept_strict: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v10, 16, v0 +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v11, 16, v1 +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v5.l, v1.l, v3.l +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v4.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v8.l, v10.l, v9.l +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v6.l, v11.l, v7.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v4.h, v8.l +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v5.h, v6.l +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fma_v4f16_fpexcept_strict: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v5 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v1 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v4 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v2 +; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v4, v0, v2 +; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v6, v8, v7 +; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v5, v1, v3 +; GFX11-FAKE16-NEXT: v_fmac_f16_e32 v9, v11, v10 +; GFX11-FAKE16-NEXT: v_perm_b32 v1, v6, v5, 0x5040100 +; GFX11-FAKE16-NEXT: v_perm_b32 v0, v9, v4, 0x5040100 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call <4 x half> @llvm.experimental.constrained.fma.v4f16(<4 x half> %x, <4 x half> %y, <4 x half> %z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret <4 x half> %val } @@ -186,11 +219,17 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg(half %x, half %y, half %z ; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constained_fma_f16_fpexcept_strict_fneg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fma_f16_fpexcept_strict_fneg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg half %z %val = call half @llvm.experimental.constrained.fma.f16(half %x, half %y, half %neg.z, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val @@ -209,11 +248,17 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, ha ; GFX10-NEXT: v_fma_f16 v0, -v0, -v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, -v0, -v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, -v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fma_f16_fpexcept_strict_fneg_fneg: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, -v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg half %x %neg.y = fneg half %y %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -233,11 +278,17 @@ define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, ha ; GFX10-NEXT: v_fma_f16 v0, |v0|, |v1|, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, |v0|, |v1|, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, |v0.l|, |v1.l|, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_constained_fma_f16_fpexcept_strict_fabs_fabs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, |v0|, |v1|, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.x = call half @llvm.fabs.f16(half %x) #0 %neg.y = call half @llvm.fabs.f16(half %y) #0 %val = call half @llvm.experimental.constrained.fma.f16(half %neg.x, half %neg.y, half %z, metadata !"round.tonearest", metadata !"fpexcept.strict") diff --git a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll index d798166a67839..b2ebae84a961c 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fmul.f16.ll @@ -8,8 +8,10 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10,GFX10-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL,GFX1-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11,GFX1-GISEL,GFX1-GISEL-FAKE16 %s ; FIXME: promotion not handled without f16 insts @@ -21,11 +23,35 @@ define half @v_constained_fmul_f16_fpexcept_strict(half %x, half %y) #0 { ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_strict: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constained_fmul_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fmul_f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fmul_f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1-GISEL-TRUE16-LABEL: v_constained_fmul_f16_fpexcept_strict: +; GFX1-GISEL-TRUE16: ; %bb.0: +; GFX1-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1-GISEL-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX1-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1-GISEL-FAKE16-LABEL: v_constained_fmul_f16_fpexcept_strict: +; GFX1-GISEL-FAKE16: ; %bb.0: +; GFX1-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1-GISEL-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -37,11 +63,35 @@ define half @v_constained_fmul_f16_fpexcept_ignore(half %x, half %y) #0 { ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_ignore: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constained_fmul_f16_fpexcept_ignore: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fmul_f16_fpexcept_ignore: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fmul_f16_fpexcept_ignore: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1-GISEL-TRUE16-LABEL: v_constained_fmul_f16_fpexcept_ignore: +; GFX1-GISEL-TRUE16: ; %bb.0: +; GFX1-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1-GISEL-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX1-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1-GISEL-FAKE16-LABEL: v_constained_fmul_f16_fpexcept_ignore: +; GFX1-GISEL-FAKE16: ; %bb.0: +; GFX1-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1-GISEL-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -53,11 +103,35 @@ define half @v_constained_fmul_f16_fpexcept_maytrap(half %x, half %y) #0 { ; GCN-NEXT: v_mul_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fmul_f16_fpexcept_maytrap: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: v_constained_fmul_f16_fpexcept_maytrap: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX10-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fmul_f16_fpexcept_maytrap: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fmul_f16_fpexcept_maytrap: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1-GISEL-TRUE16-LABEL: v_constained_fmul_f16_fpexcept_maytrap: +; GFX1-GISEL-TRUE16: ; %bb.0: +; GFX1-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1-GISEL-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX1-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX1-GISEL-FAKE16-LABEL: v_constained_fmul_f16_fpexcept_maytrap: +; GFX1-GISEL-FAKE16: ; %bb.0: +; GFX1-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX1-GISEL-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX1-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -205,12 +279,19 @@ define <3 x half> @v_constained_fmul_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX10-GISEL-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_constained_fmul_v3f16_fpexcept_strict: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_pk_mul_f16 v0, v0, v2 -; GFX11-SDAG-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: v_constained_fmul_v3f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fmul_v3f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1-GISEL-LABEL: v_constained_fmul_v3f16_fpexcept_strict: ; GFX1-GISEL: ; %bb.0: @@ -283,20 +364,33 @@ define <4 x half> @v_constained_fmul_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10-GISEL-NEXT: v_pk_mul_f16 v1, v1, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: v_constained_fmul_v4f16_fpexcept_strict: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v3 -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v5, 16, v2 -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v0 -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v1 -; GFX11-SDAG-NEXT: v_mul_f16_e32 v1, v1, v3 -; GFX11-SDAG-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, v6, v5 -; GFX11-SDAG-NEXT: v_mul_f16_e32 v3, v7, v4 -; GFX11-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SDAG-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: v_constained_fmul_v4f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v0.h, v6.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.h, v7.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fmul_v4f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, v6, v5 +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v3, v7, v4 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX1-GISEL-LABEL: v_constained_fmul_v4f16_fpexcept_strict: ; GFX1-GISEL: ; %bb.0: @@ -315,10 +409,30 @@ define amdgpu_ps half @s_constained_fmul_f16_fpexcept_strict(half inreg %x, half ; GCN-NEXT: v_mul_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_constained_fmul_f16_fpexcept_strict: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_mul_f16_e64 v0, s2, s3 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_constained_fmul_f16_fpexcept_strict: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_mul_f16_e64 v0, s2, s3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-TRUE16-LABEL: s_constained_fmul_f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e64 v0.l, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-FAKE16-LABEL: s_constained_fmul_f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e64 v0, s2, s3 +; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX1-GISEL-TRUE16-LABEL: s_constained_fmul_f16_fpexcept_strict: +; GFX1-GISEL-TRUE16: ; %bb.0: +; GFX1-GISEL-TRUE16-NEXT: v_mul_f16_e64 v0.l, s2, s3 +; GFX1-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX1-GISEL-FAKE16-LABEL: s_constained_fmul_f16_fpexcept_strict: +; GFX1-GISEL-FAKE16: ; %bb.0: +; GFX1-GISEL-FAKE16-NEXT: v_mul_f16_e64 v0, s2, s3 +; GFX1-GISEL-FAKE16-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fmul.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -370,6 +484,6 @@ declare <4 x half> @llvm.experimental.constrained.fmul.v4f16(<4 x half>, <4 x ha attributes #0 = { strictfp } attributes #1 = { inaccessiblememonly nounwind willreturn } ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX10: {{.*}} ; GFX11: {{.*}} +; GFX11-SDAG: {{.*}} ; GFX8: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll index 3420596da2aac..d6c5c937fd83e 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_fsub.f16.ll @@ -5,11 +5,13 @@ ; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s ; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-SDAG %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-GISEL %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-SDAG %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10PLUS-GISEL %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11-SDAG-FAKE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-GISEL-FAKE16 %s ; FIXME: promotion not handled without f16 insts @@ -20,11 +22,41 @@ define half @v_constained_fsub_f16_fpexcept_strict(half %x, half %y) #0 { ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_strict: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_f16_fpexcept_strict: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -36,11 +68,41 @@ define half @v_constained_fsub_f16_fpexcept_ignore(half %x, half %y) #0 { ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_ignore: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_f16_fpexcept_ignore: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.ignore") ret half %val } @@ -52,11 +114,41 @@ define half @v_constained_fsub_f16_fpexcept_maytrap(half %x, half %y) #0 { ; GCN-NEXT: v_sub_f16_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] ; -; GFX10PLUS-LABEL: v_constained_fsub_f16_fpexcept_maytrap: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10PLUS-NEXT: v_sub_f16_e32 v0, v0, v1 -; GFX10PLUS-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_f16_fpexcept_maytrap: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.maytrap") ret half %val } @@ -108,6 +200,30 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_v2f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.h, v3.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_v2f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: ; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -117,7 +233,6 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_strict(<2 x half> %x, <2 x h ; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v3, v2 ; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-GISEL: ; %bb.0: ; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -174,6 +289,30 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.h, v3.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: ; GFX10PLUS-SDAG: ; %bb.0: ; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -183,7 +322,6 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_ignore(<2 x half> %x, <2 x h ; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v3, v2 ; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_ignore: ; GFX10PLUS-GISEL: ; %bb.0: ; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -240,6 +378,30 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX10-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.h, v3.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v2, v3, v2 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, v0, v1 neg_lo:[0,1] neg_hi:[0,1] +; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: ; GFX10PLUS-SDAG: ; %bb.0: ; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -249,7 +411,6 @@ define <2 x half> @v_constained_fsub_v2f16_fpexcept_maytrap(<2 x half> %x, <2 x ; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v3, v2 ; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v2f16_fpexcept_maytrap: ; GFX10PLUS-GISEL: ; %bb.0: ; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -316,6 +477,46 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX10-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v2 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.h, v5.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v2, v5, v4 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.h, v0.h, v2.h +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_v3f16_fpexcept_strict: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v2, v4, v5 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: ; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -326,7 +527,6 @@ define <3 x half> @v_constained_fsub_v3f16_fpexcept_strict(<3 x half> %x, <3 x h ; GFX10PLUS-SDAG-NEXT: v_sub_f16_e32 v2, v5, v4 ; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v3f16_fpexcept_strict: ; GFX10PLUS-GISEL: ; %bb.0: ; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -413,6 +613,59 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v3 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; +; GFX11-SDAG-TRUE16-LABEL: v_constained_fsub_v4f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.h, v6.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.h, v7.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: v_constained_fsub_v4f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v2, v6, v5 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e32 v3, v7, v4 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: v_constained_fsub_v4f16_fpexcept_strict: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v0.h, v0.h, v2.h +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v3.l +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e32 v1.h, v1.h, v3.h +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: v_constained_fsub_v4f16_fpexcept_strict: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v0, v0, v2 +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v1, v1, v3 +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v2, v4, v6 +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e32 v3, v5, v7 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v1, v3, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; GFX10PLUS-SDAG-LABEL: v_constained_fsub_v4f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: ; GFX10PLUS-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -427,7 +680,6 @@ define <4 x half> @v_constained_fsub_v4f16_fpexcept_strict(<4 x half> %x, <4 x h ; GFX10PLUS-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 ; GFX10PLUS-SDAG-NEXT: v_perm_b32 v1, v3, v1, 0x5040100 ; GFX10PLUS-SDAG-NEXT: s_setpc_b64 s[30:31] -; ; GFX10PLUS-GISEL-LABEL: v_constained_fsub_v4f16_fpexcept_strict: ; GFX10PLUS-GISEL: ; %bb.0: ; GFX10PLUS-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) @@ -455,10 +707,35 @@ define amdgpu_ps half @s_constained_fsub_f16_fpexcept_strict(half inreg %x, half ; GCN-NEXT: v_sub_f16_e32 v0, s2, v0 ; GCN-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_constained_fsub_f16_fpexcept_strict: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_f16_e64 v0, s2, s3 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-SDAG-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX10-SDAG: ; %bb.0: +; GFX10-SDAG-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX10-SDAG-NEXT: ; return to shader part epilog +; +; GFX10-GISEL-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX10-GISEL: ; %bb.0: +; GFX10-GISEL-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX10-GISEL-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3 +; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-FAKE16-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-TRUE16-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: v_sub_f16_e64 v0.l, s2, s3 +; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-FAKE16-LABEL: s_constained_fsub_f16_fpexcept_strict: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog %val = call half @llvm.experimental.constrained.fsub.f16(half %x, half %y, metadata !"round.tonearest", metadata !"fpexcept.strict") ret half %val } @@ -523,6 +800,35 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; GFX10-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 ; GFX10-GISEL-NEXT: ; return to shader part epilog ; +; GFX11-SDAG-TRUE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s0, s2, 16 +; GFX11-SDAG-TRUE16-NEXT: s_lshr_b32 s1, s3, 16 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, s0 +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, s1 +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_sub_f16_e32 v1.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-FAKE16-LABEL: s_constained_fsub_v2f16_fpexcept_strict: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e64 v0, s2, s3 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s0, s3, 16 +; GFX11-SDAG-FAKE16-NEXT: s_lshr_b32 s1, s2, 16 +; GFX11-SDAG-FAKE16-NEXT: v_sub_f16_e64 v1, s1, s0 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 +; GFX11-GISEL-NEXT: v_pk_add_f16 v0, s2, s0 +; GFX11-GISEL-NEXT: ; return to shader part epilog ; GFX10PLUS-SDAG-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-SDAG: ; %bb.0: ; GFX10PLUS-SDAG-NEXT: v_sub_f16_e64 v0, s2, s3 @@ -532,7 +838,6 @@ define amdgpu_ps <2 x half> @s_constained_fsub_v2f16_fpexcept_strict(<2 x half> ; GFX10PLUS-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX10PLUS-SDAG-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GFX10PLUS-SDAG-NEXT: ; return to shader part epilog -; ; GFX10PLUS-GISEL-LABEL: s_constained_fsub_v2f16_fpexcept_strict: ; GFX10PLUS-GISEL: ; %bb.0: ; GFX10PLUS-GISEL-NEXT: s_xor_b32 s0, s3, 0x80008000 diff --git a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll index 1f3f17c3e0c46..c46e3a08a6a0c 100644 --- a/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/strict_ldexp.f16.ll @@ -3,12 +3,14 @@ ; XUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-SDAG %s ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-SDAG %s -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s ; XUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,GFX6,GFX6-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,GFX8,GFX8-GISEL %s ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-GISEL %s -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s ; define half @test_ldexp_f16_i16(ptr addrspace(1) %out, half %a, i16 %b) #0 { ; %result = call half @llvm.experimental.constrained.ldexp.f16.i16(half %a, i16 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") @@ -34,14 +36,23 @@ define half @test_ldexp_f16_i32(ptr addrspace(1) %out, half %a, i32 %b) #0 { ; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_ldexp_f16_i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_med3_i32 v0, v3, s0, 0x7fff -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_ldexp_f16_i32: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v0, v3, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_ldexp_f16_i32: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v0, v3, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_f16_i32: ; GFX8-GISEL: ; %bb.0: @@ -61,14 +72,23 @@ define half @test_ldexp_f16_i32(ptr addrspace(1) %out, half %a, i32 %b) #0 { ; GFX9-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: test_ldexp_f16_i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v3, v0 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v2, v0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i32: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v0, 0xffff8000, v3, v0 +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_ldexp_f16_i32: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v0, 0xffff8000, v3, v0 +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call half @llvm.experimental.constrained.ldexp.f16.i32(half %a, i32 %b, metadata !"round.dynamic", metadata !"fpexcept.strict") ret half %result } @@ -104,19 +124,31 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a, ; GFX9-SDAG-NEXT: v_perm_b32 v0, v0, v1, s4 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_ldexp_v2f16_v2i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_med3_i32 v0, v3, s0, 0x7fff -; GFX11-SDAG-NEXT: v_med3_i32 v1, v4, s0, 0x7fff -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v1 -; GFX11-SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v0, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v3, v3, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v0, v3, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, v4, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v1, v3, v1 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v2f16_v2i32: ; GFX8-GISEL: ; %bb.0: @@ -142,21 +174,33 @@ define <2 x half> @test_ldexp_v2f16_v2i32(ptr addrspace(1) %out, <2 x half> %a, ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v3 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: test_ldexp_v2f16_v2i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v3, v0 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v4, v0 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v2, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v0, v3, v0 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v0, 16, v1 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v1, 0xffff8000, v3, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v3, 0xffff8000, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.h, v3.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v2f16_v2i32: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v1, 0xffff8000, v3, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v0, 0xffff8000, v4, v0 +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v1, v2, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v0, v3, v0 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v0, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <2 x half> @llvm.experimental.constrained.ldexp.v2f16.v2i32(<2 x half> %a, <2 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <2 x half> %result } @@ -191,22 +235,37 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a, ; GFX9-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v1 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_ldexp_v3f16_v3i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_med3_i32 v0, v4, s0, 0x7fff -; GFX11-SDAG-NEXT: v_med3_i32 v1, v5, s0, 0x7fff -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v0 -; GFX11-SDAG-NEXT: v_med3_i32 v2, v6, s0, 0x7fff -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v4, v1 -; GFX11-SDAG-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v3, v2 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v3f16_v3i32: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v0, v5, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v6, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v3f16_v3i32: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v0, v4, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, v5, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v2, v0 +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v2, v6, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v1, v4, v1 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v1, v3, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v3f16_v3i32: ; GFX8-GISEL: ; %bb.0: @@ -236,23 +295,37 @@ define <3 x half> @test_ldexp_v3f16_v3i32(ptr addrspace(1) %out, <3 x half> %a, ; GFX9-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v4 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: test_ldexp_v3f16_v3i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_med3_i32 v1, 0xffff8000, v4, v0 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v4, 16, v2 -; GFX11-GISEL-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v2, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v2, v4, v5 -; GFX11-GISEL-NEXT: v_med3_i32 v4, 0xffff8000, v6, v0 -; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v2, 16, v1 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v3, v4 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v3f16_v3i32: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v1, 0xffff8000, v6, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.h, v5.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v3f16_v3i32: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v1, 0xffff8000, v4, v0 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v1, v2, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v2, v4, v5 +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v4, 0xffff8000, v6, v0 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v2, 16, v1 +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v1, v3, v4 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <3 x half> @llvm.experimental.constrained.ldexp.v3f16.v3i32(<3 x half> %a, <3 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <3 x half> %result } @@ -293,26 +366,44 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a, ; GFX9-SDAG-NEXT: v_perm_b32 v1, v3, v1, s4 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_ldexp_v4f16_v4i32: -; GFX11-SDAG: ; %bb.0: -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x8000 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-SDAG-NEXT: v_med3_i32 v0, v6, s0, 0x7fff -; GFX11-SDAG-NEXT: v_med3_i32 v1, v7, s0, 0x7fff -; GFX11-SDAG-NEXT: v_med3_i32 v4, v4, s0, 0x7fff -; GFX11-SDAG-NEXT: v_med3_i32 v5, v5, s0, 0x7fff -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v6, 16, v2 -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v3 -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v3, v3, v0 -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v0, v2, v4 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v2, v6, v5 -; GFX11-SDAG-NEXT: v_ldexp_f16_e32 v1, v7, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 -; GFX11-SDAG-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_ldexp_v4f16_v4i32: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v3 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v0, v7, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v7, 16, v2 +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_med3_i32 v6, v6, s0, 0x7fff +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v1.l, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v7.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_ldexp_v4f16_v4i32: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x8000 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v0, v6, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v1, v7, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v4, v4, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_med3_i32 v5, v5, s0, 0x7fff +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v3, v3, v0 +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v2, v4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v2, v6, v5 +; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v1, v7, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v0, v2, v0, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: v_perm_b32 v1, v1, v3, 0x5040100 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_ldexp_v4f16_v4i32: ; GFX8-GISEL: ; %bb.0: @@ -348,30 +439,47 @@ define <4 x half> @test_ldexp_v4f16_v4i32(ptr addrspace(1) %out, <4 x half> %a, ; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v1, 16, v5 ; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-GISEL-LABEL: test_ldexp_v4f16_v4i32: -; GFX11-GISEL: ; %bb.0: -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0x7fff -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v8, 16, v3 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_med3_i32 v4, 0xffff8000, v4, v0 -; GFX11-GISEL-NEXT: v_med3_i32 v6, 0xffff8000, v6, v0 -; GFX11-GISEL-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 -; GFX11-GISEL-NEXT: v_med3_i32 v0, 0xffff8000, v7, v0 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v2, v2, v4 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v3, v3, v6 -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v1, v1, v5 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_ldexp_f16_e32 v4, v8, v0 -; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v3 -; GFX11-GISEL-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_lshl_or_b32 v1, v4, 16, v2 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-GISEL-TRUE16-LABEL: test_ldexp_v4f16_v4i32: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v1, 0xffff8000, v4, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v4, 0xffff8000, v5, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v5, 0xffff8000, v6, v0 +; GFX11-GISEL-TRUE16-NEXT: v_med3_i32 v6, 0xffff8000, v7, v0 +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.l, v2.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v0.h, v2.h, v4.l +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.l, v3.l, v5.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e32 v1.h, v3.h, v6.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_ldexp_v4f16_v4i32: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, 0x7fff +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v3 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v4, 0xffff8000, v4, v0 +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v6, 0xffff8000, v6, v0 +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v5, 0xffff8000, v5, v0 +; GFX11-GISEL-FAKE16-NEXT: v_med3_i32 v0, 0xffff8000, v7, v0 +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v2, v2, v4 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v3, v3, v6 +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v1, v1, v5 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_ldexp_f16_e32 v4, v8, v0 +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff, v3 +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_lshl_or_b32 v1, v4, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] %result = call <4 x half> @llvm.experimental.constrained.ldexp.v4f16.v4i32(<4 x half> %a, <4 x i32> %b, metadata !"round.dynamic", metadata !"fpexcept.strict") ret <4 x half> %result } @@ -388,5 +496,7 @@ attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessib ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; GCN: {{.*}} ; GFX11: {{.*}} +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} ; GFX8: {{.*}} ; GFX9: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll index c9547e2c68c82..5ec19a54403e0 100644 --- a/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/sub.v2i16.ll @@ -2,7 +2,8 @@ ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,GFX9 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GCN,VI ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX10 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11,GFX11-TRUE16 +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s -enable-var-scope -check-prefixes=GFX11,GFX11-FAKE16 ; FIXME: Need to handle non-uniform case for function below (load without gep). define amdgpu_kernel void @v_test_sub_v2i16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) #1 { @@ -785,28 +786,53 @@ define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(ptr addrspace(1) %out, ; GFX10-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; GFX10-NEXT: s_endpgm ; -; GFX11-LABEL: v_test_sub_v2i16_zext_to_v2i64: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s3, 0x31016000 -; GFX11-NEXT: s_mov_b32 s2, -1 -; GFX11-NEXT: v_pk_sub_i16 v0, v1, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_alignbit_b32 v2, 0, v0, 16 -; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 -; GFX11-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: v_test_sub_v2i16_zext_to_v2i64: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, -1 +; GFX11-TRUE16-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_lshl_or_b32 v2, 0, 16, v2 +; GFX11-TRUE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: v_test_sub_v2i16_zext_to_v2i64: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v1, v0, s[2:3] glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: global_load_b32 v0, v0, s[4:5] glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s3, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 +; GFX11-FAKE16-NEXT: v_pk_sub_i16 v0, v1, v0 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_alignbit_b32 v2, 0, v0, 16 +; GFX11-FAKE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v0, 0xffff, v0 +; GFX11-FAKE16-NEXT: buffer_store_b128 v[0:3], off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %gep.out = getelementptr inbounds <2 x i64>, ptr addrspace(1) %out, i32 %tid %gep.in0 = getelementptr inbounds <2 x i16>, ptr addrspace(1) %in0, i32 %tid diff --git a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll index ee16dad2d7d11..580938f922a04 100644 --- a/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=tahiti -verify-machineinstrs | FileCheck %s --check-prefix=SI ; RUN: llc < %s -mtriple=amdgcn-- -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=VI -; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefix=GFX11 +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-TRUE16 +; RUN: llc < %s -mtriple=amdgcn-- -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX11-FAKE16 define amdgpu_kernel void @madak_f16( ; SI-LABEL: madak_f16: @@ -52,30 +53,55 @@ define amdgpu_kernel void @madak_f16( ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: madak_f16: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 -; GFX11-NEXT: s_mov_b32 s10, -1 -; GFX11-NEXT: s_mov_b32 s11, 0x31016000 -; GFX11-NEXT: s_mov_b32 s14, s10 -; GFX11-NEXT: s_mov_b32 s15, s11 -; GFX11-NEXT: s_mov_b32 s6, s10 -; GFX11-NEXT: s_mov_b32 s7, s11 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s12, s2 -; GFX11-NEXT: s_mov_b32 s13, s3 -; GFX11-NEXT: buffer_load_u16 v0, off, s[12:15], 0 -; GFX11-NEXT: buffer_load_u16 v1, off, s[4:7], 0 -; GFX11-NEXT: s_mov_b32 s8, s0 -; GFX11-NEXT: s_mov_b32 s9, s1 -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[8:11], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: madak_f16: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0x4900, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: madak_f16: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[12:15], 0 +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0x4900, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r, ptr addrspace(1) %a, ptr addrspace(1) %b) #0 { @@ -164,42 +190,79 @@ define amdgpu_kernel void @madak_f16_use_2( ; VI-NEXT: buffer_store_short v3, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; -; GFX11-LABEL: madak_f16_use_2: -; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 -; GFX11-NEXT: s_mov_b32 s6, -1 -; GFX11-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-NEXT: s_mov_b32 s18, s6 -; GFX11-NEXT: s_mov_b32 s19, s7 -; GFX11-NEXT: s_mov_b32 s22, s6 -; GFX11-NEXT: s_mov_b32 s23, s7 -; GFX11-NEXT: s_mov_b32 s2, s6 -; GFX11-NEXT: s_mov_b32 s3, s7 -; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s16, s12 -; GFX11-NEXT: s_mov_b32 s17, s13 -; GFX11-NEXT: s_mov_b32 s20, s14 -; GFX11-NEXT: s_mov_b32 s21, s15 -; GFX11-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc -; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s8 -; GFX11-NEXT: s_mov_b32 s5, s9 -; GFX11-NEXT: s_mov_b32 s0, s10 -; GFX11-NEXT: s_mov_b32 s1, s11 -; GFX11-NEXT: v_mul_f16_e32 v1, v0, v1 -; GFX11-NEXT: v_mul_f16_e32 v0, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_f16_e32 v1, 0x4900, v1 -; GFX11-NEXT: v_add_f16_e32 v0, 0x4900, v0 -; GFX11-NEXT: buffer_store_b16 v1, off, s[4:7], 0 -; GFX11-NEXT: buffer_store_b16 v0, off, s[0:3], 0 -; GFX11-NEXT: s_endpgm +; GFX11-TRUE16-LABEL: madak_f16_use_2: +; GFX11-TRUE16: ; %bb.0: ; %entry +; GFX11-TRUE16-NEXT: s_clause 0x1 +; GFX11-TRUE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s18, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s19, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s22, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s23, s7 +; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 +; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 +; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s16, s12 +; GFX11-TRUE16-NEXT: s_mov_b32 s17, s13 +; GFX11-TRUE16-NEXT: s_mov_b32 s20, s14 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, s15 +; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s0, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s1, s11 +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.h, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mul_f16_e32 v0.l, v0.l, v2.l +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-TRUE16-NEXT: v_add_f16_e32 v1.l, 0x4900, v0.h +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, 0x4900, v0.l +; GFX11-TRUE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_endpgm +; +; GFX11-FAKE16-LABEL: madak_f16_use_2: +; GFX11-FAKE16: ; %bb.0: ; %entry +; GFX11-FAKE16-NEXT: s_clause 0x1 +; GFX11-FAKE16-NEXT: s_load_b256 s[8:15], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x44 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s18, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s19, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s22, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s23, s7 +; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 +; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 +; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s16, s12 +; GFX11-FAKE16-NEXT: s_mov_b32 s17, s13 +; GFX11-FAKE16-NEXT: s_mov_b32 s20, s14 +; GFX11-FAKE16-NEXT: s_mov_b32 s21, s15 +; GFX11-FAKE16-NEXT: buffer_load_u16 v0, off, s[16:19], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v1, off, s[20:23], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: buffer_load_u16 v2, off, s[0:3], 0 glc dlc +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) +; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 +; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 +; GFX11-FAKE16-NEXT: s_mov_b32 s0, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s1, s11 +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v1, v0, v1 +; GFX11-FAKE16-NEXT: v_mul_f16_e32 v0, v0, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-FAKE16-NEXT: v_add_f16_e32 v1, 0x4900, v1 +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, 0x4900, v0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v1, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_endpgm ptr addrspace(1) %r0, ptr addrspace(1) %r1, ptr addrspace(1) %a, @@ -221,3 +284,5 @@ entry: } attributes #0 = { "denormal-fp-math"="preserve-sign,preserve-sign" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll index b81f5d0a19ba8..556c553cfd7d5 100644 --- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll +++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll @@ -2,12 +2,14 @@ ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-GFX12,SDAG-GFX12-FAKE16 %s ; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-VI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1101 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GFX11,GISEL-GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-TRUE16 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs -global-isel < %s | FileCheck -check-prefixes=GISEL-GFX12,GISEL-GFX12-FAKE16 %s ; @basic_smax_smin(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smax_smin: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin: ; GISEL-VI: ; %bb.0: @@ -92,19 +105,30 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smax_smin: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smax_smin: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) @@ -158,18 +182,34 @@ define amdgpu_kernel void @basic_smax_smin_sgpr(ptr addrspace(1) %out, i32 inreg ; SDAG-GFX11-NEXT: global_store_b32 v2, v0, s[0:1] ; SDAG-GFX11-NEXT: s_endpgm ; -; SDAG-GFX12-LABEL: basic_smax_smin_sgpr: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; SDAG-GFX12-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_med3_i16 v0, s2, 0, 0xff -; SDAG-GFX12-NEXT: v_med3_i16 v1, s3, 0, 0xff -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; SDAG-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; SDAG-GFX12-NEXT: global_store_b32 v2, v0, s[0:1] -; SDAG-GFX12-NEXT: s_endpgm +; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_sgpr: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2 +; SDAG-GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.h, s3 +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v1.l, v0.h, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-TRUE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; SDAG-GFX12-TRUE16-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX12-TRUE16-NEXT: s_endpgm +; +; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_sgpr: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; SDAG-GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, 0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v0, s2, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v1, s3, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; SDAG-GFX12-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; SDAG-GFX12-FAKE16-NEXT: global_store_b32 v2, v0, s[0:1] +; SDAG-GFX12-FAKE16-NEXT: s_endpgm ; ; GISEL-VI-LABEL: basic_smax_smin_sgpr: ; GISEL-VI: ; %bb.0: @@ -300,18 +340,29 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smin_smax: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smin_smax: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smin_smax: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smin_smax: ; GISEL-VI: ; %bb.0: @@ -344,19 +395,30 @@ define <2 x i16> @basic_smin_smax(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smin_smax: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smin_smax: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smin_smax: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) @@ -398,18 +460,29 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smin_smax_combined: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smin_smax_combined: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smin_smax_combined: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smin_smax_combined: ; GISEL-VI: ; %bb.0: @@ -442,19 +515,30 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_lshl_or_b32 v0, v1, 16, v0 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smin_smax_combined: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GISEL-GFX12-NEXT: v_lshl_or_b32 v0, v1, 16, v0 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smin_smax_combined: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smin_smax_combined: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GISEL-GFX12-FAKE16-NEXT: v_lshl_or_b32 v0, v1, 16, v0 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0.min = call i16 @llvm.smin.i16(i16 %src0, i16 255) %src0.clamp = call i16 @llvm.smax.i16(i16 %src0.min, i16 0) @@ -825,19 +909,33 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smax_smin_bit_or: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_bit_or: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin_bit_or: ; GISEL-VI: ; %bb.0: @@ -860,19 +958,33 @@ define i16 @basic_smax_smin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smax_smin_bit_or: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smax_smin_bit_or: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) @@ -912,19 +1024,33 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_umax_umin_bit_or: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_min_u16 v1, 0xff, v1 -; SDAG-GFX12-NEXT: v_min_u16 v0, 0xff, v0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_min_u16 v0.h, 0xff, v1.l +; SDAG-GFX12-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_umax_umin_bit_or: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_min_u16 v1, 0xff, v1 +; SDAG-GFX12-FAKE16-NEXT: v_min_u16 v0, 0xff, v0 +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_umax_umin_bit_or: ; GISEL-VI: ; %bb.0: @@ -944,19 +1070,33 @@ define i16 @basic_umax_umin_bit_or(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_umax_umin_bit_or: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_min_u16 v1, 0xff, v1 -; GISEL-GFX12-NEXT: v_min_u16 v0, 0xff, v0 -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_min_u16 v0.h, 0xff, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_min_u16 v0.l, 0xff, v0.l +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_umax_umin_bit_or: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_min_u16 v1, 0xff, v1 +; GISEL-GFX12-FAKE16-NEXT: v_min_u16 v0, 0xff, v0 +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.umax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.umin.i16(i16 %src0.max, i16 255) @@ -1001,21 +1141,37 @@ define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) { ; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smax_smin_vec_cast: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v0.l +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_vec_cast: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin_vec_cast: ; GISEL-VI: ; %bb.0: @@ -1048,19 +1204,33 @@ define i16 @basic_smax_smin_vec_cast(i16 %src0, i16 %src1) { ; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smax_smin_vec_cast: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_med3_i16 v1, v1, 0, 0xff -; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.h, v1.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smax_smin_vec_cast: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v1, v1, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) @@ -1103,19 +1273,33 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { ; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smax_smin_bit_shl: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_max_i16 v1, v1, 0 -; SDAG-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_max_i16 v0.h, v1.l, 0 +; SDAG-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; SDAG-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_bit_shl: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_max_i16 v1, v1, 0 +; SDAG-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin_bit_shl: ; GISEL-VI: ; %bb.0: @@ -1137,19 +1321,33 @@ define i16 @basic_smax_smin_bit_shl(i16 %src0, i16 %src1) { ; GISEL-GFX9-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smax_smin_bit_shl: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_max_i16 v1, v1, 0 -; GISEL-GFX12-NEXT: v_med3_i16 v0, v0, 0, 0xff -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_max_i16 v0.h, v1.l, 0 +; GISEL-GFX12-TRUE16-NEXT: v_med3_i16 v0.l, v0.l, 0, 0xff +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.h, 8, v0.h +; GISEL-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smax_smin_bit_shl: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_max_i16 v1, v1, 0 +; GISEL-GFX12-FAKE16-NEXT: v_med3_i16 v0, v0, 0, 0xff +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %src0.max = call i16 @llvm.smax.i16(i16 %src0, i16 0) %src0.clamp = call i16 @llvm.smin.i16(i16 %src0.max, i16 255) @@ -1194,21 +1392,37 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { ; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smax_smin_vec_input: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_pk_max_i16 v1, v0, 0 +; SDAG-GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; SDAG-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_vec_input: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin_vec_input: ; GISEL-VI: ; %bb.0: @@ -1249,24 +1463,42 @@ define i16 @basic_smax_smin_vec_input(<2 x i16> %src) { ; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smax_smin_vec_input: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 -; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_pk_max_i16 v1, 0, v0 +; GISEL-GFX12-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.h +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GISEL-GFX12-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l +; GISEL-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v0.h, v0.l +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smax_smin_vec_input: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %smin = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %src) %smed = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %smin) @@ -1310,21 +1542,37 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ; SDAG-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; SDAG-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; SDAG-GFX12-LABEL: basic_smax_smin_vec_input_rev: -; SDAG-GFX12: ; %bb.0: -; SDAG-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; SDAG-GFX12-NEXT: s_wait_expcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_samplecnt 0x0 -; SDAG-GFX12-NEXT: s_wait_bvhcnt 0x0 -; SDAG-GFX12-NEXT: s_wait_kmcnt 0x0 -; SDAG-GFX12-NEXT: v_pk_max_i16 v0, v0, 0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] -; SDAG-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; SDAG-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; SDAG-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; SDAG-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; SDAG-GFX12-NEXT: s_setpc_b64 s[30:31] +; SDAG-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX12-TRUE16: ; %bb.0: +; SDAG-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-TRUE16-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_pk_min_i16 v1, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 16, v1 +; SDAG-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v0.l +; SDAG-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; SDAG-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; SDAG-GFX12-FAKE16-LABEL: basic_smax_smin_vec_input_rev: +; SDAG-GFX12-FAKE16: ; %bb.0: +; SDAG-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; SDAG-GFX12-FAKE16-NEXT: v_pk_max_i16 v0, v0, 0 +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_pk_min_i16 v0, 0xff, v0 op_sel_hi:[0,1] +; SDAG-GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; SDAG-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; SDAG-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-VI-LABEL: basic_smax_smin_vec_input_rev: ; GISEL-VI: ; %bb.0: @@ -1364,24 +1612,39 @@ define i16 @basic_smax_smin_vec_input_rev(<2 x i16> %src) { ; GISEL-GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31] ; -; GISEL-GFX12-LABEL: basic_smax_smin_vec_input_rev: -; GISEL-GFX12: ; %bb.0: -; GISEL-GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GISEL-GFX12-NEXT: s_wait_expcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_samplecnt 0x0 -; GISEL-GFX12-NEXT: s_wait_bvhcnt 0x0 -; GISEL-GFX12-NEXT: s_wait_kmcnt 0x0 -; GISEL-GFX12-NEXT: v_pk_max_i16 v0, 0, v0 -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 -; GISEL-GFX12-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GISEL-GFX12-NEXT: v_and_b32_e32 v0, 0xff, v0 -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_and_b32_e32 v1, 0xff, v1 -; GISEL-GFX12-NEXT: v_lshlrev_b16 v1, 8, v1 -; GISEL-GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GISEL-GFX12-NEXT: v_or_b32_e32 v0, v0, v1 -; GISEL-GFX12-NEXT: s_setpc_b64 s[30:31] +; GISEL-GFX12-TRUE16-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX12-TRUE16: ; %bb.0: +; GISEL-GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-TRUE16-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_pk_min_i16 v1, 0xff00ff, v0 +; GISEL-GFX12-TRUE16-NEXT: v_lshlrev_b16 v0.l, 8, v1.h +; GISEL-GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-TRUE16-NEXT: v_or_b16 v0.l, v1.l, v0.l +; GISEL-GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GISEL-GFX12-FAKE16-LABEL: basic_smax_smin_vec_input_rev: +; GISEL-GFX12-FAKE16: ; %bb.0: +; GISEL-GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GISEL-GFX12-FAKE16-NEXT: v_pk_max_i16 v0, 0, v0 +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_pk_min_i16 v0, 0xff00ff, v0 +; GISEL-GFX12-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GISEL-GFX12-FAKE16-NEXT: v_and_b32_e32 v0, 0xff, v0 +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_and_b32_e32 v1, 0xff, v1 +; GISEL-GFX12-FAKE16-NEXT: v_lshlrev_b16 v1, 8, v1 +; GISEL-GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GISEL-GFX12-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GISEL-GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] %smax = call <2 x i16> @llvm.smax.v2i16(<2 x i16> , <2 x i16> %src) %smed = call <2 x i16> @llvm.smin.v2i16(<2 x i16> , <2 x i16> %smax)