-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[AMDGPU][NFC] Enable gfx942 for more tests #154363
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Janek van Oirschot (JanekvO) ChangesEnable gfx942 for tests that are affected by the an AMDGPU bitcast constant combine (#154115) Expecting to see more tests affected in aforementioned PR after rebase on top of this PR Patch is 2.25 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154363.diff 37 Files Affected:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index c7385e4324e2c..c6ad5c93fb7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2,6 +2,7 @@
; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; CHECK-LABEL: @udiv_i32(
@@ -98,6 +99,37 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_sub_i32 s4, 0, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v0
+; GFX942-NEXT: s_mul_i32 s4, s4, s5
+; GFX942-NEXT: s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT: s_add_i32 s5, s5, s4
+; GFX942-NEXT: s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT: s_mul_i32 s5, s4, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s5
+; GFX942-NEXT: s_add_i32 s6, s4, 1
+; GFX942-NEXT: s_sub_i32 s5, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s4, s6, s4
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_add_i32 s5, s4, 1
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s5, s4
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = udiv i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -191,6 +223,35 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: urem_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_sub_i32 s4, 0, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s5, v0
+; GFX942-NEXT: s_mul_i32 s4, s4, s5
+; GFX942-NEXT: s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT: s_add_i32 s5, s5, s4
+; GFX942-NEXT: s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT: s_mul_i32 s4, s4, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s4
+; GFX942-NEXT: s_sub_i32 s4, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s4, s2
+; GFX942-NEXT: s_sub_i32 s4, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s4, s2
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = urem i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -312,6 +373,42 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: sdiv_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_abs_i32 s4, s3
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s4
+; GFX942-NEXT: s_sub_i32 s5, 0, s4
+; GFX942-NEXT: s_xor_b32 s3, s2, s3
+; GFX942-NEXT: s_abs_i32 s2, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_ashr_i32 s3, s3, 31
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s6, v0
+; GFX942-NEXT: s_mul_i32 s5, s5, s6
+; GFX942-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT: s_add_i32 s6, s6, s5
+; GFX942-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT: s_mul_i32 s6, s5, s4
+; GFX942-NEXT: s_sub_i32 s2, s2, s6
+; GFX942-NEXT: s_add_i32 s7, s5, 1
+; GFX942-NEXT: s_sub_i32 s6, s2, s4
+; GFX942-NEXT: s_cmp_ge_u32 s2, s4
+; GFX942-NEXT: s_cselect_b32 s5, s7, s5
+; GFX942-NEXT: s_cselect_b32 s2, s6, s2
+; GFX942-NEXT: s_add_i32 s6, s5, 1
+; GFX942-NEXT: s_cmp_ge_u32 s2, s4
+; GFX942-NEXT: s_cselect_b32 s2, s6, s5
+; GFX942-NEXT: s_xor_b32 s2, s2, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s3
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = sdiv i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -423,6 +520,40 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: srem_i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_abs_i32 s3, s3
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_sub_i32 s5, 0, s3
+; GFX942-NEXT: s_ashr_i32 s4, s2, 31
+; GFX942-NEXT: s_abs_i32 s2, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_readfirstlane_b32 s6, v0
+; GFX942-NEXT: s_mul_i32 s5, s5, s6
+; GFX942-NEXT: s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT: s_add_i32 s6, s6, s5
+; GFX942-NEXT: s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT: s_mul_i32 s5, s5, s3
+; GFX942-NEXT: s_sub_i32 s2, s2, s5
+; GFX942-NEXT: s_sub_i32 s5, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_sub_i32 s5, s2, s3
+; GFX942-NEXT: s_cmp_ge_u32 s2, s3
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s4
+; GFX942-NEXT: s_sub_i32 s2, s2, s4
+; GFX942-NEXT: v_mov_b32_e32 v0, s2
+; GFX942-NEXT: global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = srem i32 %x, %y
store i32 %r, ptr addrspace(1) %out
ret void
@@ -492,6 +623,29 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_short v3, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_lshr_b32 s1, s0, 16
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s1
+; GFX942-NEXT: s_and_b32 s0, s0, 0xffff
+; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT: v_trunc_f32_e32 v2, v2
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT: v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_short v3, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = udiv i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -567,6 +721,31 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: urem_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: v_mov_b32_e32 v3, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_lshr_b32 s3, s2, 16
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT: s_and_b32 s0, s2, 0xffff
+; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT: v_trunc_f32_e32 v2, v2
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT: v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: global_store_short v3, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = urem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -648,6 +827,31 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: sdiv_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_ashr_i32 s3, s2, 16
+; GFX942-NEXT: v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT: s_sext_i32_i16 s2, s2
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_or_b32 s4, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s4, 0
+; GFX942-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT: global_store_short v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = sdiv i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -735,6 +939,33 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0
; GFX9-NEXT: global_store_short v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: srem_i16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_ashr_i32 s4, s6, 16
+; GFX942-NEXT: v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT: s_sext_i32_i16 s2, s6
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s4
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_or_b32 s5, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s5, 0
+; GFX942-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT: v_mul_lo_u32 v0, v0, s4
+; GFX942-NEXT: v_sub_u32_e32 v0, s6, v0
+; GFX942-NEXT: global_store_short v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = srem i16 %x, %y
store i16 %r, ptr addrspace(1) %out
ret void
@@ -798,6 +1029,25 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
; GFX9-NEXT: global_store_byte v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT: v_trunc_f32_e32 v1, v1
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT: v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = udiv i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -869,6 +1119,28 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: urem_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT: v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT: s_lshr_b32 s3, s2, 8
+; GFX942-NEXT: v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT: v_trunc_f32_e32 v1, v1
+; GFX942-NEXT: v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT: v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT: v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT: v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT: global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = urem i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -950,6 +1222,31 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_add_u32_e32 v0, s2, v3
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: sdiv_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v1, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bfe_i32 s3, s2, 0x80008
+; GFX942-NEXT: v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT: s_sext_i32_i8 s2, s2
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT: s_xor_b32 s2, s2, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_or_b32 s4, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s4, 0
+; GFX942-NEXT: v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT: global_store_byte v1, v0, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = sdiv i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -1039,6 +1336,34 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
; GFX9-NEXT: v_sub_u32_e32 v0, s6, v0
; GFX9-NEXT: global_store_byte v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: srem_i8:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_bfe_i32 s2, s6, 0x80008
+; GFX942-NEXT: v_cvt_f32_i32_e32 v1, s2
+; GFX942-NEXT: s_sext_i32_i8 s3, s6
+; GFX942-NEXT: v_cvt_f32_i32_e32 v2, s3
+; GFX942-NEXT: s_xor_b32 s2, s3, s2
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v3, v1
+; GFX942-NEXT: s_ashr_i32 s2, s2, 30
+; GFX942-NEXT: s_lshr_b32 s4, s6, 8
+; GFX942-NEXT: s_or_b32 s5, s2, 1
+; GFX942-NEXT: v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT: v_trunc_f32_e32 v3, v3
+; GFX942-NEXT: v_fma_f32 v2, -v3, v1, v2
+; GFX942-NEXT: v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT: v_cmp_ge_f32_e64 s[2:3], |v2|, |v1|
+; GFX942-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT: s_cselect_b32 s2, s5, 0
+; GFX942-NEXT: v_add_u32_e32 v1, s2, v3
+; GFX942-NEXT: v_mul_lo_u32 v1, v1, s4
+; GFX942-NEXT: v_sub_u32_e32 v1, s6, v1
+; GFX942-NEXT: global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT: s_endpgm
%r = srem i8 %x, %y
store i8 %r, ptr addrspace(1) %out
ret void
@@ -1367,6 +1692,99 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
; GFX9-NEXT: v_mov_b32_e32 v3, s5
; GFX9-NEXT: global_store_dwordx4 v4, v[0:3], s[0:1]
; GFX9-NEXT: s_endpgm
+;
+; GFX942-LABEL: udiv_v4i32:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v2, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s12
+; GFX942-NEXT: v_cvt_f32_u32_e32 v1, s13
+; GFX942-NEXT: s_sub_i32 s2, 0, s12
+; GFX942-NEXT: v_cvt_f32_u32_e32 v3, s14
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT: v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT: v_readfirstlane_b32 s3, v0
+; GFX942-NEXT: s_mul_i32 s2, s2, s3
+; GFX942-NEXT: s_mul_hi_u32 s2, s3, s2
+; GFX942-NEXT: s_add_i32 s3, s3, s2
+; GFX942-NEXT: s_mul_hi_u32 s2, s8, s3
+; GFX942-NEXT: s_mul_i32 s3, s2, s12
+; GFX942-NEXT: s_sub_i32 s3, s8, s3
+; GFX942-NEXT: s_add_i32 s5, s2, 1
+; GFX942-NEXT: s_sub_i32 s6, s3, s12
+; GFX942-NEXT: s_cmp_ge_u32 s3, s12
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_cselect_b32 s3, s6, s3
+; GFX942-NEXT: s_add_i32 s5, s2, 1
+; GFX942-NEXT: s_cmp_ge_u32 s3, s12
+; GFX942-NEXT: v_readfirstlane_b32 s4, v1
+; GFX942-NEXT: s_cselect_b32 s2, s5, s2
+; GFX942-NEXT: s_sub_i32 s3, 0, s13
+; GFX942-NEXT: s_mul_i32 s3, s3, s4
+; GFX942-NEXT: s_mul_hi_u32 s3, s4, s3
+; GFX942-NEXT: s_add_i32 s4, s4, s3
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v3
+; GFX942-NEXT: s_mul_hi_u32 s3, s9, s4
+; GFX942-NEXT: s_mul_i32 s4, s3, s13
+; GFX942-NEXT: s_sub_i32 s4, s9, s4
+; GFX942-NEXT: s_add_i32 s5, s3, 1
+; GFX942-NEXT: s_sub_i32 s6, s4, s13
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: s_cmp_ge_u32 s4, s13
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_cselect_b32 s3, s5, s3
+; GFX942-NEXT: s_cselect_b32 s4, s6, s4
+; GFX942-NEXT: s_add_i32 s5, s3, 1
+; GFX942-NEXT: s_cmp_ge_u32 s4, s13
+; GFX942-NEXT: s_cselect_b32 s3, s5, s3
+; GFX942-NEXT: v_readfirstlane_b32 s5, v0
+; GFX942-NEXT: v_cvt_f32_u32_e32 v0, s15
+; GFX942-NEXT: s_sub_i32 s4, 0, s14
+; GFX942-NEXT: s_mul_i32 s4, s4, s5
+; GFX942-NEXT: s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT: s_add_i32 s5, s5, s4
+; GFX942-NEXT: v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT: s_mul_hi_u32 s4, s10, s5
+; GFX942-NEXT: s_mul_i32 s5, s4, s14
+; GFX942-NEXT: s_sub_i32 s5, s10, s5
+; GFX942-NEXT: s_add_i32 s6, s4, 1
+; GFX942-NEXT: s_sub_i32 s7, s5, s14
+; GFX942-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT: s_cmp_ge_u32 s5, s14
+; GFX942-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT: s_cselect_b32 s4, s6, s4
+; GFX942-NEXT: s_cselect_b32 s5, s7, s5
+; GFX942-NEXT: s_add_i32 s6, s4, 1
+; GFX942-NEXT: s_cmp_ge_u32 s5, s14
+; GFX942-NEXT: s_cselect_b...
[truncated]
|
|
I've rebased locally on top of changes in #154115 and it does affect about 28 of the 37 tests I'm enabling gfx942 for here; let me know if I should cut the amount of tests adding in this PR to only those 28. |
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not every test that changes needs to run with 942, this can be more targeted.
I was hoping the original commit had a more targeted test, but 7900334 looks like a bunch of random test changes. The int_to_fp*s removed FIXMEs so those are probably more interesting.
Sorry, which FIXMEs? Also, I only have enabled for the int_to_fp* tests that were affected by the bitcast combine so let me know if I should do so for the other int_to_fp* tests as well. |
The ones in the test diffs from 7900334 |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/10/builds/11822 Here is the relevant piece of the build log for the reference |
Enable gfx942 for tests that are affected by the an AMDGPU bitcast constant combine (#154115)
Expecting to see more tests affected in aforementioned PR after rebase on top of this PR