-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[AMDGPU][True16][CodeGen] remove 32 bit and/or/xor pattern from true16 #131634
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
|
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) ChangesFull diff: https://github.com/llvm/llvm-project/pull/131634.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index de77401eb0137..3d0bd4f28881c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1964,6 +1964,8 @@ def : GCNPat <
(S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let SubtargetPredicate = p in {
foreach fp16vt = [f16, bf16] in {
def : GCNPat <
(UniformUnaryFrag<fneg> (fp16vt SReg_32:$src)),
@@ -1980,6 +1982,7 @@ def : GCNPat <
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
>;
} // End foreach fp16vt = ...
+} // let SubtargetPredicate = p
def : GCNPat <
(UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index efcaa8807367b..d6ce57990d57f 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -18652,12 +18652,20 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_fabs_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_fabs_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, s0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_fabs_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11FAKE16-NEXT: ; return to shader part epilog
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
%cast = bitcast bfloat %op to i16
%zext = zext i16 %cast to i32
@@ -18747,12 +18755,20 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_fneg_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_fneg_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, s0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_fneg_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11FAKE16-NEXT: ; return to shader part epilog
%op = fneg bfloat %a
%cast = bitcast bfloat %op to i16
%zext = zext i16 %cast to i32
@@ -18859,12 +18875,20 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
; GFX10-NEXT: ; return to shader part epilog
;
-; GFX11-LABEL: s_fneg_fabs_bf16:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_bitset1_b32 s0, 15
-; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT: ; return to shader part epilog
+; GFX11TRUE16-LABEL: s_fneg_fabs_bf16:
+; GFX11TRUE16: ; %bb.0:
+; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, s0
+; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT: ; return to shader part epilog
+;
+; GFX11FAKE16-LABEL: s_fneg_fabs_bf16:
+; GFX11FAKE16: ; %bb.0:
+; GFX11FAKE16-NEXT: s_bitset1_b32 s0, 15
+; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
+; GFX11FAKE16-NEXT: ; return to shader part epilog
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
%op = fneg bfloat %fabs
%cast = bitcast bfloat %op to i16
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 365588eaec3ac..b2158613d400d 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -52,9 +52,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -118,9 +118,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 620273a360439..265902b83d071 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -136,9 +136,9 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -201,9 +201,9 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -266,9 +266,9 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -331,9 +331,9 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -396,9 +396,9 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 5ea39997938ad..b45366443311f 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -695,12 +695,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0, v0.l
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l
-; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
-; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo
-; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
+; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 9642b36ecb7e8..7604c81c0787a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -204,9 +204,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -271,9 +271,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -327,7 +327,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
+; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 23e4ba9fd4ed7..77fa5bce09781 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -49,9 +49,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
@@ -190,9 +190,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
; GFX11-TRUE16-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 59ba9b72e2911..9d166f603473a 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -724,12 +724,10 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
@@ -927,12 +925,10 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l|
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1130,12 +1126,10 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
-; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
-; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v1
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT: v_or_b16 v1.l, 0x8000, v0.l
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l|
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
|
Sisyph
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This patch doesn't make sense to me. The Pats are for UnformUnaryFrag, so they would go to the SALU, and shouldn't have anything to do with True16. In all the test cases this is turning SALUs into VALUs, which is not good.
Hi Joe Thanks for the comment. Let me try removing this downstream and check the results |
remove S_XOR_B32/S_AND_B32/S_OR_32 pattern from true16 mode