Skip to content

Commit 9c6e87e

Browse files
committed
fix and/or/xor pattern
1 parent 2ff370f commit 9c6e87e

File tree

8 files changed

+77
-58
lines changed

8 files changed

+77
-58
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1964,6 +1964,8 @@ def : GCNPat <
19641964
(S_XOR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80000000)))
19651965
>;
19661966

1967+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
1968+
let SubtargetPredicate = p in {
19671969
foreach fp16vt = [f16, bf16] in {
19681970
def : GCNPat <
19691971
(UniformUnaryFrag<fneg> (fp16vt SReg_32:$src)),
@@ -1980,6 +1982,7 @@ def : GCNPat <
19801982
(S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
19811983
>;
19821984
} // End foreach fp16vt = ...
1985+
} // let SubtargetPredicate = p
19831986

19841987
def : GCNPat <
19851988
(UniformUnaryFrag<fneg> (v2f16 SReg_32:$src)),

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18652,12 +18652,20 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
1865218652
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
1865318653
; GFX10-NEXT: ; return to shader part epilog
1865418654
;
18655-
; GFX11-LABEL: s_fabs_bf16:
18656-
; GFX11: ; %bb.0:
18657-
; GFX11-NEXT: s_and_b32 s0, s0, 0x7fff
18658-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18659-
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
18660-
; GFX11-NEXT: ; return to shader part epilog
18655+
; GFX11TRUE16-LABEL: s_fabs_bf16:
18656+
; GFX11TRUE16: ; %bb.0:
18657+
; GFX11TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, s0
18658+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18659+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
18660+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
18661+
; GFX11TRUE16-NEXT: ; return to shader part epilog
18662+
;
18663+
; GFX11FAKE16-LABEL: s_fabs_bf16:
18664+
; GFX11FAKE16: ; %bb.0:
18665+
; GFX11FAKE16-NEXT: s_and_b32 s0, s0, 0x7fff
18666+
; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18667+
; GFX11FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
18668+
; GFX11FAKE16-NEXT: ; return to shader part epilog
1866118669
%op = call bfloat @llvm.fabs.bf16(bfloat %a)
1866218670
%cast = bitcast bfloat %op to i16
1866318671
%zext = zext i16 %cast to i32
@@ -18747,12 +18755,20 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
1874718755
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
1874818756
; GFX10-NEXT: ; return to shader part epilog
1874918757
;
18750-
; GFX11-LABEL: s_fneg_bf16:
18751-
; GFX11: ; %bb.0:
18752-
; GFX11-NEXT: s_xor_b32 s0, s0, 0x8000
18753-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18754-
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
18755-
; GFX11-NEXT: ; return to shader part epilog
18758+
; GFX11TRUE16-LABEL: s_fneg_bf16:
18759+
; GFX11TRUE16: ; %bb.0:
18760+
; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, s0
18761+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18762+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
18763+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
18764+
; GFX11TRUE16-NEXT: ; return to shader part epilog
18765+
;
18766+
; GFX11FAKE16-LABEL: s_fneg_bf16:
18767+
; GFX11FAKE16: ; %bb.0:
18768+
; GFX11FAKE16-NEXT: s_xor_b32 s0, s0, 0x8000
18769+
; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18770+
; GFX11FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
18771+
; GFX11FAKE16-NEXT: ; return to shader part epilog
1875618772
%op = fneg bfloat %a
1875718773
%cast = bitcast bfloat %op to i16
1875818774
%zext = zext i16 %cast to i32
@@ -18859,12 +18875,20 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
1885918875
; GFX10-NEXT: s_and_b32 s0, 0xffff, s0
1886018876
; GFX10-NEXT: ; return to shader part epilog
1886118877
;
18862-
; GFX11-LABEL: s_fneg_fabs_bf16:
18863-
; GFX11: ; %bb.0:
18864-
; GFX11-NEXT: s_bitset1_b32 s0, 15
18865-
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18866-
; GFX11-NEXT: s_and_b32 s0, 0xffff, s0
18867-
; GFX11-NEXT: ; return to shader part epilog
18878+
; GFX11TRUE16-LABEL: s_fneg_fabs_bf16:
18879+
; GFX11TRUE16: ; %bb.0:
18880+
; GFX11TRUE16-NEXT: v_or_b16 v0.l, 0x8000, s0
18881+
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
18882+
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
18883+
; GFX11TRUE16-NEXT: v_readfirstlane_b32 s0, v0
18884+
; GFX11TRUE16-NEXT: ; return to shader part epilog
18885+
;
18886+
; GFX11FAKE16-LABEL: s_fneg_fabs_bf16:
18887+
; GFX11FAKE16: ; %bb.0:
18888+
; GFX11FAKE16-NEXT: s_bitset1_b32 s0, 15
18889+
; GFX11FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
18890+
; GFX11FAKE16-NEXT: s_and_b32 s0, 0xffff, s0
18891+
; GFX11FAKE16-NEXT: ; return to shader part epilog
1886818892
%fabs = call bfloat @llvm.fabs.bf16(bfloat %a)
1886918893
%op = fneg bfloat %fabs
1887018894
%cast = bitcast bfloat %op to i16

llvm/test/CodeGen/AMDGPU/fabs.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,9 +52,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
5252
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5353
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
5454
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
55-
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
56-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5755
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
56+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
57+
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
5858
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
5959
; GFX11-TRUE16-NEXT: s_endpgm
6060
;
@@ -118,9 +118,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
118118
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
119119
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
120120
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
121-
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
122-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
123121
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
122+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
123+
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
124124
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
125125
; GFX11-TRUE16-NEXT: s_endpgm
126126
;

llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -136,9 +136,9 @@ define amdgpu_kernel void @s_test_copysign_f16_0(ptr addrspace(1) %out, half %ma
136136
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
137137
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
138138
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
139-
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
140-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
141139
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
140+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
141+
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
142142
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
143143
; GFX11-TRUE16-NEXT: s_endpgm
144144
;
@@ -201,9 +201,9 @@ define amdgpu_kernel void @s_test_copysign_f16_1(ptr addrspace(1) %out, half %ma
201201
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
202202
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
203203
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
204-
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
205-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
206204
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
205+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
206+
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
207207
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
208208
; GFX11-TRUE16-NEXT: s_endpgm
209209
;
@@ -266,9 +266,9 @@ define amdgpu_kernel void @s_test_copysign_f16_10.0(ptr addrspace(1) %out, half
266266
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
267267
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
268268
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
269-
; GFX11-TRUE16-NEXT: s_and_b32 s2, s2, 0x7fff
270-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
271269
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
270+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
271+
; GFX11-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l
272272
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
273273
; GFX11-TRUE16-NEXT: s_endpgm
274274
;
@@ -331,9 +331,9 @@ define amdgpu_kernel void @s_test_copysign_f16_neg1(ptr addrspace(1) %out, half
331331
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
332332
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
333333
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
334-
; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
335-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
336334
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
335+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
336+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
337337
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
338338
; GFX11-TRUE16-NEXT: s_endpgm
339339
;
@@ -396,9 +396,9 @@ define amdgpu_kernel void @s_test_copysign_f16_neg10(ptr addrspace(1) %out, half
396396
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
397397
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
398398
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
399-
; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
400-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
401399
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
400+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
401+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
402402
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
403403
; GFX11-TRUE16-NEXT: s_endpgm
404404
;

llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -695,12 +695,10 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x
695695
; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0, v0.l
696696
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
697697
; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l
698-
; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l
699698
; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l
700-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
701-
; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1
702-
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo
703-
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
699+
; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
700+
; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
701+
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, vcc_lo
704702
; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l
705703
; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo
706704
; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog

llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -204,9 +204,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
204204
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
205205
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
206206
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
207-
; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
208-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
209207
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
208+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
209+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
210210
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
211211
; GFX11-TRUE16-NEXT: s_endpgm
212212
;
@@ -271,9 +271,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
271271
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
272272
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
273273
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
274-
; GFX11-TRUE16-NEXT: s_bitset1_b32 s2, 15
275-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
276274
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
275+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
276+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
277277
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
278278
; GFX11-TRUE16-NEXT: s_endpgm
279279
;
@@ -327,7 +327,7 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
327327
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
328328
; GFX11-TRUE16-NEXT: global_load_d16_b16 v0, v1, s[2:3]
329329
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
330-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, 0x8000, v0
330+
; GFX11-TRUE16-NEXT: v_or_b16 v0.l, 0x8000, v0.l
331331
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
332332
; GFX11-TRUE16-NEXT: s_endpgm
333333
;

llvm/test/CodeGen/AMDGPU/fneg.f16.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,9 +49,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
4949
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
5050
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
5151
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
52-
; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
53-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
5452
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
53+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
54+
; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
5555
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
5656
; GFX11-TRUE16-NEXT: s_endpgm
5757
;
@@ -190,9 +190,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
190190
; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
191191
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, 0
192192
; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
193-
; GFX11-TRUE16-NEXT: s_xor_b32 s2, s2, 0x8000
194-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
195193
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, s2
194+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
195+
; GFX11-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
196196
; GFX11-TRUE16-NEXT: global_store_b16 v1, v0, s[0:1]
197197
; GFX11-TRUE16-NEXT: s_endpgm
198198
;

llvm/test/CodeGen/AMDGPU/fpext.f16.ll

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -724,12 +724,10 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
724724
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
725725
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
726726
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
727-
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
727+
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
728728
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
729729
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
730-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
731-
; GFX11-TRUE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1
732-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
730+
; GFX11-TRUE16-NEXT: v_xor_b16 v1.l, 0x8000, v0.l
733731
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -v0.l
734732
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
735733
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
@@ -927,12 +925,10 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
927925
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
928926
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
929927
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
930-
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
928+
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
931929
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
932930
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
933-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
934-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff, v1
935-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
931+
; GFX11-TRUE16-NEXT: v_and_b16 v1.l, 0x7fff, v0.l
936932
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, |v0.l|
937933
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
938934
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1130,12 +1126,10 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
11301126
; GFX11-TRUE16-NEXT: s_mov_b32 s8, s2
11311127
; GFX11-TRUE16-NEXT: s_mov_b32 s9, s3
11321128
; GFX11-TRUE16-NEXT: s_mov_b32 s4, s0
1133-
; GFX11-TRUE16-NEXT: buffer_load_u16 v1, off, s[8:11], 0
1129+
; GFX11-TRUE16-NEXT: buffer_load_u16 v0, off, s[8:11], 0
11341130
; GFX11-TRUE16-NEXT: s_mov_b32 s5, s1
11351131
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0)
1136-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
1137-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v1, 0x8000, v1
1138-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
1132+
; GFX11-TRUE16-NEXT: v_or_b16 v1.l, 0x8000, v0.l
11391133
; GFX11-TRUE16-NEXT: v_cvt_f32_f16_e64 v0, -|v0.l|
11401134
; GFX11-TRUE16-NEXT: buffer_store_b32 v0, off, s[4:7], 0 dlc
11411135
; GFX11-TRUE16-NEXT: s_waitcnt_vscnt null, 0x0

0 commit comments

Comments
 (0)