Skip to content

Commit 2e036dc

Browse files
committed
[AMDGPU] Fix a few LIT tests
1 parent 2051941 commit 2e036dc

File tree

3 files changed

+39
-77
lines changed

3 files changed

+39
-77
lines changed

llvm/test/CodeGen/AMDGPU/bf16-math.ll

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,10 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
368368
define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
369369
; GCN-LABEL: test_clamp_bf16_folding:
370370
; GCN: ; %bb.0:
371-
; GCN-NEXT: v_exp_bf16_e64 v0, v0 clamp
371+
; GCN-NEXT: v_exp_bf16_e32 v0, v0
372+
; GCN-NEXT: v_nop
373+
; GCN-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
374+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
372375
; GCN-NEXT: ; return to shader part epilog
373376
%exp = call bfloat @llvm.exp2.bf16(bfloat %src)
374377
%max = call bfloat @llvm.maxnum.bf16(bfloat %exp, bfloat 0.0)
@@ -379,7 +382,9 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
379382
define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
380383
; GCN-LABEL: test_clamp_v2bf16_folding:
381384
; GCN: ; %bb.0:
382-
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1 clamp
385+
; GCN-NEXT: v_pk_mul_bf16 v0, v0, v1
386+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
387+
; GCN-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
383388
; GCN-NEXT: ; return to shader part epilog
384389
%mul = fmul <2 x bfloat> %src0, %src1
385390
%max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %mul, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
@@ -391,7 +396,9 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
391396
define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
392397
; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
393398
; GCN: ; %bb.0:
394-
; GCN-NEXT: v_pk_fma_bf16 v2, v2, v3, v4
399+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, v3
400+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
401+
; GCN-NEXT: v_pk_add_bf16 v2, v2, v4
395402
; GCN-NEXT: global_store_b32 v[0:1], v2, off
396403
; GCN-NEXT: s_endpgm
397404
%mul = fmul contract <2 x bfloat> %a, %b
@@ -403,7 +410,9 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfl
403410
define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
404411
; GCN-LABEL: v_test_mul_add_v2bf16_vss:
405412
; GCN: ; %bb.0:
406-
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, s1
413+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
414+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
415+
; GCN-NEXT: v_pk_add_bf16 v2, v2, s1
407416
; GCN-NEXT: global_store_b32 v[0:1], v2, off
408417
; GCN-NEXT: s_endpgm
409418
%mul = fmul contract <2 x bfloat> %a, %b
@@ -415,9 +424,9 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfl
415424
define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
416425
; GCN-LABEL: v_test_mul_add_v2bf16_sss:
417426
; GCN: ; %bb.0:
418-
; GCN-NEXT: v_mov_b32_e32 v2, s2
427+
; GCN-NEXT: v_pk_mul_bf16 v2, s0, s1
419428
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
420-
; GCN-NEXT: v_pk_fma_bf16 v2, s0, s1, v2
429+
; GCN-NEXT: v_pk_add_bf16 v2, v2, s2
421430
; GCN-NEXT: global_store_b32 v[0:1], v2, off
422431
; GCN-NEXT: s_endpgm
423432
%mul = fmul contract <2 x bfloat> %a, %b
@@ -429,7 +438,9 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfl
429438
define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
430439
; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
431440
; GCN: ; %bb.0:
432-
; GCN-NEXT: v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
441+
; GCN-NEXT: v_pk_mul_bf16 v2, v2, s0
442+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
443+
; GCN-NEXT: v_pk_add_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
433444
; GCN-NEXT: global_store_b32 v[0:1], v2, off
434445
; GCN-NEXT: s_endpgm
435446
%mul = fmul contract <2 x bfloat> %a, %b
@@ -441,9 +452,9 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfl
441452
define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
442453
; GCN-LABEL: v_test_mul_add_v2bf16_vll:
443454
; GCN: ; %bb.0:
444-
; GCN-NEXT: s_mov_b32 s0, 0x43484000
445-
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
446-
; GCN-NEXT: v_pk_fma_bf16 v2, 0x42c83f80, v2, s0
455+
; GCN-NEXT: v_pk_mul_bf16 v2, 0x42c83f80, v2
456+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
457+
; GCN-NEXT: v_pk_add_bf16 v2, 0x43484000, v2
447458
; GCN-NEXT: global_store_b32 v[0:1], v2, off
448459
; GCN-NEXT: s_endpgm
449460
%mul = fmul contract <2 x bfloat> %a, <bfloat 1.0, bfloat 100.0>

llvm/test/CodeGen/AMDGPU/mad-mix-hi-bf16.ll

Lines changed: 4 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -121,18 +121,7 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
121121
; GFX1250: ; %bb.0:
122122
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
123123
; GFX1250-NEXT: s_wait_kmcnt 0x0
124-
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1]
125-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
126-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
127-
; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
128-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
129-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
130-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
131-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
132-
; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
133-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
134-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
135-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
124+
; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
136125
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
137126
%src0.ext = fpext bfloat %src0 to float
138127
%src1.ext = fpext bfloat %src1 to float
@@ -150,20 +139,10 @@ define <2 x bfloat> @v_mad_mixhi_bf16_bf16lo_bf16lo_bf16lo_undeflo_clamp_postcvt
150139
; GFX1250: ; %bb.0:
151140
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
152141
; GFX1250-NEXT: s_wait_kmcnt 0x0
153-
; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v0, v1, v2 op_sel_hi:[1,1,1]
154-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
155-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
156-
; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
157-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
158-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
159-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
160-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
161-
; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
162-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
163-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
164-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
142+
; GFX1250-NEXT: v_fma_mixlo_bf16 v3, v0, v1, v2 op_sel_hi:[1,1,1]
143+
; GFX1250-NEXT: v_fma_mixhi_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
165144
; GFX1250-NEXT: s_wait_storecnt 0x0
166-
; GFX1250-NEXT: global_store_b16 v[0:1], v1, off scope:SCOPE_SYS
145+
; GFX1250-NEXT: global_store_b16 v[0:1], v3, off scope:SCOPE_SYS
167146
; GFX1250-NEXT: s_wait_storecnt 0x0
168147
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
169148
%src0.ext = fpext bfloat %src0 to float

llvm/test/CodeGen/AMDGPU/mad-mix-lo-bf16.ll

Lines changed: 14 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,8 @@ define bfloat @v_mad_mixlo_bf16_bf16lo_bf16lo_f32_clamp_post_cvt(bfloat %src0, b
7575
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
7676
; GFX1250-NEXT: s_wait_kmcnt 0x0
7777
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v0, v1, v2 op_sel_hi:[1,1,0]
78-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
79-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
80-
; GFX1250-NEXT: v_max_num_f32_e32 v0, 0, v0
81-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
82-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
83-
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
84-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
85-
; GFX1250-NEXT: v_min_num_f32_e32 v0, 1.0, v0
86-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
78+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
79+
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
8780
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
8881
%src0.ext = fpext bfloat %src0 to float
8982
%src1.ext = fpext bfloat %src1 to float
@@ -199,9 +192,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt(<2 x bfloat> %src0, <2 x bflo
199192
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
200193
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
201194
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
202-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
203-
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
204-
; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
195+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
196+
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
205197
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
206198
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
207199
%src1.ext = fpext <2 x bfloat> %src1 to <2 x float>
@@ -219,16 +211,13 @@ define <3 x bfloat> @v_mad_mix_v3f32_clamp_postcvt(<3 x bfloat> %src0, <3 x bflo
219211
; GFX1250: ; %bb.0:
220212
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
221213
; GFX1250-NEXT: s_wait_kmcnt 0x0
222-
; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
223-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
224-
; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
225-
; GFX1250-NEXT: v_fma_mixlo_bf16 v0, v1, v3, v5 op_sel_hi:[1,1,1]
226-
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v6, 0
214+
; GFX1250-NEXT: v_fma_mixlo_bf16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
215+
; GFX1250-NEXT: v_fma_mixlo_bf16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
227216
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
228-
; GFX1250-NEXT: v_pk_max_num_bf16 v2, v0, 0
229-
; GFX1250-NEXT: v_pk_min_num_bf16 v0, v1, 1.0 op_sel_hi:[1,0]
217+
; GFX1250-NEXT: v_fma_mixhi_bf16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
218+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
230219
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
231-
; GFX1250-NEXT: v_pk_min_num_bf16 v1, v2, 1.0
220+
; GFX1250-NEXT: v_mov_b32_e32 v0, v6
232221
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
233222
%src0.ext = fpext <3 x bfloat> %src0 to <3 x float>
234223
%src1.ext = fpext <3 x bfloat> %src1 to <3 x float>
@@ -261,11 +250,8 @@ define <4 x bfloat> @v_mad_mix_v4f32_clamp_postcvt(<4 x bfloat> %src0, <4 x bflo
261250
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
262251
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v2, v3
263252
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
264-
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, 0
265-
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, 0
266-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
267-
; GFX1250-NEXT: v_pk_min_num_bf16 v0, v0, 1.0 op_sel_hi:[1,0]
268-
; GFX1250-NEXT: v_pk_min_num_bf16 v1, v1, 1.0 op_sel_hi:[1,0]
253+
; GFX1250-NEXT: v_pk_max_num_bf16 v0, v0, v0 clamp
254+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
269255
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
270256
%src0.ext = fpext <4 x bfloat> %src0 to <4 x float>
271257
%src1.ext = fpext <4 x bfloat> %src1 to <4 x float>
@@ -291,15 +277,7 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x bfloat> %src0, <2 x b
291277
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
292278
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
293279
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
294-
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
295-
; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
296-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
297-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
298-
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
299-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
300-
; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
301-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
302-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
280+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v0, v0 clamp
303281
; GFX1250-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
304282
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
305283
%src0.ext = fpext <2 x bfloat> %src0 to <2 x float>
@@ -328,14 +306,8 @@ define <2 x bfloat> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x bfloat> %src0, <2 x b
328306
; GFX1250-NEXT: v_pk_fma_f32 v[0:1], v[4:5], v[6:7], v[0:1]
329307
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, v1
330308
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
331-
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
332-
; GFX1250-NEXT: v_max_num_f32_e32 v1, 0, v1
333-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
334-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
335-
; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v1
336-
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
337-
; GFX1250-NEXT: v_min_num_f32_e32 v1, 1.0, v1
338-
; GFX1250-NEXT: v_cvt_pk_bf16_f32 v1, v1, s0
309+
; GFX1250-NEXT: v_lshrrev_b32_e32 v1, 16, v0
310+
; GFX1250-NEXT: v_pk_max_num_bf16 v1, v1, v1 clamp
339311
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
340312
; GFX1250-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
341313
; GFX1250-NEXT: s_set_pc_i64 s[30:31]

0 commit comments

Comments
 (0)