Skip to content

Commit dfdfc4e

Browse files
authored
[AMDGPU][True16][Codegen] remove another build_vector pattern from true16 (#149861)
Remove another build_vector pattern which takes a i16 but placed in a VGPR_32 from true16 mode. This stop isel from generating illegal "vgpr_32 = COPY vgpr_16". ISel will use vgpr16 build vector pattern in true16 mode instead
1 parent 84b5620 commit dfdfc4e

39 files changed

+14501
-12845
lines changed

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2201,7 +2201,6 @@ def : GCNPat <
22012201
}
22022202

22032203
foreach fp16vt = [f16, bf16] in {
2204-
22052204
def : GCNPat <
22062205
(fcopysign fp16vt:$src0, fp16vt:$src1),
22072206
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
@@ -3694,13 +3693,24 @@ def : GCNPat <
36943693
>;
36953694

36963695
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
3697-
let True16Predicate = p in
3696+
let True16Predicate = p in {
36983697
// Take the lower 16 bits from each VGPR_32 and concat them
36993698
def : GCNPat <
37003699
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
37013700
(V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
37023701
>;
37033702

3703+
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
3704+
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
3705+
def : GCNPat <
3706+
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
3707+
(Ty !if(!eq(Ty, i16),
3708+
(Ty (trunc (srl VGPR_32:$b, (i32 16)))),
3709+
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
3710+
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
3711+
>;
3712+
}
3713+
37043714
let True16Predicate = UseRealTrue16Insts in {
37053715
def : GCNPat <
37063716
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
@@ -3726,18 +3736,6 @@ def : GCNPat <
37263736
(V_AND_B32_e64 (S_MOV_B32 (i32 0xffff0000)), VGPR_32:$b)
37273737
>;
37283738

3729-
3730-
// Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
3731-
// Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
3732-
def : GCNPat <
3733-
(vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a),
3734-
(Ty !if(!eq(Ty, i16),
3735-
(Ty (trunc (srl VGPR_32:$b, (i32 16)))),
3736-
(Ty (bitconvert (i16 (trunc (srl VGPR_32:$b, (i32 16)))))))))),
3737-
(V_BFI_B32_e64 (S_MOV_B32 (i32 0x0000ffff)), VGPR_32:$a, VGPR_32:$b)
3738-
>;
3739-
3740-
37413739
// Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
37423740
// Special case, can use V_ALIGNBIT (always uses encoded literal)
37433741
let True16Predicate = NotHasTrue16BitInsts in {

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.1024bit.ll

Lines changed: 3930 additions & 3639 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.128bit.ll

Lines changed: 388 additions & 357 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.256bit.ll

Lines changed: 822 additions & 819 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.32bit.ll

Lines changed: 77 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1303,13 +1303,18 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
13031303
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32:
13041304
; GFX11-TRUE16: ; %bb.0:
13051305
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1306-
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
1307-
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
1308-
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
1306+
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
1307+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
1308+
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
13091309
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1310+
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
1311+
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
1312+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
1313+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
1314+
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
13101315
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
1311-
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
1312-
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
1316+
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_4
1317+
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
13131318
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
13141319
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
13151320
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1322,15 +1327,16 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
13221327
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
13231328
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
13241329
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
1325-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1326-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
1330+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
1331+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
13271332
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
1328-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
1329-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
1330-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
1331-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
1332-
; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
1333+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
1334+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
1335+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
1336+
; GFX11-TRUE16-NEXT: .LBB14_4: ; %end
13331337
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
1338+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1339+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
13341340
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
13351341
;
13361342
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32:
@@ -3543,13 +3549,18 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
35433549
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32:
35443550
; GFX11-TRUE16: ; %bb.0:
35453551
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3546-
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
3547-
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
3548-
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
3552+
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
3553+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
3554+
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
35493555
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3556+
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
3557+
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
3558+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
3559+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
3560+
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
35503561
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
3551-
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
3552-
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
3562+
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4
3563+
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
35533564
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
35543565
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35553566
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3562,15 +3573,16 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
35623573
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
35633574
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
35643575
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
3565-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
3566-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
3576+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
3577+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
35673578
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3568-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
3569-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
3570-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
3571-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
3572-
; GFX11-TRUE16-NEXT: .LBB34_2: ; %end
3579+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
3580+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3581+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
3582+
; GFX11-TRUE16-NEXT: .LBB34_4: ; %end
35733583
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
3584+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
3585+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
35743586
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
35753587
;
35763588
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32:
@@ -7051,13 +7063,18 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
70517063
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16:
70527064
; GFX11-TRUE16: ; %bb.0:
70537065
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7054-
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
7055-
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
7056-
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
7066+
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
7067+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
7068+
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
70577069
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7070+
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
7071+
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
7072+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
7073+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
7074+
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
70587075
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
7059-
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
7060-
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
7076+
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_4
7077+
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
70617078
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
70627079
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
70637080
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7070,15 +7087,16 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
70707087
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
70717088
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
70727089
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
7073-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
7074-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
7090+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
7091+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
70757092
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
7076-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
7077-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
7078-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
7079-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
7080-
; GFX11-TRUE16-NEXT: .LBB62_2: ; %end
7093+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
7094+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
7095+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
7096+
; GFX11-TRUE16-NEXT: .LBB62_4: ; %end
70817097
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
7098+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7099+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
70827100
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
70837101
;
70847102
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16:
@@ -8488,13 +8506,18 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
84888506
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32:
84898507
; GFX11-TRUE16: ; %bb.0:
84908508
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8491-
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
8492-
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
8493-
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
8509+
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
8510+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
8511+
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
84948512
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8513+
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
8514+
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
8515+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
8516+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
8517+
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
84958518
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
8496-
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
8497-
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
8519+
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_4
8520+
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
84988521
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
84998522
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
85008523
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8507,15 +8530,16 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
85078530
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
85088531
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
85098532
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
8510-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
8511-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v2, v4, vcc_lo
8533+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
8534+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc_lo
85128535
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
8513-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
8514-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
8515-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
8516-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
8517-
; GFX11-TRUE16-NEXT: .LBB72_2: ; %end
8536+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
8537+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
8538+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
8539+
; GFX11-TRUE16-NEXT: .LBB72_4: ; %end
85188540
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
8541+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
8542+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
85198543
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
85208544
;
85218545
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32:
@@ -9062,15 +9086,14 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
90629086
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
90639087
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
90649088
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
9065-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
9089+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4)
90669090
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
90679091
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9068-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
90699092
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
9070-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
9071-
; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
9072-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
9073-
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
9093+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
9094+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
9095+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v2
9096+
; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
90749097
; GFX11-TRUE16-NEXT: .LBB76_4: ; %end
90759098
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
90769099
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h

llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.48bit.ll

Lines changed: 33 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -138,46 +138,51 @@ define <3 x half> @bitcast_v3bf16_to_v3f16(<3 x bfloat> %a, i32 %b) {
138138
; GFX11-TRUE16-LABEL: bitcast_v3bf16_to_v3f16:
139139
; GFX11-TRUE16: ; %bb.0:
140140
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
141-
; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
142-
; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v2
143-
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
141+
; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
142+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr2_vgpr3
143+
; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
144144
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
145+
; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
146+
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
147+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
148+
; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
149+
; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
145150
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
146-
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_2
147-
; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.true
151+
; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB0_4
152+
; GFX11-TRUE16-NEXT: ; %bb.3: ; %cmp.true
148153
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, 0
149154
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v1.l
150-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
155+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
151156
; GFX11-TRUE16-NEXT: v_add_f32_e32 v1, 0x40c00000, v2
152157
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.h, v0.l
153-
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
154158
; GFX11-TRUE16-NEXT: v_add_f32_e32 v2, 0x40c00000, v2
155-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
156-
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
157-
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v1, 16, 1
159+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
158160
; GFX11-TRUE16-NEXT: v_bfe_u32 v3, v2, 16, 1
159-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
160-
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v0, 16, 1
161-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v2
161+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v2
162162
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v2, v2
163-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v7, 0x400000, v0
164163
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v2, 0x7fff
165-
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v0, 0x7fff
166-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
167-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
168-
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
169-
; GFX11-TRUE16-NEXT: v_or_b32_e32 v3, 0x400000, v1
170-
; GFX11-TRUE16-NEXT: v_add3_u32 v5, v6, v1, 0x7fff
171-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
172-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc_lo
164+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
165+
; GFX11-TRUE16-NEXT: v_dual_cndmask_b32 v7, v3, v7 :: v_dual_and_b32 v0, 0xffff0000, v0
166+
; GFX11-TRUE16-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
167+
; GFX11-TRUE16-NEXT: v_bfe_u32 v4, v1, 16, 1
168+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
173169
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
174-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0x7fc0
175-
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
176-
; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v2, v0
177-
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc_lo
178-
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v3.h
179-
; GFX11-TRUE16-NEXT: .LBB0_2: ; %end
170+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.h, 0x7fc0
171+
; GFX11-TRUE16-NEXT: v_bfe_u32 v6, v0, 16, 1
172+
; GFX11-TRUE16-NEXT: v_add3_u32 v4, v4, v1, 0x7fff
173+
; GFX11-TRUE16-NEXT: v_or_b32_e32 v8, 0x400000, v0
174+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
175+
; GFX11-TRUE16-NEXT: v_add3_u32 v6, v6, v0, 0x7fff
176+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo
177+
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
178+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
179+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, v1.h
180+
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v6, v8, vcc_lo
181+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v7.h
182+
; GFX11-TRUE16-NEXT: .LBB0_4: ; %end
180183
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
184+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
185+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
181186
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
182187
;
183188
; GFX11-FAKE16-LABEL: bitcast_v3bf16_to_v3f16:

0 commit comments

Comments
 (0)