@@ -1303,13 +1303,18 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
1303
1303
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_i32:
1304
1304
; GFX11-TRUE16: ; %bb.0:
1305
1305
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
1306
- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
1307
- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
1308
- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
1306
+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
1307
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
1308
+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
1309
1309
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
1310
+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
1311
+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
1312
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
1313
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
1314
+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
1310
1315
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
1311
- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_2
1312
- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
1316
+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB14_4
1317
+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
1313
1318
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
1314
1319
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
1315
1320
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1322,15 +1327,16 @@ define i32 @bitcast_v2bf16_to_i32(<2 x bfloat> %a, i32 %b) {
1322
1327
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
1323
1328
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
1324
1329
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
1325
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
1326
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
1330
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
1331
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
1327
1332
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
1328
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
1329
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
1330
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
1331
- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
1332
- ; GFX11-TRUE16-NEXT: .LBB14_2: ; %end
1333
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
1334
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
1335
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
1336
+ ; GFX11-TRUE16-NEXT: .LBB14_4: ; %end
1333
1337
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
1338
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
1339
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
1334
1340
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
1335
1341
;
1336
1342
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_i32:
@@ -3543,13 +3549,18 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
3543
3549
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_f32:
3544
3550
; GFX11-TRUE16: ; %bb.0:
3545
3551
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
3546
- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
3547
- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
3548
- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
3552
+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
3553
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
3554
+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
3549
3555
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
3556
+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
3557
+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
3558
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
3559
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
3560
+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
3550
3561
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
3551
- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_2
3552
- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
3562
+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB34_4
3563
+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
3553
3564
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
3554
3565
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
3555
3566
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3562,15 +3573,16 @@ define float @bitcast_v2bf16_to_f32(<2 x bfloat> %a, i32 %b) {
3562
3573
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
3563
3574
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
3564
3575
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
3565
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
3566
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
3576
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
3577
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
3567
3578
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
3568
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
3569
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
3570
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
3571
- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
3572
- ; GFX11-TRUE16-NEXT: .LBB34_2: ; %end
3579
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
3580
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
3581
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
3582
+ ; GFX11-TRUE16-NEXT: .LBB34_4: ; %end
3573
3583
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
3584
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
3585
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
3574
3586
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
3575
3587
;
3576
3588
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_f32:
@@ -7051,13 +7063,18 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
7051
7063
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v2f16:
7052
7064
; GFX11-TRUE16: ; %bb.0:
7053
7065
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
7054
- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
7055
- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
7056
- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
7066
+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
7067
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
7068
+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
7057
7069
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
7070
+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
7071
+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
7072
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
7073
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
7074
+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
7058
7075
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
7059
- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_2
7060
- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
7076
+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB62_4
7077
+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
7061
7078
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
7062
7079
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
7063
7080
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7070,15 +7087,16 @@ define <2 x half> @bitcast_v2bf16_to_v2f16(<2 x bfloat> %a, i32 %b) {
7070
7087
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
7071
7088
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
7072
7089
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
7073
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
7074
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
7090
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
7091
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
7075
7092
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
7076
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
7077
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
7078
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
7079
- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
7080
- ; GFX11-TRUE16-NEXT: .LBB62_2: ; %end
7093
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
7094
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
7095
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
7096
+ ; GFX11-TRUE16-NEXT: .LBB62_4: ; %end
7081
7097
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
7098
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
7099
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
7082
7100
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
7083
7101
;
7084
7102
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v2f16:
@@ -8488,13 +8506,18 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
8488
8506
; GFX11-TRUE16-LABEL: bitcast_v2bf16_to_v1i32:
8489
8507
; GFX11-TRUE16: ; %bb.0:
8490
8508
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
8491
- ; GFX11-TRUE16-NEXT: s_mov_b32 s0, exec_lo
8492
- ; GFX11-TRUE16-NEXT: v_cmpx_ne_u32_e32 0, v1
8493
- ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
8509
+ ; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
8510
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr1
8511
+ ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, vcc_lo
8494
8512
; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
8513
+ ; GFX11-TRUE16-NEXT: s_xor_b32 s0, exec_lo, s0
8514
+ ; GFX11-TRUE16-NEXT: ; %bb.1: ; %cmp.false
8515
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v1, v0
8516
+ ; GFX11-TRUE16-NEXT: ; implicit-def: $vgpr0
8517
+ ; GFX11-TRUE16-NEXT: ; %bb.2: ; %Flow
8495
8518
; GFX11-TRUE16-NEXT: s_and_not1_saveexec_b32 s0, s0
8496
- ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_2
8497
- ; GFX11-TRUE16-NEXT: ; %bb.1 : ; %cmp.true
8519
+ ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB72_4
8520
+ ; GFX11-TRUE16-NEXT: ; %bb.3 : ; %cmp.true
8498
8521
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
8499
8522
; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
8500
8523
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8507,15 +8530,16 @@ define <1 x i32> @bitcast_v2bf16_to_v1i32(<2 x bfloat> %a, i32 %b) {
8507
8530
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v0
8508
8531
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v1, 0x7fff
8509
8532
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v0, 0x7fff
8510
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
8511
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1 , v2, v4, vcc_lo
8533
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3 )
8534
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2 , v2, v4, vcc_lo
8512
8535
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v0, v0
8513
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
8514
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
8515
- ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v3, v5, vcc_lo
8516
- ; GFX11-TRUE16-NEXT: v_bfi_b32 v0, 0xffff, v1, v0
8517
- ; GFX11-TRUE16-NEXT: .LBB72_2: ; %end
8536
+ ; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc_lo
8537
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
8538
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.h
8539
+ ; GFX11-TRUE16-NEXT: .LBB72_4: ; %end
8518
8540
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
8541
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
8542
+ ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v1
8519
8543
; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
8520
8544
;
8521
8545
; GFX11-FAKE16-LABEL: bitcast_v2bf16_to_v1i32:
@@ -9062,15 +9086,14 @@ define <4 x i8> @bitcast_v2bf16_to_v4i8(<2 x bfloat> %a, i32 %b) {
9062
9086
; GFX11-TRUE16-NEXT: v_or_b32_e32 v5, 0x400000, v1
9063
9087
; GFX11-TRUE16-NEXT: v_add3_u32 v3, v3, v1, 0x7fff
9064
9088
; GFX11-TRUE16-NEXT: v_add3_u32 v2, v2, v0, 0x7fff
9065
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2 )
9089
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4 )
9066
9090
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
9067
9091
; GFX11-TRUE16-NEXT: v_cmp_u_f32_e32 vcc_lo, v1, v1
9068
- ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h
9069
9092
; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v2, v3, v5, vcc_lo
9070
- ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instskip(NEXT) | instid1(VALU_DEP_1)
9071
- ; GFX11-TRUE16-NEXT: v_bfi_b32 v1, 0xffff, v1, v2
9072
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v1
9073
- ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v1
9093
+ ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3 ) | instskip(NEXT) | instid1(VALU_DEP_1)
9094
+ ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h
9095
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v2
9096
+ ; GFX11-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
9074
9097
; GFX11-TRUE16-NEXT: .LBB76_4: ; %end
9075
9098
; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s0
9076
9099
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v0.h
0 commit comments