@@ -2520,6 +2520,7 @@ define void @test_load_store_f32_to_bf16(ptr addrspace(1) %in, ptr addrspace(1)
25202520; GFX1250-NEXT: s_wait_kmcnt 0x0
25212521; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
25222522; GFX1250-NEXT: s_wait_loadcnt 0x0
2523+ ; GFX1250-NEXT: s_wait_xcnt 0x0
25232524; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0
25242525; GFX1250-NEXT: global_store_b16 v[2:3], v0, off
25252526; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2783,6 +2784,7 @@ define void @test_load_store_bf16_to_f32(ptr addrspace(1) %in, ptr addrspace(1)
27832784; GFX1250-NEXT: s_wait_kmcnt 0x0
27842785; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
27852786; GFX1250-NEXT: s_wait_loadcnt 0x0
2787+ ; GFX1250-NEXT: s_wait_xcnt 0x0
27862788; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
27872789; GFX1250-NEXT: global_store_b32 v[2:3], v0, off
27882790; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -2872,6 +2874,7 @@ define void @test_load_store_bf16_to_f64(ptr addrspace(1) %in, ptr addrspace(1)
28722874; GFX1250-NEXT: s_wait_kmcnt 0x0
28732875; GFX1250-NEXT: global_load_u16 v0, v[0:1], off
28742876; GFX1250-NEXT: s_wait_loadcnt 0x0
2877+ ; GFX1250-NEXT: s_wait_xcnt 0x0
28752878; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
28762879; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
28772880; GFX1250-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
@@ -6850,6 +6853,7 @@ define <2 x float> @global_extload_v2bf16_to_v2f32(ptr addrspace(1) %ptr) {
68506853; GFX1250-NEXT: s_wait_kmcnt 0x0
68516854; GFX1250-NEXT: global_load_b32 v1, v[0:1], off
68526855; GFX1250-NEXT: s_wait_loadcnt 0x0
6856+ ; GFX1250-NEXT: s_wait_xcnt 0x0
68536857; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v1
68546858; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
68556859; GFX1250-NEXT: s_set_pc_i64 s[30:31]
@@ -6943,6 +6947,7 @@ define <3 x float> @global_extload_v3bf16_to_v3f32(ptr addrspace(1) %ptr) {
69436947; GFX1250-NEXT: s_wait_kmcnt 0x0
69446948; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
69456949; GFX1250-NEXT: s_wait_loadcnt 0x0
6950+ ; GFX1250-NEXT: s_wait_xcnt 0x0
69466951; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
69476952; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
69486953; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7033,6 +7038,7 @@ define <4 x float> @global_extload_v4bf16_to_v4f32(ptr addrspace(1) %ptr) {
70337038; GFX1250-NEXT: s_wait_kmcnt 0x0
70347039; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
70357040; GFX1250-NEXT: s_wait_loadcnt 0x0
7041+ ; GFX1250-NEXT: s_wait_xcnt 0x0
70367042; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
70377043; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
70387044; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7134,6 +7140,7 @@ define <5 x float> @global_extload_v5bf16_to_v5f32(ptr addrspace(1) %ptr) {
71347140; GFX1250-NEXT: s_wait_kmcnt 0x0
71357141; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
71367142; GFX1250-NEXT: s_wait_loadcnt 0x0
7143+ ; GFX1250-NEXT: s_wait_xcnt 0x0
71377144; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v2
71387145; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
71397146; GFX1250-NEXT: v_lshlrev_b32_e32 v2, 16, v3
@@ -7251,6 +7258,7 @@ define <6 x float> @global_extload_v6bf16_to_v6f32(ptr addrspace(1) %ptr) {
72517258; GFX1250-NEXT: s_wait_kmcnt 0x0
72527259; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
72537260; GFX1250-NEXT: s_wait_loadcnt 0x0
7261+ ; GFX1250-NEXT: s_wait_xcnt 0x0
72547262; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
72557263; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
72567264; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
@@ -7367,6 +7375,7 @@ define <8 x float> @global_extload_v8bf16_to_v8f32(ptr addrspace(1) %ptr) {
73677375; GFX1250-NEXT: s_wait_kmcnt 0x0
73687376; GFX1250-NEXT: global_load_b128 v[4:7], v[0:1], off
73697377; GFX1250-NEXT: s_wait_loadcnt 0x0
7378+ ; GFX1250-NEXT: s_wait_xcnt 0x0
73707379; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v4 :: v_dual_lshlrev_b32 v2, 16, v5
73717380; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v4
73727381; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v5
@@ -8001,6 +8010,7 @@ define <2 x double> @global_extload_v2bf16_to_v2f64(ptr addrspace(1) %ptr) {
80018010; GFX1250-NEXT: s_wait_kmcnt 0x0
80028011; GFX1250-NEXT: global_load_b32 v0, v[0:1], off
80038012; GFX1250-NEXT: s_wait_loadcnt 0x0
8013+ ; GFX1250-NEXT: s_wait_xcnt 0x0
80048014; GFX1250-NEXT: v_lshlrev_b32_e32 v1, 16, v0
80058015; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
80068016; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -8241,6 +8251,7 @@ define <4 x double> @global_extload_v4bf16_to_v4f64(ptr addrspace(1) %ptr) {
82418251; GFX1250-NEXT: s_wait_kmcnt 0x0
82428252; GFX1250-NEXT: global_load_b64 v[2:3], v[0:1], off
82438253; GFX1250-NEXT: s_wait_loadcnt 0x0
8254+ ; GFX1250-NEXT: s_wait_xcnt 0x0
82448255; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v4, 16, v3
82458256; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
82468257; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
@@ -8377,6 +8388,7 @@ define <5 x double> @global_extload_v5bf16_to_v5f64(ptr addrspace(1) %ptr) {
83778388; GFX1250-NEXT: s_wait_kmcnt 0x0
83788389; GFX1250-NEXT: global_load_b128 v[2:5], v[0:1], off
83798390; GFX1250-NEXT: s_wait_loadcnt 0x0
8391+ ; GFX1250-NEXT: s_wait_xcnt 0x0
83808392; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v2 :: v_dual_lshlrev_b32 v5, 16, v3
83818393; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
83828394; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
@@ -8522,6 +8534,7 @@ define <6 x double> @global_extload_v6bf16_to_v6f64(ptr addrspace(1) %ptr) {
85228534; GFX1250-NEXT: s_wait_kmcnt 0x0
85238535; GFX1250-NEXT: global_load_b96 v[4:6], v[0:1], off
85248536; GFX1250-NEXT: s_wait_loadcnt 0x0
8537+ ; GFX1250-NEXT: s_wait_xcnt 0x0
85258538; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v4
85268539; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v4
85278540; GFX1250-NEXT: v_lshlrev_b32_e32 v4, 16, v5
@@ -8693,6 +8706,7 @@ define <8 x double> @global_extload_v8bf16_to_v8f64(ptr addrspace(1) %ptr) {
86938706; GFX1250-NEXT: s_wait_kmcnt 0x0
86948707; GFX1250-NEXT: global_load_b128 v[8:11], v[0:1], off
86958708; GFX1250-NEXT: s_wait_loadcnt 0x0
8709+ ; GFX1250-NEXT: s_wait_xcnt 0x0
86968710; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v8 :: v_dual_lshlrev_b32 v4, 16, v9
86978711; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v8
86988712; GFX1250-NEXT: v_and_b32_e32 v6, 0xffff0000, v9
0 commit comments