@@ -87,21 +87,37 @@ define amdgpu_kernel void @sqrt_f16(
8787; GFX12-TRUE16-LABEL: sqrt_f16: 
8888; GFX12-TRUE16:       ; %bb.0: ; %entry 
8989; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
90- ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, 0 
90+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1 
91+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000 
92+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6 
93+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7 
9194; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0 
92- ; GFX12-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 
93- ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, -1 
94- ; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null 
95+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2 
96+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3 
97+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0 
98+ ; GFX12-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null 
99+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1 
100+ ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0 
101+ ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l 
102+ ; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null 
95103; GFX12-TRUE16-NEXT:    s_endpgm 
96104; 
97105; GFX12-FAKE16-LABEL: sqrt_f16: 
98106; GFX12-FAKE16:       ; %bb.0: ; %entry 
99107; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
100- ; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v0, 0 
108+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1 
109+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000 
110+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6 
111+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7 
101112; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
102- ; GFX12-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 
103- ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, -1 
104- ; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null 
113+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2 
114+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3 
115+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0 
116+ ; GFX12-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], null 
117+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1 
118+ ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0 
119+ ; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v0, v0 
120+ ; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null 
105121; GFX12-FAKE16-NEXT:    s_endpgm 
106122    ptr  addrspace (1 ) %r ,
107123    ptr  addrspace (1 ) %a ) {
@@ -215,27 +231,45 @@ define amdgpu_kernel void @sqrt_v2f16(
215231; GFX12-TRUE16-LABEL: sqrt_v2f16: 
216232; GFX12-TRUE16:       ; %bb.0: ; %entry 
217233; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
218- ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, 0 
219- ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.h, 0 
234+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1 
235+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000 
236+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s10, s6 
237+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s11, s7 
220238; GFX12-TRUE16-NEXT:    s_wait_kmcnt 0x0 
221- ; GFX12-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000 
222- ; GFX12-TRUE16-NEXT:    s_mov_b32 s2, -1 
223- ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) 
239+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s8, s2 
240+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s9, s3 
241+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s4, s0 
242+ ; GFX12-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null 
243+ ; GFX12-TRUE16-NEXT:    s_mov_b32 s5, s1 
244+ ; GFX12-TRUE16-NEXT:    s_wait_loadcnt 0x0 
245+ ; GFX12-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0 
246+ ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.l, v0.l 
247+ ; GFX12-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 
248+ ; GFX12-TRUE16-NEXT:    v_sqrt_f16_e32 v0.h, v1.l 
224249; GFX12-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h 
225- ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3 ], null 
250+ ; GFX12-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7 ], null 
226251; GFX12-TRUE16-NEXT:    s_endpgm 
227252; 
228253; GFX12-FAKE16-LABEL: sqrt_v2f16: 
229254; GFX12-FAKE16:       ; %bb.0: ; %entry 
230255; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24 
231- ; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v0, 0 
232- ; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v1, 0 
256+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1 
257+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000 
258+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s10, s6 
259+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s11, s7 
233260; GFX12-FAKE16-NEXT:    s_wait_kmcnt 0x0 
234- ; GFX12-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000 
235- ; GFX12-FAKE16-NEXT:    s_mov_b32 s2, -1 
236- ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(TRANS32_DEP_1) 
261+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s8, s2 
262+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s9, s3 
263+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s4, s0 
264+ ; GFX12-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null 
265+ ; GFX12-FAKE16-NEXT:    s_mov_b32 s5, s1 
266+ ; GFX12-FAKE16-NEXT:    s_wait_loadcnt 0x0 
267+ ; GFX12-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0 
268+ ; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v0, v0 
269+ ; GFX12-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(TRANS32_DEP_1) 
270+ ; GFX12-FAKE16-NEXT:    v_sqrt_f16_e32 v1, v1 
237271; GFX12-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1 
238- ; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3 ], null 
272+ ; GFX12-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7 ], null 
239273; GFX12-FAKE16-NEXT:    s_endpgm 
240274    ptr  addrspace (1 ) %r ,
241275    ptr  addrspace (1 ) %a ) {
0 commit comments