@@ -518,14 +518,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
518518; GFX908-NEXT: s_waitcnt lgkmcnt(0)
519519; GFX908-NEXT: v_cvt_f32_u32_e32 v0, s1
520520; GFX908-NEXT: s_sub_i32 s3, 0, s1
521- ; GFX908-NEXT: v_cvt_f32_f16_e32 v17, s2
522- ; GFX908-NEXT: v_mov_b32_e32 v19, 0
523- ; GFX908-NEXT: v_rcp_iflag_f32_e32 v2, v0
524- ; GFX908-NEXT: v_mov_b32_e32 v0, 0
525- ; GFX908-NEXT: v_mov_b32_e32 v1, 0
526- ; GFX908-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
527- ; GFX908-NEXT: v_cvt_u32_f32_e32 v2, v2
528- ; GFX908-NEXT: v_readfirstlane_b32 s5, v2
521+ ; GFX908-NEXT: v_cvt_f32_f16_e32 v18, s2
522+ ; GFX908-NEXT: v_mov_b32_e32 v17, 0
523+ ; GFX908-NEXT: v_rcp_iflag_f32_e32 v0, v0
524+ ; GFX908-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
525+ ; GFX908-NEXT: v_cvt_u32_f32_e32 v0, v0
526+ ; GFX908-NEXT: v_readfirstlane_b32 s5, v0
529527; GFX908-NEXT: s_mul_i32 s3, s3, s5
530528; GFX908-NEXT: s_mul_hi_u32 s3, s5, s3
531529; GFX908-NEXT: s_add_i32 s5, s5, s3
@@ -541,12 +539,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
541539; GFX908-NEXT: s_cmp_ge_u32 s0, s1
542540; GFX908-NEXT: s_cselect_b32 s8, s5, s3
543541; GFX908-NEXT: s_lshr_b32 s2, s2, 16
544- ; GFX908-NEXT: v_cvt_f32_f16_e32 v18 , s2
542+ ; GFX908-NEXT: v_cvt_f32_f16_e32 v19 , s2
545543; GFX908-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
544+ ; GFX908-NEXT: v_mov_b32_e32 v0, 0
546545; GFX908-NEXT: s_lshl_b64 s[10:11], s[16:17], 5
547546; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1]
548547; GFX908-NEXT: s_or_b32 s12, s12, 28
549548; GFX908-NEXT: s_lshl_b64 s[14:15], s[8:9], 5
549+ ; GFX908-NEXT: v_mov_b32_e32 v1, 0
550550; GFX908-NEXT: s_waitcnt vmcnt(0)
551551; GFX908-NEXT: v_readfirstlane_b32 s2, v16
552552; GFX908-NEXT: s_and_b32 s2, 0xffff, s2
@@ -610,7 +610,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
610610; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
611611; GFX908-NEXT: s_add_u32 s22, s20, s5
612612; GFX908-NEXT: s_addc_u32 s23, s21, s9
613- ; GFX908-NEXT: global_load_dword v21, v19 , s[22:23] offset:-12 glc
613+ ; GFX908-NEXT: global_load_dword v21, v17 , s[22:23] offset:-12 glc
614614; GFX908-NEXT: s_waitcnt vmcnt(0)
615615; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc
616616; GFX908-NEXT: s_waitcnt vmcnt(0)
@@ -685,12 +685,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
685685; GFX90A-NEXT: v_cvt_f32_u32_e32 v0, s1
686686; GFX90A-NEXT: s_sub_i32 s3, 0, s1
687687; GFX90A-NEXT: v_mov_b32_e32 v19, 0
688- ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v2, v0
689- ; GFX90A-NEXT: v_pk_mov_b32 v[0:1], 0, 0
690- ; GFX90A-NEXT: v_mul_f32_e32 v2 , 0x4f7ffffe, v2
691- ; GFX90A-NEXT: v_cvt_u32_f32_e32 v3, v2
692- ; GFX90A-NEXT: v_cvt_f32_f16_e32 v2 , s2
693- ; GFX90A-NEXT: v_readfirstlane_b32 s5, v3
688+ ; GFX90A-NEXT: v_pk_mov_b32 v[2:3], 0, 0
689+ ; GFX90A-NEXT: v_rcp_iflag_f32_e32 v0, v0
690+ ; GFX90A-NEXT: v_mul_f32_e32 v0 , 0x4f7ffffe, v0
691+ ; GFX90A-NEXT: v_cvt_u32_f32_e32 v1, v0
692+ ; GFX90A-NEXT: v_cvt_f32_f16_e32 v0 , s2
693+ ; GFX90A-NEXT: v_readfirstlane_b32 s5, v1
694694; GFX90A-NEXT: s_mul_i32 s3, s3, s5
695695; GFX90A-NEXT: s_mul_hi_u32 s3, s5, s3
696696; GFX90A-NEXT: s_add_i32 s5, s5, s3
@@ -706,7 +706,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
706706; GFX90A-NEXT: s_cmp_ge_u32 s0, s1
707707; GFX90A-NEXT: s_cselect_b32 s8, s5, s3
708708; GFX90A-NEXT: s_lshr_b32 s2, s2, 16
709- ; GFX90A-NEXT: v_cvt_f32_f16_e32 v3 , s2
709+ ; GFX90A-NEXT: v_cvt_f32_f16_e32 v1 , s2
710710; GFX90A-NEXT: s_lshl_b64 s[12:13], s[6:7], 5
711711; GFX90A-NEXT: s_lshl_b64 s[10:11], s[16:17], 5
712712; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1]
@@ -733,7 +733,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
733733; GFX90A-NEXT: s_cbranch_vccz .LBB3_10
734734; GFX90A-NEXT: ; %bb.3: ; %bb14
735735; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
736- ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1 ], off
736+ ; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3 ], off
737737; GFX90A-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], -1
738738; GFX90A-NEXT: s_mov_b32 s5, s4
739739; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[2:3]
0 commit comments