@@ -885,7 +885,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
885885; SI-NEXT: s_mov_b64 s[0:1], exec
886886; SI-NEXT: s_wqm_b64 exec, exec
887887; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
888- ; SI-NEXT: s_mov_b32 s4 , 0
888+ ; SI-NEXT: s_mov_b32 s6 , 0
889889; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
890890; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
891891; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -894,24 +894,24 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
894894; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
895895; SI-NEXT: s_cbranch_scc0 .LBB7_9
896896; SI-NEXT: ; %bb.2: ; %.demote0
897- ; SI-NEXT: s_wqm_b64 s[6:7 ], s[0:1]
898- ; SI-NEXT: s_and_b64 exec, exec, s[6:7 ]
897+ ; SI-NEXT: s_wqm_b64 s[4:5 ], s[0:1]
898+ ; SI-NEXT: s_and_b64 exec, exec, s[4:5 ]
899899; SI-NEXT: .LBB7_3: ; %.continue0.preheader
900900; SI-NEXT: s_or_b64 exec, exec, s[2:3]
901+ ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
901902; SI-NEXT: s_mov_b64 s[2:3], 0
902- ; SI-NEXT: v_mov_b32_e32 v0, s4
903+ ; SI-NEXT: v_mov_b32_e32 v0, s6
903904; SI-NEXT: s_branch .LBB7_5
904905; SI-NEXT: .LBB7_4: ; %.continue1
905906; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
906- ; SI-NEXT: s_or_b64 exec, exec, s[4:5 ]
907+ ; SI-NEXT: s_or_b64 exec, exec, s[6:7 ]
907908; SI-NEXT: v_add_u32_e32 v0, vcc, 1, v0
908909; SI-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
909910; SI-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
910911; SI-NEXT: s_andn2_b64 exec, exec, s[2:3]
911912; SI-NEXT: s_cbranch_execz .LBB7_8
912913; SI-NEXT: .LBB7_5: ; %.continue0
913914; SI-NEXT: ; =>This Inner Loop Header: Depth=1
914- ; SI-NEXT: s_mov_b64 s[4:5], s[0:1]
915915; SI-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
916916; SI-NEXT: v_mov_b32_e32 v3, v2
917917; SI-NEXT: s_nop 1
@@ -920,19 +920,19 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
920920; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
921921; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
922922; SI-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
923- ; SI-NEXT: s_and_b64 s[4:5 ], s[0:1 ], vcc
924- ; SI-NEXT: s_xor_b64 s[4:5 ], s[4:5 ], -1
925- ; SI-NEXT: s_and_saveexec_b64 s[6:7 ], s[4:5 ]
926- ; SI-NEXT: s_xor_b64 s[4:5 ], exec, s[6:7 ]
923+ ; SI-NEXT: s_and_b64 s[6:7 ], s[4:5 ], vcc
924+ ; SI-NEXT: s_xor_b64 s[6:7 ], s[6:7 ], -1
925+ ; SI-NEXT: s_and_saveexec_b64 s[8:9 ], s[6:7 ]
926+ ; SI-NEXT: s_xor_b64 s[6:7 ], exec, s[8:9 ]
927927; SI-NEXT: s_cbranch_execz .LBB7_4
928928; SI-NEXT: ; %bb.6: ; %.demote1
929929; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
930930; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
931931; SI-NEXT: s_cbranch_scc0 .LBB7_9
932932; SI-NEXT: ; %bb.7: ; %.demote1
933933; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1
934- ; SI-NEXT: s_wqm_b64 s[6:7 ], s[0:1]
935- ; SI-NEXT: s_and_b64 exec, exec, s[6:7 ]
934+ ; SI-NEXT: s_wqm_b64 s[8:9 ], s[0:1]
935+ ; SI-NEXT: s_and_b64 exec, exec, s[8:9 ]
936936; SI-NEXT: s_branch .LBB7_4
937937; SI-NEXT: .LBB7_8: ; %.return
938938; SI-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -951,7 +951,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
951951; GFX9-NEXT: s_mov_b64 s[0:1], exec
952952; GFX9-NEXT: s_wqm_b64 exec, exec
953953; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
954- ; GFX9-NEXT: s_mov_b32 s4 , 0
954+ ; GFX9-NEXT: s_mov_b32 s6 , 0
955955; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
956956; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
957957; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -960,24 +960,24 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
960960; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
961961; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
962962; GFX9-NEXT: ; %bb.2: ; %.demote0
963- ; GFX9-NEXT: s_wqm_b64 s[6:7 ], s[0:1]
964- ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7 ]
963+ ; GFX9-NEXT: s_wqm_b64 s[4:5 ], s[0:1]
964+ ; GFX9-NEXT: s_and_b64 exec, exec, s[4:5 ]
965965; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader
966966; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
967+ ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
967968; GFX9-NEXT: s_mov_b64 s[2:3], 0
968- ; GFX9-NEXT: v_mov_b32_e32 v0, s4
969+ ; GFX9-NEXT: v_mov_b32_e32 v0, s6
969970; GFX9-NEXT: s_branch .LBB7_5
970971; GFX9-NEXT: .LBB7_4: ; %.continue1
971972; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
972- ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5 ]
973+ ; GFX9-NEXT: s_or_b64 exec, exec, s[6:7 ]
973974; GFX9-NEXT: v_add_u32_e32 v0, 1, v0
974975; GFX9-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
975976; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
976977; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
977978; GFX9-NEXT: s_cbranch_execz .LBB7_8
978979; GFX9-NEXT: .LBB7_5: ; %.continue0
979980; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
980- ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
981981; GFX9-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
982982; GFX9-NEXT: v_mov_b32_e32 v3, v2
983983; GFX9-NEXT: s_nop 1
@@ -986,19 +986,19 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
986986; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
987987; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
988988; GFX9-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
989- ; GFX9-NEXT: s_and_b64 s[4:5 ], s[0:1 ], vcc
990- ; GFX9-NEXT: s_xor_b64 s[4:5 ], s[4:5 ], -1
991- ; GFX9-NEXT: s_and_saveexec_b64 s[6:7 ], s[4:5 ]
992- ; GFX9-NEXT: s_xor_b64 s[4:5 ], exec, s[6:7 ]
989+ ; GFX9-NEXT: s_and_b64 s[6:7 ], s[4:5 ], vcc
990+ ; GFX9-NEXT: s_xor_b64 s[6:7 ], s[6:7 ], -1
991+ ; GFX9-NEXT: s_and_saveexec_b64 s[8:9 ], s[6:7 ]
992+ ; GFX9-NEXT: s_xor_b64 s[6:7 ], exec, s[8:9 ]
993993; GFX9-NEXT: s_cbranch_execz .LBB7_4
994994; GFX9-NEXT: ; %bb.6: ; %.demote1
995995; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
996996; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
997997; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
998998; GFX9-NEXT: ; %bb.7: ; %.demote1
999999; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1
1000- ; GFX9-NEXT: s_wqm_b64 s[6:7 ], s[0:1]
1001- ; GFX9-NEXT: s_and_b64 exec, exec, s[6:7 ]
1000+ ; GFX9-NEXT: s_wqm_b64 s[8:9 ], s[0:1]
1001+ ; GFX9-NEXT: s_and_b64 exec, exec, s[8:9 ]
10021002; GFX9-NEXT: s_branch .LBB7_4
10031003; GFX9-NEXT: .LBB7_8: ; %.return
10041004; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
@@ -1031,37 +1031,37 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10311031; GFX10-32-NEXT: .LBB7_3: ; %.continue0.preheader
10321032; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
10331033; GFX10-32-NEXT: v_mov_b32_e32 v0, s1
1034+ ; GFX10-32-NEXT: s_mov_b32 s2, s0
10341035; GFX10-32-NEXT: s_branch .LBB7_5
10351036; GFX10-32-NEXT: .LBB7_4: ; %.continue1
10361037; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
1037- ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s2
1038+ ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s3
10381039; GFX10-32-NEXT: v_add_nc_u32_e32 v0, 1, v0
10391040; GFX10-32-NEXT: v_cmp_ge_i32_e32 vcc_lo, v0, v1
10401041; GFX10-32-NEXT: s_or_b32 s1, vcc_lo, s1
10411042; GFX10-32-NEXT: s_andn2_b32 exec_lo, exec_lo, s1
10421043; GFX10-32-NEXT: s_cbranch_execz .LBB7_8
10431044; GFX10-32-NEXT: .LBB7_5: ; %.continue0
10441045; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1
1045- ; GFX10-32-NEXT: s_mov_b32 s2, s0
10461046; GFX10-32-NEXT: v_cndmask_b32_e64 v2, v0, 0, s2
10471047; GFX10-32-NEXT: v_mov_b32_e32 v3, v2
10481048; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
10491049; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
10501050; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
10511051; GFX10-32-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v2
1052- ; GFX10-32-NEXT: s_and_b32 s2, s0 , vcc_lo
1053- ; GFX10-32-NEXT: s_xor_b32 s2, s2 , -1
1054- ; GFX10-32-NEXT: s_and_saveexec_b32 s3, s2
1055- ; GFX10-32-NEXT: s_xor_b32 s2 , exec_lo, s3
1052+ ; GFX10-32-NEXT: s_and_b32 s3, s2 , vcc_lo
1053+ ; GFX10-32-NEXT: s_xor_b32 s3, s3 , -1
1054+ ; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3
1055+ ; GFX10-32-NEXT: s_xor_b32 s3 , exec_lo, s4
10561056; GFX10-32-NEXT: s_cbranch_execz .LBB7_4
10571057; GFX10-32-NEXT: ; %bb.6: ; %.demote1
10581058; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
10591059; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo
10601060; GFX10-32-NEXT: s_cbranch_scc0 .LBB7_9
10611061; GFX10-32-NEXT: ; %bb.7: ; %.demote1
10621062; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1
1063- ; GFX10-32-NEXT: s_wqm_b32 s3 , s0
1064- ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s3
1063+ ; GFX10-32-NEXT: s_wqm_b32 s4 , s0
1064+ ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s4
10651065; GFX10-32-NEXT: s_branch .LBB7_4
10661066; GFX10-32-NEXT: .LBB7_8: ; %.return
10671067; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1
@@ -1094,41 +1094,41 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10941094; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader
10951095; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
10961096; GFX10-64-NEXT: v_mov_b32_e32 v0, s4
1097- ; GFX10-64-NEXT: s_mov_b64 s[2:3], 0
1097+ ; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1]
1098+ ; GFX10-64-NEXT: s_mov_b64 s[4:5], 0
10981099; GFX10-64-NEXT: s_branch .LBB7_5
10991100; GFX10-64-NEXT: .LBB7_4: ; %.continue1
11001101; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
1101- ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5 ]
1102+ ; GFX10-64-NEXT: s_or_b64 exec, exec, s[6:7 ]
11021103; GFX10-64-NEXT: v_add_nc_u32_e32 v0, 1, v0
11031104; GFX10-64-NEXT: v_cmp_ge_i32_e32 vcc, v0, v1
1104- ; GFX10-64-NEXT: s_or_b64 s[2:3 ], vcc, s[2:3 ]
1105- ; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[2:3 ]
1105+ ; GFX10-64-NEXT: s_or_b64 s[4:5 ], vcc, s[4:5 ]
1106+ ; GFX10-64-NEXT: s_andn2_b64 exec, exec, s[4:5 ]
11061107; GFX10-64-NEXT: s_cbranch_execz .LBB7_8
11071108; GFX10-64-NEXT: .LBB7_5: ; %.continue0
11081109; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1
1109- ; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1]
1110- ; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[4:5]
1110+ ; GFX10-64-NEXT: v_cndmask_b32_e64 v2, v0, 0, s[2:3]
11111111; GFX10-64-NEXT: v_mov_b32_e32 v3, v2
11121112; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1
11131113; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
11141114; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec
11151115; GFX10-64-NEXT: v_cmp_eq_f32_e32 vcc, 0, v2
1116- ; GFX10-64-NEXT: s_and_b64 s[4:5 ], s[0:1 ], vcc
1117- ; GFX10-64-NEXT: s_xor_b64 s[4:5 ], s[4:5 ], -1
1118- ; GFX10-64-NEXT: s_and_saveexec_b64 s[6:7 ], s[4:5 ]
1119- ; GFX10-64-NEXT: s_xor_b64 s[4:5 ], exec, s[6:7 ]
1116+ ; GFX10-64-NEXT: s_and_b64 s[6:7 ], s[2:3 ], vcc
1117+ ; GFX10-64-NEXT: s_xor_b64 s[6:7 ], s[6:7 ], -1
1118+ ; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9 ], s[6:7 ]
1119+ ; GFX10-64-NEXT: s_xor_b64 s[6:7 ], exec, s[8:9 ]
11201120; GFX10-64-NEXT: s_cbranch_execz .LBB7_4
11211121; GFX10-64-NEXT: ; %bb.6: ; %.demote1
11221122; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
11231123; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
11241124; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9
11251125; GFX10-64-NEXT: ; %bb.7: ; %.demote1
11261126; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1
1127- ; GFX10-64-NEXT: s_wqm_b64 s[6:7 ], s[0:1]
1128- ; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7 ]
1127+ ; GFX10-64-NEXT: s_wqm_b64 s[8:9 ], s[0:1]
1128+ ; GFX10-64-NEXT: s_and_b64 exec, exec, s[8:9 ]
11291129; GFX10-64-NEXT: s_branch .LBB7_4
11301130; GFX10-64-NEXT: .LBB7_8: ; %.return
1131- ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3 ]
1131+ ; GFX10-64-NEXT: s_or_b64 exec, exec, s[4:5 ]
11321132; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1]
11331133; GFX10-64-NEXT: v_mov_b32_e32 v0, 0x3c00
11341134; GFX10-64-NEXT: v_bfrev_b32_e32 v1, 60
0 commit comments