@@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
557557; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7
558558; GFX908-NEXT: s_mul_i32 s0, s0, s7
559559; GFX908-NEXT: s_add_i32 s1, s9, s1
560- ; GFX908-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
560+ ; GFX908-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
561561; GFX908-NEXT: s_branch .LBB3_2
562562; GFX908-NEXT: .LBB3_1: ; %Flow20
563563; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
564- ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
564+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
565565; GFX908-NEXT: s_cbranch_vccz .LBB3_12
566566; GFX908-NEXT: .LBB3_2: ; %bb9
567567; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -571,17 +571,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
571571; GFX908-NEXT: ; %bb.3: ; %bb14
572572; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
573573; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
574- ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
575574; GFX908-NEXT: s_mov_b32 s7, s6
576- ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
577575; GFX908-NEXT: v_mov_b32_e32 v4, s6
578- ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
579576; GFX908-NEXT: v_mov_b32_e32 v6, s6
580577; GFX908-NEXT: v_mov_b32_e32 v9, s7
581578; GFX908-NEXT: v_mov_b32_e32 v5, s7
582579; GFX908-NEXT: v_mov_b32_e32 v7, s7
583580; GFX908-NEXT: v_mov_b32_e32 v8, s6
584- ; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
581+ ; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
582+ ; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
585583; GFX908-NEXT: v_mov_b32_e32 v11, v5
586584; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
587585; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -601,9 +599,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
601599; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
602600; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
603601; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
604- ; GFX908-NEXT: s_add_u32 s18, s18, s14
602+ ; GFX908-NEXT: s_add_u32 s18, s18, s0
605603; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
606- ; GFX908-NEXT: s_addc_u32 s19, s19, s15
604+ ; GFX908-NEXT: s_addc_u32 s19, s19, s1
607605; GFX908-NEXT: s_mov_b64 s[20:21], 0
608606; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
609607; GFX908-NEXT: s_cbranch_vccz .LBB3_9
@@ -622,7 +620,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
622620; GFX908-NEXT: s_waitcnt vmcnt(0)
623621; GFX908-NEXT: ds_read_b64 v[12:13], v19
624622; GFX908-NEXT: ds_read_b64 v[14:15], v0
625- ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1 ]
623+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
626624; GFX908-NEXT: s_waitcnt lgkmcnt(0)
627625; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
628626; GFX908-NEXT: ; %bb.6: ; %bb51
@@ -650,7 +648,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
650648; GFX908-NEXT: s_mov_b64 s[20:21], -1
651649; GFX908-NEXT: s_branch .LBB3_4
652650; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
653- ; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17 ]
651+ ; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15 ]
654652; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
655653; GFX908-NEXT: s_cbranch_vccz .LBB3_4
656654; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -661,7 +659,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
661659; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1
662660; GFX908-NEXT: .LBB3_10: ; %Flow19
663661; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
664- ; GFX908-NEXT: s_mov_b64 s[0:1 ], -1
662+ ; GFX908-NEXT: s_mov_b64 s[14:15 ], -1
665663; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
666664; GFX908-NEXT: s_cbranch_vccz .LBB3_1
667665; GFX908-NEXT: ; %bb.11: ; %bb12
@@ -670,7 +668,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
670668; GFX908-NEXT: s_addc_u32 s5, s5, 0
671669; GFX908-NEXT: s_add_u32 s10, s10, s12
672670; GFX908-NEXT: s_addc_u32 s11, s11, s13
673- ; GFX908-NEXT: s_mov_b64 s[0:1 ], 0
671+ ; GFX908-NEXT: s_mov_b64 s[14:15 ], 0
674672; GFX908-NEXT: s_branch .LBB3_1
675673; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
676674; GFX908-NEXT: s_endpgm
@@ -720,11 +718,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
720718; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7
721719; GFX90A-NEXT: s_mul_i32 s0, s0, s7
722720; GFX90A-NEXT: s_add_i32 s1, s9, s1
723- ; GFX90A-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
721+ ; GFX90A-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
724722; GFX90A-NEXT: s_branch .LBB3_2
725723; GFX90A-NEXT: .LBB3_1: ; %Flow20
726724; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
727- ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
725+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
728726; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
729727; GFX90A-NEXT: .LBB3_2: ; %bb9
730728; GFX90A-NEXT: ; =>This Loop Header: Depth=1
@@ -734,14 +732,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
734732; GFX90A-NEXT: ; %bb.3: ; %bb14
735733; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
736734; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
737- ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
738735; GFX90A-NEXT: s_mov_b32 s7, s6
739- ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
740736; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
741- ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
742737; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1]
743738; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
744- ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
739+ ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
740+ ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
745741; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11]
746742; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
747743; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -760,8 +756,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
760756; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
761757; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
762758; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
763- ; GFX90A-NEXT: s_add_u32 s18, s18, s14
764- ; GFX90A-NEXT: s_addc_u32 s19, s19, s15
759+ ; GFX90A-NEXT: s_add_u32 s18, s18, s0
760+ ; GFX90A-NEXT: s_addc_u32 s19, s19, s1
765761; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
766762; GFX90A-NEXT: s_mov_b64 s[20:21], 0
767763; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
@@ -781,7 +777,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
781777; GFX90A-NEXT: s_waitcnt vmcnt(0)
782778; GFX90A-NEXT: ds_read_b64 v[14:15], v19
783779; GFX90A-NEXT: ds_read_b64 v[16:17], v0
784- ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1 ]
780+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
785781; GFX90A-NEXT: ; kill: killed $sgpr20 killed $sgpr21
786782; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
787783; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
@@ -802,7 +798,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
802798; GFX90A-NEXT: s_mov_b64 s[20:21], -1
803799; GFX90A-NEXT: s_branch .LBB3_4
804800; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
805- ; GFX90A-NEXT: s_mov_b64 s[20:21], s[16:17 ]
801+ ; GFX90A-NEXT: s_mov_b64 s[20:21], s[14:15 ]
806802; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[20:21]
807803; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
808804; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -813,7 +809,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
813809; GFX90A-NEXT: s_xor_b64 s[16:17], s[20:21], -1
814810; GFX90A-NEXT: .LBB3_10: ; %Flow19
815811; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
816- ; GFX90A-NEXT: s_mov_b64 s[0:1 ], -1
812+ ; GFX90A-NEXT: s_mov_b64 s[14:15 ], -1
817813; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
818814; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
819815; GFX90A-NEXT: ; %bb.11: ; %bb12
@@ -822,7 +818,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
822818; GFX90A-NEXT: s_addc_u32 s5, s5, 0
823819; GFX90A-NEXT: s_add_u32 s10, s10, s12
824820; GFX90A-NEXT: s_addc_u32 s11, s11, s13
825- ; GFX90A-NEXT: s_mov_b64 s[0:1 ], 0
821+ ; GFX90A-NEXT: s_mov_b64 s[14:15 ], 0
826822; GFX90A-NEXT: s_branch .LBB3_1
827823; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
828824; GFX90A-NEXT: s_endpgm
0 commit comments