@@ -565,8 +565,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
565565; GFX908-NEXT: s_cbranch_vccz .LBB3_12
566566; GFX908-NEXT: .LBB3_2: ; %bb9
567567; GFX908-NEXT: ; =>This Loop Header: Depth=1
568- ; GFX908-NEXT: ; Child Loop BB3_5 Depth 2
569- ; GFX908-NEXT: s_mov_b64 s[18:19 ], -1
568+ ; GFX908-NEXT: ; Child Loop BB3_6 Depth 2
569+ ; GFX908-NEXT: s_mov_b64 s[22:23 ], -1
570570; GFX908-NEXT: s_mov_b64 vcc, s[0:1]
571571; GFX908-NEXT: s_cbranch_vccz .LBB3_10
572572; GFX908-NEXT: ; %bb.3: ; %bb14
@@ -597,18 +597,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
597597; GFX908-NEXT: s_add_i32 s13, s22, s13
598598; GFX908-NEXT: s_mul_i32 s9, s6, s9
599599; GFX908-NEXT: s_add_i32 s13, s13, s23
600- ; GFX908-NEXT: s_branch .LBB3_5
600+ ; GFX908-NEXT: s_branch .LBB3_6
601601; GFX908-NEXT: .LBB3_4: ; %bb58
602- ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
602+ ; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
603603; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
604604; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
605605; GFX908-NEXT: s_add_u32 s20, s20, s4
606606; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
607607; GFX908-NEXT: s_addc_u32 s21, s21, s5
608608; GFX908-NEXT: s_mov_b64 s[22:23], 0
609+ ; GFX908-NEXT: .LBB3_5: ; %Flow18
610+ ; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
611+ ; GFX908-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[22:23]
612+ ; GFX908-NEXT: v_readfirstlane_b32 s22, v12
613+ ; GFX908-NEXT: s_not_b32 s22, s22
614+ ; GFX908-NEXT: s_bitcmp1_b32 s22, 0
615+ ; GFX908-NEXT: s_cselect_b64 s[22:23], -1, 0
609616; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
610- ; GFX908-NEXT: s_cbranch_vccz .LBB3_9
611- ; GFX908-NEXT: .LBB3_5 : ; %bb16
617+ ; GFX908-NEXT: s_cbranch_vccz .LBB3_10
618+ ; GFX908-NEXT: .LBB3_6 : ; %bb16
612619; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
613620; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
614621; GFX908-NEXT: s_add_u32 s22, s20, s9
@@ -625,9 +632,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
625632; GFX908-NEXT: ds_read_b64 v[14:15], v0
626633; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3]
627634; GFX908-NEXT: s_waitcnt lgkmcnt(0)
628- ; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
629- ; GFX908-NEXT: ; %bb.6 : ; %bb51
630- ; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
635+ ; GFX908-NEXT: s_cbranch_vccnz .LBB3_8
636+ ; GFX908-NEXT: ; %bb.7 : ; %bb51
637+ ; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
631638; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
632639; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21
633640; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -649,21 +656,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
649656; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
650657; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
651658; GFX908-NEXT: s_branch .LBB3_4
652- ; GFX908-NEXT: .LBB3_7 : ; in Loop: Header=BB3_5 Depth=2
659+ ; GFX908-NEXT: .LBB3_8 : ; in Loop: Header=BB3_6 Depth=2
653660; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19]
654661; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
655662; GFX908-NEXT: s_cbranch_vccz .LBB3_4
656- ; GFX908-NEXT: ; %bb.8 : ; in Loop: Header=BB3_2 Depth=1
663+ ; GFX908-NEXT: ; %bb.9 : ; in Loop: Header=BB3_6 Depth=2
657664; GFX908-NEXT: s_mov_b64 s[22:23], -1
658665; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
659666; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21
660- ; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
661- ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
662- ; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1
667+ ; GFX908-NEXT: s_mov_b64 s[24:25], -1
668+ ; GFX908-NEXT: s_branch .LBB3_5
663669; GFX908-NEXT: .LBB3_10: ; %Flow19
664670; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
665671; GFX908-NEXT: s_mov_b64 s[2:3], -1
666- ; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19 ]
672+ ; GFX908-NEXT: s_and_b64 vcc, exec, s[22:23 ]
667673; GFX908-NEXT: s_cbranch_vccz .LBB3_1
668674; GFX908-NEXT: ; %bb.11: ; %bb12
669675; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
@@ -730,8 +736,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
730736; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
731737; GFX90A-NEXT: .LBB3_2: ; %bb9
732738; GFX90A-NEXT: ; =>This Loop Header: Depth=1
733- ; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2
734- ; GFX90A-NEXT: s_mov_b64 s[18:19 ], -1
739+ ; GFX90A-NEXT: ; Child Loop BB3_6 Depth 2
740+ ; GFX90A-NEXT: s_mov_b64 s[22:23 ], -1
735741; GFX90A-NEXT: s_mov_b64 vcc, s[0:1]
736742; GFX90A-NEXT: s_cbranch_vccz .LBB3_10
737743; GFX90A-NEXT: ; %bb.3: ; %bb14
@@ -758,18 +764,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
758764; GFX90A-NEXT: s_add_i32 s13, s22, s13
759765; GFX90A-NEXT: s_mul_i32 s9, s6, s9
760766; GFX90A-NEXT: s_add_i32 s13, s13, s23
761- ; GFX90A-NEXT: s_branch .LBB3_5
767+ ; GFX90A-NEXT: s_branch .LBB3_6
762768; GFX90A-NEXT: .LBB3_4: ; %bb58
763- ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
769+ ; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
764770; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
765771; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
766772; GFX90A-NEXT: s_add_u32 s20, s20, s4
767773; GFX90A-NEXT: s_addc_u32 s21, s21, s5
768774; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
769775; GFX90A-NEXT: s_mov_b64 s[22:23], 0
776+ ; GFX90A-NEXT: .LBB3_5: ; %Flow18
777+ ; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
778+ ; GFX90A-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[22:23]
779+ ; GFX90A-NEXT: v_readfirstlane_b32 s22, v14
780+ ; GFX90A-NEXT: s_not_b32 s22, s22
781+ ; GFX90A-NEXT: s_bitcmp1_b32 s22, 0
782+ ; GFX90A-NEXT: s_cselect_b64 s[22:23], -1, 0
770783; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
771- ; GFX90A-NEXT: s_cbranch_vccz .LBB3_9
772- ; GFX90A-NEXT: .LBB3_5 : ; %bb16
784+ ; GFX90A-NEXT: s_cbranch_vccz .LBB3_10
785+ ; GFX90A-NEXT: .LBB3_6 : ; %bb16
773786; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
774787; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
775788; GFX90A-NEXT: s_add_u32 s22, s20, s9
@@ -787,9 +800,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
787800; GFX90A-NEXT: s_and_b64 vcc, exec, s[2:3]
788801; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23
789802; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
790- ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
791- ; GFX90A-NEXT: ; %bb.6 : ; %bb51
792- ; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
803+ ; GFX90A-NEXT: s_cbranch_vccnz .LBB3_8
804+ ; GFX90A-NEXT: ; %bb.7 : ; %bb51
805+ ; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
793806; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
794807; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21
795808; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -803,21 +816,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
803816; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17]
804817; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15]
805818; GFX90A-NEXT: s_branch .LBB3_4
806- ; GFX90A-NEXT: .LBB3_7 : ; in Loop: Header=BB3_5 Depth=2
819+ ; GFX90A-NEXT: .LBB3_8 : ; in Loop: Header=BB3_6 Depth=2
807820; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19]
808821; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
809822; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
810- ; GFX90A-NEXT: ; %bb.8 : ; in Loop: Header=BB3_2 Depth=1
823+ ; GFX90A-NEXT: ; %bb.9 : ; in Loop: Header=BB3_6 Depth=2
811824; GFX90A-NEXT: s_mov_b64 s[22:23], -1
812825; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5
813826; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21
814- ; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard
815- ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
816- ; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1
827+ ; GFX90A-NEXT: s_mov_b64 s[24:25], -1
828+ ; GFX90A-NEXT: s_branch .LBB3_5
817829; GFX90A-NEXT: .LBB3_10: ; %Flow19
818830; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
819831; GFX90A-NEXT: s_mov_b64 s[2:3], -1
820- ; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19 ]
832+ ; GFX90A-NEXT: s_and_b64 vcc, exec, s[22:23 ]
821833; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
822834; GFX90A-NEXT: ; %bb.11: ; %bb12
823835; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
0 commit comments