llvm · justinfargnoli · Jul 1, 2024 · Jul 8, 2024 · Jul 8, 2024 · Jul 8, 2024
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -6036,6 +6036,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
          HandOpcode == ISD::ANY_EXTEND_VECTOR_INREG) &&
         LegalTypes && !TLI.isTypeDesirableForOp(LogicOpcode, XVT))
       return SDValue();
+    // Prevent an infinite loop if the target prefers the inverse
+    // transformation.
+    if (TLI.isNarrowingProfitable(N, XVT, VT))
+      return SDValue();
     // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y)
     SDNodeFlags LogicFlags;
     LogicFlags.setDisjoint(N->getFlags().hasDisjoint() &&
@@ -6048,6 +6052,9 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
   if (HandOpcode == ISD::TRUNCATE) {
+    // Don't create a logic op on an illegal type.
+    if (!TLI.isTypeLegal(XVT))
+      return SDValue();
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
     if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -6059,10 +6066,12 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
       return SDValue();
     // Be extra careful sinking truncate. If it's free, there's no benefit in
-    // widening a binop. Also, don't create a logic op on an illegal type.
+    // widening a binop.
     if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
       return SDValue();
-    if (!TLI.isTypeLegal(XVT))
+    // Prevent an infinite loop if the target prefers the inverse
+    // transformation.
+    if (TLI.isNarrowingProfitable(N, XVT, VT))
       return SDValue();
     SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
     return DAG.getNode(HandOpcode, DL, VT, Logic);
@@ -15869,6 +15878,28 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
+  if (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT)) {
+    switch (N0.getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      if (!N0.hasOneUse() || !VT.isScalarInteger())
+        break;
+      if (!TLI.isNarrowingProfitable(N0.getNode(), SrcVT, VT))
+        break;
+      SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+      SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+      SDValue TruncatedOp =
+          DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+      if (TLI.IsDesirableToPromoteOp(TruncatedOp, SrcVT))
+        break;
+      return TruncatedOp;
+    }
+  }
+
   return SDValue();
 }
 

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1040,6 +1040,8 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
   case ISD::MUL:
   case ISD::SETCC:
   case ISD::SELECT:
+    if (DestVT.getScalarSizeInBits() == 1)
+      return false;
     if (Subtarget->has16BitInsts() &&
         (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
       // Don't narrow back down to i16 if promoted to i32 already.

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -162,6 +162,14 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
+  bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override {
+    // Truncating 64-bit to 32-bit is free in SASS.
+    if (!SrcVT.isScalarInteger() || !DestVT.isScalarInteger())
+      return false;
+    return SrcVT.getFixedSizeInBits() == 64 &&
+           DestVT.getFixedSizeInBits() == 32;
+  }
+
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {
     if (VT.isVector())

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1438,8 +1438,8 @@ def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>;
 // These transformations were once reliably performed by instcombine, but thanks
 // to poison semantics they are no longer safe for LLVM IR, perform them here
 // instead.
-def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>;
-def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>;
+def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
 
 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
 foreach vt = [v2i16, v4i8] in {

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -35425,6 +35425,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
                                               EVT DestVT) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
+  if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
+    return false;
   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
 }
 

diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s
@@ -6,6 +7,20 @@
 ; GFX9:  v_xor_b32_e32
 ; GFX10: v_xor_b32_e32
 define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) {
+; GFX9-LABEL: add_var_var_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in0
   %b = load volatile i1, ptr addrspace(1) %in1
   %add = add i1 %a, %b
@@ -17,6 +32,17 @@ define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX9:  s_xor_b64
 ; GFX10: s_xor_b32
 define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; GFX9-LABEL: add_var_imm_i1:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_not_b32_e32 v1, v1
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in
   %add = add i1 %a, 1
   store i1 %add, ptr addrspace(1) %out
@@ -28,6 +54,44 @@ define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1
 ; GFX9: s_xor_b64
 ; GFX10: s_xor_b32
 define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) {
+; GFX9-LABEL: add_i1_cf:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX9-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
+; GFX9-NEXT:    ; implicit-def: $sgpr4_sgpr5
+; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX9-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_2
+; GFX9-NEXT:  ; %bb.1: ; %else
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[8:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
+; GFX9-NEXT:  .LBB2_2: ; %Flow
+; GFX9-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX9-NEXT:    s_cbranch_execz .LBB2_4
+; GFX9-NEXT:  ; %bb.3: ; %if
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ubyte v0, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_andn2_b64 s[2:3], s[4:5], exec
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    s_and_b64 s[4:5], vcc, exec
+; GFX9-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
+; GFX9-NEXT:  .LBB2_4: ; %endif
+; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_not_b32_e32 v1, v1
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -49,3 +113,6 @@ endif:
 }
 
 declare i32 @llvm.amdgcn.workitem.id.x()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; GFX10: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -565,8 +565,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
 ; GFX908-NEXT:  .LBB3_2: ; %bb9
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
-; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
-; GFX908-NEXT:    s_mov_b64 s[18:19], -1
+; GFX908-NEXT:    ; Child Loop BB3_6 Depth 2
+; GFX908-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX908-NEXT:    s_mov_b64 vcc, s[0:1]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_10
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
@@ -597,18 +597,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    s_add_i32 s13, s22, s13
 ; GFX908-NEXT:    s_mul_i32 s9, s6, s9
 ; GFX908-NEXT:    s_add_i32 s13, s13, s23
-; GFX908-NEXT:    s_branch .LBB3_5
+; GFX908-NEXT:    s_branch .LBB3_6
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
-; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
 ; GFX908-NEXT:    s_add_u32 s20, s20, s4
 ; GFX908-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
 ; GFX908-NEXT:    s_addc_u32 s21, s21, s5
 ; GFX908-NEXT:    s_mov_b64 s[22:23], 0
+; GFX908-NEXT:  .LBB3_5: ; %Flow18
+; GFX908-NEXT:    ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s[22:23]
+; GFX908-NEXT:    v_readfirstlane_b32 s22, v12
+; GFX908-NEXT:    s_not_b32 s22, s22
+; GFX908-NEXT:    s_bitcmp1_b32 s22, 0
+; GFX908-NEXT:    s_cselect_b64 s[22:23], -1, 0
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
-; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
-; GFX908-NEXT:  .LBB3_5: ; %bb16
+; GFX908-NEXT:    s_cbranch_vccz .LBB3_10
+; GFX908-NEXT:  .LBB3_6: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX908-NEXT:    s_add_u32 s22, s20, s9
@@ -625,9 +632,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    ds_read_b64 v[14:15], v0
 ; GFX908-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    s_cbranch_vccnz .LBB3_7
-; GFX908-NEXT:  ; %bb.6: ; %bb51
-; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:    s_cbranch_vccnz .LBB3_8
+; GFX908-NEXT:  ; %bb.7: ; %bb51
+; GFX908-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX908-NEXT:    v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX908-NEXT:    v_cvt_f32_f16_e32 v21, v21
 ; GFX908-NEXT:    v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -649,21 +656,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v10, v10, v12
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_branch .LBB3_4
-; GFX908-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT:  .LBB3_8: ; in Loop: Header=BB3_6 Depth=2
 ; GFX908-NEXT:    s_mov_b64 s[22:23], s[18:19]
 ; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
-; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT:  ; %bb.9: ; in Loop: Header=BB3_6 Depth=2
 ; GFX908-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
 ; GFX908-NEXT:    ; implicit-def: $sgpr20_sgpr21
-; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
-; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
+; GFX908-NEXT:    s_mov_b64 s[24:25], -1
+; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_10: ; %Flow19
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    s_mov_b64 s[2:3], -1
-; GFX908-NEXT:    s_and_b64 vcc, exec, s[18:19]
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[22:23]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  ; %bb.11: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
@@ -730,8 +736,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
 ; GFX90A-NEXT:  .LBB3_2: ; %bb9
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
-; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
-; GFX90A-NEXT:    s_mov_b64 s[18:19], -1
+; GFX90A-NEXT:    ; Child Loop BB3_6 Depth 2
+; GFX90A-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX90A-NEXT:    s_mov_b64 vcc, s[0:1]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_10
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
@@ -758,18 +764,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_add_i32 s13, s22, s13
 ; GFX90A-NEXT:    s_mul_i32 s9, s6, s9
 ; GFX90A-NEXT:    s_add_i32 s13, s13, s23
-; GFX90A-NEXT:    s_branch .LBB3_5
+; GFX90A-NEXT:    s_branch .LBB3_6
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
-; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
 ; GFX90A-NEXT:    s_add_u32 s20, s20, s4
 ; GFX90A-NEXT:    s_addc_u32 s21, s21, s5
 ; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
 ; GFX90A-NEXT:    s_mov_b64 s[22:23], 0
+; GFX90A-NEXT:  .LBB3_5: ; %Flow18
+; GFX90A-NEXT:    ; in Loop: Header=BB3_6 Depth=2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v14, 0, 1, s[22:23]
+; GFX90A-NEXT:    v_readfirstlane_b32 s22, v14
+; GFX90A-NEXT:    s_not_b32 s22, s22
+; GFX90A-NEXT:    s_bitcmp1_b32 s22, 0
+; GFX90A-NEXT:    s_cselect_b64 s[22:23], -1, 0
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
-; GFX90A-NEXT:    s_cbranch_vccz .LBB3_9
-; GFX90A-NEXT:  .LBB3_5: ; %bb16
+; GFX90A-NEXT:    s_cbranch_vccz .LBB3_10
+; GFX90A-NEXT:  .LBB3_6: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
 ; GFX90A-NEXT:    s_add_u32 s22, s20, s9
@@ -787,9 +800,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX90A-NEXT:    ; kill: killed $sgpr22 killed $sgpr23
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_7
-; GFX90A-NEXT:  ; %bb.6: ; %bb51
-; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_8
+; GFX90A-NEXT:  ; %bb.7: ; %bb51
+; GFX90A-NEXT:    ; in Loop: Header=BB3_6 Depth=2
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX90A-NEXT:    v_cvt_f32_f16_e32 v22, v21
 ; GFX90A-NEXT:    v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -803,21 +816,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[10:11], v[10:11], v[16:17]
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_branch .LBB3_4
-; GFX90A-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT:  .LBB3_8: ; in Loop: Header=BB3_6 Depth=2
 ; GFX90A-NEXT:    s_mov_b64 s[22:23], s[18:19]
 ; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
-; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX90A-NEXT:  ; %bb.9: ; in Loop: Header=BB3_6 Depth=2
 ; GFX90A-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
 ; GFX90A-NEXT:    ; implicit-def: $sgpr20_sgpr21
-; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
-; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
+; GFX90A-NEXT:    s_mov_b64 s[24:25], -1
+; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_10: ; %Flow19
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    s_mov_b64 s[2:3], -1
-; GFX90A-NEXT:    s_and_b64 vcc, exec, s[18:19]
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[22:23]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX90A-NEXT:  ; %bb.11: ; %bb12
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1