fixup! AMDGPU: Fix assert when multi operands to update after folding imm

macurtis-amd · macurtis-amd · commit 8e02ce545f87 · 2025-07-16T05:14:01.000-05:00
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1761,7 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
   for (MachineInstr *Copy : CopiesToReplace)
     Copy->addImplicitDefUseOperands(*MF);
 
-  SmallVector<MachineInstr *, 4> ConstantFoldCandidates;
+  SetVector<MachineInstr *> ConstantFoldCandidates;
   for (FoldCandidate &Fold : FoldList) {
     assert(!Fold.isReg() || Fold.Def.OpToFold);
     if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1784,8 +1784,8 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
                         << static_cast<int>(Fold.UseOpNo) << " of "
                         << *Fold.UseMI);
 
-      if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI))
-        ConstantFoldCandidates.push_back(Fold.UseMI);
+      if (Fold.isImm())
+        ConstantFoldCandidates.insert(Fold.UseMI);
 
     } else if (Fold.Commuted) {
       // Restoring instruction's original operand order if fold has failed.
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll
diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir
@@ -0,0 +1,128 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
+--- |
+  %struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
+  %struct.bar.0 = type { %struct.blam }
+  %struct.blam = type { i32, i32, i32, i32 }
+
+  @global = external addrspace(3) global %struct.bar
+
+  define void @snork() {
+  bb:
+    %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
+    %fmul = fmul ninf float %call, 0.000000e+00
+    %fptoui = fptoui float %fmul to i32
+    %zext = zext i32 %fptoui to i64
+    %mul = mul i64 2, %zext
+    %trunc = trunc i64 %mul to i32
+    %0 = insertelement <4 x i32> poison, i32 %trunc, i32 0
+    %1 = insertelement <4 x i32> %0, i32 0, i32 1
+    %2 = insertelement <4 x i32> %1, i32 0, i32 2
+    %3 = insertelement <4 x i32> %2, i32 %trunc, i32 3
+    store <4 x i32> %3, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
+    %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
+    %extractelement = extractelement <4 x i32> %load, i64 0
+    %icmp = icmp ne i32 %extractelement, 0
+    %extractelement1 = extractelement <4 x i32> %load, i64 3
+    %icmp2 = icmp ne i32 %extractelement1, 0
+    %select = select i1 %icmp, i1 true, i1 %icmp2
+    %select.inv = xor i1 %select, true
+    br i1 %select.inv, label %bb3, label %bb5, !amdgpu.uniform !0
+
+  bb3:                                              ; preds = %bb
+    %and = and <4 x i32> %load, splat (i32 1)
+    br label %bb5, !amdgpu.uniform !0
+
+  bb5:                                              ; preds = %bb3, %bb
+    ret void
+  }
+
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare float @llvm.amdgcn.rcp.f32(float)
+
+  !0 = !{}
+...
+---
+name:            snork
+alignment:       1
+tracksRegLiveness: true
+noPhis:          false
+isSSA:           true
+noVRegs:         false
+hasFakeUses:     false
+registers:
+  - { id: 0, class: sgpr_128 }
+  - { id: 1, class: sgpr_64 }
+  - { id: 2, class: sgpr_64 }
+  - { id: 3, class: sgpr_64 }
+  - { id: 4, class: sgpr_64 }
+  - { id: 5, class: sgpr_32 }
+  - { id: 6, class: sgpr_32 }
+  - { id: 7, class: sgpr_32 }
+  - { id: 8, class: sgpr_32 }
+  - { id: 9, class: sreg_32 }
+  - { id: 10, class: sgpr_128 }
+  - { id: 11, class: vgpr_32 }
+  - { id: 12, class: vreg_128 }
+  - { id: 13, class: sreg_32 }
+  - { id: 14, class: sreg_32 }
+  - { id: 15, class: sreg_32 }
+frameInfo:
+  maxAlignment:    1
+machineFunctionInfo:
+  maxKernArgAlign: 1
+  scratchRSrcReg:  '$sgpr0_sgpr1_sgpr2_sgpr3'
+  frameOffsetReg:  '$sgpr33'
+  stackPtrOffsetReg: '$sgpr32'
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+    queuePtr:        { reg: '$sgpr6_sgpr7' }
+    dispatchID:      { reg: '$sgpr10_sgpr11' }
+    workGroupIDX:    { reg: '$sgpr12' }
+    workGroupIDY:    { reg: '$sgpr13' }
+    workGroupIDZ:    { reg: '$sgpr14' }
+    LDSKernelId:     { reg: '$sgpr15' }
+    implicitArgPtr:  { reg: '$sgpr8_sgpr9' }
+    workItemIDX:     { reg: '$vgpr31', mask: 1023 }
+    workItemIDY:     { reg: '$vgpr31', mask: 1047552 }
+    workItemIDZ:     { reg: '$vgpr31', mask: 1072693248 }
+  occupancy:       16
+  sgprForEXECCopy: '$sgpr105'
+body:             |
+  ; CHECK-LABEL: name: snork
+  ; CHECK: bb.0.bb:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
+  ; CHECK-NEXT:   DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
+  ; CHECK-NEXT:   S_CMP_LG_U32 0, 0, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC1 %bb.2, implicit $scc
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1.bb3:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.bb5:
+  ; CHECK-NEXT:   SI_RETURN
+  bb.0.bb:
+    successors: %bb.1, %bb.2
+
+    %9:sreg_32 = S_MOV_B32 0
+    %10:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3
+    %11:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
+    %12:vreg_128 = COPY %10
+    DS_WRITE_B128_gfx9 killed %11, %12, 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
+    %15:sreg_32 = S_OR_B32 %10.sub0, %10.sub3, implicit-def dead $scc
+    S_CMP_LG_U32 killed %15, 0, implicit-def $scc
+    S_CBRANCH_SCC1 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1.bb3:
+
+  bb.2.bb5:
+    SI_RETURN
+...