From 3ccef46d239ee1df7dddba4c2ef0e5d2c538bfd3 Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Fri, 11 Jul 2025 05:33:54 -0500 Subject: [PATCH 1/7] AMDGPU: Fix assert when multi operands to update after folding imm In the original motivating test case, FoldList had entries: #0: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 1 #1: UseMI: %224:sreg_32 = S_OR_B32 %219.sub0:sreg_64, %219.sub1:sreg_64, implicit-def dead $scc UseOpNo: 2 After calling updateOperand(#0), tryConstantFoldOp(#0.UseMI) removed operand 1, and entry #1.UseOpNo was no longer valid, resulting in an assert. This change defers constant folding until after all operands have been updated so that UseOpNo values remain valid. --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 14 +++-- ...bug-multi-operands-to-update-after-fold.ll | 58 +++++++++++++++++++ 2 files changed, 68 insertions(+), 4 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0ed06c37507af..0f2a932f984b1 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1761,6 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); + SmallVector ConstantFoldCandidates; for (FoldCandidate &Fold : FoldList) { assert(!Fold.isReg() || Fold.Def.OpToFold); if (Fold.isReg() && Fold.getReg().isVirtual()) { @@ -1783,16 +1784,21 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, << static_cast(Fold.UseOpNo) << " of " << *Fold.UseMI); - if (Fold.isImm() && tryConstantFoldOp(Fold.UseMI)) { - LLVM_DEBUG(dbgs() << "Constant folded " << *Fold.UseMI); - Changed = true; - } + if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI)) + ConstantFoldCandidates.push_back(Fold.UseMI); } else if (Fold.Commuted) { // Restoring instruction's original operand order if fold has failed. TII->commuteInstruction(*Fold.UseMI, false); } } + + for (MachineInstr *MI : ConstantFoldCandidates) { + if (tryConstantFoldOp(MI)) { + LLVM_DEBUG(dbgs() << "Constant folded " << *MI); + Changed = true; + } + } return true; } diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll new file mode 100644 index 0000000000000..a81fc6a25e43e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll @@ -0,0 +1,58 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s + +%struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 } +%struct.bar.0 = type { %struct.blam } +%struct.blam = type { i32, i32, i32, i32 } + +@global = external addrspace(3) global %struct.bar + +define void @snork() { +; CHECK-LABEL: snork: +; CHECK: ; %bb.0: ; %bb +; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: v_mov_b32_e32 v4, global@abs32@lo +; CHECK-NEXT: s_mov_b32 s5, s4 +; CHECK-NEXT: s_mov_b32 s6, s4 +; CHECK-NEXT: s_mov_b32 s7, s4 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: v_mov_b32_e32 v2, s6 +; CHECK-NEXT: v_mov_b32_e32 v3, s7 +; CHECK-NEXT: s_cmp_lg_u32 0, 0 +; CHECK-NEXT: ds_write_b128 v4, v[0:3] offset:32 +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: s_setpc_b64 s[30:31] +bb: + %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00) + %fmul = fmul ninf float %call, 0.000000e+00 + %fptoui = fptoui float %fmul to i32 + %zext = zext i32 %fptoui to i64 + %mul = mul i64 2, %zext + %trunc = trunc i64 %mul to i32 + store i32 %trunc, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16 + store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 36), align 4 + store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 40), align 8 + store i32 %trunc, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 44), align 4 + %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16 + %extractelement = extractelement <4 x i32> %load, i64 0 + %icmp = icmp ne i32 %extractelement, 0 + %extractelement1 = extractelement <4 x i32> %load, i64 3 + %icmp2 = icmp ne i32 %extractelement1, 0 + %select = select i1 %icmp, i1 true, i1 %icmp2 + br i1 %select, label %bb5, label %bb3 + +bb3: ; preds = %bb + %and = and <4 x i32> %load, splat (i32 1) + %extractelement4 = extractelement <4 x i32> %and, i64 0 + br label %bb5 + +bb5: ; preds = %bb3, %bb + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) +declare float @llvm.amdgcn.rcp.f32(float) #0 + +attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } From 8e02ce545f87c4e94ddbe01594bac844e3c90b08 Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Fri, 11 Jul 2025 12:28:36 -0500 Subject: [PATCH 2/7] fixup! AMDGPU: Fix assert when multi operands to update after folding imm --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 6 +- ...bug-multi-operands-to-update-after-fold.ll | 58 -------- ...ug-multi-operands-to-update-after-fold.mir | 128 ++++++++++++++++++ 3 files changed, 131 insertions(+), 61 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll create mode 100644 llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 0f2a932f984b1..e172c0b63189b 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -1761,7 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, for (MachineInstr *Copy : CopiesToReplace) Copy->addImplicitDefUseOperands(*MF); - SmallVector ConstantFoldCandidates; + SetVector ConstantFoldCandidates; for (FoldCandidate &Fold : FoldList) { assert(!Fold.isReg() || Fold.Def.OpToFold); if (Fold.isReg() && Fold.getReg().isVirtual()) { @@ -1784,8 +1784,8 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI, << static_cast(Fold.UseOpNo) << " of " << *Fold.UseMI); - if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI)) - ConstantFoldCandidates.push_back(Fold.UseMI); + if (Fold.isImm()) + ConstantFoldCandidates.insert(Fold.UseMI); } else if (Fold.Commuted) { // Restoring instruction's original operand order if fold has failed. diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll deleted file mode 100644 index a81fc6a25e43e..0000000000000 --- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll +++ /dev/null @@ -1,58 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -O3 -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -o - < %s | FileCheck %s - -%struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 } -%struct.bar.0 = type { %struct.blam } -%struct.blam = type { i32, i32, i32, i32 } - -@global = external addrspace(3) global %struct.bar - -define void @snork() { -; CHECK-LABEL: snork: -; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s4, 0 -; CHECK-NEXT: v_mov_b32_e32 v4, global@abs32@lo -; CHECK-NEXT: s_mov_b32 s5, s4 -; CHECK-NEXT: s_mov_b32 s6, s4 -; CHECK-NEXT: s_mov_b32 s7, s4 -; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 -; CHECK-NEXT: s_cmp_lg_u32 0, 0 -; CHECK-NEXT: ds_write_b128 v4, v[0:3] offset:32 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] -bb: - %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00) - %fmul = fmul ninf float %call, 0.000000e+00 - %fptoui = fptoui float %fmul to i32 - %zext = zext i32 %fptoui to i64 - %mul = mul i64 2, %zext - %trunc = trunc i64 %mul to i32 - store i32 %trunc, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16 - store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 36), align 4 - store i32 0, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 40), align 8 - store i32 %trunc, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 44), align 4 - %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds (%struct.bar, ptr addrspace(3) @global, i32 0, i32 2), align 16 - %extractelement = extractelement <4 x i32> %load, i64 0 - %icmp = icmp ne i32 %extractelement, 0 - %extractelement1 = extractelement <4 x i32> %load, i64 3 - %icmp2 = icmp ne i32 %extractelement1, 0 - %select = select i1 %icmp, i1 true, i1 %icmp2 - br i1 %select, label %bb5, label %bb3 - -bb3: ; preds = %bb - %and = and <4 x i32> %load, splat (i32 1) - %extractelement4 = extractelement <4 x i32> %and, i64 0 - br label %bb5 - -bb5: ; preds = %bb3, %bb - ret void -} - -; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) -declare float @llvm.amdgcn.rcp.f32(float) #0 - -attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir new file mode 100644 index 0000000000000..da362bdacc90f --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -0,0 +1,128 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s +--- | + %struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 } + %struct.bar.0 = type { %struct.blam } + %struct.blam = type { i32, i32, i32, i32 } + + @global = external addrspace(3) global %struct.bar + + define void @snork() { + bb: + %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00) + %fmul = fmul ninf float %call, 0.000000e+00 + %fptoui = fptoui float %fmul to i32 + %zext = zext i32 %fptoui to i64 + %mul = mul i64 2, %zext + %trunc = trunc i64 %mul to i32 + %0 = insertelement <4 x i32> poison, i32 %trunc, i32 0 + %1 = insertelement <4 x i32> %0, i32 0, i32 1 + %2 = insertelement <4 x i32> %1, i32 0, i32 2 + %3 = insertelement <4 x i32> %2, i32 %trunc, i32 3 + store <4 x i32> %3, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16 + %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16 + %extractelement = extractelement <4 x i32> %load, i64 0 + %icmp = icmp ne i32 %extractelement, 0 + %extractelement1 = extractelement <4 x i32> %load, i64 3 + %icmp2 = icmp ne i32 %extractelement1, 0 + %select = select i1 %icmp, i1 true, i1 %icmp2 + %select.inv = xor i1 %select, true + br i1 %select.inv, label %bb3, label %bb5, !amdgpu.uniform !0 + + bb3: ; preds = %bb + %and = and <4 x i32> %load, splat (i32 1) + br label %bb5, !amdgpu.uniform !0 + + bb5: ; preds = %bb3, %bb + ret void + } + + ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) + declare float @llvm.amdgcn.rcp.f32(float) + + !0 = !{} +... +--- +name: snork +alignment: 1 +tracksRegLiveness: true +noPhis: false +isSSA: true +noVRegs: false +hasFakeUses: false +registers: + - { id: 0, class: sgpr_128 } + - { id: 1, class: sgpr_64 } + - { id: 2, class: sgpr_64 } + - { id: 3, class: sgpr_64 } + - { id: 4, class: sgpr_64 } + - { id: 5, class: sgpr_32 } + - { id: 6, class: sgpr_32 } + - { id: 7, class: sgpr_32 } + - { id: 8, class: sgpr_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sgpr_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vreg_128 } + - { id: 13, class: sreg_32 } + - { id: 14, class: sreg_32 } + - { id: 15, class: sreg_32 } +frameInfo: + maxAlignment: 1 +machineFunctionInfo: + maxKernArgAlign: 1 + scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' + frameOffsetReg: '$sgpr33' + stackPtrOffsetReg: '$sgpr32' + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + queuePtr: { reg: '$sgpr6_sgpr7' } + dispatchID: { reg: '$sgpr10_sgpr11' } + workGroupIDX: { reg: '$sgpr12' } + workGroupIDY: { reg: '$sgpr13' } + workGroupIDZ: { reg: '$sgpr14' } + LDSKernelId: { reg: '$sgpr15' } + implicitArgPtr: { reg: '$sgpr8_sgpr9' } + workItemIDX: { reg: '$vgpr31', mask: 1023 } + workItemIDY: { reg: '$vgpr31', mask: 1047552 } + workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } + occupancy: 16 + sgprForEXECCopy: '$sgpr105' +body: | + ; CHECK-LABEL: name: snork + ; CHECK: bb.0.bb: + ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] + ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3) + ; CHECK-NEXT: S_CMP_LG_U32 0, 0, implicit-def $scc + ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.bb3: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.bb5: + ; CHECK-NEXT: SI_RETURN + bb.0.bb: + successors: %bb.1, %bb.2 + + %9:sreg_32 = S_MOV_B32 0 + %10:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3 + %11:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec + %12:vreg_128 = COPY %10 + DS_WRITE_B128_gfx9 killed %11, %12, 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3) + %15:sreg_32 = S_OR_B32 %10.sub0, %10.sub3, implicit-def dead $scc + S_CMP_LG_U32 killed %15, 0, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit $scc + S_BRANCH %bb.1 + + bb.1.bb3: + + bb.2.bb5: + SI_RETURN +... From be9c98ffa95d2e55dd917bf7f944b2421db860f3 Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Fri, 11 Jul 2025 12:36:56 -0500 Subject: [PATCH 3/7] fixup! AMDGPU: Fix assert when multi operands to update after folding imm --- .../CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir | 1 - 1 file changed, 1 deletion(-) diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir index da362bdacc90f..95095c132f879 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -37,7 +37,6 @@ ret void } - ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none) declare float @llvm.amdgcn.rcp.f32(float) !0 = !{} From 34c61fc3ef615430b2912e79716350439740ec84 Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Fri, 11 Jul 2025 15:51:48 -0500 Subject: [PATCH 4/7] fixup! AMDGPU: Fix assert when multi operands to update after folding imm --- ...ug-multi-operands-to-update-after-fold.mir | 82 ++----------------- 1 file changed, 5 insertions(+), 77 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir index 95095c132f879..027b6fd0b014d 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -1,93 +1,21 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s --- | - %struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 } - %struct.bar.0 = type { %struct.blam } - %struct.blam = type { i32, i32, i32, i32 } - - @global = external addrspace(3) global %struct.bar + @global = external addrspace(3) global i32 define void @snork() { bb: - %call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00) - %fmul = fmul ninf float %call, 0.000000e+00 - %fptoui = fptoui float %fmul to i32 - %zext = zext i32 %fptoui to i64 - %mul = mul i64 2, %zext - %trunc = trunc i64 %mul to i32 - %0 = insertelement <4 x i32> poison, i32 %trunc, i32 0 - %1 = insertelement <4 x i32> %0, i32 0, i32 1 - %2 = insertelement <4 x i32> %1, i32 0, i32 2 - %3 = insertelement <4 x i32> %2, i32 %trunc, i32 3 - store <4 x i32> %3, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16 - %load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16 - %extractelement = extractelement <4 x i32> %load, i64 0 - %icmp = icmp ne i32 %extractelement, 0 - %extractelement1 = extractelement <4 x i32> %load, i64 3 - %icmp2 = icmp ne i32 %extractelement1, 0 - %select = select i1 %icmp, i1 true, i1 %icmp2 - %select.inv = xor i1 %select, true - br i1 %select.inv, label %bb3, label %bb5, !amdgpu.uniform !0 + br label %bb3 - bb3: ; preds = %bb - %and = and <4 x i32> %load, splat (i32 1) - br label %bb5, !amdgpu.uniform !0 + bb3: + br label %bb5 - bb5: ; preds = %bb3, %bb + bb5: ret void } - - declare float @llvm.amdgcn.rcp.f32(float) - - !0 = !{} ... --- name: snork -alignment: 1 -tracksRegLiveness: true -noPhis: false -isSSA: true -noVRegs: false -hasFakeUses: false -registers: - - { id: 0, class: sgpr_128 } - - { id: 1, class: sgpr_64 } - - { id: 2, class: sgpr_64 } - - { id: 3, class: sgpr_64 } - - { id: 4, class: sgpr_64 } - - { id: 5, class: sgpr_32 } - - { id: 6, class: sgpr_32 } - - { id: 7, class: sgpr_32 } - - { id: 8, class: sgpr_32 } - - { id: 9, class: sreg_32 } - - { id: 10, class: sgpr_128 } - - { id: 11, class: vgpr_32 } - - { id: 12, class: vreg_128 } - - { id: 13, class: sreg_32 } - - { id: 14, class: sreg_32 } - - { id: 15, class: sreg_32 } -frameInfo: - maxAlignment: 1 -machineFunctionInfo: - maxKernArgAlign: 1 - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - frameOffsetReg: '$sgpr33' - stackPtrOffsetReg: '$sgpr32' - argumentInfo: - privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } - dispatchPtr: { reg: '$sgpr4_sgpr5' } - queuePtr: { reg: '$sgpr6_sgpr7' } - dispatchID: { reg: '$sgpr10_sgpr11' } - workGroupIDX: { reg: '$sgpr12' } - workGroupIDY: { reg: '$sgpr13' } - workGroupIDZ: { reg: '$sgpr14' } - LDSKernelId: { reg: '$sgpr15' } - implicitArgPtr: { reg: '$sgpr8_sgpr9' } - workItemIDX: { reg: '$vgpr31', mask: 1023 } - workItemIDY: { reg: '$vgpr31', mask: 1047552 } - workItemIDZ: { reg: '$vgpr31', mask: 1072693248 } - occupancy: 16 - sgprForEXECCopy: '$sgpr105' body: | ; CHECK-LABEL: name: snork ; CHECK: bb.0.bb: From 269cae130061e31d7fbbfe02fa81048928e0d747 Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Mon, 14 Jul 2025 08:01:43 -0500 Subject: [PATCH 5/7] fixup! AMDGPU: Fix assert when multi operands to update after folding imm --- ...ug-multi-operands-to-update-after-fold.mir | 55 ++++--------------- 1 file changed, 10 insertions(+), 45 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir index 027b6fd0b014d..ec4ed94e25b79 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -1,55 +1,20 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s --- | - @global = external addrspace(3) global i32 - define void @snork() { - bb: - br label %bb3 - - bb3: - br label %bb5 - - bb5: ret void } ... --- -name: snork -body: | - ; CHECK-LABEL: name: snork - ; CHECK: bb.0.bb: - ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 - ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec - ; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]] - ; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3) - ; CHECK-NEXT: S_CMP_LG_U32 0, 0, implicit-def $scc - ; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc - ; CHECK-NEXT: S_BRANCH %bb.1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1.bb3: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2.bb5: - ; CHECK-NEXT: SI_RETURN - bb.0.bb: - successors: %bb.1, %bb.2 - - %9:sreg_32 = S_MOV_B32 0 - %10:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3 - %11:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec - %12:vreg_128 = COPY %10 - DS_WRITE_B128_gfx9 killed %11, %12, 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3) - %15:sreg_32 = S_OR_B32 %10.sub0, %10.sub3, implicit-def dead $scc - S_CMP_LG_U32 killed %15, 0, implicit-def $scc - S_CBRANCH_SCC1 %bb.2, implicit $scc - S_BRANCH %bb.1 - - bb.1.bb3: - - bb.2.bb5: +name: snork +body: | + bb.0: + ; CHECK-LABEL: name: snork + ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE undef [[S_MOV_B32_]], %subreg.sub0, undef [[S_MOV_B32_]], %subreg.sub1, undef [[S_MOV_B32_]], %subreg.sub2, undef [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: SI_RETURN + %0:sreg_32 = S_MOV_B32 0 + %1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3 + %2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc SI_RETURN ... From bc2d5b14615243e160c1d706a94b12044a6a8211 Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Tue, 15 Jul 2025 03:57:06 -0500 Subject: [PATCH 6/7] fixup! AMDGPU: Fix assert when multi operands to update after folding imm --- .../AMDGPU/bug-multi-operands-to-update-after-fold.mir | 5 ----- 1 file changed, 5 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir index ec4ed94e25b79..66c13b3f969a6 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -1,10 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5 # RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s ---- | - define void @snork() { - ret void - } -... --- name: snork body: | From 4c444512288204730d09d541cfec0f0807879546 Mon Sep 17 00:00:00 2001 From: Matthew Curtis Date: Tue, 15 Jul 2025 10:34:04 -0500 Subject: [PATCH 7/7] fixup! AMDGPU: Fix assert when multi operands to update after folding imm --- .../AMDGPU/bug-multi-operands-to-update-after-fold.mir | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir index 66c13b3f969a6..d0c9740c6954e 100644 --- a/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir +++ b/llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.mir @@ -6,10 +6,10 @@ body: | bb.0: ; CHECK-LABEL: name: snork ; CHECK: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0 - ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE undef [[S_MOV_B32_]], %subreg.sub0, undef [[S_MOV_B32_]], %subreg.sub1, undef [[S_MOV_B32_]], %subreg.sub2, undef [[S_MOV_B32_]], %subreg.sub3 + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3 ; CHECK-NEXT: SI_RETURN %0:sreg_32 = S_MOV_B32 0 - %1:sgpr_128 = REG_SEQUENCE undef %0, %subreg.sub0, undef %0, %subreg.sub1, undef %0, %subreg.sub2, undef %0, %subreg.sub3 - %2:sreg_32 = S_OR_B32 undef %1.sub0, undef %1.sub3, implicit-def dead $scc + %1:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %0, %subreg.sub1, %0, %subreg.sub2, %0, %subreg.sub3 + %2:sreg_32 = S_OR_B32 %1.sub0, %1.sub3, implicit-def dead $scc SI_RETURN ...