Skip to content

Commit 8e02ce5

Browse files
committed
fixup! AMDGPU: Fix assert when multi operands to update after folding imm
1 parent 3ccef46 commit 8e02ce5

File tree

3 files changed

+131
-61
lines changed

3 files changed

+131
-61
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1761,7 +1761,7 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
17611761
for (MachineInstr *Copy : CopiesToReplace)
17621762
Copy->addImplicitDefUseOperands(*MF);
17631763

1764-
SmallVector<MachineInstr *, 4> ConstantFoldCandidates;
1764+
SetVector<MachineInstr *> ConstantFoldCandidates;
17651765
for (FoldCandidate &Fold : FoldList) {
17661766
assert(!Fold.isReg() || Fold.Def.OpToFold);
17671767
if (Fold.isReg() && Fold.getReg().isVirtual()) {
@@ -1784,8 +1784,8 @@ bool SIFoldOperandsImpl::foldInstOperand(MachineInstr &MI,
17841784
<< static_cast<int>(Fold.UseOpNo) << " of "
17851785
<< *Fold.UseMI);
17861786

1787-
if (Fold.isImm() && !is_contained(ConstantFoldCandidates, Fold.UseMI))
1788-
ConstantFoldCandidates.push_back(Fold.UseMI);
1787+
if (Fold.isImm())
1788+
ConstantFoldCandidates.insert(Fold.UseMI);
17891789

17901790
} else if (Fold.Commuted) {
17911791
// Restoring instruction's original operand order if fold has failed.

llvm/test/CodeGen/AMDGPU/bug-multi-operands-to-update-after-fold.ll

Lines changed: 0 additions & 58 deletions
This file was deleted.
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx1031 -run-pass=si-fold-operands -o - %s | FileCheck %s
3+
--- |
4+
%struct.bar = type { %struct.bar.0, %struct.bar.0, %struct.bar.0 }
5+
%struct.bar.0 = type { %struct.blam }
6+
%struct.blam = type { i32, i32, i32, i32 }
7+
8+
@global = external addrspace(3) global %struct.bar
9+
10+
define void @snork() {
11+
bb:
12+
%call = call float @llvm.amdgcn.rcp.f32(float 0.000000e+00)
13+
%fmul = fmul ninf float %call, 0.000000e+00
14+
%fptoui = fptoui float %fmul to i32
15+
%zext = zext i32 %fptoui to i64
16+
%mul = mul i64 2, %zext
17+
%trunc = trunc i64 %mul to i32
18+
%0 = insertelement <4 x i32> poison, i32 %trunc, i32 0
19+
%1 = insertelement <4 x i32> %0, i32 0, i32 1
20+
%2 = insertelement <4 x i32> %1, i32 0, i32 2
21+
%3 = insertelement <4 x i32> %2, i32 %trunc, i32 3
22+
store <4 x i32> %3, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
23+
%load = load <4 x i32>, ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32), align 16
24+
%extractelement = extractelement <4 x i32> %load, i64 0
25+
%icmp = icmp ne i32 %extractelement, 0
26+
%extractelement1 = extractelement <4 x i32> %load, i64 3
27+
%icmp2 = icmp ne i32 %extractelement1, 0
28+
%select = select i1 %icmp, i1 true, i1 %icmp2
29+
%select.inv = xor i1 %select, true
30+
br i1 %select.inv, label %bb3, label %bb5, !amdgpu.uniform !0
31+
32+
bb3: ; preds = %bb
33+
%and = and <4 x i32> %load, splat (i32 1)
34+
br label %bb5, !amdgpu.uniform !0
35+
36+
bb5: ; preds = %bb3, %bb
37+
ret void
38+
}
39+
40+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
41+
declare float @llvm.amdgcn.rcp.f32(float)
42+
43+
!0 = !{}
44+
...
45+
---
46+
name: snork
47+
alignment: 1
48+
tracksRegLiveness: true
49+
noPhis: false
50+
isSSA: true
51+
noVRegs: false
52+
hasFakeUses: false
53+
registers:
54+
- { id: 0, class: sgpr_128 }
55+
- { id: 1, class: sgpr_64 }
56+
- { id: 2, class: sgpr_64 }
57+
- { id: 3, class: sgpr_64 }
58+
- { id: 4, class: sgpr_64 }
59+
- { id: 5, class: sgpr_32 }
60+
- { id: 6, class: sgpr_32 }
61+
- { id: 7, class: sgpr_32 }
62+
- { id: 8, class: sgpr_32 }
63+
- { id: 9, class: sreg_32 }
64+
- { id: 10, class: sgpr_128 }
65+
- { id: 11, class: vgpr_32 }
66+
- { id: 12, class: vreg_128 }
67+
- { id: 13, class: sreg_32 }
68+
- { id: 14, class: sreg_32 }
69+
- { id: 15, class: sreg_32 }
70+
frameInfo:
71+
maxAlignment: 1
72+
machineFunctionInfo:
73+
maxKernArgAlign: 1
74+
scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
75+
frameOffsetReg: '$sgpr33'
76+
stackPtrOffsetReg: '$sgpr32'
77+
argumentInfo:
78+
privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
79+
dispatchPtr: { reg: '$sgpr4_sgpr5' }
80+
queuePtr: { reg: '$sgpr6_sgpr7' }
81+
dispatchID: { reg: '$sgpr10_sgpr11' }
82+
workGroupIDX: { reg: '$sgpr12' }
83+
workGroupIDY: { reg: '$sgpr13' }
84+
workGroupIDZ: { reg: '$sgpr14' }
85+
LDSKernelId: { reg: '$sgpr15' }
86+
implicitArgPtr: { reg: '$sgpr8_sgpr9' }
87+
workItemIDX: { reg: '$vgpr31', mask: 1023 }
88+
workItemIDY: { reg: '$vgpr31', mask: 1047552 }
89+
workItemIDZ: { reg: '$vgpr31', mask: 1072693248 }
90+
occupancy: 16
91+
sgprForEXECCopy: '$sgpr105'
92+
body: |
93+
; CHECK-LABEL: name: snork
94+
; CHECK: bb.0.bb:
95+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
96+
; CHECK-NEXT: {{ $}}
97+
; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
98+
; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
99+
; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
100+
; CHECK-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY [[REG_SEQUENCE]]
101+
; CHECK-NEXT: DS_WRITE_B128_gfx9 killed [[V_MOV_B32_e32_]], [[COPY]], 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
102+
; CHECK-NEXT: S_CMP_LG_U32 0, 0, implicit-def $scc
103+
; CHECK-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
104+
; CHECK-NEXT: S_BRANCH %bb.1
105+
; CHECK-NEXT: {{ $}}
106+
; CHECK-NEXT: bb.1.bb3:
107+
; CHECK-NEXT: successors: %bb.2(0x80000000)
108+
; CHECK-NEXT: {{ $}}
109+
; CHECK-NEXT: bb.2.bb5:
110+
; CHECK-NEXT: SI_RETURN
111+
bb.0.bb:
112+
successors: %bb.1, %bb.2
113+
114+
%9:sreg_32 = S_MOV_B32 0
115+
%10:sgpr_128 = REG_SEQUENCE %9, %subreg.sub0, %9, %subreg.sub1, %9, %subreg.sub2, %9, %subreg.sub3
116+
%11:vgpr_32 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @global, implicit $exec
117+
%12:vreg_128 = COPY %10
118+
DS_WRITE_B128_gfx9 killed %11, %12, 32, 0, implicit $exec :: (store (s128) into `ptr addrspace(3) getelementptr inbounds nuw (i8, ptr addrspace(3) @global, i32 32)`, addrspace 3)
119+
%15:sreg_32 = S_OR_B32 %10.sub0, %10.sub3, implicit-def dead $scc
120+
S_CMP_LG_U32 killed %15, 0, implicit-def $scc
121+
S_CBRANCH_SCC1 %bb.2, implicit $scc
122+
S_BRANCH %bb.1
123+
124+
bb.1.bb3:
125+
126+
bb.2.bb5:
127+
SI_RETURN
128+
...

0 commit comments

Comments
 (0)