Skip to content

Commit cb2e331

Browse files
author
git apple-llvm automerger
committed
Merge commit '857a04cd7670' from llvm.org/main into next
2 parents 00bffa9 + 857a04c commit cb2e331

File tree

2 files changed

+252
-0
lines changed

2 files changed

+252
-0
lines changed

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3297,6 +3297,32 @@ bool SMSchedule::normalizeNonPipelinedInstructions(
32973297
<< ") is not pipelined; moving from cycle " << OldCycle
32983298
<< " to " << NewCycle << " Instr:" << *SU.getInstr());
32993299
}
3300+
3301+
// We traverse the SUs in the order of the original basic block. Computing
3302+
// NewCycle in this order normally works fine because all dependencies
3303+
// (except for loop-carried dependencies) don't violate the original order.
3304+
// However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3305+
// break it. That is, there may be exist an artificial dependency from
3306+
// bottom to top. In such a case, NewCycle may become too large to be
3307+
// scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3308+
// following case:
3309+
//
3310+
// | Inst0 <-+
3311+
// SU order | | artificial dep
3312+
// | Inst1 --+
3313+
// v
3314+
//
3315+
// If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3316+
// Inst0 must be greater than or equal to N so that Inst0 is not be
3317+
// scheduled at Stage 0. In such cases, we reject this schedule at this
3318+
// time.
3319+
// FIXME: The reason for this is the existence of artificial dependencies
3320+
// that are contradict to the original SU order. If ignoring artificial
3321+
// dependencies does not affect correctness, then it is better to ignore
3322+
// them.
3323+
if (FirstCycle + InitiationInterval <= NewCycle)
3324+
return false;
3325+
33003326
NewLastCycle = std::max(NewLastCycle, NewCycle);
33013327
}
33023328
LastCycle = NewLastCycle;
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
# RUN: llc --verify-machineinstrs -mtriple=aarch64 -run-pass=pipeliner -o - %s -aarch64-enable-pipeliner -pipeliner-enable-copytophi=1
2+
3+
--- |
4+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
5+
6+
@glb = internal unnamed_addr global { [256 x i32], [256 x i32], [256 x i32] } zeroinitializer
7+
8+
; Function Attrs: nounwind vscale_range(1,16)
9+
define internal void @f(i32 %0, i32 %1) #0 {
10+
entry:
11+
%reass.sub = sub i32 %1, %0
12+
%invariant.op = add i32 %0, 1
13+
%invariant.op3 = add i32 %0, 2
14+
%omp_loop.cmp5.not = icmp eq i32 %reass.sub, -1
15+
br i1 %omp_loop.cmp5.not, label %exit, label %preheader
16+
17+
preheader: ; preds = %entry
18+
%2 = add i32 %1, 1
19+
%3 = icmp slt i32 %2, %invariant.op
20+
br i1 %3, label %body.preheader, label %vector.ph
21+
22+
body.preheader: ; preds = %preheader
23+
%4 = add i32 %1, 1
24+
%5 = sub i32 %4, %0
25+
br label %body
26+
27+
vector.ph: ; preds = %preheader
28+
%6 = add i32 %1, 1
29+
%7 = sub i32 %6, %0
30+
%8 = tail call i32 @llvm.vscale.i32()
31+
%9 = shl nuw nsw i32 %8, 2
32+
%10 = tail call i32 @llvm.vscale.i32()
33+
%11 = shl nuw nsw i32 %10, 2
34+
%12 = call i32 @llvm.usub.sat.i32(i32 %7, i32 %11)
35+
%active.lane.mask.entry = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 0, i32 %7)
36+
%13 = tail call <vscale x 4 x i32> @llvm.stepvector.nxv4i32()
37+
%.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %9, i64 0
38+
%.splat = shufflevector <vscale x 4 x i32> %.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
39+
%broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %invariant.op, i64 0
40+
%broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
41+
%broadcast.splatinsert7 = insertelement <vscale x 4 x i32> poison, i32 %invariant.op3, i64 0
42+
%broadcast.splat8 = shufflevector <vscale x 4 x i32> %broadcast.splatinsert7, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
43+
br label %vector.body
44+
45+
vector.body: ; preds = %vector.body, %vector.ph
46+
%index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
47+
%active.lane.mask = phi <vscale x 4 x i1> [ %active.lane.mask.entry, %vector.ph ], [ %active.lane.mask.next, %vector.body ]
48+
%vec.ind = phi <vscale x 4 x i32> [ %13, %vector.ph ], [ %vec.ind.next, %vector.body ]
49+
%14 = add <vscale x 4 x i32> %vec.ind, %broadcast.splat
50+
%15 = extractelement <vscale x 4 x i32> %14, i64 0
51+
%16 = sext i32 %15 to i64
52+
%17 = add nsw i64 %16, -1
53+
%18 = getelementptr i32, ptr @glb, i64 %17
54+
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %14, ptr %18, i32 4, <vscale x 4 x i1> %active.lane.mask)
55+
%19 = add <vscale x 4 x i32> %vec.ind, %broadcast.splat8
56+
%20 = mul <vscale x 4 x i32> %14, %19
57+
%21 = sdiv <vscale x 4 x i32> %20, splat (i32 2)
58+
%22 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 1024), i64 %17
59+
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %21, ptr %22, i32 4, <vscale x 4 x i1> %active.lane.mask)
60+
%23 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 2048), i64 %17
61+
%wide.masked.load = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr %23, i32 4, <vscale x 4 x i1> %active.lane.mask, <vscale x 4 x i32> poison)
62+
%24 = add <vscale x 4 x i32> %wide.masked.load, %21
63+
call void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32> %24, ptr %23, i32 4, <vscale x 4 x i1> %active.lane.mask)
64+
%25 = tail call i32 @llvm.vscale.i32()
65+
%26 = shl nuw nsw i32 %25, 2
66+
%index.next = add i32 %index, %26
67+
%active.lane.mask.next = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 %index, i32 %12)
68+
%vec.ind.next = add <vscale x 4 x i32> %vec.ind, %.splat
69+
%27 = extractelement <vscale x 4 x i1> %active.lane.mask.next, i64 0
70+
br i1 %27, label %vector.body, label %exit
71+
72+
exit: ; preds = %vector.body, %body, %entry
73+
ret void
74+
75+
body: ; preds = %body.preheader, %body
76+
%lsr.iv2 = phi i32 [ %invariant.op3, %body.preheader ], [ %lsr.iv.next3, %body ]
77+
%lsr.iv = phi i32 [ %5, %body.preheader ], [ %lsr.iv.next, %body ]
78+
%28 = add i32 %lsr.iv2, -1
79+
%29 = sext i32 %28 to i64
80+
%30 = add nsw i64 %29, -1
81+
%31 = getelementptr i32, ptr @glb, i64 %30
82+
store i32 %28, ptr %31, align 4
83+
%32 = mul i32 %28, %lsr.iv2
84+
%33 = sdiv i32 %32, 2
85+
%34 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 1024), i64 %30
86+
store i32 %33, ptr %34, align 4
87+
%35 = getelementptr i32, ptr getelementptr inbounds nuw (i8, ptr @glb, i64 2048), i64 %30
88+
%36 = load i32, ptr %35, align 4
89+
%37 = add i32 %36, %33
90+
store i32 %37, ptr %35, align 4
91+
%lsr.iv.next = add i32 %lsr.iv, -1
92+
%lsr.iv.next3 = add i32 %lsr.iv2, 1
93+
%exitcond.not = icmp eq i32 %lsr.iv.next, 0
94+
br i1 %exitcond.not, label %exit, label %body
95+
}
96+
97+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
98+
declare <vscale x 4 x i32> @llvm.stepvector.nxv4i32() #1
99+
100+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
101+
declare i32 @llvm.vscale.i32() #1
102+
103+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
104+
declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32) #1
105+
106+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
107+
declare void @llvm.masked.store.nxv4i32.p0(<vscale x 4 x i32>, ptr captures(none), i32 immarg, <vscale x 4 x i1>) #2
108+
109+
; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: read)
110+
declare <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0(ptr captures(none), i32 immarg, <vscale x 4 x i1>, <vscale x 4 x i32>) #3
111+
112+
; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
113+
declare i32 @llvm.usub.sat.i32(i32, i32) #4
114+
115+
attributes #0 = { nounwind vscale_range(1,16) "frame-pointer"="non-leaf" "target-cpu"="neoverse-v1" "target-features"="+sve" }
116+
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) }
117+
attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
118+
attributes #3 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
119+
attributes #4 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
120+
121+
...
122+
---
123+
name: f
124+
tracksRegLiveness: true
125+
body: |
126+
bb.0.entry:
127+
successors: %bb.5(0x30000000), %bb.1(0x50000000)
128+
liveins: $w0, $w1
129+
130+
%20:gpr32common = COPY $w1
131+
%19:gpr32common = COPY $w0
132+
%21:gpr32common = SUBWrr %20, %19
133+
dead $wzr = ADDSWri %21, 1, 0, implicit-def $nzcv
134+
Bcc 0, %bb.5, implicit $nzcv
135+
B %bb.1
136+
137+
bb.1.preheader:
138+
successors: %bb.2(0x40000000), %bb.3(0x40000000)
139+
140+
%22:gpr32common = ADDWri %19, 1, 0
141+
%23:gpr32sp = ADDWri %19, 2, 0
142+
%25:gpr32common = ADDWri %20, 1, 0
143+
dead $wzr = SUBSWrr killed %25, %22, implicit-def $nzcv
144+
Bcc 10, %bb.3, implicit $nzcv
145+
B %bb.2
146+
147+
bb.2.body.preheader:
148+
successors: %bb.6(0x80000000)
149+
150+
%1:gpr32sp = COPY %23
151+
%55:gpr32sp = ADDWri %21, 1, 0
152+
%2:gpr32all = COPY %55
153+
%57:gpr64common = MOVaddr target-flags(aarch64-page) @glb, target-flags(aarch64-pageoff, aarch64-nc) @glb
154+
B %bb.6
155+
156+
bb.3.vector.ph:
157+
successors: %bb.4(0x80000000)
158+
159+
%29:gpr32common = ADDWri %21, 1, 0
160+
%30:gpr64 = CNTW_XPiI 31, 1, implicit $vg
161+
%31:gpr32common = COPY %30.sub_32
162+
%32:gpr32 = SUBSWrr %29, %31, implicit-def $nzcv
163+
%33:gpr32 = COPY $wzr
164+
%34:gpr32 = CSELWr %33, killed %32, 3, implicit $nzcv
165+
%4:ppr = WHILELO_PWW_S %33, %29, implicit-def dead $nzcv
166+
%5:zpr = INDEX_II_S 0, 1, implicit $vg
167+
%6:zpr = DUP_ZR_S %31
168+
%7:zpr = DUP_ZR_S %22
169+
%8:zpr = DUP_ZR_S %23
170+
%27:gpr32all = COPY %33
171+
%37:gpr64common = MOVaddr target-flags(aarch64-page) @glb, target-flags(aarch64-pageoff, aarch64-nc) @glb
172+
%39:gpr64common = MOVi64imm -1
173+
%41:ppr_3b = PTRUE_S 31, implicit $vg
174+
%44:gpr64common = MOVi64imm 255
175+
%45:gpr64common = MOVi64imm 511
176+
177+
bb.4.vector.body:
178+
successors: %bb.4(0x7c000000), %bb.5(0x04000000)
179+
180+
%9:gpr32 = PHI %27, %bb.3, %12, %bb.4
181+
%10:ppr_3b = PHI %4, %bb.3, %13, %bb.4
182+
%11:zpr = PHI %5, %bb.3, %14, %bb.4
183+
%35:zpr = ADD_ZZZ_S %11, %7
184+
%36:gpr32 = COPY %35.ssub
185+
%38:gpr64sp = ADDXrx %37, killed %36, 50
186+
ST1W %35, %10, %38, %39 :: (store unknown-size into %ir.18, align 4)
187+
%40:zpr = ADD_ZZZ_S %11, %8
188+
%42:zpr = MUL_ZPZZ_S_UNDEF %41, %35, killed %40
189+
%43:zpr = ASRD_ZPmI_S %41, %42, 1
190+
ST1W %43, %10, %38, %44 :: (store unknown-size into %ir.22, align 4)
191+
%46:zpr = LD1W %10, %38, %45 :: (load unknown-size from %ir.23, align 4)
192+
%47:zpr = ADD_ZZZ_S killed %46, %43
193+
ST1W killed %47, %10, %38, %45 :: (store unknown-size into %ir.23, align 4)
194+
%50:gpr32 = ADDWrr %9, %31
195+
%12:gpr32all = COPY %50
196+
%13:ppr = WHILELO_PWW_S %9, %34, implicit-def $nzcv
197+
%14:zpr = ADD_ZZZ_S %11, %6
198+
Bcc 4, %bb.4, implicit $nzcv
199+
B %bb.5
200+
201+
bb.5.exit:
202+
RET_ReallyLR
203+
204+
bb.6.body:
205+
successors: %bb.5(0x04000000), %bb.6(0x7c000000)
206+
207+
%15:gpr32common = PHI %1, %bb.2, %18, %bb.6
208+
%16:gpr32sp = PHI %2, %bb.2, %17, %bb.6
209+
%56:gpr32common = SUBWri %15, 1, 0
210+
%58:gpr64sp = ADDXrx %57, %56, 50
211+
STURWi %56, %58, -4 :: (store (s32) into %ir.31)
212+
%59:gpr32 = MADDWrrr %56, %15, $wzr
213+
%60:gpr32 = ADDWrs %59, %59, 95
214+
%61:gpr32 = SBFMWri killed %60, 1, 31
215+
STRWui %61, %58, 255 :: (store (s32) into %ir.34)
216+
%62:gpr32 = LDRWui %58, 511 :: (load (s32) from %ir.35)
217+
%63:gpr32 = ADDWrr killed %62, %61
218+
STRWui killed %63, %58, 511 :: (store (s32) into %ir.35)
219+
%64:gpr32 = SUBSWri %16, 1, 0, implicit-def $nzcv
220+
%17:gpr32all = COPY %64
221+
%65:gpr32sp = ADDWri %15, 1, 0
222+
%18:gpr32all = COPY %65
223+
Bcc 0, %bb.5, implicit $nzcv
224+
B %bb.6
225+
226+
...

0 commit comments

Comments
 (0)