Skip to content

Commit c0abb5d

Browse files
committed
Limit some extreme unrolling
1 parent 4fe3575 commit c0abb5d

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

src/modeling/determinestrategy.jl

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ end
208208
function unroll_no_reductions(ls, order, vloopsym)
209209
size_T = biggest_type_size(ls)
210210
W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
211-
211+
212212
compute_rt = load_rt = store_rt = 0.0
213213
unrolled = last(order)
214214
i = 0
@@ -222,7 +222,8 @@ function unroll_no_reductions(ls, order, vloopsym)
222222
unrolled = unrolled_candidate
223223
end
224224
end
225-
# latency not a concern, because no depchains
225+
# latency not a concern, because no depchains
226+
innerloop = last(order)
226227
compute_l = 0.0
227228
rpp = 0 # register pressure proportional to unrolling
228229
rpc = 0 # register pressure independent of unroll factor
@@ -246,8 +247,12 @@ function unroll_no_reductions(ls, order, vloopsym)
246247
u = if compute_rt 1
247248
4
248249
elseif compute_rt > memory_rt
249-
clamp(round(Int, compute_l / compute_rt), 1, 4)
250-
# max(1, VectorizationBase.nextpow2( min( 4, round(Int, 8 / compute_rt) ) ))
250+
# @show load_rt, store_rt, compute_rt, compute_l, rpc, rpp
251+
# if compute_rt > 40
252+
# max(VectorizationBase.nextpow2( min( 4, round(Int, compute_rt / memory_rt) ) ), 1)
253+
# else
254+
clamp(round(Int, compute_l / compute_rt), 1, Core.ifelse(compute_rt>80, 2, 4))
255+
# end
251256
elseif iszero(load_rt)
252257
iszero(store_rt) ? 4 : max(1, min(4, round(Int, 2compute_rt / store_rt)))
253258
else
@@ -269,7 +274,7 @@ function unroll_no_reductions(ls, order, vloopsym)
269274
# motivation for skipping division by loads here: https://github.com/microhh/stencilbuilder/blob/master/julia/stencil_julia_4th.jl
270275
# Some values:
271276
# (load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp) = (52.0, 3.0, 92.0, 736.0, 4, 0.0, 52.0)
272-
# This is fastest when `u = 4`, but `reg_constraint` was restricting it to 1.
277+
# This is fastest when `u = 4`, but `reg_constraint` was restricting it to 1. ## later benchmarks were faster with u = 2?
273278
# Obviously, this limitation on number of registers didn't seem so important in practice.
274279
# So, heuristically I check if compute latency dominates the problem, in which case unrolling could be expected to benefit us.
275280
# Ideally, we'd count the number of loads that actually have to be live at a given time. But this heuristic is hopefully okay for now.
@@ -1462,7 +1467,7 @@ function choose_order_cost(ls::LoopSet, v::Int = 0)
14621467
fill_children!(ls)
14631468
resize!(ls.loop_order, length(ls.loopsymbols))
14641469
sld = store_load_deps(operations(ls))
1465-
if num_loops(ls) > 1
1470+
if (num_loops(ls) > 1) && (length(ls.operations) 100)
14661471
torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline = choose_tile(ls, sld, v)
14671472
else
14681473
torder = names(ls) # dummy

0 commit comments

Comments
 (0)