208
208
function unroll_no_reductions (ls, order, vloopsym)
209
209
size_T = biggest_type_size (ls)
210
210
W, Wshift = lsvecwidthshift (ls, vloopsym, size_T)
211
-
211
+
212
212
compute_rt = load_rt = store_rt = 0.0
213
213
unrolled = last (order)
214
214
i = 0
@@ -222,7 +222,8 @@ function unroll_no_reductions(ls, order, vloopsym)
222
222
unrolled = unrolled_candidate
223
223
end
224
224
end
225
- # latency not a concern, because no depchains
225
+ # latency not a concern, because no depchains
226
+ innerloop = last (order)
226
227
compute_l = 0.0
227
228
rpp = 0 # register pressure proportional to unrolling
228
229
rpc = 0 # register pressure independent of unroll factor
@@ -246,8 +247,12 @@ function unroll_no_reductions(ls, order, vloopsym)
246
247
u = if compute_rt ≤ 1
247
248
4
248
249
elseif compute_rt > memory_rt
249
- clamp (round (Int, compute_l / compute_rt), 1 , 4 )
250
- # max(1, VectorizationBase.nextpow2( min( 4, round(Int, 8 / compute_rt) ) ))
250
+ # @show load_rt, store_rt, compute_rt, compute_l, rpc, rpp
251
+ # if compute_rt > 40
252
+ # max(VectorizationBase.nextpow2( min( 4, round(Int, compute_rt / memory_rt) ) ), 1)
253
+ # else
254
+ clamp (round (Int, compute_l / compute_rt), 1 , Core. ifelse (compute_rt> 80 , 2 , 4 ))
255
+ # end
251
256
elseif iszero (load_rt)
252
257
iszero (store_rt) ? 4 : max (1 , min (4 , round (Int, 2 compute_rt / store_rt)))
253
258
else
@@ -269,7 +274,7 @@ function unroll_no_reductions(ls, order, vloopsym)
269
274
# motivation for skipping division by loads here: https://github.com/microhh/stencilbuilder/blob/master/julia/stencil_julia_4th.jl
270
275
# Some values:
271
276
# (load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp) = (52.0, 3.0, 92.0, 736.0, 4, 0.0, 52.0)
272
- # This is fastest when `u = 4`, but `reg_constraint` was restricting it to 1.
277
+ # This is fastest when `u = 4`, but `reg_constraint` was restricting it to 1. ## later benchmarks were faster with u = 2?
273
278
# Obviously, this limitation on number of registers didn't seem so important in practice.
274
279
# So, heuristically I check if compute latency dominates the problem, in which case unrolling could be expected to benefit us.
275
280
# Ideally, we'd count the number of loads that actually have to be live at a given time. But this heuristic is hopefully okay for now.
@@ -1462,7 +1467,7 @@ function choose_order_cost(ls::LoopSet, v::Int = 0)
1462
1467
fill_children! (ls)
1463
1468
resize! (ls. loop_order, length (ls. loopsymbols))
1464
1469
sld = store_load_deps (operations (ls))
1465
- if num_loops (ls) > 1
1470
+ if ( num_loops (ls) > 1 ) && ( length (ls . operations) ≤ 100 )
1466
1471
torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline = choose_tile (ls, sld, v)
1467
1472
else
1468
1473
torder = names (ls) # dummy
0 commit comments