@@ -232,7 +232,7 @@ function unroll_no_reductions(ls, order, vloopsym)
232
232
if iscompute (op)
233
233
compute_rt += rt
234
234
compute_l += sl
235
- rpc += rpop # constant loads for special functions reused with unrolling
235
+ rpc += max ( zero ( rpop),rpop - one (rpop)) # constant loads for special functions reused with unrolling
236
236
elseif isload (op)
237
237
load_rt += rt
238
238
rpp += rpop # loads are proportional to unrolling
@@ -253,7 +253,7 @@ function unroll_no_reductions(ls, order, vloopsym)
253
253
else
254
254
max (1 , min (4 , round (Int, 1.75 compute_rt / load_rt)))
255
255
end
256
- # @show load_rt, store_rt, compute_rt, compute_l, u
256
+ # @show load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp
257
257
# u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
258
258
# commented out here is to decide to align loops
259
259
# if memory_rt > compute_rt && isone(u) && (length(order) > 1) && (last(order) === vloopsym) && length(getloop(ls, last(order))) > 8W
@@ -265,7 +265,18 @@ function unroll_no_reductions(ls, order, vloopsym)
265
265
u = demote_unroll_factor (ls, u, vloopsym)
266
266
end
267
267
remaining_reg = max (8 , (reg_count (ls) - round (Int,rpc))) # spilling a few consts isn't so bad
268
- reg_constraint = max (1 , remaining_reg ÷ max (1 ,round (Int,rpp)))
268
+ if compute_l ≥ 4 compute_rt ≥ 4 rpp
269
+ # motivation for skipping division by loads here: https://github.com/microhh/stencilbuilder/blob/master/julia/stencil_julia_4th.jl
270
+ # Some values:
271
+ # (load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp) = (52.0, 3.0, 92.0, 736.0, 4, 0.0, 52.0)
272
+ # This is fastest when `u = 4`, but `reg_constraint` was restricting it to 1.
273
+ # Obviously, this limitation on number of registers didn't seem so important in practice.
274
+ # So, heuristically I check if compute latency dominates the problem, in which case unrolling could be expected to benefit us.
275
+ # Ideally, we'd count the number of loads that actually have to be live at a given time. But this heuristic is hopefully okay for now.
276
+ reg_constraint = max (1 , remaining_reg)
277
+ else
278
+ reg_constraint = max (1 , remaining_reg ÷ max (1 ,round (Int,rpp)))
279
+ end
269
280
clamp (u, 1 , reg_constraint), unrolled
270
281
# rt = max(compute_rt, load_rt + store_rt)
271
282
# # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
0 commit comments