Skip to content

Commit fa589f2

Browse files
committed
don't consider register loads for some compute and latency heavy operations
1 parent 281b304 commit fa589f2

File tree

1 file changed

+14
-3
lines changed

1 file changed

+14
-3
lines changed

src/modeling/determinestrategy.jl

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ function unroll_no_reductions(ls, order, vloopsym)
232232
if iscompute(op)
233233
compute_rt += rt
234234
compute_l += sl
235-
rpc += rpop # constant loads for special functions reused with unrolling
235+
rpc += max(zero(rpop),rpop - one(rpop)) # constant loads for special functions reused with unrolling
236236
elseif isload(op)
237237
load_rt += rt
238238
rpp += rpop # loads are proportional to unrolling
@@ -253,7 +253,7 @@ function unroll_no_reductions(ls, order, vloopsym)
253253
else
254254
max(1, min(4, round(Int, 1.75compute_rt / load_rt)))
255255
end
256-
# @show load_rt, store_rt, compute_rt, compute_l, u
256+
# @show load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp
257257
# u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
258258
# commented out here is to decide to align loops
259259
# if memory_rt > compute_rt && isone(u) && (length(order) > 1) && (last(order) === vloopsym) && length(getloop(ls, last(order))) > 8W
@@ -265,7 +265,18 @@ function unroll_no_reductions(ls, order, vloopsym)
265265
u = demote_unroll_factor(ls, u, vloopsym)
266266
end
267267
remaining_reg = max(8, (reg_count(ls) - round(Int,rpc))) # spilling a few consts isn't so bad
268-
reg_constraint = max(1, remaining_reg ÷ max(1,round(Int,rpp)))
268+
if compute_l 4compute_rt 4rpp
269+
# motivation for skipping division by loads here: https://github.com/microhh/stencilbuilder/blob/master/julia/stencil_julia_4th.jl
270+
# Some values:
271+
# (load_rt, store_rt, compute_rt, compute_l, u, rpc, rpp) = (52.0, 3.0, 92.0, 736.0, 4, 0.0, 52.0)
272+
# This is fastest when `u = 4`, but `reg_constraint` was restricting it to 1.
273+
# Obviously, this limitation on number of registers didn't seem so important in practice.
274+
# So, heuristically I check if compute latency dominates the problem, in which case unrolling could be expected to benefit us.
275+
# Ideally, we'd count the number of loads that actually have to be live at a given time. But this heuristic is hopefully okay for now.
276+
reg_constraint = max(1, remaining_reg)
277+
else
278+
reg_constraint = max(1, remaining_reg ÷ max(1,round(Int,rpp)))
279+
end
269280
clamp(u, 1, reg_constraint), unrolled
270281
# rt = max(compute_rt, load_rt + store_rt)
271282
# # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled

0 commit comments

Comments
 (0)