@@ -240,16 +240,18 @@ function unroll_no_reductions(ls, order, vloopsym)
240
240
end
241
241
# latency not a concern, because no depchains
242
242
compute_l = 0.0
243
- # rp = 0
243
+ rpp = 0 # register pressure proportional to unrolling
244
+ rpc = 0 # register pressure independent of unroll factor
244
245
for op ∈ operations (ls)
245
246
isu₁unrolled (op) || continue
246
247
rt, sl, rpop = cost (ls, op, (unrolled,Symbol (" " )), vloopsym, Wshift, size_T)
247
- # rp += rpop
248
248
if iscompute (op)
249
249
compute_rt += rt
250
250
compute_l += sl
251
+ rpc += rpop # constant loads for special functions reused with unrolling
251
252
elseif isload (op)
252
253
load_rt += rt
254
+ rpp += rpop # loads are proportional to unrolling
253
255
elseif isstore (op)
254
256
store_rt += rt
255
257
end
@@ -277,7 +279,9 @@ function unroll_no_reductions(ls, order, vloopsym)
277
279
if unrolled === vloopsym
278
280
u = demote_unroll_factor (ls, u, vloopsym)
279
281
end
280
- u, unrolled
282
+ remaining_reg = max (8 , (reg_count (ls) - round (Int,rpc))) # spilling a few consts isn't so bad
283
+ reg_constraint = max (1 , remaining_reg ÷ round (Int,rpp))
284
+ clamp (u, 1 , reg_constraint), unrolled
281
285
# rt = max(compute_rt, load_rt + store_rt)
282
286
# # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
283
287
# (iszero(rt) ? 4 : max(1, VectorizationBase.nextpow2( min( 4, round(Int, 8 / rt) ) ))), unrolled
0 commit comments