@@ -70,7 +70,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
70
70
if opisvectorized
71
71
if ! unitstride (ls, op, vectorized)# || !isdense(op) # need gather/scatter
72
72
r = (1 << Wshift)
73
- srt *= r
73
+ srt *= r# * 2
74
74
sl *= r
75
75
elseif isload (op) & (length (loopdependencies (op)) > 1 )# vmov(a/u)pd
76
76
# penalize vectorized loads with more than 1 loopdep
@@ -258,7 +258,7 @@ function determine_unroll_factor(
258
258
# So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
259
259
# if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_throughput * num_reductions)))
260
260
# We also make sure register pressure is not too high.
261
- latency = 1
261
+ latency = 1.0
262
262
# compute_recip_throughput_u = 0.0
263
263
compute_recip_throughput = 0.0
264
264
visited_nodes = fill (false , length (operations (ls)))
341
341
function unroll_cost (X, u₁, u₂, u₁L, u₂L)
342
342
u₂factor = (num_iterations (u₂L, u₂)/ u₂L)
343
343
u₁factor = (num_iterations (u₁L, u₁)/ u₁L)
344
+ # @show num_iterations(u₂L, u₂)/u₂L, u₂, u₂L
345
+ # @show num_iterations(u₁L, u₁)/u₁L, u₁, u₁L
344
346
# X[1]*u₂factor*u₁factor + X[4] + X[2] * u₂factor + X[3] * u₁factor
345
347
X[1 ] + X[2 ] * u₂factor + X[3 ] * u₁factor + X[4 ] * u₁factor * u₂factor
346
348
end
@@ -366,6 +368,7 @@ function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
366
368
for u₂temp ∈ u₂range
367
369
RR ≥ u₁temp* u₂temp* R₁ + u₁temp* R₂ + u₂temp* R₅ || continue
368
370
tempcost = unroll_cost (X, u₁temp, u₂temp, u₁L, u₂L)
371
+ # @show u₁temp, u₂temp, tempcost
369
372
if tempcost ≤ bestcost
370
373
bestcost = tempcost
371
374
u₁best, u₂best = u₁temp, u₂temp
@@ -394,7 +397,7 @@ function solve_unroll(X, R, u₁L, u₂L, u₁step, u₂step)
394
397
return solve_unroll_iter (X, R, u₁L, u₂L, u₁low: u₁step: u₁high, u₂low: u₂step: u₂high)
395
398
end
396
399
u₁low = floor (Int, u₁float)
397
- u₂low = max (u₂step, floor (Int, u ₂float)) # must be at least 1
400
+ u₂low = max (u₂step, floor (Int, 0.8 u ₂float)) # must be at least 1
398
401
u₁high = solve_unroll_constT (R, u₂low) + u₁step
399
402
u₂high = solve_unroll_constU (R, u₁low) + u₂step
400
403
maxunroll = REGISTER_COUNT == 32 ? (((X₂ > 0 ) & (X₃ > 0 )) ? 10 : 8 ) : 6
@@ -498,19 +501,30 @@ function solve_unroll(
498
501
u₁ = isstaticloop (u₁loop) ? maybedemotesize (u₁, u₁L) : u₁
499
502
return u₁, u₂L, unroll_cost (cost_vec, u₁, u₂L, u₁L, u₂L)
500
503
end
501
- u₂L = u₂loopsym === vectorized ? cld (u₂L,W) : u₂L
502
- maxu₂ = min (4 maxu₂, u₂L )
504
+ u₂Ltemp = u₂loopsym === vectorized ? cld (u₂L, W) : u₂L
505
+ maxu₂ = min (4 maxu₂, u₂Ltemp )
503
506
end
504
507
if isstaticloop (u₁loop)
505
508
if u₁loopsym != = vectorized && u₁L ≤ 4
506
509
u₂ = max (1 , solve_unroll_constU (reg_pressure, u₁L))
507
510
u₂ = isstaticloop (u₂loop) ? maybedemotesize (u₂, u₂L) : u₂
508
511
return u₁L, u₂, unroll_cost (cost_vec, u₁L, u₂, u₁L, u₂L)
509
512
end
510
- u₁L = u₁loopsym === vectorized ? cld (u₁L,W) : u₁L
511
- maxu₁ = min (4 maxu₁, u₁L )
513
+ u₁Ltemp = u₁loopsym === vectorized ? cld (u₁L, W) : u₁L
514
+ maxu₁ = min (4 maxu₁, u₁Ltemp )
512
515
end
513
- u₁, u₂, cost = solve_unroll (cost_vec, reg_pressure, maxu₁, maxu₂, length (u₁loop), length (u₂loop), u₁step, u₂step)
516
+ if u₁loopsym === vectorized
517
+ u₁Lf = u₁L / W
518
+ else
519
+ u₁Lf = Float64 (u₁L)
520
+ end
521
+ if u₂loopsym === vectorized
522
+ u₂Lf = u₂L / W
523
+ else
524
+ u₂Lf = Float64 (u₂L)
525
+ end
526
+ # @show u₁Lf, u₂Lf, u₁L, length(u₁loop)
527
+ u₁, u₂, cost = solve_unroll (cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step)
514
528
# heuristic to more evenly divide small numbers of iterations
515
529
if isstaticloop (u₂loop)
516
530
u₂ = maybedemotesize (u₂, length (u₂loop), u₁, u₁loop, maxu₂base)
@@ -884,7 +898,7 @@ function evaluate_cost_tile(
884
898
if isstore (op) & (! u₁reducesrt) & (! u₂reducesrt)
885
899
irreducible_storecosts += rt
886
900
end
887
- # @show u₁reducesrt, u₂reducesrt, op, rt, rto, rp
901
+ # iiter = convert(Int, iters[id]); @show u₁reducesrt, u₂reducesrt, op, rt, rto, rp, iiter
888
902
update_costs! (cost_vec, rt, u₁reducesrt, u₂reducesrt)
889
903
update_costs! (reg_pressure, rp, u₁reducesrp, u₂reducesrp)
890
904
end
@@ -1070,6 +1084,9 @@ function choose_order_cost(ls::LoopSet)
1070
1084
if num_loops (ls) > 1
1071
1085
torder, tunroll, ttile, tvec, tU, tT, tc, shouldinline = choose_tile (ls)
1072
1086
else
1087
+ torder = names (ls) # dummy
1088
+ tunroll = ttile = tvec = Symbol (" ##undefined##" ) # dummy
1089
+ tU = tT = 0 # dummy
1073
1090
tc = Inf
1074
1091
end
1075
1092
uorder, uvec, uc = choose_unroll_order (ls, tc)
0 commit comments