49
49
# evaluates cost of evaluating loop in given order
50
50
# heuristically, could simplify analysis by just unrolling outer loop?
51
51
function evaluate_cost_unroll (
52
- ls:: LoopSet , order:: Vector{Symbol} , max_cost = typemax (Float64), unrolled :: Symbol = first (order)
52
+ ls:: LoopSet , order:: Vector{Symbol} , max_cost = typemax (Float64), vectorized :: Symbol = first (order)
53
53
)
54
54
# included_vars = Set{UInt}()
55
55
included_vars = fill (false , length (operations (ls)))
@@ -58,12 +58,12 @@ function evaluate_cost_unroll(
58
58
iter = 1.0
59
59
# Need to check if fusion is possible
60
60
size_T = biggest_type_size (ls)
61
- W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, unrolled ), size_T):: Tuple{Int,Int}
61
+ W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, vectorized ), size_T):: Tuple{Int,Int}
62
62
for itersym ∈ order
63
63
# Add to set of defined symbles
64
64
push! (nested_loop_syms, itersym)
65
65
liter = Float64 (length (ls, itersym))
66
- if itersym === unrolled
66
+ if itersym === vectorized
67
67
liter /= W
68
68
end
69
69
iter *= liter
@@ -79,27 +79,27 @@ function evaluate_cost_unroll(
79
79
hasintersection (rd, nested_loop_syms[1 : end - length (rd)]) && return Inf
80
80
included_vars[id] = true
81
81
82
- total_cost += iter * first (cost (op, unrolled , Wshift, size_T))
82
+ total_cost += iter * first (cost (op, vectorized , Wshift, size_T))
83
83
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
84
84
end
85
85
end
86
86
total_cost
87
87
end
88
88
89
- # only covers unrolled ops; everything else considered lifted?
89
+ # only covers vectorized ops; everything else considered lifted?
90
90
function depchain_cost! (
91
- skip:: Vector{Bool} , op:: Operation , unrolled :: Symbol , Wshift:: Int , size_T:: Int , rt:: Float64 = 0.0 , sl:: Int = 0
91
+ skip:: Vector{Bool} , op:: Operation , vectorized :: Symbol , Wshift:: Int , size_T:: Int , rt:: Float64 = 0.0 , sl:: Int = 0
92
92
)
93
93
skip[identifier (op)] = true
94
94
# depth first search
95
95
for opp ∈ parents (op)
96
96
skip[identifier (opp)] && continue
97
- rt, sl = depchain_cost! (skip, opp, unrolled , Wshift, size_T, rt, sl)
97
+ rt, sl = depchain_cost! (skip, opp, vectorized , Wshift, size_T, rt, sl)
98
98
end
99
99
# Basically assuming memory and compute don't conflict, but everything else does
100
100
# Ie, ignoring the fact that integer and floating point operations likely don't either
101
101
if iscompute (op)
102
- rtᵢ, slᵢ = cost (op, unrolled , Wshift, size_T)
102
+ rtᵢ, slᵢ = cost (op, vectorized , Wshift, size_T)
103
103
rt += rtᵢ; sl += slᵢ
104
104
end
105
105
rt, sl
@@ -111,10 +111,10 @@ function parentsnotreduction(op::Operation)
111
111
return true
112
112
end
113
113
function determine_unroll_factor (
114
- ls:: LoopSet , order:: Vector{Symbol} , unrolled:: Symbol = first (order)
114
+ ls:: LoopSet , order:: Vector{Symbol} , unrolled:: Symbol , vectorized :: Symbol = first (order)
115
115
)
116
116
size_T = biggest_type_size (ls)
117
- W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, unrolled ), size_T):: Tuple{Int,Int}
117
+ W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, vectorized ), size_T):: Tuple{Int,Int}
118
118
119
119
# The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
120
120
# The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
@@ -139,13 +139,13 @@ function determine_unroll_factor(
139
139
for op ∈ operations (ls)
140
140
dependson (op, unrolled) || continue
141
141
if isreduction (op)
142
- rt, sl = depchain_cost! (visited_nodes, op, unrolled , Wshift, size_T)
142
+ rt, sl = depchain_cost! (visited_nodes, op, vectorized , Wshift, size_T)
143
143
latency = max (sl, latency)
144
144
compute_recip_throughput += rt
145
145
elseif isload (op)
146
- load_recip_throughput += first (cost (op, unrolled , Wshift, size_T))
146
+ load_recip_throughput += first (cost (op, vectorized , Wshift, size_T))
147
147
elseif isstore (op)
148
- store_recip_throughput += first (cost (op, unrolled , Wshift, size_T))
148
+ store_recip_throughput += first (cost (op, vectorized , Wshift, size_T))
149
149
end
150
150
end
151
151
recip_throughput = max (
@@ -240,16 +240,22 @@ function solve_tilesize(
240
240
cost_vec:: AbstractVector{Float64} = @view (ls. cost_vec[:,1 ]),
241
241
reg_pressure:: AbstractVector{Int} = @view (ls. reg_pres[:,1 ])
242
242
)
243
- maxT = isstaticloop (ls, tiled) ? looprangehint (ls, tiled) : 4 # REGISTER_COUNT
244
- maxU = isstaticloop (ls, unrolled) ? looprangehint (ls, unrolled) : 8 # REGISTER_COUNT
243
+ maxT = 4
244
+ maxU = 8
245
+ if isstaticloop (ls, tiled)
246
+ maxT = min (maxT, looprangehint (ls, tiled))
247
+ end
248
+ if isstaticloop (ls, unrolled)
249
+ maxU = min (maxU, looprangehint (ls, unrolled))
250
+ end
245
251
solve_tilesize (cost_vec, reg_pressure, maxU, maxT)
246
252
end
247
253
248
254
# Just tile outer two loops?
249
255
# But optimal order within tile must still be determined
250
256
# as well as size of the tiles.
251
257
function evaluate_cost_tile (
252
- ls:: LoopSet , order:: Vector{Symbol}
258
+ ls:: LoopSet , order:: Vector{Symbol} , vectorized :: Symbol
253
259
)
254
260
N = length (order)
255
261
@assert N ≥ 2 " Cannot tile merely $N loops!"
@@ -260,7 +266,7 @@ function evaluate_cost_tile(
260
266
iter = 1.0
261
267
# Need to check if fusion is possible
262
268
size_T = biggest_type_size (ls)
263
- W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, unrolled ), size_T):: Tuple{Int,Int}
269
+ W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, vectorized ), size_T):: Tuple{Int,Int}
264
270
# costs =
265
271
# cost_mat[1] / ( unrolled * tiled)
266
272
# cost_mat[2] / ( tiled)
@@ -293,7 +299,7 @@ function evaluate_cost_tile(
293
299
rd = reduceddependencies (op)
294
300
hasintersection (rd, nested_loop_syms[1 : end - length (rd)]) && return 0 ,0 ,Inf
295
301
included_vars[id] = true
296
- rt, lat, rp = cost (op, unrolled , Wshift, size_T)
302
+ rt, lat, rp = cost (op, vectorized , Wshift, size_T)
297
303
# @show instruction(op), rt, lat, rp, iter
298
304
rt *= iter
299
305
isunrolled = unrolled ∈ loopdependencies (op)
@@ -367,48 +373,54 @@ function choose_unroll_order(ls::LoopSet, lowest_cost::Float64 = Inf)
367
373
lo = LoopOrders (ls)
368
374
best_order = lo. syms
369
375
new_order, state = iterate (lo) # right now, new_order === best_order
376
+ best_vec = first (new_order)
370
377
while true
371
- cost_temp = evaluate_cost_unroll (ls, new_order, lowest_cost)
372
- if cost_temp < lowest_cost
373
- lowest_cost = cost_temp
374
- best_order = new_order
378
+ for new_vec ∈ new_order
379
+ cost_temp = evaluate_cost_unroll (ls, new_order, lowest_cost, new_vec)
380
+ if cost_temp < lowest_cost
381
+ lowest_cost = cost_temp
382
+ best_order = new_order
383
+ best_vec = new_vec
384
+ end
375
385
end
376
386
iter = iterate (lo, state)
377
- iter === nothing && return best_order, lowest_cost
387
+ iter === nothing && return best_order, best_vec, lowest_cost
378
388
new_order, state = iter
379
389
end
380
390
end
381
391
function choose_tile (ls:: LoopSet )
382
392
lo = LoopOrders (ls)
383
393
best_order = copyto! (ls. loop_order. bestorder, lo. syms)
394
+ best_vec = first (best_order) # filler
384
395
new_order, state = iterate (lo) # right now, new_order === best_order
385
396
U, T, lowest_cost = 0 , 0 , Inf
386
397
while true
387
- U_temp, T_temp, cost_temp = evaluate_cost_tile (ls, new_order)
388
- if cost_temp < lowest_cost
389
- lowest_cost = cost_temp
390
- U, T = U_temp, T_temp
391
- copyto! (best_order, new_order)
392
- save_tilecost! (ls)
398
+ for new_vec ∈ @view (new_order[2 : end ]) # view to skip first
399
+ U_temp, T_temp, cost_temp = evaluate_cost_tile (ls, new_order, new_vec)
400
+ if cost_temp < lowest_cost
401
+ lowest_cost = cost_temp
402
+ U, T = U_temp, T_temp
403
+ best_vec = new_vec
404
+ copyto! (best_order, new_order)
405
+ save_tilecost! (ls)
406
+ end
393
407
end
394
408
iter = iterate (lo, state)
395
- iter === nothing && return best_order, U, T, lowest_cost
409
+ iter === nothing && return best_order, best_vec, U, T, lowest_cost
396
410
new_order, state = iter
397
411
end
398
412
end
399
413
function choose_order (ls:: LoopSet )
400
414
if num_loops (ls) > 1
401
- torder, tU, tT, tc = choose_tile (ls)
415
+ torder, tvec, tU, tT, tc = choose_tile (ls)
402
416
else
403
417
tc = Inf
404
418
end
405
- uorder, uc = choose_unroll_order (ls, tc)
406
- if num_loops (ls) <= 1 || tc > uc # if tc == uc, then that probably means we want tc, and no unrolled managed to beat the tiled cost
407
- # copyto!(ls.loop_order.loopnames, uorder)
408
- return uorder, determine_unroll_factor (ls, uorder), - 1
419
+ uorder, uvec, uc = choose_unroll_order (ls, tc)
420
+ if num_loops (ls) > 1 && tc < uc
421
+ return torder, tvec, tU, tT
409
422
else
410
- # copyto!(ls.loop_order.loopnames, torder)
411
- return torder, tU, tT
423
+ return uorder, uvec, determine_unroll_factor (ls, uorder, first (uorder), uvec), - 1
412
424
end
413
425
end
414
426
0 commit comments