125
125
function biggest_type_size (ls:: LoopSet )
126
126
maximum (elsize, operations (ls))
127
127
end
128
- # function VectorizationBase.pick_vector_width(ls::LoopSet, u::Symbol)
129
- # VectorizationBase.pick_vector_width(length(ls, u), biggest_type_size(ls))
130
- # end
131
- # function VectorizationBase.pick_vector_width_shift(ls::LoopSet, u::Symbol)
132
- # VectorizationBase.pick_vector_width_shift(length(ls, u), biggest_type_size(ls))
133
- # end
134
128
function hasintersection (a, b)
135
129
for aᵢ ∈ a, bᵢ ∈ b
136
130
aᵢ === bᵢ && return true
242
236
function unroll_no_reductions (ls, order, vloopsym)
243
237
size_T = biggest_type_size (ls)
244
238
W, Wshift = lsvecwidthshift (ls, vloopsym, size_T)
245
- # W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vloopsym), size_T)::Tuple{Int,Int}
246
239
247
240
compute_rt = load_rt = store_rt = 0.0
248
241
unrolled = last (order)
@@ -361,11 +354,25 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
361
354
num_reductions = count_reductions (ls)
362
355
# The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
363
356
# The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
364
- if iszero (num_reductions)
365
- # if only 1 loop, no need to unroll
366
- # if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
367
- return unroll_no_reductions (ls, order, vloopsym)
357
+ loopindexesbit = ls. loopindexesbit
358
+ if iszero (length (loopindexesbit)) || ((! loopindexesbit[getloopid (ls, vloopsym)]))
359
+ if iszero (num_reductions)
360
+ return unroll_no_reductions (ls, order, vloopsym)
361
+ else
362
+ return determine_unroll_factor (ls, order, vloopsym, num_reductions)
363
+ end
364
+ elseif iszero (num_reductions)
365
+ return 8 ÷ ls. vector_width[], vloopsym
366
+ else
367
+ rttemp, ltemp = determine_unroll_factor (ls, order, vloopsym, vloopsym)
368
+ UF = min (8 , VectorizationBase. nextpow2 (max (1 , round (Int, ltemp / (rttemp * num_reductions) ) )))
369
+ UFfactor = 8 ÷ ls. vector_width[]
370
+ cld (UF, UFfactor)* UFfactor, vloopsym
368
371
end
372
+ end
373
+ # function scale_unrolled()
374
+ # end
375
+ function determine_unroll_factor (ls:: LoopSet , order:: Vector{Symbol} , vloopsym:: Symbol , num_reductions:: Int )
369
376
innermost_loop = last (order)
370
377
rt = Inf ; rtcomp = Inf ; latency = Inf ; best_unrolled = Symbol (" " )
371
378
for unrolled ∈ order
@@ -533,12 +540,17 @@ function solve_unroll(
533
540
W:: Int , vloopsym:: Symbol , rounduᵢ:: Int
534
541
)
535
542
(u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of.
536
- (max (1 ,cache_lnsze (ls) ÷ reg_size (ls)), 1 )
543
+ (max (1 , cache_lnsze (ls) ÷ reg_size (ls)), 1 )
537
544
elseif rounduᵢ == 2
538
545
(1 , max (1 ,cache_lnsze (ls) ÷ reg_size (ls)))
546
+ elseif rounduᵢ == - 1
547
+ (8 ÷ ls. vector_width[], 1 )
548
+ elseif rounduᵢ == - 2
549
+ (1 , 8 ÷ ls. vector_width[])
539
550
else
540
551
(1 , 1 )
541
552
end
553
+ # @show u₁step, u₂step
542
554
u₁loop = getloop (ls, u₁loopsym)
543
555
u₂loop = getloop (ls, u₂loopsym)
544
556
solve_unroll (
921
933
# But optimal order within tile must still be determined
922
934
# as well as size of the tiles.
923
935
function evaluate_cost_tile (
924
- ls:: LoopSet , order:: Vector{Symbol} , unrollsyms:: UnrollSymbols
936
+ ls:: LoopSet , order:: Vector{Symbol} , unrollsyms:: UnrollSymbols , anyisbit :: Bool
925
937
)
926
938
N = length (order)
927
939
@assert N ≥ 2 " Cannot tile merely $N loops!"
@@ -940,7 +952,6 @@ function evaluate_cost_tile(
940
952
# Need to check if fusion is possible
941
953
size_T = biggest_type_size (ls)
942
954
W, Wshift = lsvecwidthshift (ls, vloopsym, size_T)
943
- # W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vloopsym), size_T)::Tuple{Int,Int}
944
955
# costs =
945
956
# cost_mat[1] / ( unrolled * u₂loopsym)
946
957
# cost_mat[2] / ( u₂loopsym)
@@ -1019,10 +1030,8 @@ function evaluate_cost_tile(
1019
1030
# elseif isconstant(op)
1020
1031
end
1021
1032
rt, lat, rp = cost (ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
1022
- if isload (op)
1023
- if ! prefetch_good_idea
1024
- prefetch_good_idea = prefetchisagoodidea (ls, op, UnrollArgs (ls, 4 , unrollsyms, 4 , 0 )) ≠ 0
1025
- end
1033
+ if isload (op) & (! prefetch_good_idea)
1034
+ prefetch_good_idea = prefetchisagoodidea (ls, op, UnrollArgs (ls, 4 , unrollsyms, 4 , 0 )) ≠ 0
1026
1035
end
1027
1036
# rp = (opisininnerloop && !(loadintostore(ls, op))) ? rp : zero(rp) # we only care about register pressure within the inner most loop
1028
1037
rp = opisininnerloop ? rp : zero (rp) # we only care about register pressure within the inner most loop
@@ -1041,10 +1050,22 @@ function evaluate_cost_tile(
1041
1050
# reg_pres[4] == remaining_registers
1042
1051
costpenalty = ((reg_pressure[1 ] + reg_pressure[2 ] + reg_pressure[3 ]) > reg_pressure[4 ]) ? 2 : 1
1043
1052
u₁v = vloopsym === u₁loopsym; u₂v = vloopsym === u₂loopsym
1044
- round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0 )) : 0
1053
+ visbit = anyisbit && ls. loopindexesbit[getloopid (ls,vloopsym)]
1054
+ round_uᵢ = if visbit
1055
+ (u₁v ? - 1 : (u₂v ? - 2 : 0 ))
1056
+ elseif prefetch_good_idea
1057
+ (u₁v ? 1 : (u₂v ? 2 : 0 ))
1058
+ else
1059
+ 0
1060
+ end
1045
1061
# @show (irreducible_storecosts / sum(cost_vec))
1046
1062
if (irreducible_storecosts / sum (cost_vec) ≥ 0.5 ) && ! any (op -> loadintostore (ls, op), operations (ls))
1047
- u₁, u₂ = (1 , 1 )
1063
+ u₁, u₂ = if visbit
1064
+ vecsforbyte = 8 ÷ ls. vector_width[]
1065
+ u₁v ? (vecsforbyte,1 ) : (1 ,vecsforbyte)
1066
+ else
1067
+ (1 , 1 )
1068
+ end
1048
1069
ucost = unroll_cost (cost_vec, 1 , 1 , length (getloop (ls, u₁loopsym)), length (getloop (ls, u₂loopsym)))
1049
1070
else
1050
1071
u₁, u₂, ucost = solve_unroll (ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, round_uᵢ)
@@ -1198,6 +1219,7 @@ function choose_tile(ls::LoopSet)
1198
1219
best_order = copyto! (ls. loop_order. bestorder, lo. syms)
1199
1220
bestu₁ = bestu₂ = best_vec = first (best_order) # filler
1200
1221
u₁ = u₂ = 0 ; lowest_cost = Inf ; shouldinline = false
1222
+ anyisbit = any (ls. loopindexesbit)
1201
1223
for newu₂ ∈ lo. syms
1202
1224
reject_reorder (ls, newu₂) && continue
1203
1225
for newu₁ ∈ lo. syms# @view(new_order[nt+1:end])
@@ -1207,7 +1229,11 @@ function choose_tile(ls::LoopSet)
1207
1229
while true
1208
1230
for new_vec ∈ new_order # view to skip first
1209
1231
reject_reorder (ls, new_vec) && continue
1210
- u₁temp, u₂temp, cost_temp, shouldinline_temp = evaluate_cost_tile (ls, new_order, UnrollSymbols (newu₁, newu₂, new_vec))
1232
+ if anyisbit && ls. loopindexesbit[getloopid (ls,new_vec)]
1233
+ # ((new_vec === newu₁) || (new_vec === newu₂)) || continue
1234
+ (new_vec === newu₁) || continue
1235
+ end
1236
+ u₁temp, u₂temp, cost_temp, shouldinline_temp = evaluate_cost_tile (ls, new_order, UnrollSymbols (newu₁, newu₂, new_vec), anyisbit)
1211
1237
# if cost_temp < lowest_cost # leads to 4 vmovapds
1212
1238
if cost_temp ≤ lowest_cost # lead to 2 vmovapds
1213
1239
lowest_cost = cost_temp
0 commit comments