@@ -170,9 +170,16 @@ function roundpow2(i::Integer)
170
170
ld = i - l
171
171
ud > ld ? l : u
172
172
end
173
- function unroll_no_reductions (ls, order, unrolled, vectorized, Wshift, size_T)
173
+ function unroll_no_reductions (ls, order, vectorized)
174
+ size_T = biggest_type_size (ls)
175
+ W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, vectorized), size_T):: Tuple{Int,Int}
176
+
174
177
compute_rt = 0.0
175
178
load_rt = 0.0
179
+ unrolled = last (order)
180
+ if unrolled === vectorized && length (order) > 1
181
+ unrolled = order[end - 1 ]
182
+ end
176
183
# latency not a concern, because no depchains
177
184
for op ∈ operations (ls)
178
185
dependson (op, unrolled) || continue
@@ -186,28 +193,14 @@ function unroll_no_reductions(ls, order, unrolled, vectorized, Wshift, size_T)
186
193
# @show compute_rt, load_rt
187
194
# roundpow2(min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt)))
188
195
rt = max (compute_rt, load_rt)
189
- iszero (rt) && return 4
190
- max (1 , roundpow2 ( min ( 4 , round (Int, 16 / rt) ) ))
196
+ (iszero (rt) ? 4 : max (1 , roundpow2 ( min ( 4 , round (Int, 16 / rt) ) ))), unrolled
191
197
end
192
198
function determine_unroll_factor (
193
- ls:: LoopSet , order:: Vector{Symbol} , unrolled:: Symbol , vectorized:: Symbol = first (order)
199
+ ls:: LoopSet , order:: Vector{Symbol} , unrolled:: Symbol , vectorized:: Symbol
194
200
)
195
201
size_T = biggest_type_size (ls)
196
202
W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, vectorized), size_T):: Tuple{Int,Int}
197
203
198
- # The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
199
- # The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
200
- num_reductions = 0 # sum(isreduction, operations(ls))
201
- for op ∈ operations (ls)
202
- if isreduction (op) & iscompute (op) && parentsnotreduction (op)
203
- num_reductions += 1
204
- end
205
- end
206
- if iszero (num_reductions)
207
- # if only 1 loop, no need to unroll
208
- # if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
209
- return unroll_no_reductions (ls, order, unrolled, vectorized, Wshift, size_T)
210
- end
211
204
# So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
212
205
# if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_throughput * num_reductions)))
213
206
# We also make sure register pressure is not too high.
@@ -233,7 +226,40 @@ function determine_unroll_factor(
233
226
load_recip_throughput,
234
227
store_recip_throughput
235
228
)
236
- min (8 , roundpow2 (max (1 , round (Int, latency / (recip_throughput * num_reductions) ) )))
229
+ recip_throughput, latency
230
+ end
231
+ function count_reductions (ls:: LoopSet )
232
+ num_reductions = 0
233
+ for op ∈ operations (ls)
234
+ if isreduction (op) & iscompute (op) && parentsnotreduction (op)
235
+ num_reductions += 1
236
+ end
237
+ end
238
+ num_reductions
239
+ end
240
+
241
+ function determine_unroll_factor (ls:: LoopSet , order:: Vector{Symbol} , vectorized:: Symbol )
242
+ num_reductions = count_reductions (ls)
243
+ # The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
244
+ # The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
245
+ if iszero (num_reductions)
246
+ # if only 1 loop, no need to unroll
247
+ # if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
248
+ return unroll_no_reductions (ls, order, vectorized)
249
+ end
250
+
251
+ rt = Inf ; rtcomp = Inf ; latency = Inf ; best_unrolled = Symbol (" " )
252
+ for unrolled ∈ order
253
+ rttemp, ltemp = determine_unroll_factor (ls, order, unrolled, vectorized)
254
+ rtcomptemp = rttemp + (0.01 * (vectorized === unrolled))
255
+ if rtcomptemp < rtcomp
256
+ rt = rttemp
257
+ rtcomp = rtcomptemp
258
+ latency = ltemp
259
+ best_unrolled = unrolled
260
+ end
261
+ end
262
+ min (8 , roundpow2 (max (1 , round (Int, latency / (rt * num_reductions) ) ))), best_unrolled
237
263
end
238
264
239
265
function unroll_cost (X, u₁, u₂, u₁L, u₂L)
@@ -728,6 +754,7 @@ function evaluate_cost_tile(
728
754
reg_pressure[1 ] += rp
729
755
end
730
756
end
757
+ # @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
731
758
# @show cost_vec reg_pressure
732
759
costpenalty = (sum (reg_pressure) > REGISTER_COUNT) ? 2 : 1
733
760
# @show order, vectorized cost_vec reg_pressure
@@ -914,7 +941,8 @@ function choose_order_cost(ls::LoopSet)
914
941
# return torder, tvec, 4, 4#5, 5
915
942
else
916
943
copyto! (ls. loop_order. bestorder, uorder)
917
- return uorder, first (uorder), Symbol (" ##undefined##" ), uvec, determine_unroll_factor (ls, uorder, first (uorder), uvec), - 1 , uc, true
944
+ UF, uunroll = determine_unroll_factor (ls, uorder, uvec)
945
+ return uorder, uunroll, Symbol (" ##undefined##" ), uvec, UF, - 1 , uc, true
918
946
end
919
947
end
920
948
function choose_order (ls:: LoopSet )
0 commit comments