Skip to content

Commit b7199a5

Browse files
committed
Allow singly unrolled loops to unroll something other than the outer-most loop, and improve performance of a few vreduce methods.
1 parent e582fe2 commit b7199a5

File tree

2 files changed

+73
-28
lines changed

2 files changed

+73
-28
lines changed

src/determinestrategy.jl

Lines changed: 47 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -170,9 +170,16 @@ function roundpow2(i::Integer)
170170
ld = i - l
171171
ud > ld ? l : u
172172
end
173-
function unroll_no_reductions(ls, order, unrolled, vectorized, Wshift, size_T)
173+
function unroll_no_reductions(ls, order, vectorized)
174+
size_T = biggest_type_size(ls)
175+
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
176+
174177
compute_rt = 0.0
175178
load_rt = 0.0
179+
unrolled = last(order)
180+
if unrolled === vectorized && length(order) > 1
181+
unrolled = order[end-1]
182+
end
176183
# latency not a concern, because no depchains
177184
for op operations(ls)
178185
dependson(op, unrolled) || continue
@@ -186,28 +193,14 @@ function unroll_no_reductions(ls, order, unrolled, vectorized, Wshift, size_T)
186193
# @show compute_rt, load_rt
187194
# roundpow2(min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt)))
188195
rt = max(compute_rt, load_rt)
189-
iszero(rt) && return 4
190-
max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))
196+
(iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
191197
end
192198
function determine_unroll_factor(
193-
ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, vectorized::Symbol = first(order)
199+
ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, vectorized::Symbol
194200
)
195201
size_T = biggest_type_size(ls)
196202
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
197203

198-
# The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
199-
# The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
200-
num_reductions = 0#sum(isreduction, operations(ls))
201-
for op operations(ls)
202-
if isreduction(op) & iscompute(op) && parentsnotreduction(op)
203-
num_reductions += 1
204-
end
205-
end
206-
if iszero(num_reductions)
207-
# if only 1 loop, no need to unroll
208-
# if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
209-
return unroll_no_reductions(ls, order, unrolled, vectorized, Wshift, size_T)
210-
end
211204
# So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
212205
# if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_throughput * num_reductions)))
213206
# We also make sure register pressure is not too high.
@@ -233,7 +226,40 @@ function determine_unroll_factor(
233226
load_recip_throughput,
234227
store_recip_throughput
235228
)
236-
min(8, roundpow2(max(1, round(Int, latency / (recip_throughput * num_reductions) ) )))
229+
recip_throughput, latency
230+
end
231+
function count_reductions(ls::LoopSet)
232+
num_reductions = 0
233+
for op operations(ls)
234+
if isreduction(op) & iscompute(op) && parentsnotreduction(op)
235+
num_reductions += 1
236+
end
237+
end
238+
num_reductions
239+
end
240+
241+
function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vectorized::Symbol)
242+
num_reductions = count_reductions(ls)
243+
# The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
244+
# The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
245+
if iszero(num_reductions)
246+
# if only 1 loop, no need to unroll
247+
# if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
248+
return unroll_no_reductions(ls, order, vectorized)
249+
end
250+
251+
rt = Inf; rtcomp = Inf; latency = Inf; best_unrolled = Symbol("")
252+
for unrolled order
253+
rttemp, ltemp = determine_unroll_factor(ls, order, unrolled, vectorized)
254+
rtcomptemp = rttemp + (0.01 * (vectorized === unrolled))
255+
if rtcomptemp < rtcomp
256+
rt = rttemp
257+
rtcomp = rtcomptemp
258+
latency = ltemp
259+
best_unrolled = unrolled
260+
end
261+
end
262+
min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
237263
end
238264

239265
function unroll_cost(X, u₁, u₂, u₁L, u₂L)
@@ -728,6 +754,7 @@ function evaluate_cost_tile(
728754
reg_pressure[1] += rp
729755
end
730756
end
757+
# @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
731758
# @show cost_vec reg_pressure
732759
costpenalty = (sum(reg_pressure) > REGISTER_COUNT) ? 2 : 1
733760
# @show order, vectorized cost_vec reg_pressure
@@ -914,7 +941,8 @@ function choose_order_cost(ls::LoopSet)
914941
# return torder, tvec, 4, 4#5, 5
915942
else
916943
copyto!(ls.loop_order.bestorder, uorder)
917-
return uorder, first(uorder), Symbol("##undefined##"), uvec, determine_unroll_factor(ls, uorder, first(uorder), uvec), -1, uc, true
944+
UF, uunroll = determine_unroll_factor(ls, uorder, uvec)
945+
return uorder, uunroll, Symbol("##undefined##"), uvec, UF, -1, uc, true
918946
end
919947
end
920948
function choose_order(ls::LoopSet)

src/mapreduce.jl

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,8 @@ end
7171

7272
@inline vmapreduce(f, op, args...) = mapreduce(f, op, args...)
7373

74+
length_one_axis(::Base.OneTo) = Base.OneTo(1)
75+
length_one_axis(::Any) = 1:1
7476

7577
"""
7678
vreduce(op, destination, A::DenseArray...)
@@ -79,26 +81,41 @@ Vectorized version of `reduce`. Reduces the array `A` using the operator `op`.
7981
"""
8082
@inline vreduce(op, arg) = vmapreduce(identity, op, arg)
8183

82-
for (op, init) in zip((:+, :max, :min), (:zero, :identity, :identity))
84+
for (op, init) in zip((:+, :max, :min), (:zero, :typemin, :typemax))
8385
@eval function vreduce(::typeof($op), arg; dims = nothing)
8486
isnothing(dims) && return _vreduce($op, arg)
8587
@assert length(dims) == 1
86-
out = $init(arg[ntuple(d -> d == dims ? (1:1) : (1:size(arg, d)), ndims(arg))...])
87-
Rpre = CartesianIndices(axes(arg)[1:dims-1])
88-
Rpost = CartesianIndices(axes(arg)[dims+1:end])
89-
_vreduce_dims!(out, $op, Rpre, 1:size(arg, dims), Rpost, arg)
88+
axes_arg = axes(arg)
89+
axes_out = Base.setindex(axes_arg, length_one_axis(axes_arg[dims]), dims)
90+
out = similar(arg, axes_out)
91+
# fill!(out, $init(first(arg)))
92+
# TODO: generated function with Base.Cartesian.@nif to set to ndim(arg)
93+
Base.Cartesian.@nif 5 d -> (d <= ndims(arg) && dims == d) d -> begin
94+
Rpre = CartesianIndices(ntuple(i -> axes_arg[i], d-1))
95+
Rpost = CartesianIndices(ntuple(i -> axes_arg[i+d], ndims(arg) - d))
96+
_vreduce_dims!(out, $op, Rpre, 1:size(arg, dims), Rpost, arg)
97+
end d -> begin
98+
Rpre = CartesianIndices(axes_arg[1:dims-1])
99+
Rpost = CartesianIndices(axes_arg[dims+1:end])
100+
_vreduce_dims!(out, $op, Rpre, 1:size(arg, dims), Rpost, arg)
101+
end
90102
end
91103

92104
@eval function _vreduce_dims!(out, ::typeof($op), Rpre, is, Rpost, arg)
93-
@avx for Ipost in Rpost, i in is, Ipre in Rpre
94-
out[Ipre, 1, Ipost] = $op(out[Ipre, 1, Ipost], arg[Ipre, i, Ipost])
105+
s = $init(first(arg))
106+
@avx for Ipost in Rpost, Ipre in Rpre
107+
accum = s
108+
for i in is
109+
accum = $op(accum, arg[Ipre, i, Ipost])
110+
end
111+
out[Ipre, 1, Ipost] = accum
95112
end
96113
return out
97114
end
98115

99116
@eval function _vreduce(::typeof($op), arg)
100-
s = $init(arg[1])
101-
@avx for i in 1:length(arg)
117+
s = $init(first(arg))
118+
@avx for i in eachindex(arg)
102119
s = $op(s, arg[i])
103120
end
104121
return s

0 commit comments

Comments
 (0)