Skip to content

Commit 2dd9f27

Browse files
committed
Started working on decoupling vectorization and unrolling.
1 parent 7148b19 commit 2dd9f27

File tree

6 files changed

+242
-144
lines changed

6 files changed

+242
-144
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ MacroTools = "0.5"
1616
Parameters = "0.12.0"
1717
SIMDPirates = "0.1.1"
1818
SLEEFPirates = "0.1.1"
19-
VectorizationBase = "0.1.3"
19+
VectorizationBase = "0.1.4"
2020
julia = "1.3.0"
2121

2222
[extras]

src/LoopVectorization.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
module LoopVectorization
22

33
using VectorizationBase, SIMDPirates, SLEEFPirates, MacroTools, Parameters
4-
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr, mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd
4+
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
5+
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub
56
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod
67
using Base.Broadcast: Broadcasted, DefaultArrayStyle
78
using LinearAlgebra: Adjoint, Transpose

src/broadcast.jl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,21 @@ function add_broadcast!(
127127
) where {T,N}
128128
add_load!(ls, destname, ArrayReference(bcname, @view(loopsyms[1:N]), Ref{Bool}(false)), elementbytes)
129129
end
130+
function add_broadcast!(
131+
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{T}, elementbytes::Int = 8
132+
) where {T<:Union{Integer,Float32,Float64}}
133+
pushpreamble!(ls, Expr(:(=), destname, bcname))
134+
add_constant!(ls, destname, elementbytes) # or replace elementbytes with sizeof(T) ?
135+
end
136+
function add_broadcast!(
137+
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
138+
::Type{SubArray{T,N,A,S,B}}, elementbytes::Int = 8
139+
) where {T,N,N2,A<:AbstractArray{T,N2},B,N3,S <: Tuple{Int,Vararg{Any,N3}}}
140+
inds = Vector{Union{Int,Symbol}}(undef, N+1)
141+
inds[1] = Symbol("##DISCONTIGUOUSSUBARRAY##")
142+
inds[2:end] .= @view(loopsyms[1:N])
143+
add_load!(ls, destname, ArrayReference(bcname, inds, Ref{Bool}(false)), elementbytes)
144+
end
130145
function add_broadcast!(
131146
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
132147
::Type{Broadcasted{DefaultArrayStyle{N},Nothing,F,A}},

src/determinestrategy.jl

Lines changed: 49 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ end
4949
# evaluates cost of evaluating loop in given order
5050
# heuristically, could simplify analysis by just unrolling outer loop?
5151
function evaluate_cost_unroll(
52-
ls::LoopSet, order::Vector{Symbol}, max_cost = typemax(Float64), unrolled::Symbol = first(order)
52+
ls::LoopSet, order::Vector{Symbol}, max_cost = typemax(Float64), vectorized::Symbol = first(order)
5353
)
5454
# included_vars = Set{UInt}()
5555
included_vars = fill(false, length(operations(ls)))
@@ -58,12 +58,12 @@ function evaluate_cost_unroll(
5858
iter = 1.0
5959
# Need to check if fusion is possible
6060
size_T = biggest_type_size(ls)
61-
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, unrolled), size_T)::Tuple{Int,Int}
61+
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
6262
for itersym order
6363
# Add to set of defined symbles
6464
push!(nested_loop_syms, itersym)
6565
liter = Float64(length(ls, itersym))
66-
if itersym === unrolled
66+
if itersym === vectorized
6767
liter /= W
6868
end
6969
iter *= liter
@@ -79,27 +79,27 @@ function evaluate_cost_unroll(
7979
hasintersection(rd, nested_loop_syms[1:end-length(rd)]) && return Inf
8080
included_vars[id] = true
8181

82-
total_cost += iter * first(cost(op, unrolled, Wshift, size_T))
82+
total_cost += iter * first(cost(op, vectorized, Wshift, size_T))
8383
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
8484
end
8585
end
8686
total_cost
8787
end
8888

89-
# only covers unrolled ops; everything else considered lifted?
89+
# only covers vectorized ops; everything else considered lifted?
9090
function depchain_cost!(
91-
skip::Vector{Bool}, op::Operation, unrolled::Symbol, Wshift::Int, size_T::Int, rt::Float64 = 0.0, sl::Int = 0
91+
skip::Vector{Bool}, op::Operation, vectorized::Symbol, Wshift::Int, size_T::Int, rt::Float64 = 0.0, sl::Int = 0
9292
)
9393
skip[identifier(op)] = true
9494
# depth first search
9595
for opp parents(op)
9696
skip[identifier(opp)] && continue
97-
rt, sl = depchain_cost!(skip, opp, unrolled, Wshift, size_T, rt, sl)
97+
rt, sl = depchain_cost!(skip, opp, vectorized, Wshift, size_T, rt, sl)
9898
end
9999
# Basically assuming memory and compute don't conflict, but everything else does
100100
# Ie, ignoring the fact that integer and floating point operations likely don't either
101101
if iscompute(op)
102-
rtᵢ, slᵢ = cost(op, unrolled, Wshift, size_T)
102+
rtᵢ, slᵢ = cost(op, vectorized, Wshift, size_T)
103103
rt += rtᵢ; sl += slᵢ
104104
end
105105
rt, sl
@@ -111,10 +111,10 @@ function parentsnotreduction(op::Operation)
111111
return true
112112
end
113113
function determine_unroll_factor(
114-
ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol = first(order)
114+
ls::LoopSet, order::Vector{Symbol}, unrolled::Symbol, vectorized::Symbol = first(order)
115115
)
116116
size_T = biggest_type_size(ls)
117-
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, unrolled), size_T)::Tuple{Int,Int}
117+
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
118118

119119
# The strategy is to use an unroll factor of 1, unless there appears to be loop carried dependencies (ie, num_reductions > 0)
120120
# The assumption here is that unrolling provides no real benefit, unless it is needed to enable OOO execution by breaking up these dependency chains
@@ -139,13 +139,13 @@ function determine_unroll_factor(
139139
for op operations(ls)
140140
dependson(op, unrolled) || continue
141141
if isreduction(op)
142-
rt, sl = depchain_cost!(visited_nodes, op, unrolled, Wshift, size_T)
142+
rt, sl = depchain_cost!(visited_nodes, op, vectorized, Wshift, size_T)
143143
latency = max(sl, latency)
144144
compute_recip_throughput += rt
145145
elseif isload(op)
146-
load_recip_throughput += first(cost(op, unrolled, Wshift, size_T))
146+
load_recip_throughput += first(cost(op, vectorized, Wshift, size_T))
147147
elseif isstore(op)
148-
store_recip_throughput += first(cost(op, unrolled, Wshift, size_T))
148+
store_recip_throughput += first(cost(op, vectorized, Wshift, size_T))
149149
end
150150
end
151151
recip_throughput = max(
@@ -240,16 +240,22 @@ function solve_tilesize(
240240
cost_vec::AbstractVector{Float64} = @view(ls.cost_vec[:,1]),
241241
reg_pressure::AbstractVector{Int} = @view(ls.reg_pres[:,1])
242242
)
243-
maxT = isstaticloop(ls, tiled) ? looprangehint(ls, tiled) : 4#REGISTER_COUNT
244-
maxU = isstaticloop(ls, unrolled) ? looprangehint(ls, unrolled) : 8#REGISTER_COUNT
243+
maxT = 4
244+
maxU = 8
245+
if isstaticloop(ls, tiled)
246+
maxT = min(maxT, looprangehint(ls, tiled))
247+
end
248+
if isstaticloop(ls, unrolled)
249+
maxU = min(maxU, looprangehint(ls, unrolled))
250+
end
245251
solve_tilesize(cost_vec, reg_pressure, maxU, maxT)
246252
end
247253

248254
# Just tile outer two loops?
249255
# But optimal order within tile must still be determined
250256
# as well as size of the tiles.
251257
function evaluate_cost_tile(
252-
ls::LoopSet, order::Vector{Symbol}
258+
ls::LoopSet, order::Vector{Symbol}, vectorized::Symbol
253259
)
254260
N = length(order)
255261
@assert N 2 "Cannot tile merely $N loops!"
@@ -260,7 +266,7 @@ function evaluate_cost_tile(
260266
iter = 1.0
261267
# Need to check if fusion is possible
262268
size_T = biggest_type_size(ls)
263-
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, unrolled), size_T)::Tuple{Int,Int}
269+
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
264270
# costs =
265271
# cost_mat[1] / ( unrolled * tiled)
266272
# cost_mat[2] / ( tiled)
@@ -293,7 +299,7 @@ function evaluate_cost_tile(
293299
rd = reduceddependencies(op)
294300
hasintersection(rd, nested_loop_syms[1:end-length(rd)]) && return 0,0,Inf
295301
included_vars[id] = true
296-
rt, lat, rp = cost(op, unrolled, Wshift, size_T)
302+
rt, lat, rp = cost(op, vectorized, Wshift, size_T)
297303
# @show instruction(op), rt, lat, rp, iter
298304
rt *= iter
299305
isunrolled = unrolled loopdependencies(op)
@@ -367,48 +373,54 @@ function choose_unroll_order(ls::LoopSet, lowest_cost::Float64 = Inf)
367373
lo = LoopOrders(ls)
368374
best_order = lo.syms
369375
new_order, state = iterate(lo) # right now, new_order === best_order
376+
best_vec = first(new_order)
370377
while true
371-
cost_temp = evaluate_cost_unroll(ls, new_order, lowest_cost)
372-
if cost_temp < lowest_cost
373-
lowest_cost = cost_temp
374-
best_order = new_order
378+
for new_vec new_order
379+
cost_temp = evaluate_cost_unroll(ls, new_order, lowest_cost, new_vec)
380+
if cost_temp < lowest_cost
381+
lowest_cost = cost_temp
382+
best_order = new_order
383+
best_vec = new_vec
384+
end
375385
end
376386
iter = iterate(lo, state)
377-
iter === nothing && return best_order, lowest_cost
387+
iter === nothing && return best_order, best_vec, lowest_cost
378388
new_order, state = iter
379389
end
380390
end
381391
function choose_tile(ls::LoopSet)
382392
lo = LoopOrders(ls)
383393
best_order = copyto!(ls.loop_order.bestorder, lo.syms)
394+
best_vec = first(best_order) # filler
384395
new_order, state = iterate(lo) # right now, new_order === best_order
385396
U, T, lowest_cost = 0, 0, Inf
386397
while true
387-
U_temp, T_temp, cost_temp = evaluate_cost_tile(ls, new_order)
388-
if cost_temp < lowest_cost
389-
lowest_cost = cost_temp
390-
U, T = U_temp, T_temp
391-
copyto!(best_order, new_order)
392-
save_tilecost!(ls)
398+
for new_vec @view(new_order[2:end]) # view to skip first
399+
U_temp, T_temp, cost_temp = evaluate_cost_tile(ls, new_order, new_vec)
400+
if cost_temp < lowest_cost
401+
lowest_cost = cost_temp
402+
U, T = U_temp, T_temp
403+
best_vec = new_vec
404+
copyto!(best_order, new_order)
405+
save_tilecost!(ls)
406+
end
393407
end
394408
iter = iterate(lo, state)
395-
iter === nothing && return best_order, U, T, lowest_cost
409+
iter === nothing && return best_order, best_vec, U, T, lowest_cost
396410
new_order, state = iter
397411
end
398412
end
399413
function choose_order(ls::LoopSet)
400414
if num_loops(ls) > 1
401-
torder, tU, tT, tc = choose_tile(ls)
415+
torder, tvec, tU, tT, tc = choose_tile(ls)
402416
else
403417
tc = Inf
404418
end
405-
uorder, uc = choose_unroll_order(ls, tc)
406-
if num_loops(ls) <= 1 || tc > uc # if tc == uc, then that probably means we want tc, and no unrolled managed to beat the tiled cost
407-
# copyto!(ls.loop_order.loopnames, uorder)
408-
return uorder, determine_unroll_factor(ls, uorder), -1
419+
uorder, uvec, uc = choose_unroll_order(ls, tc)
420+
if num_loops(ls) > 1 && tc < uc
421+
return torder, tvec, tU, tT
409422
else
410-
# copyto!(ls.loop_order.loopnames, torder)
411-
return torder, tU, tT
423+
return uorder, uvec, determine_unroll_factor(ls, uorder, first(uorder), uvec), -1
412424
end
413425
end
414426

src/graphs.jl

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,28 @@ looprangesym(ls::LoopSet, s::Symbol) = ls.loops[s].rangesym
161161
getop(ls::LoopSet, s::Symbol) = ls.opdict[s]
162162
getop(ls::LoopSet, i::Int) = ls.operations[i + 1]
163163

164+
@inline extract_val(::Val{N}) where {N} = N
165+
function determine_veced_increment(ls::LoopSet, iter::Symbol, isunrolled::Bool, W::Symbol, U::Int) # , istiled::Bool, ..., T::Int # may not be tiled
166+
if isunrolled
167+
Expr(:call, lv(:valmul), W, U)
168+
# elseif istiled
169+
# Expr(:call, lv(:valmul), W, T)
170+
else
171+
Expr(:call, lv(:extract_val), W)
172+
end
173+
end
174+
function vec_looprange(ls::LoopSet, s::Symbol, isunrolled::Bool, W::Symbol, U::Int, loop = ls.loops[s])
175+
incr = if isunrolled
176+
Expr(:call, lv(:valmuladd), W, U, -1)
177+
else
178+
Expr(:call, lv(:valsub), W, 1)
179+
end
180+
if loop.hintexact
181+
Expr(:call, :<, mangledname, Expr(:call, :-, loop.rangehint, incr))
182+
else
183+
Expr(:call, :<, mangledname, Expr(:call, :-, loop.rangesym, incr))
184+
end
185+
end
164186
function looprange(ls::LoopSet, s::Symbol, incr::Int = 1, mangledname::Symbol = s, loop = ls.loops[s])
165187
incr -= 1
166188
if iszero(incr)
@@ -178,6 +200,7 @@ function looprange(ls::LoopSet, s::Symbol, incr::Expr, mangledname::Symbol = s,
178200
end
179201
Expr(:call, :<, mangledname, increxpr)
180202
end
203+
181204
function Base.length(ls::LoopSet, is::Symbol)
182205
ls.loops[is].rangehint
183206
end

0 commit comments

Comments
 (0)