Skip to content

Commit c115196

Browse files
committed
Some more internal restructuring; loop indices are now internally symbols; ints are are CSE-ed as views (expressions are bound to symbols/converted to a more ssa-like form).
1 parent 07a97c2 commit c115196

13 files changed

+190
-106
lines changed

Project.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,11 +13,11 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1313

1414
[compat]
1515
MacroTools = "0.5"
16-
Parameters = "0.12.0"
17-
SIMDPirates = "0.1.6"
18-
SLEEFPirates = "0.1.3"
19-
VectorizationBase = "0.1.9"
20-
julia = "1.3.0"
16+
Parameters = ">=0.12.0"
17+
SIMDPirates = ">=0.1.7"
18+
SLEEFPirates = ">=0.1.3"
19+
VectorizationBase = ">=0.1.10"
20+
julia = "1.3"
2121

2222
[extras]
2323
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

src/LoopVectorization.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ module LoopVectorization
33
using VectorizationBase, SIMDPirates, SLEEFPirates, MacroTools, Parameters
44
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector_load_expr,
55
mask, masktable, pick_vector_width_val, valmul, valrem, valmuladd, valadd, valsub, _MM,
6-
maybestaticlength, maybestaticsize, Static, staticm1
6+
maybestaticlength, maybestaticsize, Static, staticm1, subsetview
77
using SIMDPirates: VECTOR_SYMBOLS, evadd, evmul, vrange, reduced_add, reduced_prod, reduce_to_add, reduce_to_prod
88
using Base.Broadcast: Broadcasted, DefaultArrayStyle
99
using LinearAlgebra: Adjoint, Transpose

src/add_loads.jl

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ function add_load!(
99
ls::LoopSet, var::Symbol, mpref::ArrayReferenceMetaPosition, elementbytes::Int = 8
1010
)
1111
length(mpref.loopdependencies) == 0 && return add_constant!(ls, var, mpref, elementbytes)
12-
ref = mpref.mref.ref
12+
ref = mpref.mref
1313
# try to CSE
1414
id = findfirst(r -> r == ref, ls.refs_aliasing_syms)
1515
if id === nothing
@@ -29,13 +29,6 @@ end
2929
function add_simple_load!(
3030
ls::LoopSet, var::Symbol, ref::ArrayReference, elementbytes::Int = 8
3131
)
32-
# if ref.loaded[] == true
33-
# op = getop(ls, var, elementbytes)
34-
# @assert var === op.variable
35-
# return op
36-
# end
37-
# loopset = keys(ls.loops)
38-
# loopdeps = Symbol[s for s ∈ loopdependencies(ref) if (s isa Symbol && s ∈ loopset)]
3932
loopdeps = Symbol[s for s ref.indices]
4033
mref = ArrayReferenceMeta(
4134
ref, fill(true, length(loopdeps))

src/add_stores.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ function add_store!(
2626
id = nops
2727
if pvar ls.syms_aliasing_refs
2828
push!(ls.syms_aliasing_refs, pvar)
29-
push!(ls.refs_aliasing_syms, mpref.mref.ref)
29+
push!(ls.refs_aliasing_syms, mpref.mref)
3030
# add_unique_store!(ls, mref, parents, ldref, reduceddeps, elementbytes)
3131
else
3232
# try to cse store

src/broadcast.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ function add_broadcast!(
5252
pushpreamble!(ls, Expr(:(=), K, Expr(:call, :size, mB, 1)))
5353

5454
k = gensym(:k)
55-
ls.loops[k] = Loop(k, 0, K)
55+
add_loop!(ls, Loop(k, 0, K), k)
5656
m = loopsyms[1];
5757
if ndims(B) == 1
5858
bloopsyms = Symbol[k]
@@ -100,7 +100,7 @@ function add_broadcast!(
100100
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
101101
::Type{<:LowDimArray{D,T,N}}, elementbytes::Int = 8
102102
) where {D,T,N}
103-
fulldims = Union{Symbol,Int}[loopsyms[n] for n 1:N if D[n]]
103+
fulldims = Symbol[loopsyms[n] for n 1:N if D[n]]
104104
ref = ArrayReference(bcname, fulldims)
105105
add_simple_load!(ls, destname, ref, elementbytes )::Operation
106106
end
@@ -109,13 +109,13 @@ function add_broadcast_adjoint_array!(
109109
) where {T,N,A<:AbstractArray{T,N}}
110110
parent = gensym(:parent)
111111
pushpreamble!(ls, Expr(:(=), parent, Expr(:call, :parent, bcname)))
112-
ref = ArrayReference(parent, Union{Symbol,Int}[loopsyms[N + 1 - n] for n 1:N])
112+
ref = ArrayReference(parent, Symbol[loopsyms[N + 1 - n] for n 1:N])
113113
add_simple_load!( ls, destname, ref, elementbytes )::Operation
114114
end
115115
function add_broadcast_adjoint_array!(
116116
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol}, ::Type{<:AbstractVector}, elementbytes::Int = 8
117117
)
118-
ref = ArrayReference(bcname, Union{Symbol,Int}[loopsyms[2]])
118+
ref = ArrayReference(bcname, Symbol[loopsyms[2]])
119119
add_simple_load!( ls, destname, ref, elementbytes )
120120
end
121121
function add_broadcast!(
@@ -147,7 +147,7 @@ function add_broadcast!(
147147
ls::LoopSet, destname::Symbol, bcname::Symbol, loopsyms::Vector{Symbol},
148148
::Type{SubArray{T,N,A,S,B}}, elementbytes::Int = 8
149149
) where {T,N,N2,A<:AbstractArray{T,N2},B,N3,S <: Tuple{Int,Vararg{Any,N3}}}
150-
inds = Vector{Union{Int,Symbol}}(undef, N+1)
150+
inds = Vector{Symbol}(undef, N+1)
151151
inds[1] = Symbol("##DISCONTIGUOUSSUBARRAY##")
152152
inds[2:end] .= @view(loopsyms[1:N])
153153
add_simple_load!(ls, destname, ArrayReference(bcname, inds), elementbytes)
@@ -192,7 +192,7 @@ end
192192
sizes = Expr(:tuple)
193193
for (n,itersym) enumerate(loopsyms)
194194
Nsym = gensym(:N)
195-
ls.loops[itersym] = Loop(itersym, 0, Nsym)
195+
add_loop!(ls, Loop(itersym, 0, Nsym), itersym)
196196
push!(sizes.args, Nsym)
197197
end
198198
pushpreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest)))
@@ -217,7 +217,7 @@ end
217217
sizes = Expr(:tuple)
218218
for (n,itersym) enumerate(loopsyms)
219219
Nsym = gensym(:N)
220-
ls.loops[itersym] = Loop(itersym, 0, Nsym)
220+
add_loop!(ls, Loop(itersym, 0, Nsym), itersym)
221221
push!(sizes.args, Nsym)
222222
end
223223
pushpreamble!(ls, Expr(:(=), sizes, Expr(:call, :size, :dest′)))

src/condense_loopset.jl

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,65 @@
11

2+
@enum IndexType::UInt8 NotAnIndex=0 LoopIndex=1 ComputedIndex=2 SymbolicIndex=3 LiteralIndex=4
3+
4+
struct ArrayRefStruct
5+
index_types::UInt64
6+
indices::UInt64
7+
end
8+
tup_to_vec(t::NTuple{W,T}) where {W,T} = ntuple(Val(W)) do w @inbounds Core.VecElement(t[w]) end
9+
vec_to_tup(v::Vec{W,T}) where {W,T} = ntuple(Val(W)) do w @inbounds (v[w]).value end
10+
vec_to_tup(v::SVec{W,T}) where {W,T} = ntuple(Val(W)) do w @inbounds (v[w]) end
11+
function ArrayRefStruct(ls::LoopSet, mref::ArrayReferenceMeta)
12+
index_types = zero(UInt64)
13+
indices = vbroadcast(SVec{8,UInt64}, zero(UInt64))
14+
indv = mref.ref.indices
15+
start = 1 + (first(indv) === Symbol("##DISCONTIGUOUSSUBARRAY##"))
16+
for (n,ind) enumerate(@view(indv[start:end]))
17+
index_types <<= 8
18+
indices <<= 16
19+
if ind isa Int
20+
21+
elseif mref.loopindex[n]
22+
else
23+
end
24+
end
25+
ArrayRefStruct( index_types, vec_to_tup(indices) )
26+
end
27+
28+
struct OperationStruct
29+
instruction::Instruction
30+
loopdeps::UInt64
31+
reduceddeps::UInt64
32+
parents::UInt64
33+
array::UInt64
34+
end
35+
function findmatchingarray(ls::LoopSet, array::Symbol)
36+
id = zero(UInt64)
37+
for (as,_) ls.includedarrays
38+
id += one(UInt64)
39+
if as === arraysym
40+
return id
41+
end
42+
end
43+
zero(UInt64)
44+
end
45+
filled_4byte_chunks(u::UInt64) = leading_zeros(u) >> 2
46+
num_loop_deps(os::OperationStruct) = filled_4byte_chunks(os.loopdeps)
47+
num_reduced_deps(os::OperationStruct) = filled_4byte_chunks(os.reduced_deps)
48+
num_parents(os::OperationStruct) = filled_4byte_chunks(os.parents)
49+
50+
function loodeps_uint(ls::LoopSet, op::Operation)
51+
ld = zero(UInt64) # leading_zeros(ld) >> 2 yields the number of loopdeps
52+
for d loopdependencies(op)
53+
ld <<= 4
54+
ld |= getloopid(ls, d)
55+
end
56+
ld
57+
end
58+
function OperationStruct(ls::LoopSet, op::Operation)
59+
instr = instruction(op)
60+
array = accesses_memory(op) ? findmatchingarray(ls, name(op.ref)) : zero(UInt64)
61+
62+
end
263
## turn a LoopSet into a type object which can be used to reconstruct the LoopSet.
364

465

src/costs.jl

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@ Base.convert(::Type{Instruction}, instr::Symbol) = Instruction(instr)
88
lower(instr::Instruction) = Expr(:(.), instr.mod, QuoteNode(instr.instr))
99
Base.Expr(instr::Instruction, args...) = Expr(:call, lower(instr), args...)::Expr
1010
Base.hash(instr::Instruction, h::UInt64) = hash(instr.instr, hash(instr.mod, h))
11+
function Base.isless(instr1::Instruction, instr2::Instruction)
12+
if instr1.mod === instr2.mod
13+
isless(instr1.instr, instr2.instr)
14+
else
15+
isless(instr1.mod, instr2.mod)
16+
end
17+
end
1118

1219
const LOOPCONSTANT = Instruction(gensym())
1320

src/determinestrategy.jl

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -287,8 +287,8 @@ function solve_tilesize(
287287
)
288288
maxT = 4#8
289289
maxU = 4#8
290-
tiledloop = ls.loops[tiled]
291-
unrolledloop = ls.loops[unrolled]
290+
tiledloop = getloop(ls, tiled)
291+
unrolledloop = getloop(ls, unrolled)
292292
if isstaticloop(tiledloop)
293293
maxT = min(4maxT, length(tiledloop))
294294
end
@@ -314,6 +314,27 @@ function set_upstream_family!(adal::Vector{T}, op::Operation, val::T) where {T}
314314
end
315315
end
316316

317+
function stride_penalty(ls::LoopSet, op::Operation, order::Vector{Symbol})
318+
num_loops = length(order)
319+
contigsym = first(loopdependencies(op))
320+
contigsym == Symbol("##DISCONTIGUOUSSUBARRAY##") && return 0
321+
iter = 0
322+
for i 0:num_loops - 1
323+
loopsym = order[num_loops - i]
324+
loopsym === contigsym && return iter
325+
iter *= length(getloop(ls, loopsym))
326+
end
327+
iter
328+
end
329+
function stride_penalty(ls::LoopSet, order::Vector{Symbol})
330+
stridepenalty = 0
331+
for op operations(ls)
332+
if accesses_memory(op)
333+
stridepenalty += stride_penalty(ls, op, order)
334+
end
335+
end
336+
stridepenalty * 1e-9
337+
end
317338
# Just tile outer two loops?
318339
# But optimal order within tile must still be determined
319340
# as well as size of the tiles.
@@ -402,7 +423,8 @@ function evaluate_cost_tile(
402423
end
403424
# @show order, vectorized cost_vec reg_pressure
404425
# @show solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure)
405-
solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure)
426+
U, T, tcost = solve_tilesize(ls, unrolled, tiled, cost_vec, reg_pressure)
427+
U, T, tcost + stride_penalty(ls, order)
406428
end
407429

408430

@@ -411,7 +433,7 @@ struct LoopOrders
411433
buff::Vector{Symbol}
412434
end
413435
function LoopOrders(ls::LoopSet)
414-
syms = [s for s keys(ls.loops)]
436+
syms = copy(ls.loopsymbols)
415437
LoopOrders(syms, similar(syms))
416438
end
417439
function Base.iterate(lo::LoopOrders)

src/graphs.jl

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,8 @@ Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i...) = lo.oporder[LinearI
143143
# Must make it easy to iterate
144144
# outer_reductions is a vector of indixes (within operation vectors) of the reduction operation, eg the vmuladd op in a dot product
145145
struct LoopSet
146-
loops::Dict{Symbol,Loop} # sym === loops[sym].itersymbol
146+
loopsymbols::Vector{Symbol}
147+
loops::Vector{Loop}
147148
opdict::Dict{Symbol,Operation}
148149
operations::Vector{Operation} # Split them to make it easier to iterate over just a subset
149150
outer_reductions::Vector{Int} # IDs of reduction operations that need to be reduced at end.
@@ -155,9 +156,9 @@ struct LoopSet
155156
preamble_symfloat::Vector{Tuple{Int,Float64}}
156157
preamble_zeros::Vector{Int}
157158
preamble_ones::Vector{Int}
158-
includedarrays::Vector{Tuple{Symbol,Int}}
159+
includedarrays::Vector{Symbol}
159160
syms_aliasing_refs::Vector{Symbol} # O(N) search is faster at small sizes
160-
refs_aliasing_syms::Vector{ArrayReference}
161+
refs_aliasing_syms::Vector{ArrayReferenceMeta}
161162
cost_vec::Matrix{Float64}
162163
reg_pres::Matrix{Int}
163164
included_vars::Vector{Bool}
@@ -211,15 +212,11 @@ function pushpreamble!(ls::LoopSet, op::Operation, RHS::Expr)
211212
nothing
212213
end
213214

214-
function includesarray(ls::LoopSet, array::Symbol)
215-
for (a,i) ls.includedarrays
216-
a === array && return i
217-
end
218-
-1
219-
end
215+
includesarray(ls::LoopSet, array::Symbol) = array ls.includedarrays
216+
220217
function LoopSet()
221218
LoopSet(
222-
Dict{Symbol,Loop}(),
219+
Symbol[], Loop[],
223220
Dict{Symbol,Operation}(),
224221
Operation[],
225222
Int[],
@@ -231,7 +228,7 @@ function LoopSet()
231228
Int[],Int[],
232229
Tuple{Symbol,Int}[],
233230
Symbol[],
234-
ArrayReference[],
231+
ArrayReferenceMeta[],
235232
Matrix{Float64}(undef, 4, 2),
236233
Matrix{Int}(undef, 4, 2),
237234
Bool[], Bool[], gensym(:W), gensym(:T)
@@ -243,11 +240,17 @@ function oporder(ls::LoopSet)
243240
reshape(ls.loop_order.oporder, (2,2,2,N))
244241
end
245242
names(ls::LoopSet) = ls.loop_order.loopnames
246-
Base.length(ls::LoopSet, s::Symbol) = length(ls.loops[s])
247-
isstaticloop(ls::LoopSet, s::Symbol) = isstaticloop(ls.loops[s])
248-
looprangehint(ls::LoopSet, s::Symbol) = length(ls.loops[s])
249-
looprangesym(ls::LoopSet, s::Symbol) = ls.loops[s].rangesym
250-
# itersyms(ls::LoopSet) = keys(ls.loops)
243+
function getloopid(ls::LoopSet, s::Symbol)::Int
244+
for (loopnum,sym) enumerate(ls.loopsymbols)
245+
s === sym && return loopnum
246+
end
247+
end
248+
getloop(ls::LoopSet, s::Symbol) = ls.loops[getloopid(ls, s)]
249+
Base.length(ls::LoopSet, s::Symbol) = length(getloop(ls, s))
250+
251+
isstaticloop(ls::LoopSet, s::Symbol) = isstaticloop(getloop(ls,s))
252+
looprangehint(ls::LoopSet, s::Symbol) = length(getloop(ls, s))
253+
looprangesym(ls::LoopSet, s::Symbol) = getloop(ls, s).rangesym
251254
function getop(ls::LoopSet, var::Symbol, elementbytes::Int = 8)
252255
get!(ls.opdict, var) do
253256
# might add constant
@@ -377,7 +380,7 @@ function register_single_loop!(ls::LoopSet, looprange::Expr)
377380
else
378381
throw("Unrecognized loop range type: $r.")
379382
end
380-
ls.loops[itersym] = loop
383+
add_loop!(ls, loop, itersym)
381384
nothing
382385
end
383386
function register_loop!(ls::LoopSet, looprange::Expr)
@@ -399,8 +402,10 @@ function add_loop!(ls::LoopSet, q::Expr, elementbytes::Int = 8)
399402
push!(ls, q, elementbytes)
400403
end
401404
end
402-
function add_loop!(ls::LoopSet, loop::Loop)
403-
ls.loops[loop.itersym] = loop
405+
function add_loop!(ls::LoopSet, loop::Loop, itersym::Symbol = loop.itersym)
406+
push!(ls.loopsymbols, itersym)
407+
push!(ls.loops, loop)
408+
nothing
404409
end
405410

406411
function instruction(x)
@@ -422,7 +427,7 @@ function add_operation!(
422427
elseif f === :zero || f === :one
423428
c = gensym(f)
424429
# pushpreamble!(ls, Expr(:(=), c, RHS))
425-
op = add_constant!(ls, c, [keys(ls.loops)...], LHS, f, elementbytes)
430+
op = add_constant!(ls, c, copy(ls.loopsymbols), LHS, f, elementbytes)
426431
push!(f === :zero ? ls.preamble_zeros : ls.preamble_ones, identifier(op))
427432
op
428433
else
@@ -447,7 +452,7 @@ function add_operation!(
447452
elseif f === :zero || f === :one
448453
c = gensym(f)
449454
# pushpreamble!(ls, Expr(:(=), c, RHS))
450-
op = add_constant!(ls, c, [keys(ls.loops)...], LHS_sym, f, elementbytes)
455+
op = add_constant!(ls, c, copy(ls.loopsymbols), LHS_sym, f, elementbytes)
451456
push!(f === :zero ? ls.preamble_zeros : ls.preamble_ones, identifier(op))
452457
op
453458
else
@@ -475,7 +480,7 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int = 8)
475480
if RHS isa Expr
476481
add_operation!(ls, LHS, RHS, elementbytes)
477482
else
478-
deps = [keys(ls.loops)...]
483+
deps = copy(ls.loopsymbols)
479484
if RHS isa Number
480485
fisone = false
481486
fiszero = false

0 commit comments

Comments
 (0)