Skip to content

Commit 15d5c66

Browse files
committed
WIP: support CartesianIndex
1 parent f09c753 commit 15d5c66

10 files changed

+90
-52
lines changed

src/add_loads.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ function add_load!(
2424
ls::LoopSet, var::Symbol, mpref::ArrayReferenceMetaPosition, elementbytes::Int
2525
)
2626
length(mpref.loopdependencies) == 0 && return add_constant!(ls, var, mpref, elementbytes)
27-
ref = mpref.mref
2827
op = Operation( ls, var, elementbytes, :getindex, memload, mpref )
2928
add_load!(ls, op, true, false)
3029
end

src/condense_loopset.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,11 @@
44
Base.:|(u::Unsigned, it::IndexType) = u | UInt8(it)
55
Base.:(==)(u::Unsigned, it::IndexType) = (u % UInt8) == UInt8(it)
66

7+
"""
8+
`ArrayRefStruct` stores a representation of an array-reference expression such as `A[i,j]`.
9+
It supports array-references with up to 8 indexes, where the data for each consecutive index is packed into corresponding 8-bit fields
10+
of `index_types` (storing the enum `IndexType`), `indices` (the `id` for each index symbol), and `offsets` (currently unused).
11+
"""
712
struct ArrayRefStruct
813
index_types::UInt64
914
indices::UInt64
@@ -392,4 +397,3 @@ function setup_call(ls::LoopSet, inline = Int8(2), U = zero(Int8), T = zero(Int8
392397
setup_call_noinline(ls, U, T)
393398
end
394399
end
395-

src/determinestrategy.jl

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ function findparent(ls::LoopSet, s::Symbol)#opdict isn't filled when reconstruct
2121
end
2222
function unitstride(ls::LoopSet, op::Operation, s::Symbol)
2323
inds = getindices(op)
24-
li = op.ref.loopedindex
24+
li, lookup = op.ref.loopedindex, op.ref.indexlookup
2525
# The first index is allowed to be indexed by `s`
2626
fi = first(inds)
2727
if fi === Symbol("##DISCONTIGUOUSSUBARRAY##")
@@ -32,7 +32,7 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
3232
indexappearences(parent, s) > 1 && return false
3333
end
3434
for i 2:length(inds)
35-
if li[i]
35+
if li[lookup[i]]
3636
s === inds[i] && return false
3737
else
3838
parent = findparent(ls, inds[i])
@@ -344,7 +344,7 @@ function maybedemotesize(T::Int, N::Int, U::Int, Uloop::Loop, maxTbase::Int)
344344
end
345345
function solve_tilesize(
346346
ls::LoopSet, unrolled::Symbol, tiled::Symbol,
347-
cost_vec::AbstractVector{Float64},
347+
cost_vec::AbstractVector{Float64},
348348
reg_pressure::AbstractVector{Int},
349349
W::Int, vectorized::Symbol
350350
)
@@ -436,7 +436,7 @@ function evaluate_cost_tile(
436436
# Need to check if fusion is possible
437437
size_T = biggest_type_size(ls)
438438
W, Wshift = VectorizationBase.pick_vector_width_shift(length(ls, vectorized), size_T)::Tuple{Int,Int}
439-
# costs =
439+
# costs =
440440
# cost_mat[1] / ( unrolled * tiled)
441441
# cost_mat[2] / ( tiled)
442442
# cost_mat[3] / ( unrolled)
@@ -569,7 +569,7 @@ function choose_unroll_order(ls::LoopSet, lowest_cost::Float64 = Inf)
569569
iter = iterate(lo, state)
570570
iter === nothing && return best_order, best_vec, lowest_cost
571571
new_order, state = iter
572-
end
572+
end
573573
end
574574
function choose_tile(ls::LoopSet)
575575
lo = LoopOrders(ls)
@@ -627,4 +627,3 @@ function register_pressure(ls::LoopSet)
627627
tU * tT * rp[1] + tU * rp[2] + rp[3] + rp[4]
628628
end
629629
end
630-

src/graphs.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ Base.@propagate_inbounds Base.getindex(lo::LoopOrder, i...) = lo.oporder[LinearI
174174
# O(N) search is faster at small sizes
175175
struct LoopSet
176176
loopsymbols::Vector{Symbol}
177+
loopsymbol_offsets::Vector{Int} # symbol loopsymbols[i] corresponds to loops[lso[i]+1:lso[i+1]] (CartesianIndex handling)
177178
loops::Vector{Loop}
178179
opdict::Dict{Symbol,Operation}
179180
operations::Vector{Operation} # Split them to make it easier to iterate over just a subset
@@ -281,7 +282,7 @@ includesarray(ls::LoopSet, array::Symbol) = array ∈ ls.includedarrays
281282

282283
function LoopSet(mod::Symbol)# = :LoopVectorization)
283284
LoopSet(
284-
Symbol[], Loop[],
285+
Symbol[], [0], Loop[],
285286
Dict{Symbol,Operation}(),
286287
Operation[],
287288
Int[],

src/lower_load.jl

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ function pushvectorload!(q::Expr, op::Operation, var::Symbol, td::UnrollArgs, U:
88
end
99
push!(q.args, Expr(:(=), name, instrcall))
1010
end
11-
function lower_load_scalar!(
11+
function lower_load_scalar!(
1212
q::Expr, op::Operation, vectorized::Symbol, W::Symbol, unrolled::Symbol, tiled::Symbol, U::Int,
1313
suffix::Union{Nothing,Int}, mask::Union{Nothing,Symbol,Unsigned} = nothing
1414
)
@@ -60,6 +60,3 @@ function lower_load!(
6060
lower_load_scalar!(q, op, vectorized, W, unrolled, tiled, U, suffix, mask)
6161
end
6262
end
63-
64-
65-

src/lower_memory_common.jl

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@ function mem_offset(op::Operation, td::UnrollArgs)
2727
# @assert accesses_memory(op) "Computing memory offset only makes sense for operations that access memory."
2828
ret = Expr(:tuple)
2929
indices = getindices(op)
30-
loopedindex = op.ref.loopedindex
30+
loopedindex, indexlookup = op.ref.loopedindex, op.ref.indexlookup
3131
start = (first(indices) === Symbol("##DISCONTIGUOUSSUBARRAY##")) + 1
3232
for (n,ind) enumerate(@view(indices[start:end]))
3333
if ind isa Int
3434
push!(ret.args, ind)
35-
elseif loopedindex[n]
35+
elseif loopedindex[indexlookup[n]]
3636
push!(ret.args, ind)
3737
else
3838
push!(ret.args, symbolind(ind, op, td))
@@ -46,7 +46,7 @@ function mem_offset_u(op::Operation, td::UnrollArgs)
4646
incr = u
4747
ret = Expr(:tuple)
4848
indices = getindices(op)
49-
loopedindex = op.ref.loopedindex
49+
loopedindex, indexlookup = op.ref.loopedindex, op.ref.indexlookup
5050
if incr == 0
5151
return mem_offset(op, td)
5252
# append_inds!(ret, indices, loopedindex)
@@ -57,7 +57,7 @@ function mem_offset_u(op::Operation, td::UnrollArgs)
5757
push!(ret.args, ind)
5858
elseif ind === unrolled
5959
push!(ret.args, Expr(:call, :+, ind, incr))
60-
elseif loopedindex[n]
60+
elseif loopedindex[indexlookup[n]]
6161
push!(ret.args, ind)
6262
else
6363
push!(ret.args, symbolind(ind, op, td))
@@ -117,4 +117,3 @@ function name_memoffset(var::Symbol, op::Operation, td::UnrollArgs, W::Symbol, v
117117
end
118118
name, mo
119119
end
120-

src/memory_ops_common.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ function array_reference_meta!(ls::LoopSet, array::Symbol, rawindices, elementby
6464
else
6565
indop = get(ls.opdict, ind, nothing)
6666
if indop !== nothing && !isconstant(indop)
67-
pushparent!(parents, loopdependencies, reduceddeps, parent)
67+
pushparent!(parents, loopdependencies, reduceddeps, parent) # FIXME where does `parent` come from?
6868
# var = get(ls.opdict, ind, nothing)
6969
push!(indices, name(parent)); ninds += 1
7070
push!(loopedindex, false)

src/operations.jl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,11 @@ struct ArrayReferenceMeta
2020
ref::ArrayReference
2121
loopedindex::Vector{Bool}
2222
ptr::Symbol
23+
indexlookup::Vector{Int}
2324
end
24-
function ArrayReferenceMeta(ref::ArrayReference, loopedindex, ptr = vptr(ref))
25+
function ArrayReferenceMeta(ref::ArrayReference, loopedindex, ptr = vptr(ref), indexlookup = [i for i in 1:length(loopedindex)])
2526
ArrayReferenceMeta(
26-
ref, loopedindex, ptr
27+
ref, loopedindex, ptr, indexlookup
2728
)
2829
end
2930
# function Base.hash(x::ArrayReference, h::UInt)
@@ -174,7 +175,7 @@ These names will be further processed if op is tiled and/or unrolled.
174175
if tiled ∈ loopdependencies(op) # `suffix` is tilenumber
175176
mvar = Symbol(op, suffix, :_)
176177
end
177-
if unrolled ∈ loopdependencies(op) # `u` is unroll number
178+
if unrolled ∈ loopdependencies(op) # `u` is unroll number
178179
mvar = Symbol(op, u)
179180
end
180181
```
@@ -240,6 +241,3 @@ getindices(op::Operation) = op.ref.ref.indices
240241
# # access stride info?
241242
# op.numerical_metadata[symposition(op,sym)]
242243
# end
243-
244-
245-

src/reconstruct_loopset.jl

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,31 @@ function Loop(ls, l, ::Type{StaticUnitRange{L,U}}) where {L,U}
2525
Loop(gensym(:n), L, U, Symbol(""), Symbol(""), true, true)::Loop
2626
end
2727

28+
function Loop(ls::LoopSet, l::Int, k::Int, ::Type{<:CartesianIndices{N}}) where N
29+
start = gensym(:loopstart); stop = gensym(:loopstop)
30+
axisexpr = Expr(:ref, Expr(:., Expr(:ref, :lb, l), QuoteNode(:indices)), k)
31+
pushpreamble!(ls, Expr(:(=), start, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), axisexpr, QuoteNode(:start)))))
32+
pushpreamble!(ls, Expr(:(=), stop, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__, Symbol(@__FILE__)), Expr(:(.), axisexpr, QuoteNode(:stop)))))
33+
Loop(gensym(:n), 0, 1024, start, stop, false, false)::Loop
34+
end
35+
2836
function add_loops!(ls::LoopSet, LB)
29-
loopsyms = [gensym(:n) for _ eachindex(LB)]
3037
for (i,l) enumerate(LB)
31-
add_loop!(ls, Loop(ls, i, l)::Loop)
38+
if l<:CartesianIndices
39+
add_loops!(ls, i, l)
40+
else
41+
add_loop!(ls, Loop(ls, i, l)::Loop)
42+
push!(ls.loopsymbol_offsets, ls.loopsymbol_offsets[end]+1)
43+
end
3244
end
3345
end
46+
function add_loops!(ls, i, l::Type{<:CartesianIndices{N}}) where N
47+
for k = N:-1:1
48+
add_loop!(ls, Loop(ls, i, k, l)::Loop)
49+
end
50+
push!(ls.loopsymbol_offsets, ls.loopsymbol_offsets[end]+N)
51+
end
52+
3453
function ArrayReferenceMeta(
3554
ls::LoopSet, ar::ArrayRefStruct, arraysymbolinds::Vector{Symbol}, opsymbols::Vector{Symbol},
3655
array::Symbol, vp::Symbol
@@ -39,21 +58,28 @@ function ArrayReferenceMeta(
3958
indices = ar.indices
4059
offsets = ar.offsets
4160
ni = filled_8byte_chunks(index_types)
42-
index_vec = Vector{Symbol}(undef, ni)
61+
index_vec = Symbol[]
4362
offset_vec = Vector{Int8}(undef, ni)
4463
loopedindex = fill(false, ni)
64+
indexlookup = Int[]
4565
while index_types != zero(UInt64)
4666
ind = indices % UInt8
47-
symind = if index_types == LoopIndex
67+
if index_types == LoopIndex
68+
for inda in ls.loopsymbol_offsets[ind]+1:ls.loopsymbol_offsets[ind+1]
69+
pushfirst!(index_vec, ls.loopsymbols[inda])
70+
pushfirst!(indexlookup, ni)
71+
end
4872
loopedindex[ni] = true
49-
ls.loopsymbols[ind]
50-
elseif index_types == ComputedIndex
51-
opsymbols[ind]
5273
else
53-
@assert index_types == SymbolicIndex
54-
arraysymbolinds[ind]
74+
symind = if index_types == ComputedIndex
75+
opsymbols[ind]
76+
else
77+
@assert index_types == SymbolicIndex
78+
arraysymbolinds[ind]
79+
end
80+
pushfirst!(index_vec, symind)
81+
pushfirst!(indexlookup, ni)
5582
end
56-
index_vec[ni] = symind
5783
offset_vec[ni] = offsets % Int8
5884
index_types >>>= 8
5985
indices >>>= 8
@@ -62,7 +88,7 @@ function ArrayReferenceMeta(
6288
end
6389
ArrayReferenceMeta(
6490
ArrayReference(array, index_vec, offset_vec),
65-
loopedindex, vp
91+
loopedindex, vp, indexlookup
6692
)
6793
end
6894

@@ -134,14 +160,16 @@ function process_metadata!(ls::LoopSet, AM, num_arrays::Int)::Vector{Symbol}
134160
arraysymbolinds
135161
end
136162
function parents_symvec(ls::LoopSet, u::Unsigned)
137-
i = filled_4byte_chunks(u)
138-
loops = Vector{Symbol}(undef, i)
163+
loops = Symbol[]
164+
offsets = ls.loopsymbol_offsets
139165
while u != zero(u)
140-
loops[i] = getloopsym(ls, ( u % UInt8 ) & 0x0f )
141-
i -= 1
166+
idx = ( u % UInt8 ) & 0x0f
167+
for j = offsets[idx]+1:offsets[idx+1]
168+
push!(loops, getloopsym(ls, j))
169+
end
142170
u >>= 4
143171
end
144-
loops
172+
return reverse!(loops)
145173
end
146174
loopdependencies(ls::LoopSet, os::OperationStruct) = parents_symvec(ls, os.loopdeps)
147175
reduceddependencies(ls::LoopSet, os::OperationStruct) = parents_symvec(ls, os.reduceddeps)
@@ -227,7 +255,7 @@ function avx_loopset(instr, ops, arf, AM, LB, vargs)
227255
num_arrays = length(arf)
228256
elementbytes = sizeofeltypes(vargs, num_arrays)
229257
add_loops!(ls, LB)
230-
resize!(ls.loop_order, length(LB))
258+
resize!(ls.loop_order, ls.loopsymbol_offsets[end])
231259
arraysymbolinds = process_metadata!(ls, AM, length(arf))
232260
opsymbols = [gensym(:op) for _ eachindex(ops)]
233261
mrefs = create_mrefs!(ls, arf, arraysymbolinds, opsymbols, vargs)
@@ -249,7 +277,7 @@ function _avx_loopset_debug(::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LB}, va
249277
@show OPS ARF AM LB vargs
250278
_avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LB.parameters, typeof.(vargs))
251279
end
252-
function _avx_loopset(OPSsv, ARFsv, AMsv, LBsv, vargs) where {UT, OPS, ARF, AM, LB}
280+
function _avx_loopset(OPSsv, ARFsv, AMsv, LBsv, vargs)
253281
nops = length(OPSsv) ÷ 3
254282
instr = Instruction[Instruction(OPSsv[3i+1], OPSsv[3i+2]) for i 0:nops-1]
255283
ops = OperationStruct[ OPSsv[3i] for i 1:nops ]

test/offsetarrays.jl

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ using Test
8383
end
8484

8585

86-
86+
8787
struct SizedOffsetMatrix{T,LR,UR,LC,RC} <: AbstractMatrix{T}
8888
data::Matrix{T}
8989
end
@@ -141,25 +141,37 @@ using Test
141141
# lsuq = LoopVectorization.LoopSet(macroexpand(Base, uq));
142142
# LoopVectorization.choose_order(lsuq)
143143

144-
144+
145+
function avxgeneric!(out, A, kern, R=CartesianIndices(out), z=zero(eltype(out)))
146+
Rk = CartesianIndices(kern)
147+
@avx for I in R
148+
tmp = z
149+
for J in Rk
150+
tmp += A[I+J]*kern[J]
151+
end
152+
out[I] = tmp
153+
end
154+
out
155+
end
156+
145157
for T (Float32, Float64)
146158
@show T, @__LINE__
147159
A = rand(T, 100, 100);
148160
kern = OffsetArray(rand(T, 3, 3), -1:1, -1:1);
149161
skern = SizedOffsetMatrix{T,-1,1,-1,1}(parent(kern));
150162
out1 = OffsetArray(similar(A, size(A).-2), 1, 1); # stay away from the edges of A
151-
out2 = similar(out1); out3 = similar(out1);
163+
out2 = similar(out1); out3 = similar(out1); out4 = similar(out1)
152164

153165
old2d!(out1, A, kern);
154166
avx2d!(out2, A, kern);
155167
@test out1 out2
156-
168+
157169
avx2douter!(out3, A, kern);
158170
@test out1 out3
159171

160172
fill!(out2, NaN); avx2d!(out2, A, skern);
161173
@test out1 out2
162-
174+
163175
fill!(out3, NaN); avx2douter!(out3, A, skern);
164176
@test out1 out3
165177

@@ -168,11 +180,12 @@ using Test
168180

169181
fill!(out3, NaN); avx2dunrolled2x2!(out3, A, skern);
170182
@test out1 out3
171-
183+
172184
fill!(out3, NaN); avx2dunrolled3x3!(out3, A, skern);
173185
@test out1 out3
174-
end
175186

176-
177-
end
187+
@test_broken avxgeneric!(out4, A, kern) out1
188+
end
178189

190+
191+
end

0 commit comments

Comments
 (0)