Skip to content

Commit dfddc8d

Browse files
committed
Merge branch 'master' of github.com:chriselrod/LoopVectorization.jl
2 parents 49aa252 + 14370c9 commit dfddc8d

11 files changed

+321
-306
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.38"
4+
version = "0.12.42"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -30,5 +30,5 @@ Static = "0.2"
3030
StrideArraysCore = "0.1.12"
3131
ThreadingUtilities = "0.4.2"
3232
UnPack = "1"
33-
VectorizationBase = "0.20.16"
33+
VectorizationBase = "0.20.17"
3434
julia = "1.5"

src/LoopVectorization.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ using Static: StaticInt, gt
44
using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
55
using VectorizationBase: register_size, register_count, cache_linesize, cache_size, has_opmask_registers,
66
mask, pick_vector_width, MM, AbstractMask, data, grouped_strided_pointer, AbstractSIMD,
7-
maybestaticlength, maybestaticsize, vzero, maybestaticrange, offsetprecalc, lazymul,
7+
vzero, offsetprecalc, lazymul,
88
vadd_nw, vadd_nsw, vadd_nuw, vsub_nw, vsub_nsw, vsub_nuw, vmul_nw, vmul_nsw, vmul_nuw,
99
maybestaticfirst, maybestaticlast, gep, gesp, NativeTypes, #llvmptr,
1010
vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd_fast, vfmsub_fast, vfnmadd_fast, vfnmsub_fast, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231,
@@ -23,6 +23,7 @@ using VectorizationBase: register_size, register_count, cache_linesize, cache_si
2323
num_threads, num_cores,
2424
max_mask#,zero_mask
2525

26+
using VectorizationBase: maybestaticsize # for compatibility
2627

2728
using IfElse: ifelse
2829

src/broadcast.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -385,7 +385,7 @@ function add_broadcast_loops!(ls::LoopSet, loopsyms::Vector{Symbol}, destsym::Sy
385385
push!(axes_tuple.args, Nrange)
386386
pushpreamble!(ls, Expr(:(=), Nlower, Expr(:call, lv(:maybestaticfirst), Nrange)))
387387
pushpreamble!(ls, Expr(:(=), Nupper, Expr(:call, lv(:maybestaticlast), Nrange)))
388-
pushpreamble!(ls, Expr(:(=), Nlen, Expr(:call, lv(:maybestaticlength), Nrange)))
388+
pushpreamble!(ls, Expr(:(=), Nlen, Expr(:call, GlobalRef(ArrayInterface,:static_length), Nrange)))
389389
end
390390
end
391391
# size of dest determines loops
@@ -465,9 +465,8 @@ end
465465
bc::Broadcasted, ::Val{Mod}, ::Val{UNROLL}
466466
) where {Mod,UNROLL}
467467
ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
468-
@show ElType
469-
dest = similar(bc, ElType)
470-
vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}())
468+
dest = similar(bc, ElType)
469+
vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}())
471470
end
472471

473472
vmaterialize!(dest, bc, ::Val, ::Val, ::StaticInt, ::StaticInt, ::StaticInt) = Base.Broadcast.materialize!(dest, bc)

src/codegen/operation_evaluation_order.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ function load_short_static_reduction_first!(ls::LoopSet, u₁loop::Symbol, u₂l
142142
end
143143

144144
function fillorder!(ls::LoopSet, order::Vector{Symbol}, u₁loop::Symbol, u₂loop::Symbol, u₂max::Int, vectorized::Symbol)
145-
load_short_static_reduction_first!(ls, u₁loop, u₂loop, vectorized)
145+
load_short_static_reduction_first!(ls, u₁loop, u₂loop, vectorized)
146146
lo = ls.loop_order
147147
resize!(lo, length(ls.loopsymbols))
148148
ro = lo.loopnames # reverse order; will have same order as lo

src/codegen/split_loops.jl

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -89,39 +89,41 @@ function returned_ops(ls::LoopSet)
8989
end
9090

9191
function lower_and_split_loops(ls::LoopSet, inline::Int)
92-
split_candidates = returned_ops(ls)
93-
length(split_candidates) > 1 || return lower(ls, inline)
94-
order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, cost_fused, shouldinline_fused = choose_order_cost(ls)
95-
remaining_ops = Vector{Int}(undef, length(split_candidates) - 1); split_1 = Int[0];
96-
# for (ind,i) ∈ enumerate(split_candidates)
97-
for (ind,i) enumerate(split_candidates)
98-
split_1[1] = i
99-
ls_1 = split_loopset(ls, split_1)
100-
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost(ls_1)
101-
remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
102-
ls_2 = split_loopset(ls, remaining_ops)
103-
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
104-
# U_1 = T_1 = U_2 = T_2 = 2
105-
#@show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
106-
if cost_1 + cost_2 cost_fused
107-
ls_2_lowered = if length(remaining_ops) > 1
108-
inline = iszero(inline) ? (shouldinline_1 % Int) : inline
109-
lower_and_split_loops(ls_2, inline)
110-
else
111-
doinline = inlinedecision(inline, shouldinline_1 | shouldinline_2)
112-
lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, doinline)
113-
end
114-
return Expr(
115-
:block,
116-
ls.preamble,
117-
lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, false),
118-
ls_2_lowered,
119-
nothing
120-
)
121-
end
92+
split_candidates = returned_ops(ls)
93+
length(split_candidates) > 1 || return lower(ls, inline)
94+
order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, cost_fused, shouldinline_fused = choose_order_cost(ls)
95+
remaining_ops = Vector{Int}(undef, length(split_candidates) - 1); split_1 = Int[0];
96+
# for (ind,i) ∈ enumerate(split_candidates)
97+
for (ind,i) enumerate(split_candidates)
98+
split_1[1] = i
99+
ls_1 = split_loopset(ls, split_1)
100+
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost(ls_1)
101+
remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
102+
ls_2 = split_loopset(ls, remaining_ops)
103+
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
104+
# U_1 = T_1 = U_2 = T_2 = 2
105+
# return ls_1, ls_2
106+
# @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
107+
if cost_1 + cost_2 0.9cost_fused
108+
ls_2_lowered = if length(remaining_ops) > 1
109+
inline = iszero(inline) ? (shouldinline_1 % Int) : inline
110+
lower_and_split_loops(ls_2, inline)
111+
else
112+
doinline = inlinedecision(inline, shouldinline_1 | shouldinline_2)
113+
lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, doinline)
114+
end
115+
return Expr(
116+
:block,
117+
ls.preamble,
118+
lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, false),
119+
ls_2_lowered,
120+
nothing
121+
)
122122
end
123-
doinline = inlinedecision(inline, shouldinline_fused)
124-
lower(ls, order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, doinline)
123+
length(split_candidates) == 2 && break
124+
end
125+
doinline = inlinedecision(inline, shouldinline_fused)
126+
lower(ls, order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, doinline)
125127
end
126128

127129

src/condense_loopset.jl

Lines changed: 76 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -75,57 +75,57 @@ It supports array-references with up to 8 indexes, where the data for each conse
7575
of `index_types` (storing the enum `IndexType`), `indices` (the `id` for each index symbol), and `offsets` (currently unused).
7676
"""
7777
struct ArrayRefStruct{array,ptr}
78-
index_types::UInt64
79-
indices::UInt64
80-
offsets::UInt64
81-
strides::UInt64
78+
index_types::UInt128
79+
indices::UInt128
80+
offsets::UInt128
81+
strides::UInt128
8282
end
8383
array_and_ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = (a::Symbol,p::Symbol)
8484
# array(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = a::Symbol
8585
# ptr(@nospecialize(ar::ArrayRefStruct{a,p})) where {a,p} = p::Symbol
8686

8787
function findindoradd!(v::Vector{T}, s::T) where {T}
88-
ind = findfirst(==(s), v)
89-
ind === nothing || return ind
90-
push!(v, s)
91-
length(v)
88+
ind = findfirst(==(s), v)
89+
ind === nothing || return ind
90+
push!(v, s)
91+
length(v)
9292
end
9393
function ArrayRefStruct(ls::LoopSet, mref::ArrayReferenceMeta, arraysymbolinds::Vector{Symbol}, ids::Vector{Int})
94-
index_types = zero(UInt64)
95-
indices = zero(UInt64)
96-
offsets = zero(UInt64)
97-
strides = zero(UInt64)
98-
@unpack loopedindex, ref = mref
99-
indv = ref.indices
100-
offv = ref.offsets
101-
strv = ref.strides
102-
# we can discard that the array was considered discontiguous, as it should be recovered from type information
103-
start = 1 + (first(indv) === DISCONTIGUOUS)
104-
for (n,ind) enumerate(@view(indv[start:end]))
105-
index_types <<= 8
106-
indices <<= 8
107-
offsets <<= 8
108-
offsets |= (offv[n] % UInt8)
109-
strides <<= 8
110-
strides |= (strv[n] % UInt8)
111-
if loopedindex[n]
112-
index_types |= LoopIndex
113-
if strv[n] 0
114-
indices |= getloopid(ls, ind)
115-
end
116-
else
117-
parent = get(ls.opdict, ind, nothing)
118-
@assert !(parent === nothing) "Index $ind not found in array."
119-
# if parent === nothing
120-
# index_types |= SymbolicIndex
121-
# indices |= findindoradd!(arraysymbolinds, ind)
122-
# else
123-
index_types |= ComputedIndex
124-
indices |= ids[identifier(parent)]
125-
# end
126-
end
94+
index_types = zero(UInt128)
95+
indices = zero(UInt128)
96+
offsets = zero(UInt128)
97+
strides = zero(UInt128)
98+
@unpack loopedindex, ref = mref
99+
indv = ref.indices
100+
offv = ref.offsets
101+
strv = ref.strides
102+
# we can discard that the array was considered discontiguous, as it should be recovered from type information
103+
start = 1 + (first(indv) === DISCONTIGUOUS)
104+
for (n,ind) enumerate(@view(indv[start:end]))
105+
index_types <<= 8
106+
indices <<= 8
107+
offsets <<= 8
108+
offsets |= (offv[n] % UInt8)
109+
strides <<= 8
110+
strides |= (strv[n] % UInt8)
111+
if loopedindex[n]
112+
index_types |= LoopIndex
113+
if strv[n] 0
114+
indices |= getloopid(ls, ind)
115+
end
116+
else
117+
parent = get(ls.opdict, ind, nothing)
118+
@assert !(parent === nothing) "Index $ind not found in array."
119+
# if parent === nothing
120+
# index_types |= SymbolicIndex
121+
# indices |= findindoradd!(arraysymbolinds, ind)
122+
# else
123+
index_types |= ComputedIndex
124+
indices |= ids[identifier(parent)]
125+
# end
127126
end
128-
ArrayRefStruct{mref.ref.array,mref.ptr}( index_types, indices, offsets, strides )
127+
end
128+
ArrayRefStruct{mref.ref.array,mref.ptr}( index_types, indices, offsets, strides )
129129
end
130130

131131
"""
@@ -135,62 +135,56 @@ A condensed representation of an [`Operation`](@ref).
135135
"""
136136
struct OperationStruct <: AbstractLoopOperation
137137
# instruction::Instruction
138-
loopdeps::UInt64
139-
reduceddeps::UInt64
140-
childdeps::UInt64
141-
parents::UInt64
142-
node_type::OperationType
143-
array::UInt8
144-
symid::UInt8
138+
loopdeps::UInt128
139+
reduceddeps::UInt128
140+
childdeps::UInt128
141+
parents::UInt128
142+
node_type::OperationType
143+
array::UInt8
144+
symid::UInt8
145145
end
146146
optype(os) = os.node_type
147147

148148
function findmatchingarray(ls::LoopSet, mref::ArrayReferenceMeta)
149-
id = 0x01
150-
for r ls.refs_aliasing_syms
151-
r == mref && return id
152-
id += 0x01
153-
end
154-
0x00
149+
id = 0x01
150+
for r ls.refs_aliasing_syms
151+
r == mref && return id
152+
id += 0x01
153+
end
154+
0x00
155155
end
156-
# filled_4byte_chunks(u::UInt64) = 16 - (leading_zeros(u) >>> 2)
157-
filled_8byte_chunks(u::UInt64) = 8 - (leading_zeros(u) >>> 3)
158-
159-
# num_loop_deps(os::OperationStruct) = filled_4byte_chunks(os.loopdeps)
160-
# num_reduced_deps(os::OperationStruct) = filled_4byte_chunks(os.reduceddeps)
161-
# num_child_deps(os::OperationStruct) = filled_4byte_chunks(os.childdeps)
162-
# num_parents(os::OperationStruct) = filled_4byte_chunks(os.parents)
156+
filled_8byte_chunks(u::T) where {T<:Unsigned} = sizeof(T) - (leading_zeros(u) >>> 3)
163157

164158
function shifted_loopset(ls::LoopSet, loopsyms::Vector{Symbol})
165-
ld = zero(UInt64) # leading_zeros(ld) >> 2 yields the number of loopdeps
166-
for d loopsyms
167-
ld <<= 4
168-
ld |= getloopid(ls, d)::Int
169-
end
170-
ld
159+
ld = zero(UInt128) # leading_zeros(ld) >> 2 yields the number of loopdeps
160+
for d loopsyms
161+
ld <<= 4
162+
ld |= getloopid(ls, d)::Int
163+
end
164+
ld
171165
end
172166
loopdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, loopdependencies(op))
173167
reduceddeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reduceddependencies(op))
174168
childdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reducedchildren(op))
175169
function parents_uint(ls::LoopSet, op::Operation)
176-
p = zero(UInt64)
177-
for parent parents(op)
178-
p <<= 8
179-
p |= identifier(parent)
180-
end
181-
p
170+
p = zero(UInt128)
171+
for parent parents(op)
172+
p <<= 8
173+
p |= identifier(parent)
174+
end
175+
p
182176
end
183177
function recursively_set_parents_true!(x::Vector{Bool}, op::Operation)
184-
x[identifier(op)] && return nothing # don't redescend
185-
x[identifier(op)] = true
186-
for opp parents(op)
187-
recursively_set_parents_true!(x, opp)
188-
end
189-
return nothing
178+
x[identifier(op)] && return nothing # don't redescend
179+
x[identifier(op)] = true
180+
for opp parents(op)
181+
recursively_set_parents_true!(x, opp)
182+
end
183+
return nothing
190184
end
191185
function getroots(ls::LoopSet)::Vector{Bool}
192-
rooted = Vector{Bool}(undef, length(operations(ls)))
193-
getroots!(rooted, ls)
186+
rooted = Vector{Bool}(undef, length(operations(ls)))
187+
getroots!(rooted, ls)
194188
end
195189
function getroots!(rooted::Vector{Bool}, ls::LoopSet)
196190
fill!(rooted, false)

0 commit comments

Comments
 (0)