Skip to content

Commit 45a370c

Browse files
authored
Merge pull request #195 from chriselrod/preferences
VectorizationBase 0.18 support
2 parents 41f0d77 + a92d448 commit 45a370c

26 files changed

+353
-216
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.11.1"
4+
version = "0.11.2"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -24,7 +24,7 @@ Requires = "1"
2424
SLEEFPirates = "0.6.7"
2525
ThreadingUtilities = "0.2.3"
2626
UnPack = "1"
27-
VectorizationBase = "0.17"
27+
VectorizationBase = "0.18.1"
2828
julia = "1.5"
2929

3030
[extras]

benchmark/loadsharedlibs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,15 @@ end
3636
eigenfile = joinpath(LOOPVECBENCHDIR, "looptestseigen.cpp")
3737
if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
3838
# Clang seems to have trouble finding includes
39-
if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
39+
if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
4040
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
4141
else
4242
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
4343
end
4444
end
4545
if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
4646
# run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
47-
if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
47+
if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
4848
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4949
else
5050
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)

src/LoopVectorization.jl

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,16 +5,16 @@ module LoopVectorization
55
# end
66

77
using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
8-
using VectorizationBase: register_size, register_count, dynamic_register_size, dynamic_register_count, data,
9-
mask, pick_vector_width_val, MM,
8+
using VectorizationBase: register_size, register_count, cache_linesize, has_opmask_registers,
9+
mask, pick_vector_width, MM, data,
1010
maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
11-
Zero, maybestaticrange, offsetprecalc, lazymul,
11+
maybestaticrange, offsetprecalc, lazymul,
1212
maybestaticfirst, maybestaticlast, scalar_less, scalar_greaterequal, gep, gesp, pointerforcomparison, NativeTypes,
1313
vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd_fast, vfmsub_fast, vfnmadd_fast, vfnmsub_fast, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231,
1414
vfma_fast, vmuladd_fast, vdiv_fast, vadd_fast, vsub_fast, vmul_fast,
1515
relu, stridedpointer, StridedPointer, StridedBitPointer, AbstractStridedPointer,
1616
reduced_add, reduced_prod, reduce_to_add, reduce_to_prod, reduced_max, reduced_min, reduce_to_max, reduce_to_min,
17-
vsum, vprod, vmaximum, vminimum, vstorent!
17+
vsum, vprod, vmaximum, vminimum, unwrap
1818

1919
using IfElse: ifelse
2020

@@ -30,7 +30,7 @@ using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast, inv_fast, abs2_fast
3030

3131

3232
using ArrayInterface
33-
using ArrayInterface: OptionallyStaticUnitRange, Zero, One#, static_length
33+
using ArrayInterface: OptionallyStaticUnitRange, Zero, One, StaticBool, True, False#, static_length
3434
const Static = ArrayInterface.StaticInt
3535

3636
using Requires
@@ -41,10 +41,6 @@ export LowDimArray, stridedpointer,
4141
vmap, vmap!, vmapt, vmapt!, vmapnt, vmapnt!, vmapntt, vmapntt!, tanh_fast, sigmoid_fast,
4242
vfilter, vfilter!, vmapreduce, vreduce
4343

44-
@inline unwrap(::Val{N}) where {N} = N
45-
@inline unwrap(::Static{N}) where {N} = N
46-
@inline unwrap(x) = x
47-
4844
const VECTORWIDTHSYMBOL, ELTYPESYMBOL = Symbol("##Wvecwidth##"), Symbol("##Tloopeltype##")
4945

5046
include("vectorizationbase_compat/contract_pass.jl")

src/broadcast.jl

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -355,13 +355,15 @@ end
355355
# size of dest determines loops
356356
# function vmaterialize!(
357357
@generated function vmaterialize!(
358-
dest::AbstractArray{T,N}, bc::BC, ::Val{Mod}
359-
) where {T <: NativeTypes, N, BC <: Union{Broadcasted,Product}, Mod}
358+
dest::AbstractArray{T,N}, bc::BC,
359+
::Val{Mod}, ::StaticInt{RS}, ::StaticInt{RC}, ::StaticInt{CLS}
360+
) where {T <: NativeTypes, N, BC <: Union{Broadcasted,Product}, Mod, RS, RC, CLS}
360361
# 2+1
361362
# we have an N dimensional loop.
362363
# need to construct the LoopSet
363364
# @show typeof(dest)
364365
ls = LoopSet(Mod)
366+
set_hw!(ls, RS, RC, CLS)
365367
loopsyms = [gensym!(ls, "n") for n 1:N]
366368
ls.isbroadcast[] = true
367369
add_broadcast_loops!(ls, loopsyms, :dest)
@@ -385,11 +387,13 @@ end
385387
# ls
386388
end
387389
@generated function vmaterialize!(
388-
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::BC, ::Val{Mod}
389-
) where {T <: NativeTypes, N, A <: AbstractArray{T,N}, BC <: Union{Broadcasted,Product}, Mod}
390+
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::BC,
391+
::Val{Mod}, ::StaticInt{RS}, ::StaticInt{RC}, ::StaticInt{CLS}
392+
) where {T <: NativeTypes, N, A <: AbstractArray{T,N}, BC <: Union{Broadcasted,Product}, Mod, RS, RC, CLS}
390393
# we have an N dimensional loop.
391394
# need to construct the LoopSet
392395
ls = LoopSet(Mod)
396+
set_hw!(ls, RS, RC, CLS)
393397
loopsyms = [gensym!(ls, "n") for n 1:N]
394398
ls.isbroadcast[] = true
395399
pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
@@ -411,7 +415,8 @@ end
411415
end
412416
# these are marked `@inline` so the `@avx` itself can choose whether or not to inline.
413417
@inline function vmaterialize!(
414-
dest::AbstractArray{T,N}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}}, ::Val{Mod}
418+
dest::AbstractArray{T,N}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}},
419+
::Val{Mod}, RS::Static, RC::Static, CLS::Static
415420
) where {T <: NativeTypes, N, T2 <: Number, Mod}
416421
arg = T(first(bc.args))
417422
@avx for i eachindex(dest)
@@ -420,7 +425,8 @@ end
420425
dest
421426
end
422427
@inline function vmaterialize!(
423-
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}}, ::Val{Mod}
428+
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}},
429+
::Val{Mod}, RS::Static, RC::Static, CLS::Static
424430
) where {T <: NativeTypes, N, A <: AbstractArray{T,N}, T2 <: Number, Mod}
425431
arg = T(first(bc.args))
426432
dest = parent(dest′)
@@ -430,10 +436,10 @@ end
430436
dest′
431437
end
432438

433-
@inline function vmaterialize(bc::Broadcasted, ::Val{Mod}) where {Mod}
439+
@inline function vmaterialize(bc::Broadcasted, ::Val{Mod}, RS::Static, RC::Static, CLS::Static) where {Mod}
434440
ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
435-
vmaterialize!(similar(bc, ElType), bc, Val{Mod}())
441+
vmaterialize!(similar(bc, ElType), bc, Val{Mod}(), RS, RC, CLS)
436442
end
437443

438-
vmaterialize!(dest, bc, ::Val{mod}) where {mod} = Base.Broadcast.materialize!(dest, bc)
444+
vmaterialize!(dest, bc, ::Val{mod}, ::StaticInt, ::StaticInt, ::StaticInt) where {mod} = Base.Broadcast.materialize!(dest, bc)
439445

src/condense_loopset.jl

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ function check_if_empty(ls::LoopSet, q::Expr)
220220
end
221221

222222
val(x) = Expr(:call, Expr(:curly, :Val, x))
223+
223224
# Try to condense in type stable manner
224225
function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool = false)
225226
operation_descriptions = Expr(:tuple)
@@ -239,8 +240,15 @@ function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool =
239240
inline, u₁, u₂ = inline_unroll
240241
func = debug ? lv(:_avx_loopset_debug) : lv(:_avx_!)
241242
lbarg = debug ? Expr(:call, :typeof, loop_bounds) : loop_bounds
243+
unroll_param_tup = Expr(
244+
:tuple, inline, u₁, u₂,
245+
Expr(:call, lv(:unwrap), VECTORWIDTHSYMBOL),
246+
Expr(:call, lv(:unwrap), Expr(:call, lv(:register_size))),
247+
Expr(:call, lv(:unwrap), Expr(:call, lv(:available_registers))),
248+
Expr(:call, lv(:unwrap), Expr(:call, lv(:cache_linesize)))
249+
)
242250
q = Expr(
243-
:call, func, val(Expr(:tuple, inline, u₁, u₂, Expr(:call, lv(:unwrap), VECTORWIDTHSYMBOL))),
251+
:call, func, val(unroll_param_tup),
244252
val(operation_descriptions), val(arrayref_descriptions), val(argmeta), val(loop_syms)
245253
)
246254
# debug && deleteat!(q.args, 2)

src/constructors.jl

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,12 @@ function add_ci_call!(q::Expr, @nospecialize(f), args, syms, i, mod = nothing)
2929
push!(call.args, arg)
3030
end
3131
end
32-
mod === nothing || push!(call.args, Expr(:call, Expr(:curly, :Val, QuoteNode(mod))))
32+
if mod !== nothing # indicates it's `vmaterialize(!)`
33+
reg_size = Expr(:call, lv(:register_size))
34+
reg_count = Expr(:call, lv(:available_registers))
35+
cache_lnsze = Expr(:call, lv(:cache_linesize))
36+
push!(call.args, Expr(:call, Expr(:curly, :Val, QuoteNode(mod))), reg_size, reg_count, cache_lnsze)
37+
end
3338
push!(q.args, Expr(:(=), syms[i], call))
3439
end
3540

@@ -65,6 +70,12 @@ function LoopSet(q::Expr, mod::Symbol = :Main)
6570
end
6671
LoopSet(q::Expr, m::Module) = LoopSet(macroexpand(m, q)::Expr, Symbol(m))
6772

73+
function loopset(q::Expr) # for interactive use only
74+
ls = LoopSet(q)
75+
set_hw!(ls)
76+
ls
77+
end
78+
6879
"""
6980
@avx
7081
@@ -235,13 +246,15 @@ This macro accepts the `inline` and `unroll` keyword arguments like `@avx`, but
235246
macro _avx(q)
236247
q = macroexpand(__module__, q)
237248
ls = LoopSet(q, __module__)
249+
set_hw!(ls)
238250
esc(Expr(:block, ls.prepreamble, lower_and_split_loops(ls, -1)))
239251
end
240252
macro _avx(arg, q)
241253
@assert q.head === :for
242254
q = macroexpand(__module__, q)
243255
inline, check_empty, u₁, u₂ = check_macro_kwarg(arg)
244256
ls = LoopSet(q, __module__)
257+
set_hw!(ls)
245258
esc(Expr(:block, ls.prepreamble, lower(ls, u₁ % Int, u₂ % Int, -1)))
246259
end
247260

0 commit comments

Comments
 (0)