Skip to content

Commit 76a3613

Browse files
committed
Progress in supporting VectorizationBase 0.18
1 parent 41f0d77 commit 76a3613

24 files changed

+314
-203
lines changed

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.11.1"
4+
version = "0.11.2"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -24,8 +24,8 @@ Requires = "1"
2424
SLEEFPirates = "0.6.7"
2525
ThreadingUtilities = "0.2.3"
2626
UnPack = "1"
27-
VectorizationBase = "0.17"
28-
julia = "1.5"
27+
VectorizationBase = "0.18"
28+
julia = "1.6"
2929

3030
[extras]
3131
Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"

src/LoopVectorization.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@ module LoopVectorization
55
# end
66

77
using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
8-
using VectorizationBase: register_size, register_count, dynamic_register_size, dynamic_register_count, data,
9-
mask, pick_vector_width_val, MM,
8+
using VectorizationBase: register_size, register_count, cache_linesize, has_opmask_registers,
9+
mask, pick_vector_width, MM, data,
1010
maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
11-
Zero, maybestaticrange, offsetprecalc, lazymul,
11+
maybestaticrange, offsetprecalc, lazymul,
1212
maybestaticfirst, maybestaticlast, scalar_less, scalar_greaterequal, gep, gesp, pointerforcomparison, NativeTypes,
1313
vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd_fast, vfmsub_fast, vfnmadd_fast, vfnmsub_fast, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231,
1414
vfma_fast, vmuladd_fast, vdiv_fast, vadd_fast, vsub_fast, vmul_fast,
@@ -30,7 +30,7 @@ using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast, inv_fast, abs2_fast
3030

3131

3232
using ArrayInterface
33-
using ArrayInterface: OptionallyStaticUnitRange, Zero, One#, static_length
33+
using ArrayInterface: OptionallyStaticUnitRange, Zero, One, StaticBool, True, False#, static_length
3434
const Static = ArrayInterface.StaticInt
3535

3636
using Requires

src/broadcast.jl

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -355,13 +355,15 @@ end
355355
# size of dest determines loops
356356
# function vmaterialize!(
357357
@generated function vmaterialize!(
358-
dest::AbstractArray{T,N}, bc::BC, ::Val{Mod}
359-
) where {T <: NativeTypes, N, BC <: Union{Broadcasted,Product}, Mod}
358+
dest::AbstractArray{T,N}, bc::BC,
359+
::Val{Mod}, ::StaticInt{RS}, ::StaticInt{RC}, ::StaticInt{CLS}
360+
) where {T <: NativeTypes, N, BC <: Union{Broadcasted,Product}, Mod, RS, RC, CLS}
360361
# 2+1
361362
# we have an N dimensional loop.
362363
# need to construct the LoopSet
363364
# @show typeof(dest)
364365
ls = LoopSet(Mod)
366+
set_hw!(ls, RS, RC, CLS)
365367
loopsyms = [gensym!(ls, "n") for n 1:N]
366368
ls.isbroadcast[] = true
367369
add_broadcast_loops!(ls, loopsyms, :dest)
@@ -385,11 +387,13 @@ end
385387
# ls
386388
end
387389
@generated function vmaterialize!(
388-
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::BC, ::Val{Mod}
389-
) where {T <: NativeTypes, N, A <: AbstractArray{T,N}, BC <: Union{Broadcasted,Product}, Mod}
390+
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::BC,
391+
::Val{Mod}, ::StaticInt{RS}, ::StaticInt{RC}, ::StaticInt{CLS}
392+
) where {T <: NativeTypes, N, A <: AbstractArray{T,N}, BC <: Union{Broadcasted,Product}, Mod, RS, RC, CLS}
390393
# we have an N dimensional loop.
391394
# need to construct the LoopSet
392395
ls = LoopSet(Mod)
396+
set_hw!(ls, RS, RC, CLS)
393397
loopsyms = [gensym!(ls, "n") for n 1:N]
394398
ls.isbroadcast[] = true
395399
pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
@@ -411,7 +415,8 @@ end
411415
end
412416
# these are marked `@inline` so the `@avx` itself can choose whether or not to inline.
413417
@inline function vmaterialize!(
414-
dest::AbstractArray{T,N}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}}, ::Val{Mod}
418+
dest::AbstractArray{T,N}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}},
419+
::Val{Mod}, RS::Static, RC::Static, CLS::Static
415420
) where {T <: NativeTypes, N, T2 <: Number, Mod}
416421
arg = T(first(bc.args))
417422
@avx for i eachindex(dest)
@@ -420,7 +425,8 @@ end
420425
dest
421426
end
422427
@inline function vmaterialize!(
423-
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}}, ::Val{Mod}
428+
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::Broadcasted{Base.Broadcast.DefaultArrayStyle{0},Nothing,typeof(identity),Tuple{T2}},
429+
::Val{Mod}, RS::Static, RC::Static, CLS::Static
424430
) where {T <: NativeTypes, N, A <: AbstractArray{T,N}, T2 <: Number, Mod}
425431
arg = T(first(bc.args))
426432
dest = parent(dest′)
@@ -430,10 +436,10 @@ end
430436
dest′
431437
end
432438

433-
@inline function vmaterialize(bc::Broadcasted, ::Val{Mod}) where {Mod}
439+
@inline function vmaterialize(bc::Broadcasted, ::Val{Mod}, RS::Static, RC::Static, CLS::Static) where {Mod}
434440
ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
435-
vmaterialize!(similar(bc, ElType), bc, Val{Mod}())
441+
vmaterialize!(similar(bc, ElType), bc, Val{Mod}(), RS, RC, CLS)
436442
end
437443

438-
vmaterialize!(dest, bc, ::Val{mod}) where {mod} = Base.Broadcast.materialize!(dest, bc)
444+
vmaterialize!(dest, bc, ::Val{mod}, ::StaticInt, ::StaticInt, ::StaticInt) where {mod} = Base.Broadcast.materialize!(dest, bc)
439445

src/condense_loopset.jl

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -239,8 +239,21 @@ function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool =
239239
inline, u₁, u₂ = inline_unroll
240240
func = debug ? lv(:_avx_loopset_debug) : lv(:_avx_!)
241241
lbarg = debug ? Expr(:call, :typeof, loop_bounds) : loop_bounds
242+
unroll_param_tup = Expr(
243+
:tuple, inline, u₁, u₂,
244+
Expr(:call, lv(:unwrap), VECTORWIDTHSYMBOL),
245+
Expr(:call, lv(:unwrap), Expr(:call, lv(:register_size))),
246+
Expr(:call, lv(:unwrap),
247+
Expr(:call, lv(:ifelse),
248+
Expr(:call, lv(:unwrap), Expr(:call, lv(:has_opmask_registers))),
249+
Expr(:call, lv(:unwrap), Expr(:call, lv(:register_count))),
250+
Expr(:call, lv(:unwrap), Expr(:call, :(-), Expr(:call, lv(:register_count)), Expr(:call, lv(:One))))
251+
)
252+
),
253+
Expr(:call, lv(:unwrap), Expr(:call, lv(:cache_linesize)))
254+
)
242255
q = Expr(
243-
:call, func, val(Expr(:tuple, inline, u₁, u₂, Expr(:call, lv(:unwrap), VECTORWIDTHSYMBOL))),
256+
:call, func, val(unroll_param_tup),
244257
val(operation_descriptions), val(arrayref_descriptions), val(argmeta), val(loop_syms)
245258
)
246259
# debug && deleteat!(q.args, 2)

src/constructors.jl

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,12 @@ function add_ci_call!(q::Expr, @nospecialize(f), args, syms, i, mod = nothing)
2929
push!(call.args, arg)
3030
end
3131
end
32-
mod === nothing || push!(call.args, Expr(:call, Expr(:curly, :Val, QuoteNode(mod))))
32+
if mod !== nothing # indicates it's `vmaterialize(!)`
33+
reg_size = Expr(:call, lv(:register_size))
34+
reg_count = Expr(:call, lv(:register_size))
35+
cache_lnsze = Expr(:call, lv(:cache_linesize))
36+
push!(call.args, Expr(:call, Expr(:curly, :Val, QuoteNode(mod))), reg_size, reg_count, cache_lnsze)
37+
end
3338
push!(q.args, Expr(:(=), syms[i], call))
3439
end
3540

@@ -65,6 +70,12 @@ function LoopSet(q::Expr, mod::Symbol = :Main)
6570
end
6671
LoopSet(q::Expr, m::Module) = LoopSet(macroexpand(m, q)::Expr, Symbol(m))
6772

73+
function loopset(q::Expr) # for interactive use only
74+
ls = LoopSet(q)
75+
set_hw!(ls)
76+
ls
77+
end
78+
6879
"""
6980
@avx
7081
@@ -235,13 +246,15 @@ This macro accepts the `inline` and `unroll` keyword arguments like `@avx`, but
235246
macro _avx(q)
236247
q = macroexpand(__module__, q)
237248
ls = LoopSet(q, __module__)
249+
set_hw!(ls)
238250
esc(Expr(:block, ls.prepreamble, lower_and_split_loops(ls, -1)))
239251
end
240252
macro _avx(arg, q)
241253
@assert q.head === :for
242254
q = macroexpand(__module__, q)
243255
inline, check_empty, u₁, u₂ = check_macro_kwarg(arg)
244256
ls = LoopSet(q, __module__)
257+
set_hw!(ls)
245258
esc(Expr(:block, ls.prepreamble, lower(ls, u₁ % Int, u₂ % Int, -1)))
246259
end
247260

0 commit comments

Comments
 (0)