Skip to content

Commit 2465016

Browse files
authored
Allow more than 256 operations in a loopset, add option to choose which loop to vectorize (#308)
1 parent 3b7344c commit 2465016

File tree

10 files changed

+193
-139
lines changed

10 files changed

+193
-139
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.54"
4+
version = "0.12.55"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -25,7 +25,7 @@ IfElse = "0.1"
2525
OffsetArrays = "1.4.1"
2626
Polyester = "0.3"
2727
Requires = "1"
28-
SLEEFPirates = "0.6.18"
28+
SLEEFPirates = "0.6.23"
2929
Static = "0.2, 0.3"
3030
StrideArraysCore = "0.1.12"
3131
ThreadingUtilities = "0.4.5"

src/broadcast.jl

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -393,44 +393,44 @@ end
393393
@generated function vmaterialize!(
394394
dest::AbstractArray{T,N}, bc::BC, ::Val{Mod}, ::Val{UNROLL}
395395
) where {T <: NativeTypes, N, BC <: Union{Broadcasted,Product}, Mod, UNROLL}
396-
# 2+1
397-
# we have an N dimensional loop.
398-
# need to construct the LoopSet
399-
# @show typeof(dest)
400-
ls = LoopSet(Mod)
401-
inline, u₁, u₂, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
402-
set_hw!(ls, rs, rc, cls, l1, l2, l3)
403-
ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
404-
loopsyms = [gensym!(ls, "n") for n 1:N]
405-
add_broadcast_loops!(ls, loopsyms, :dest)
406-
elementbytes = sizeof(T)
407-
add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
408-
storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, loopsyms), elementbytes)
409-
doaddref!(ls, storeop)
410-
resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
396+
# 2+1
397+
# we have an N dimensional loop.
398+
# need to construct the LoopSet
399+
# @show typeof(dest)
400+
ls = LoopSet(Mod)
401+
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
402+
set_hw!(ls, rs, rc, cls, l1, l2, l3)
403+
ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
404+
loopsyms = [gensym!(ls, "n") for n 1:N]
405+
add_broadcast_loops!(ls, loopsyms, :dest)
406+
elementbytes = sizeof(T)
407+
add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
408+
storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, loopsyms), elementbytes)
409+
doaddref!(ls, storeop)
410+
resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
411411
# return ls
412-
sc = setup_call(ls, :(Base.Broadcast.materialize!(dest, bc)), LineNumberNode(0), inline, false, u₁, u₂, threads%Int, warncheckarg)
412+
sc = setup_call(ls, :(Base.Broadcast.materialize!(dest, bc)), LineNumberNode(0), inline, false, u₁, u₂, v, threads%Int, warncheckarg)
413413
# return sc
414-
Expr(:block, Expr(:meta,:inline), sc, :dest)
414+
Expr(:block, Expr(:meta,:inline), sc, :dest)
415415
end
416416
@generated function vmaterialize!(
417417
dest′::Union{Adjoint{T,A},Transpose{T,A}}, bc::BC, ::Val{Mod}, ::Val{UNROLL}
418418
) where {T <: NativeTypes, N, A <: AbstractArray{T,N}, BC <: Union{Broadcasted,Product}, Mod, UNROLL}
419-
# we have an N dimensional loop.
420-
# need to construct the LoopSet
421-
ls = LoopSet(Mod)
422-
inline, u₁, u₂, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
423-
set_hw!(ls, rs, rc, cls, l1, l2, l3)
424-
ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
425-
loopsyms = [gensym!(ls, "n") for n 1:N]
426-
pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
427-
add_broadcast_loops!(ls, loopsyms, :dest′)
428-
elementbytes = sizeof(T)
429-
add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
430-
storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, reverse(loopsyms)), elementbytes)
431-
doaddref!(ls, storeop)
432-
resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
433-
Expr(:block, Expr(:meta,:inline), setup_call(ls, :(Base.Broadcast.materialize!(dest′, bc)), LineNumberNode(0), inline, false, u₁, u₂, threads%Int, warncheckarg), :dest′)
419+
# we have an N dimensional loop.
420+
# need to construct the LoopSet
421+
ls = LoopSet(Mod)
422+
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, l1, l2, l3, threads, warncheckarg = UNROLL
423+
set_hw!(ls, rs, rc, cls, l1, l2, l3)
424+
ls.isbroadcast = isbroadcast # maybe set `false` in a DiffEq-like `@..` macro
425+
loopsyms = [gensym!(ls, "n") for n 1:N]
426+
pushprepreamble!(ls, Expr(:(=), :dest, Expr(:call, :parent, :dest′)))
427+
add_broadcast_loops!(ls, loopsyms, :dest′)
428+
elementbytes = sizeof(T)
429+
add_broadcast!(ls, :dest, :bc, loopsyms, BC, elementbytes)
430+
storeop = add_simple_store!(ls, :dest, ArrayReference(:dest, reverse(loopsyms)), elementbytes)
431+
doaddref!(ls, storeop)
432+
resize!(ls.loop_order, num_loops(ls)) # num_loops may be greater than N, eg Product
433+
Expr(:block, Expr(:meta,:inline), setup_call(ls, :(Base.Broadcast.materialize!(dest′, bc)), LineNumberNode(0), inline, false, u₁, u₂, v, threads%Int, warncheckarg), :dest′)
434434
end
435435
# these are marked `@inline` so the `@turbo` itself can choose whether or not to inline.
436436
@generated function vmaterialize!(

src/codegen/lower_threads.jl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -337,9 +337,9 @@ function scale_cost(c, looplen)
337337
c
338338
end
339339
function thread_one_loops_expr(
340-
ls::LoopSet, ua::UnrollArgs, valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64,
341-
UNROLL::Tuple{Bool,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, LPSYM::Expr
342-
)
340+
ls::LoopSet, ua::UnrollArgs, valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64,
341+
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, LPSYM::Expr
342+
)
343343
looplen = looplengthprod(ls)
344344
c = scale_cost(c, looplen)
345345
if all(isstaticloop, ls.loops)
@@ -473,9 +473,9 @@ function define_thread_blocks(threadedloop1, threadedloop2, vloop, u₁loop, u
473473
end
474474
end
475475
function thread_two_loops_expr(
476-
ls::LoopSet, ua::UnrollArgs, valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64,
477-
UNROLL::Tuple{Bool,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, LPSYM::Expr
478-
)
476+
ls::LoopSet, ua::UnrollArgs, valid_thread_loop::Vector{Bool}, ntmax::UInt, c::Float64,
477+
UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt}, OPS::Expr, ARF::Expr, AM::Expr, LPSYM::Expr
478+
)
479479
looplen = looplengthprod(ls)
480480
# c = 0.0225 * c / looplen
481481
c = scale_cost(c, looplen)
@@ -677,7 +677,7 @@ function valid_thread_loops(ls::LoopSet)
677677
valid_thread_loop, ua, c
678678
end
679679
function avx_threads_expr(
680-
ls::LoopSet, UNROLL::Tuple{Bool,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
680+
ls::LoopSet, UNROLL::Tuple{Bool,Int8,Int8,Int8,Bool,Int,Int,Int,Int,Int,Int,Int,UInt},
681681
nt::UInt, OPS::Expr, ARF::Expr, AM::Expr, LPSYM::Expr
682682
)
683683
valid_thread_loop, ua, c = valid_thread_loops(ls)

src/codegen/lowering.jl

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -661,7 +661,7 @@ function push_outer_reduct_types!(pt::Expr, ls::LoopSet, ortypdefined::Bool)
661661
for j ls.outer_reductions
662662
oreducop = ls.operations[j]
663663
if ortypdefined
664-
push!(pt.args, typeof_expr(oreducop))
664+
push!(pt.args, eltype_expr(oreducop))
665665
else
666666
push!(pt.args, outer_reduct_init_typename(oreducop))
667667
end
@@ -679,7 +679,7 @@ function determine_eltype(ls::LoopSet, ortypdefined::Bool)::Union{Symbol,Expr}
679679
else
680680
oreducop = ls.operations[ls.outer_reductions[1]]
681681
if ortypdefined
682-
return typeof_expr(oreducop)
682+
return eltype_expr(oreducop)
683683
else
684684
return outer_reduct_init_typename(oreducop)
685685
end
@@ -886,18 +886,20 @@ function lower(ls::LoopSet, inline::Int = -1)
886886
order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls)
887887
lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, inlinedecision(inline, shouldinline))
888888
end
889-
function lower(ls::LoopSet, u₁::Int, u₂::Int, inline::Int)
889+
function lower(ls::LoopSet, u₁::Int, u₂::Int, v::Int, inline::Int)
890890
fill_offset_memop_collection!(ls)
891891
fill_children!(ls)
892892
if u₂ > 1
893893
@assert num_loops(ls) > 1 "There is only $(num_loops(ls)) loop, but specified blocking parameter u₂ is $u₂."
894-
order, u₁loop, u₂loop, vectorized, _u₁, _u₂, c, shouldinline = choose_tile(ls)
894+
order, u₁loop, u₂loop, vectorized, _u₁, _u₂, c, shouldinline = choose_tile(ls, store_load_deps(operations(ls)), v)
895895
copyto!(ls.loop_order.bestorder, order)
896-
else
896+
elseif u₁ > 0
897897
u₂ = -1
898-
order, vectorized, c = choose_unroll_order(ls, Inf)
898+
order, vectorized, c = choose_unroll_order(ls, Inf, store_load_deps(operations(ls)), v)
899899
u₁loop = first(order); u₂loop = Symbol("##undefined##"); shouldinline = true
900900
copyto!(ls.loop_order.bestorder, order)
901+
else
902+
order, u₁loop, u₂loop, vectorized, u₁, u₂, c, shouldinline = choose_order_cost(ls, v)
901903
end
902904
doinline = inlinedecision(inline, shouldinline)
903905
lower(ls, order, u₁loop, u₂loop, vectorized, u₁, u₂, doinline)

src/condense_loopset.jl

Lines changed: 41 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -138,10 +138,11 @@ struct OperationStruct <: AbstractLoopOperation
138138
loopdeps::UInt128
139139
reduceddeps::UInt128
140140
childdeps::UInt128
141-
parents::UInt128
141+
parents₀::UInt128
142+
parents₁::UInt128
142143
node_type::OperationType
144+
symid::UInt16
143145
array::UInt8
144-
symid::UInt8
145146
end
146147
optype(os) = os.node_type
147148

@@ -166,14 +167,22 @@ end
166167
loopdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, loopdependencies(op))
167168
reduceddeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reduceddependencies(op))
168169
childdeps_uint(ls::LoopSet, op::Operation) = shifted_loopset(ls, reducedchildren(op))
169-
function parents_uint(ls::LoopSet, op::Operation)
170+
function parents_uint(oppv::AbstractVector{Operation})
170171
p = zero(UInt128)
171-
for parent parents(op)
172-
p <<= 8
172+
for parent oppv
173+
p <<= 16
173174
p |= identifier(parent)
174175
end
175176
p
176177
end
178+
function parents_uint(op::Operation)
179+
opv = parents(op)
180+
N = length(opv)
181+
@assert N 16
182+
p0 = parents_uint(view(opv, 1:min(8,N)))
183+
p1 = N > 8 ? parents_uint(view(opv, 9:N)) : zero(p0)
184+
p0, p1
185+
end
177186
function recursively_set_parents_true!(x::Vector{Bool}, op::Operation)
178187
x[identifier(op)] && return nothing # don't redescend
179188
x[identifier(op)] = true
@@ -199,16 +208,16 @@ function getroots!(rooted::Vector{Bool}, ls::LoopSet)
199208
return rooted
200209
end
201210
function OperationStruct!(varnames::Vector{Symbol}, ids::Vector{Int}, ls::LoopSet, op::Operation)
202-
instr = instruction(op)
203-
ld = loopdeps_uint(ls, op)
204-
rd = reduceddeps_uint(ls, op)
205-
cd = childdeps_uint(ls, op)
206-
p = parents_uint(ls, op)
207-
array = accesses_memory(op) ? findmatchingarray(ls, op.ref) : 0x00
208-
ids[identifier(op)] = id = findindoradd!(varnames, name(op))
209-
OperationStruct(
210-
ld, rd, cd, p, op.node_type, array, id
211-
)
211+
instr = instruction(op)
212+
ld = loopdeps_uint(ls, op)
213+
rd = reduceddeps_uint(ls, op)
214+
cd = childdeps_uint(ls, op)
215+
p0, p1 = parents_uint(op)
216+
array = accesses_memory(op) ? findmatchingarray(ls, op.ref) : 0x00
217+
ids[identifier(op)] = id = findindoradd!(varnames, name(op))
218+
OperationStruct(
219+
ld, rd, cd, p0, p1, op.node_type, id, array
220+
)
212221
end
213222
## turn a LoopSet into a type object which can be used to reconstruct the LoopSet.
214223

@@ -527,10 +536,10 @@ end
527536
::Val{CNFARG}, ::StaticInt{W}, ::StaticInt{RS}, ::StaticInt{AR}, ::StaticInt{NT},
528537
::StaticInt{CLS}, ::StaticInt{L1}, ::StaticInt{L2}, ::StaticInt{L3}
529538
) where {CNFARG,W,RS,AR,CLS,L1,L2,L3,NT}
530-
inline,u₁,u₂,BROADCAST,thread = CNFARG
539+
inline,u₁,u₂,v,BROADCAST,thread = CNFARG
531540
nt = min(thread % UInt, NT % UInt)
532-
t = Expr(:tuple, inline, u₁, u₂, BROADCAST, W, RS, AR, CLS, L1,L2,L3, nt)
533-
length(CNFARG) == 6 && push!(t.args, last(CNFARG))
541+
t = Expr(:tuple, inline, u₁, u₂, v, BROADCAST, W, RS, AR, CLS, L1, L2, L3, nt)
542+
length(CNFARG) == 7 && push!(t.args, CNFARG[7])
534543
Expr(:call, Expr(:curly, :Val, t))
535544
end
536545
@inline function avx_config_val(
@@ -563,7 +572,8 @@ end
563572

564573

565574
function split_ifelse!(
566-
ls::LoopSet, preserve::Vector{Symbol}, shouldindbyind::Vector{Bool}, roots::Vector{Bool}, extra_args::Expr, k::Int, inlineu₁u₂::Tuple{Bool,Int8,Int8}, thread::UInt, debug::Bool
575+
ls::LoopSet, preserve::Vector{Symbol}, shouldindbyind::Vector{Bool}, roots::Vector{Bool}, extra_args::Expr, k::Int,
576+
inlineu₁u₂::Tuple{Bool,Int8,Int8,Int8}, thread::UInt, debug::Bool
567577
)
568578
roots[k] = false
569579
op = operations(ls)[k]
@@ -617,13 +627,14 @@ function split_ifelse!(
617627
prepre
618628
end
619629

620-
function generate_call(ls::LoopSet, inlineu₁u₂::Tuple{Bool,Int8,Int8}, thread::UInt, debug::Bool)
630+
function generate_call(ls::LoopSet, inlineu₁u₂::Tuple{Bool,Int8,Int8,Int8}, thread::UInt, debug::Bool)
621631
extra_args = Expr(:tuple)
622632
preserve, shouldindbyind, roots = add_grouped_strided_pointer!(extra_args, ls)
623633
generate_call_split(ls, preserve, shouldindbyind, roots, extra_args, inlineu₁u₂, thread, debug)
624634
end
625635
function generate_call_split(
626-
ls::LoopSet, preserve::Vector{Symbol}, shouldindbyind::Vector{Bool}, roots::Vector{Bool}, extra_args::Expr, inlineu₁u₂::Tuple{Bool,Int8,Int8}, thread::UInt, debug::Bool
636+
ls::LoopSet, preserve::Vector{Symbol}, shouldindbyind::Vector{Bool}, roots::Vector{Bool}, extra_args::Expr,
637+
inlineu₁u₂::Tuple{Bool,Int8,Int8,Int8}, thread::UInt, debug::Bool
627638
)
628639
for (k,op) enumerate(operations(ls))
629640
parents_op = parents(op)
@@ -636,7 +647,8 @@ end
636647

637648
# Try to condense in type stable manner
638649
function generate_call_types(
639-
ls::LoopSet, preserve::Vector{Symbol}, shouldindbyind::Vector{Bool}, roots::Vector{Bool}, extra_args::Expr, (inline,u₁,u₂)::Tuple{Bool,Int8,Int8}, thread::UInt, debug::Bool
650+
ls::LoopSet, preserve::Vector{Symbol}, shouldindbyind::Vector{Bool}, roots::Vector{Bool}, extra_args::Expr,
651+
(inline,u₁,u₂,v)::Tuple{Bool,Int8,Int8,Int8}, thread::UInt, debug::Bool
640652
)
641653
# good place to check for split
642654
operation_descriptions = Expr(:tuple)
@@ -665,7 +677,7 @@ function generate_call_types(
665677
loop_syms = tuple_expr(QuoteNode, ls.loopsymbols)
666678
func = debug ? lv(:_turbo_loopset_debug) : lv(:_turbo_!)
667679
lbarg = debug ? Expr(:call, :typeof, loop_bounds) : loop_bounds
668-
configarg = (inline,u₁,u₂,ls.isbroadcast,thread)
680+
configarg = (inline,u₁,u₂,v,ls.isbroadcast,thread)
669681
unroll_param_tup = Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), VECTORWIDTHSYMBOL)
670682
q = Expr(:call, func, unroll_param_tup, val(operation_descriptions), val(arrayref_descriptions), val(argmeta), val(loop_syms))
671683

@@ -697,9 +709,10 @@ function generate_call_types(
697709
end
698710
# @inline reductinittype(::T) where {T} = StaticType{T}()
699711
typeof_expr(op::Operation) = Expr(:call, GlobalRef(Base,:typeof), name(op))
712+
eltype_expr(op::Operation) = Expr(:call, GlobalRef(Base,:eltype), name(op))
700713
function add_outerreduct_types!(extra_args::Expr, ls::LoopSet) # extract_outerreduct_types!
701714
for or ls.outer_reductions
702-
push!(extra_args.args, typeof_expr(operations(ls)[or]))
715+
push!(extra_args.args, eltype_expr(operations(ls)[or]))
703716
end
704717
end
705718
"""
@@ -735,6 +748,7 @@ Returns true if the element type is supported.
735748
"""
736749
@inline check_type(::Type{T}) where {T <: NativeTypes} = true
737750
@inline check_type(::Type{T}) where {T} = false
751+
@inline check_type(::Type{T}) where {T <: AbstractSIMD} = true
738752
@inline check_device(::ArrayInterface.CPUPointer) = true
739753
@inline check_device(::ArrayInterface.CPUTuple) = true
740754
@inline check_device(x) = false
@@ -787,10 +801,10 @@ function setup_call_final(ls::LoopSet, q::Expr)
787801
return ls.preamble
788802
end
789803
function setup_call_debug(ls::LoopSet)
790-
generate_call(ls, (false,zero(Int8),zero(Int8)), zero(UInt), true)
804+
generate_call(ls, (false,zero(Int8),zero(Int8),zero(Int8)), zero(UInt), true)
791805
end
792806
function setup_call(
793-
ls::LoopSet, q::Expr, source::LineNumberNode, inline::Bool, check_empty::Bool, u₁::Int8, u₂::Int8, thread::Int, warncheckarg::Int
807+
ls::LoopSet, q::Expr, source::LineNumberNode, inline::Bool, check_empty::Bool, u₁::Int8, u₂::Int8, v::Int8, thread::Int, warncheckarg::Int
794808
)
795809
# We outline/inline at the macro level by creating/not creating an anonymous function.
796810
# The old API instead was based on inlining or not inline the generated function, but
@@ -799,7 +813,7 @@ function setup_call(
799813
# inlining the generated function into the loop preamble.
800814
lnns = extract_all_lnns(q)
801815
pushfirst!(lnns, source)
802-
call = generate_call(ls, (inline, u₁, u₂), thread%UInt, false)
816+
call = generate_call(ls, (inline, u₁, u₂, v), thread%UInt, false)
803817
call = check_empty ? check_if_empty(ls, call) : call
804818
argfailure = make_crashy(make_fast(q))
805819
if warncheckarg 0

0 commit comments

Comments
 (0)