Skip to content

Commit e5189a8

Browse files
committed
handle many args better
1 parent 0841acb commit e5189a8

File tree

4 files changed

+109
-45
lines changed

4 files changed

+109
-45
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.140"
4+
version = "0.12.141"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/broadcast.jl

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,9 @@ end
4444
@inline ArrayInterface.device(::LowDimArray) = ArrayInterface.CPUPointer()
4545
@generated function ArrayInterface.size(A::LowDimArray{D,T,N}) where {D,T,N}
4646
t = Expr(:tuple)
47-
gf = GlobalRef(Core, :getfield)
4847
for n 1:N
4948
if n > length(D) || D[n]
50-
push!(t.args, Expr(:call, gf, :s, n, false))
49+
push!(t.args, Expr(:call, getfield, :s, n))
5150
else
5251
push!(t.args, Expr(:call, Expr(:curly, lv(:StaticInt), 1)))
5352
end
@@ -64,10 +63,9 @@ ArrayInterface.offsets(A::LowDimArray) = ArrayInterface.offsets(parent(A))
6463

6564
@generated function _lowdimfilter(::Val{D}, tup::Tuple{Vararg{Any,N}}) where {D,N}
6665
t = Expr(:tuple)
67-
gf = GlobalRef(Core, :getfield)
6866
for n 1:N
6967
if n > length(D) || D[n]
70-
push!(t.args, Expr(:call, gf, :tup, n, false))
68+
push!(t.args, Expr(:call, getfield, :tup, n))
7169
end
7270
end
7371
Expr(:block, Expr(:meta, :inline), t)
@@ -178,7 +176,6 @@ function _strides_expr(@nospecialize(s), @nospecialize(x), R::Vector{Int}, D::Ve
178176
N = length(R)
179177
q = Expr(:block, Expr(:meta, :inline))
180178
strd_tup = Expr(:tuple)
181-
gf = GlobalRef(Core, :getfield)
182179
ifel = GlobalRef(Core, :ifelse)
183180
Nrange = 1:1:N # type stability w/ respect to reverse
184181
use_stride_acc = true
@@ -207,7 +204,7 @@ function _strides_expr(@nospecialize(s), @nospecialize(x), R::Vector{Int}, D::Ve
207204
elseif stride_acc 0
208205
push!(strd_tup.args, staticexpr(stride_acc))
209206
else
210-
push!(strd_tup.args, :($gf(x, $n, false)))
207+
push!(strd_tup.args, :($getfield(x, $n)))
211208
end
212209
else
213210
if xₙ_static
@@ -217,7 +214,7 @@ function _strides_expr(@nospecialize(s), @nospecialize(x), R::Vector{Int}, D::Ve
217214
else
218215
push!(
219216
strd_tup.args,
220-
:($ifel(isone($gf(s, $n, false)), zero($xₙ_type), $gf(x, $n, false))),
217+
:($ifel(isone($getfield(s, $n)), zero($xₙ_type), $getfield(x, $n))),
221218
)
222219
end
223220
end
@@ -326,10 +323,9 @@ function add_broadcast!(
326323
Klen = gensym!(ls, "K")
327324
mA = gensym!(ls, "Aₘₖ")
328325
mB = gensym!(ls, "Bₖₙ")
329-
gf = GlobalRef(Core, :getfield)
330326
pushprepreamble!(ls, Expr(:(=), mA, Expr(:(.), bcname, QuoteNode(:a))))
331327
pushprepreamble!(ls, Expr(:(=), mB, Expr(:(.), bcname, QuoteNode(:b))))
332-
pushprepreamble!(ls, Expr(:(=), Klen, Expr(:call, gf, Expr(:call, :size, mB), 1, false)))
328+
pushprepreamble!(ls, Expr(:(=), Klen, Expr(:call, getfield, Expr(:call, :size, mB), 1)))
333329
pushpreamble!(ls, Expr(:(=), Krange, Expr(:call, :(:), staticexpr(1), Klen)))
334330
k = gensym!(ls, "k")
335331
add_loop!(ls, Loop(k, 1, Klen, 1, Krange, Klen), k)
@@ -481,10 +477,9 @@ function add_broadcast!(
481477
parents = Operation[]
482478
deps = Symbol[]
483479
# reduceddeps = Symbol[]
484-
gf = GlobalRef(Core, :getfield)
485480
for (i, arg) enumerate(args)
486481
argname = gensym!(ls, "arg")
487-
pushprepreamble!(ls, Expr(:(=), argname, Expr(:call, gf, bcargs, i, false)))
482+
pushprepreamble!(ls, Expr(:(=), argname, Expr(:call, getfield, bcargs, i)))
488483
# dynamic dispatch
489484
parent = add_broadcast!(
490485
ls,
@@ -539,7 +534,7 @@ end
539534
::Val{UNROLL},
540535
::Val{dontbc},
541536
) where {T<:NativeTypes,N,BC<:Union{Broadcasted,Product},Mod,UNROLL,dontbc}
542-
# 2 + 1
537+
2 + 1
543538
# we have an N dimensional loop.
544539
# need to construct the LoopSet
545540
ls = LoopSet(Mod)

src/condense_loopset.jl

Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,10 @@ Base.:|(u::Unsigned, it::IndexType) = u | UInt8(it)
44
Base.:(==)(u::Unsigned, it::IndexType) = (u % UInt8) == UInt8(it)
55

66
function _append_fields!(t::Expr, body::Expr, sym::Symbol, ::Type{T}) where {T}
7-
gf = GlobalRef(Core, :getfield)
87
for f 1:fieldcount(T)
98
TF = fieldtype(T, f)
109
Base.issingletontype(TF) && continue
11-
gfcall = Expr(:call, gf, sym, f)
10+
gfcall = Expr(:call, getfield, sym, f)
1211
if fieldcount(TF) 0
1312
push!(t.args, gfcall)
1413
elseif TF <: DataType
@@ -37,16 +36,15 @@ end
3736
body
3837
end
3938
function rebuild_fields(offset::Int, ::Type{T}) where {T}
40-
gf = GlobalRef(Core, :getfield)
4139
call = (T <: Tuple) ? Expr(:tuple) : Expr(:new, T)
4240
for f 1:fieldcount(T)
4341
TF = fieldtype(T, f)
4442
if Base.issingletontype(TF)
4543
push!(call.args, TF.instance)
4644
elseif fieldcount(TF) 0
47-
push!(call.args, Expr(:call, gf, :t, (offset += 1), false))
45+
push!(call.args, Expr(:call, getfield, :t, (offset += 1)))
4846
elseif TF <: DataType
49-
push!(call.args, Expr(:call, lv(:gettype), Expr(:call, gf, :t, (offset += 1), false)))
47+
push!(call.args, Expr(:call, lv(:gettype), Expr(:call, getfield, :t, (offset += 1))))
5048
else
5149
arg, offset = rebuild_fields(offset, TF)
5250
push!(call.args, arg)
@@ -58,9 +56,9 @@ end
5856
if Base.issingletontype(T)
5957
return T.instance
6058
elseif fieldcount(T) 0
61-
call = Expr(:call, GlobalRef(Core, :getfield), :t, 1, false)
59+
call = Expr(:call, getfield, :t, 1)
6260
elseif T <: DataType
63-
call = Expr(:call, lv(:gettype), Expr(:call, GlobalRef(Core, :getfield), :t, 1, false))
61+
call = Expr(:call, lv(:gettype), Expr(:call, getfield, :t, 1))
6462
else
6563
call, _ = rebuild_fields(0, T)
6664
end
@@ -377,10 +375,10 @@ val(x) = Expr(:call, Expr(:curly, :Val, x))
377375
quote
378376
$(Expr(:meta, :inline))
379377
p, li =
380-
VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1, false), one($I)),), strides(x))
378+
VectorizationBase.tdot(x, (vsub_nsw(getfield(i, 1), one($I)),), strides(x))
381379
ptr = gep(p, li)
382380
si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}(
383-
(getfield(strides(x), $ri, false),),
381+
(getfield(strides(x), $ri),),
384382
(Zero(),),
385383
)
386384
stridedpointer(ptr, si, StaticInt{$(B === 1 ? 1 : 0)}())
@@ -394,8 +392,8 @@ end
394392
quote
395393
$(Expr(:meta, :inline))
396394
si = ArrayInterface.StrideIndex{1,$(R[ri],),$(C === 1 ? 1 : 0)}(
397-
(getfield(strides(x), $ri, false),),
398-
(getfield(offsets(x), $ri, false),),
395+
(getfield(strides(x), $ri),),
396+
(getfield(offsets(x), $ri),),
399397
)
400398
stridedpointer(pointer(x), si, StaticInt{$(B == 1 ? 1 : 0)}())
401399
end
@@ -550,7 +548,7 @@ function add_grouped_strided_pointer!(extra_args::Expr, ls::LoopSet)
550548
push!(gsp.args, val(matcheddims))
551549
gsps = gensym!(ls, "#grouped#strided#pointer#")
552550
push!(extra_args.args, gsps)
553-
pushpreamble!(ls, Expr(:(=), gsps, Expr(:call, GlobalRef(Core, :getfield), gsp, 1)))
551+
pushpreamble!(ls, Expr(:(=), gsps, Expr(:call, getfield, gsp, 1)))
554552
preserve, shouldindbyind, roots
555553
end
556554

@@ -802,21 +800,10 @@ function generate_call_types(
802800
argmeta = argmeta_and_consts_description(ls, arraysymbolinds)
803801
loop_bounds = loop_boundaries(ls, shouldindbyind)
804802
loop_syms = tuple_expr(QuoteNode, ls.loopsymbols)
805-
func = debug ? lv(:_turbo_loopset_debug) : lv(:_turbo_!)
806803
lbarg = debug ? Expr(:call, :typeof, loop_bounds) : loop_bounds
807804
configarg = (inline, u₁, u₂, v, ls.isbroadcast, thread, warncheckarg, safe)
808805
unroll_param_tup =
809806
Expr(:call, lv(:avx_config_val), :(Val{$configarg}()), VECTORWIDTHSYMBOL)
810-
q = Expr(
811-
:call,
812-
func,
813-
unroll_param_tup,
814-
val(operation_descriptions),
815-
val(arrayref_descriptions),
816-
val(argmeta),
817-
val(loop_syms),
818-
)
819-
820807
add_reassigned_syms!(extra_args, ls) # counterpart to `add_ops!` constants
821808
for (opid, sym) ls.preamble_symsym # counterpart to process_metadata! symsym extraction
822809
if instruction(ops[opid]) DROPPEDCONSTANT
@@ -826,17 +813,42 @@ function generate_call_types(
826813
append!(extra_args.args, arraysymbolinds) # add_array_symbols!
827814
add_external_functions!(extra_args, ls) # extract_external_functions!
828815
add_outerreduct_types!(extra_args, ls) # extract_outerreduct_types!
829-
if debug
830-
vecwidthdefq = Expr(:block)
816+
argcestimate = length(extra_args.args) - 1
817+
for ref = ls.refs_aliasing_syms
818+
argcestimate += length(ref.loopedindex)
819+
end
820+
manyarg = !debug && (argcestimate > 16)
821+
func = debug ? lv(:_turbo_loopset_debug) : (manyarg ? lv(:_turbo_manyarg!) : lv(:_turbo_!))
822+
q = Expr(
823+
:call,
824+
func,
825+
unroll_param_tup,
826+
val(operation_descriptions),
827+
val(arrayref_descriptions),
828+
val(argmeta),
829+
val(loop_syms),
830+
)
831+
vecwidthdefq = if debug
831832
push!(q.args, Expr(:tuple, lbarg, extra_args))
833+
Expr(:block)
832834
else
833835
vargsym = gensym(:vargsym)
834-
vecwidthdefq = Expr(:block, Expr(:(=), vargsym, Expr(:tuple, lbarg, extra_args)))
835836
push!(
836837
q.args,
837-
Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym)),
838-
Expr(:(...), Expr(:call, lv(:flatten_to_tuple), vargsym)),
838+
Expr(:call, GlobalRef(Base, :Val), Expr(:call, GlobalRef(Base, :typeof), vargsym))
839839
)
840+
if manyarg
841+
push!(
842+
q.args,
843+
Expr(:call, lv(:flatten_to_tuple), vargsym),
844+
)
845+
else
846+
push!(
847+
q.args,
848+
Expr(:(...), Expr(:call, lv(:flatten_to_tuple), vargsym)),
849+
)
850+
end
851+
Expr(:block, Expr(:(=), vargsym, Expr(:tuple, lbarg, extra_args)))
840852
end
841853
define_eltype_vec_width!(vecwidthdefq, ls, nothing, true)
842854
push!(vecwidthdefq.args, q)

src/reconstruct_loopset.jl

Lines changed: 61 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ function Loop(
121121
end
122122

123123

124-
extract_loop(l) = Expr(:call, GlobalRef(Core, :getfield), Symbol("#loop#bounds#"), l, false)
124+
extract_loop(l) = Expr(:call, getfield, Symbol("#loop#bounds#"), l)
125125

126126
function add_loops!(ls::LoopSet, LPSYM, LB)
127127
n = max(length(LPSYM), length(LB))
@@ -145,7 +145,7 @@ function add_loops!(
145145
ssym = String(sym)
146146
for k = N:-1:1
147147
axisexpr =
148-
:(getfield(getfield(getfield(var"#loop#bounds#", $i, false), :indices), $k, false))
148+
:($getfield($getfield($getfield(var"#loop#bounds#", $i), :indices), $k))
149149
add_loop!(
150150
ls,
151151
Loop(ls, axisexpr, Symbol(ssym * '#' * string(k) * '#'), T.parameters[k])::Loop,
@@ -258,7 +258,7 @@ function ArrayReferenceMeta(
258258
end
259259

260260

261-
extract_varg(i) = :(getfield(var"#vargs#", $i, false))
261+
extract_varg(i) = :($getfield(var"#vargs#", $i))
262262
# _extract(::Type{StaticInt{N}}) where {N} = N
263263
extract_gsp!(sptrs::Expr, name::Symbol) = (push!(sptrs.args, name); nothing)
264264
tupleranks(R::NTuple{8,Int}) = ntuple(n -> sum(R[n] .≥ R), Val{8}())
@@ -319,7 +319,7 @@ function _add_mref!(
319319
extract_gsp!(sptrs, tmpsp)
320320
strd_tup = Expr(:tuple)
321321
offsets_tup = Expr(:tuple)
322-
gf = GlobalRef(Core, :getfield)
322+
gf = getfield
323323
offsets = gensym(:offsets)
324324
strides = gensym(:strides)
325325
pushpreamble!(ls, Expr(:(=), offsets, Expr(:call, lv(:offsets), tmpsp)))
@@ -1019,3 +1019,60 @@ Execute an `@turbo` block. The block's code is represented via the arguments:
10191019
post === ls.preamble ? q : Expr(:block, q, post)
10201020
# @show var"#UNROLL#", var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#"
10211021
end
1022+
@generated function _turbo_manyarg!(
1023+
::Val{var"#UNROLL#"},
1024+
::Val{var"#OPS#"},
1025+
::Val{var"#ARF#"},
1026+
::Val{var"#AM#"},
1027+
::Val{var"#LPSYM#"},
1028+
::Val{Tuple{var"#LB#",var"#V#"}},
1029+
var"#flattened#var#arguments#"::Tuple{Vararg{Any,var"#num#vargs#"}},
1030+
) where {
1031+
var"#UNROLL#",
1032+
var"#OPS#",
1033+
var"#ARF#",
1034+
var"#AM#",
1035+
var"#LPSYM#",
1036+
var"#LB#",
1037+
var"#V#",
1038+
var"#num#vargs#",
1039+
}
1040+
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
1041+
ls = _turbo_loopset(
1042+
var"#OPS#",
1043+
var"#ARF#",
1044+
var"#AM#",
1045+
var"#LPSYM#",
1046+
var"#LB#".parameters,
1047+
var"#V#".parameters,
1048+
var"#UNROLL#",
1049+
)
1050+
pushfirst!(
1051+
ls.preamble.args,
1052+
:(
1053+
var"#lv#tuple#args#" =
1054+
reassemble_tuple(Tuple{var"#LB#",var"#V#"}, var"#flattened#var#arguments#")
1055+
),
1056+
)
1057+
post = hoist_constant_memory_accesses!(ls)
1058+
# q = @show(avx_body(ls, var"#UNROLL#")); post === ls.preamble ? q : Expr(:block, q, post)
1059+
q = if (var"#UNROLL#"[10] > 1) && length(var"#LPSYM#") == length(ls.loops)
1060+
inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, nt, wca, safe = var"#UNROLL#"
1061+
# wrap in `var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#"` in `Expr` to homogenize types
1062+
avx_threads_expr(
1063+
ls,
1064+
(inline, u₁, u₂, v, isbroadcast, W, rs, rc, cls, one(UInt), wca, safe),
1065+
nt,
1066+
:(Val{$(var"#OPS#")}()),
1067+
:(Val{$(var"#ARF#")}()),
1068+
:(Val{$(var"#AM#")}()),
1069+
:(Val{$(var"#LPSYM#")}()),
1070+
)
1071+
else
1072+
# Main.BODY[] = avx_body(ls, var"#UNROLL#")
1073+
# return @show avx_body(ls, var"#UNROLL#")
1074+
avx_body(ls, var"#UNROLL#")
1075+
end
1076+
post === ls.preamble ? q : Expr(:block, q, post)
1077+
# @show var"#UNROLL#", var"#OPS#", var"#ARF#", var"#AM#", var"#LPSYM#", var"#LB#"
1078+
end

0 commit comments

Comments
 (0)