Skip to content

Commit c1b3842

Browse files
committed
2 parents 678c250 + 691af33 commit c1b3842

File tree

11 files changed

+75
-53
lines changed

11 files changed

+75
-53
lines changed

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.8.6"
4+
version = "0.8.7"
55

66
[deps]
77
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
@@ -15,10 +15,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1515
[compat]
1616
DocStringExtensions = "0.8"
1717
OffsetArrays = "1"
18-
SIMDPirates = "0.8.8"
18+
SIMDPirates = "0.8.9"
1919
SLEEFPirates = "0.5"
2020
UnPack = "0,1"
21-
VectorizationBase = "0.12.10"
21+
VectorizationBase = "0.12.11"
2222
julia = "1.1"
2323

2424
[extras]

src/add_compute.jl

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ function add_compute!(
255255
mergesetv!(newreduceddeps, reduceddeps)
256256
deps = newloopdeps; reduceddeps = newreduceddeps
257257
end
258-
if reduction || search_tree(vparents, var)
258+
op = if reduction || search_tree(vparents, var)
259259
parent = ls.opdict[var]
260260
setdiffv!(reduceddeps, deps, loopdependencies(parent))
261261
# parent = getop(ls, var, elementbytes)
@@ -272,6 +272,8 @@ function add_compute!(
272272
op = Operation(length(operations(ls)), var, elementbytes, instr, compute, deps, reduceddeps, vparents)
273273
pushop!(ls, op, var)
274274
end
275+
# maybe_const_compute!(ls, op, elementbytes, position)
276+
op
275277
end
276278

277279
function add_compute!(

src/add_constants.jl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ function add_constant!(
4949
ls::LoopSet, value::Symbol, deps::Vector{Symbol}, assignedsym::Symbol, elementbytes::Int, f::Symbol = Symbol("")
5050
)
5151
retop = get(ls.opdict, value, nothing)
52-
# @show retop, value ls.opdict
5352
if retop !== nothing
5453
op = Operation(length(operations(ls)), assignedsym, elementbytes, :identity, compute, deps, reduceddependencies(retop), [retop])
5554
else

src/costs.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -253,6 +253,7 @@ Base.convert(::Type{Instruction}, instr::Symbol) = Instruction(instr)
253253
# instruction(f::Symbol, m) = f ∈ keys(COST) ? Instruction(f) : Instruction(m, f)
254254
# instruction(f::Symbol) = f ∈ keys(COST) ? Instruction(:LoopVectorization, f) : Instruction(Symbol(""), f)
255255
function instruction(f::Symbol)
256+
f === :ifelse && return Instruction(:LoopVectorization, :vifelse)
256257
# @assert f ∈ keys(COST)
257258
f keys(COST) ? Instruction(:LoopVectorization, f) : Instruction(Symbol(""), f)
258259
end

src/graphs.jl

Lines changed: 34 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -112,20 +112,6 @@ function vec_looprange(loopmax, UF::Int, mangledname, W)
112112
Expr(:call, :<, mangledname, compexpr)
113113
end
114114

115-
# function looprange(stopcon, incr::Int, mangledname::Symbol, ptrcomp::Bool, verbose)
116-
# if ptrcomp
117-
# looprange(stopcon, Expr(:call, lv(:vsub), staticmulincr(mangledname, incr), 1), callpointer(mangledname), verbose)
118-
# else
119-
# looprange(stopcon, incr - 1, mangledname)
120-
# end
121-
# end
122-
# function looprange(stopcon, incr, mangledname, verbose)
123-
# if verbose
124-
# Expr(:call, :<, :(@show $mangledname), :(@show $(subexpr(stopcon, incr))))
125-
# else
126-
# Expr(:call, :<, mangledname, subexpr(stopcon, incr))
127-
# end
128-
# end
129115
function looprange(stopcon, incr::Int, mangledname)
130116
if iszero(incr)
131117
Expr(:call, :, mangledname, stopcon)
@@ -562,6 +548,25 @@ function instruction!(ls::LoopSet, x::Expr)
562548
end
563549
instruction!(ls::LoopSet, x::Symbol) = instruction(x)
564550

551+
552+
function maybe_const_compute!(ls::LoopSet, op::Operation, elementbytes::Int, position::Int)
553+
if iscompute(op) && iszero(length(loopdependencies(op)))
554+
add_constant!(ls, mangledvar(op), ls.loopsymbols[1:position], gensym(instruction(op).instr), elementbytes, :numericconstant)
555+
else
556+
op
557+
end
558+
end
559+
function strip_op_linenumber_nodes(q::Expr)
560+
non_lnn_ind = 0
561+
for i eachindex(q.args)
562+
if !(q.args[i] isa LineNumberNode)
563+
@assert iszero(non_lnn_ind) "There should only be one non-LineNumberNode in the expression."
564+
non_lnn_ind = i
565+
end
566+
end
567+
q.args[non_lnn_ind]
568+
end
569+
565570
function add_operation!(
566571
ls::LoopSet, LHS::Symbol, RHS::Expr, elementbytes::Int, position::Int
567572
)
@@ -581,12 +586,16 @@ function add_operation!(
581586
end
582587
op
583588
else
589+
# maybe_const_compute!(ls, add_compute!(ls, LHS, RHS, elementbytes, position), elementbytes, position)
584590
add_compute!(ls, LHS, RHS, elementbytes, position)
585591
end
586592
elseif RHS.head === :if
587593
add_if!(ls, LHS, RHS, elementbytes, position)
594+
elseif RHS.head === :block
595+
add_operation!(ls, LHS, strip_op_linenumber_nodes(RHS), elementbytes, position)
588596
else
589-
throw("Expression not recognized:\n$RHS")
597+
println(RHS)
598+
throw("Expression not recognized.")
590599
end
591600
end
592601
add_operation!(ls::LoopSet, RHS::Expr, elementbytes::Int, position::Int) = add_operation!(ls, gensym(:LHS), RHS, elementbytes, position)
@@ -606,6 +615,7 @@ function add_operation!(
606615
elseif f === :zero || f === :one
607616
c = gensym(f)
608617
op = add_constant!(ls, c, ls.loopsymbols[1:position], LHS_sym, elementbytes, :numericconstant)
618+
# op = add_constant!(ls, c, Symbol[], LHS_sym, elementbytes, :numericconstant)
609619
if f === :zero
610620
push!(ls.preamble_zeros, (identifier(op), IntOrFloat))
611621
else
@@ -617,8 +627,11 @@ function add_operation!(
617627
end
618628
elseif RHS.head === :if
619629
add_if!(ls, LHS_sym, RHS, elementbytes, position, LHS_ref)
630+
elseif RHS.head === :block
631+
add_operation!(ls, LHS, strip_op_linenumber_nodes(RHS), elementbytes, position)
620632
else
621-
throw("Expression not recognized:\n$x")
633+
println(RHS)
634+
throw("Expression not recognized.")
622635
end
623636
end
624637

@@ -652,7 +665,7 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
652665
RHS = ex.args[2]
653666
if LHS isa Symbol
654667
if RHS isa Expr
655-
add_operation!(ls, LHS, RHS, elementbytes, position)
668+
maybe_const_compute!(ls, add_operation!(ls, LHS, RHS, elementbytes, position), elementbytes, position)
656669
else
657670
add_constant!(ls, RHS, ls.loopsymbols[1:position], LHS, elementbytes)
658671
end
@@ -669,7 +682,8 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
669682
add_store_ref!(ls, RHS, LHS, elementbytes)
670683
end
671684
else
672-
throw("LHS not understood:\n$LHS")
685+
println(LHS)
686+
throw("LHS not understood.")
673687
end
674688
elseif ex.head === :block
675689
add_block!(ls, ex, elementbytes, position)
@@ -692,7 +706,8 @@ function Base.push!(ls::LoopSet, ex::Expr, elementbytes::Int, position::Int)
692706
add_compute!(ls, LHS, :identity, [RHS], elementbytes)
693707
end
694708
else
695-
throw("Don't know how to handle expression:\n$ex")
709+
println(ex)
710+
throw("Don't know how to handle expression.")
696711
end
697712
end
698713

src/lower_compute.jl

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,6 @@ function lower_compute!(
166166
newparentname = Symbol(newparentname, suffix_)
167167
end
168168
if isconstant(newparentop)
169-
# @show i, parentstiled[i], newparentname, parentname
170169
push!(q.args, Expr(:(=), newparentname, Symbol(parentname, 0)))
171170
else
172171
for u 0:u₁-1
@@ -183,11 +182,6 @@ function lower_compute!(
183182
# parentsyms = [opp.variable for opp ∈ parents(op)]
184183
Uiter = opunrolled ? u₁ - 1 : 0
185184
isreduct = isreduction(op)
186-
# @show op opunrolled, optiled, isreduct, unrollsym
187-
# if instr.instr === :vfmadd_fast
188-
# diffdeps = !any(opp -> isload(opp) && all(in(loopdependencies(opp)), loopdependencies(op)), parents(op)) # want to instcombine when parent load's deps are superset
189-
# @show suffix, !isnothing(suffix), isreduct, diffdeps
190-
# end
191185
if !isnothing(suffix) && isreduct# && (iszero(suffix) || (ls.unrollspecification[].u₂ - 1 == suffix))
192186
# instrfid = findfirst(isequal(instr.instr), (:vfmadd, :vfnmadd, :vfmsub, :vfnmsub))
193187
instrfid = findfirst(isequal(instr.instr), (:vfmadd_fast, :vfnmadd_fast, :vfmsub_fast, :vfnmsub_fast))
@@ -199,7 +193,6 @@ function lower_compute!(
199193
instr = Instruction(specific_fmas[instrfid])
200194
end
201195
end
202-
# @show instr.instr
203196
reduceddeps = reduceddependencies(op)
204197
vecinreduceddeps = isreduct && vectorized reduceddeps
205198
maskreduct = !isnothing(mask) && vecinreduceddeps #any(opp -> opp.variable === var, parents_op)
@@ -234,7 +227,6 @@ function lower_compute!(
234227
add_loopvalue!(instrcall, loopval, ua, u)
235228
else
236229
parent = mangledvar(parents_op[n])
237-
# @show n, tiledouterreduction, parent
238230
if n == tiledouterreduction
239231
parent = Symbol(parent, modsuffix)
240232
else

src/lower_constant.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ function lower_constant!(
5656
instruction = op.instruction
5757
constsym = instruction.instr
5858
reducedchildvectorized = vectorized reducedchildren(op)
59-
if reducedchildvectorized || vectorized loopdependencies(op) || vectorized reduceddependencies(op)
59+
if reducedchildvectorized || vectorized loopdependencies(op) || vectorized reduceddependencies(op)
6060
# call = Expr(:call, lv(:vbroadcast), W, Expr(:call, lv(:maybeconvert), typeT, constsym))
6161
call = if reducedchildvectorized && vectorized loopdependencies(op)
6262
instrclass = getparentsreductzero(ls, op)

src/precompile.jl

Lines changed: 3 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,10 @@
11
function _precompile_()
22
ccall(:jl_generating_output, Cint, ()) == 1 || return nothing
33

4-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(+),Array{Float32,3}})
5-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(+),Array{Float64,3}})
6-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(max),Array{Float32,1}})
7-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(max),Array{Float32,3}})
8-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(max),Array{Float64,1}})
9-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(max),Array{Float64,3}})
10-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(min),Array{Float32,1}})
11-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(min),Array{Float32,3}})
12-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(min),Array{Float64,1}})
13-
Base.precompile(Tuple{Core.kwftype(typeof(LoopVectorization.vreduce)),NamedTuple{(:dims,),Tuple{Int}},typeof(vreduce),typeof(min),Array{Float64,3}})
14-
Base.precompile(Tuple{Type{LoopVectorization.ArrayRefStruct},LoopVectorization.LoopSet,LoopVectorization.ArrayReferenceMeta,Array{Symbol,1}})
154
Base.precompile(Tuple{Type{LoopVectorization.LoopSet},Array{Symbol,1},Array{Int,1},Array{LoopVectorization.Loop,1},Dict{Symbol,LoopVectorization.Operation},Array{LoopVectorization.Operation,1},Array{Int,1},Array{Int,1},LoopVectorization.LoopOrder,Expr,Expr,Array{Tuple{Int,Symbol},1},Array{Tuple{Int,Int},1},Array{Tuple{Int,Float64},1},Array{Int,1},Array{Int,1},Array{Symbol,1},Array{Symbol,1},Array{Symbol,1},Array{LoopVectorization.ArrayReferenceMeta,1},Array{Float64,2},Array{Float64,2},Array{Bool,1},Array{Bool,1},Base.RefValue{LoopVectorization.UnrollSpecification},Base.RefValue{Bool},Base.RefValue{LoopVectorization.LoopStartStopManager},Base.RefValue{Bool},Symbol})
165
Base.precompile(Tuple{Type{LoopVectorization.LoopSet},Expr,Symbol})
176
Base.precompile(Tuple{Type{LoopVectorization.LoopSet},Symbol})
18-
Base.precompile(Tuple{typeof(Base.mapreduce_impl),typeof(LoopVectorization.elsize),typeof(max),Array{LoopVectorization.Operation,1},Int,Int})
19-
Base.precompile(Tuple{typeof(LoopVectorization._avx_loopset),Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Any})
207
Base.precompile(Tuple{typeof(LoopVectorization.add_ci_call!),Expr,Any,Array{Any,1},Array{Symbol,1},Int,Symbol})
21-
Base.precompile(Tuple{typeof(LoopVectorization.add_ci_call!),Expr,Any,Array{Any,1},Array{Symbol,1},Int})
228
Base.precompile(Tuple{typeof(LoopVectorization.add_compute!),LoopVectorization.LoopSet,Symbol,Expr,Int,Int,LoopVectorization.ArrayReferenceMetaPosition})
239
Base.precompile(Tuple{typeof(LoopVectorization.add_compute!),LoopVectorization.LoopSet,Symbol,Expr,Int,Int,Nothing})
2410
Base.precompile(Tuple{typeof(LoopVectorization.add_constant!),LoopVectorization.LoopSet,Float64,Array{Symbol,1},Symbol,Int})
@@ -29,17 +15,15 @@ function _precompile_()
2915
Base.precompile(Tuple{typeof(LoopVectorization.array_reference_meta!),LoopVectorization.LoopSet,Symbol,SubArray{Any,1,Array{Any,1},Tuple{UnitRange{Int}},true},Int,Nothing})
3016
Base.precompile(Tuple{typeof(LoopVectorization.avx_loopset),Array{LoopVectorization.Instruction,1},Array{LoopVectorization.OperationStruct,1},Array{LoopVectorization.ArrayRefStruct,1},Core.SimpleVector,Core.SimpleVector,Core.SimpleVector,Any})
3117
Base.precompile(Tuple{typeof(LoopVectorization.check_macro_kwarg),Expr})
18+
Base.precompile(Tuple{typeof(LoopVectorization.choose_order_cost),LoopVectorization.LoopSet})
3219
Base.precompile(Tuple{typeof(LoopVectorization.cost_vec_buf),LoopVectorization.LoopSet})
3320
Base.precompile(Tuple{typeof(LoopVectorization.evaluate_cost_tile),LoopVectorization.LoopSet,Array{Symbol,1},LoopVectorization.UnrollSymbols})
3421
Base.precompile(Tuple{typeof(LoopVectorization.evaluate_cost_unroll),LoopVectorization.LoopSet,Array{Symbol,1},Symbol,Float64})
3522
Base.precompile(Tuple{typeof(LoopVectorization.generate_call),LoopVectorization.LoopSet,Tuple{Int8,Int8,Int8},Bool})
36-
Base.precompile(Tuple{typeof(LoopVectorization.prefetchisagoodidea),LoopVectorization.LoopSet,LoopVectorization.Operation,LoopVectorization.UnrollArgs{Int}})
23+
Base.precompile(Tuple{typeof(LoopVectorization.lower_and_split_loops),LoopVectorization.LoopSet,Int})
24+
Base.precompile(Tuple{typeof(LoopVectorization.matches),LoopVectorization.Operation,LoopVectorization.Operation})
3725
Base.precompile(Tuple{typeof(LoopVectorization.pushop!),LoopVectorization.LoopSet,LoopVectorization.Operation,Symbol})
38-
Base.precompile(Tuple{typeof(LoopVectorization.repeated_index!),LoopVectorization.LoopSet,Array{Symbol,1},Symbol,Int,Int})
39-
Base.precompile(Tuple{typeof(LoopVectorization.stride_penalty),LoopVectorization.LoopSet,Array{Symbol,1}})
4026
Base.precompile(Tuple{typeof(LoopVectorization.substitute_broadcast),Expr,Symbol})
41-
Base.precompile(Tuple{typeof(LoopVectorization.vmap_quote),Int,Type{Float32}})
4227
Base.precompile(Tuple{typeof(println),Base.GenericIOBuffer{Array{UInt8,1}},Array{LoopVectorization.Operation,1}})
4328
Base.precompile(Tuple{typeof(push!),LoopVectorization.LoopSet,Expr,Int,Int})
4429
end
45-

test/ifelsemasks.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,11 @@ T = Float32
8080
end
8181
function addormulavx!(c, a, b)
8282
@avx for i eachindex(c,a,b)
83-
c[i] = a[i] > b[i] ? a[i] + b[i] : a[i] * b[i]
83+
c[i] = if a[i] > b[i]
84+
a[i] + b[i]
85+
else
86+
a[i] * b[i]
87+
end
8488
end
8589
end
8690
function addormulp1!(c, a, b)

test/mapreduce.jl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,13 @@
88
end
99
s
1010
end
11+
function minimum_avx(x)
12+
s = typemax(eltype(x))
13+
@avx for i in eachindex(x)
14+
s = min(s, x[i])
15+
end
16+
s
17+
end
1118
for T (Int32, Int64, Float32, Float64)
1219
@show T, @__LINE__
1320
if T <: Integer
@@ -38,6 +45,7 @@
3845
@test vmapreduce(log, +, x) sum(log, x)
3946
@test vmapreduce(abs2, +, x) sum(abs2, x)
4047
@test maximum(x) == vreduce(max, x) == maximum_avx(x)
48+
@test minimum(x) == vreduce(min, x) == minimum_avx(x)
4149

4250
@test vreduce(max, vec(x); dims = 1) == maximum(vec(x); dims = 1)
4351
@test vreduce(min, vec(x); dims = 1) == minimum(vec(x); dims = 1)

0 commit comments

Comments
 (0)