Skip to content

Commit 8430e5c

Browse files
committed
Merge branch 'master' of github.com:JuliaSIMD/LoopVectorization.jl
2 parents 3738a05 + 6736394 commit 8430e5c

File tree

7 files changed

+93
-107
lines changed

7 files changed

+93
-107
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.36"
4+
version = "0.12.39"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -30,5 +30,5 @@ Static = "0.2"
3030
StrideArraysCore = "0.1.12"
3131
ThreadingUtilities = "0.4.2"
3232
UnPack = "1"
33-
VectorizationBase = "0.20.16"
33+
VectorizationBase = "0.20.17"
3434
julia = "1.5"

src/broadcast.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -464,9 +464,9 @@ end
464464
@inline function vmaterialize(
465465
bc::Broadcasted, ::Val{Mod}, ::Val{UNROLL}
466466
) where {Mod,UNROLL}
467-
ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
468-
dest = similar(bc, ElType)
469-
vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}())
467+
ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
468+
dest = similar(bc, ElType)
469+
vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}())
470470
end
471471

472472
vmaterialize!(dest, bc, ::Val, ::Val, ::StaticInt, ::StaticInt, ::StaticInt) = Base.Broadcast.materialize!(dest, bc)

src/codegen/split_loops.jl

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -89,39 +89,41 @@ function returned_ops(ls::LoopSet)
8989
end
9090

9191
function lower_and_split_loops(ls::LoopSet, inline::Int)
92-
split_candidates = returned_ops(ls)
93-
length(split_candidates) > 1 || return lower(ls, inline)
94-
order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, cost_fused, shouldinline_fused = choose_order_cost(ls)
95-
remaining_ops = Vector{Int}(undef, length(split_candidates) - 1); split_1 = Int[0];
96-
# for (ind,i) ∈ enumerate(split_candidates)
97-
for (ind,i) enumerate(split_candidates)
98-
split_1[1] = i
99-
ls_1 = split_loopset(ls, split_1)
100-
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost(ls_1)
101-
remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
102-
ls_2 = split_loopset(ls, remaining_ops)
103-
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
104-
# U_1 = T_1 = U_2 = T_2 = 2
105-
#@show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
106-
if cost_1 + cost_2 cost_fused
107-
ls_2_lowered = if length(remaining_ops) > 1
108-
inline = iszero(inline) ? (shouldinline_1 % Int) : inline
109-
lower_and_split_loops(ls_2, inline)
110-
else
111-
doinline = inlinedecision(inline, shouldinline_1 | shouldinline_2)
112-
lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, doinline)
113-
end
114-
return Expr(
115-
:block,
116-
ls.preamble,
117-
lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, false),
118-
ls_2_lowered,
119-
nothing
120-
)
121-
end
92+
split_candidates = returned_ops(ls)
93+
length(split_candidates) > 1 || return lower(ls, inline)
94+
order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, cost_fused, shouldinline_fused = choose_order_cost(ls)
95+
remaining_ops = Vector{Int}(undef, length(split_candidates) - 1); split_1 = Int[0];
96+
# for (ind,i) ∈ enumerate(split_candidates)
97+
for (ind,i) enumerate(split_candidates)
98+
split_1[1] = i
99+
ls_1 = split_loopset(ls, split_1)
100+
order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, cost_1, shouldinline_1 = choose_order_cost(ls_1)
101+
remaining_ops[1:ind-1] .= @view(split_candidates[1:ind-1]); remaining_ops[ind:end] .= @view(split_candidates[ind+1:end])
102+
ls_2 = split_loopset(ls, remaining_ops)
103+
order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, cost_2, shouldinline_2 = choose_order_cost(ls_2)
104+
# U_1 = T_1 = U_2 = T_2 = 2
105+
# return ls_1, ls_2
106+
# @show cost_1 + cost_2 ≤ cost_fused, cost_1, cost_2, cost_fused
107+
if cost_1 + cost_2 0.9cost_fused
108+
ls_2_lowered = if length(remaining_ops) > 1
109+
inline = iszero(inline) ? (shouldinline_1 % Int) : inline
110+
lower_and_split_loops(ls_2, inline)
111+
else
112+
doinline = inlinedecision(inline, shouldinline_1 | shouldinline_2)
113+
lower(ls_2, order_2, unrolled_2, tiled_2, vectorized_2, U_2, T_2, doinline)
114+
end
115+
return Expr(
116+
:block,
117+
ls.preamble,
118+
lower(ls_1, order_1, unrolled_1, tiled_1, vectorized_1, U_1, T_1, false),
119+
ls_2_lowered,
120+
nothing
121+
)
122122
end
123-
doinline = inlinedecision(inline, shouldinline_fused)
124-
lower(ls, order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, doinline)
123+
length(split_candidates) == 2 && break
124+
end
125+
doinline = inlinedecision(inline, shouldinline_fused)
126+
lower(ls, order_fused, unrolled_fused, tiled_fused, vectorized_fused, U_fused, T_fused, doinline)
125127
end
126128

127129

src/modeling/determinestrategy.jl

Lines changed: 39 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,4 @@
11

2-
3-
# function indexappearences(op::Operation, s::Symbol)
4-
# s ∉ loopdependencies(op) && return 0
5-
# appearences = 0
6-
# if isloopvalue(op)
7-
# return s === first(loopdependencies(op)) ? 1 : 0
8-
# elseif isload(op)
9-
# return 100
10-
# end
11-
# newapp = 0
12-
# for opp ∈ parents(op)
13-
# newapp += indexappearences(opp, s)
14-
# end
15-
# factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :vadd_fast) ? 1 : 10
16-
# newapp * factor
17-
# end
182
function check_linear_parents(ls::LoopSet, op::Operation, s::Symbol)
193
(s loopdependencies(op)) || return true
204
if isload(op) # TODO: handle loading from ranges.
@@ -155,44 +139,44 @@ end
155139
function evaluate_cost_unroll(
156140
ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol, max_cost = typemax(Float64)
157141
)
158-
included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false)
159-
nested_loop_syms = Symbol[]#Set{Symbol}()
160-
total_cost = 0.0
161-
iter = 1.0
162-
size_T = biggest_type_size(ls)
163-
W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
164-
# Need to check if fusion is possible
165-
for itersym order
166-
cacheunrolled!(ls, itersym, Symbol(""), vloopsym)
167-
# Add to set of defined symbles
168-
push!(nested_loop_syms, itersym)
169-
looplength = length(ls, itersym)
170-
liter = itersym === vloopsym ? num_iterations(looplength, W) : looplength
171-
iter *= liter
172-
# check which vars we can define at this level of loop nest
173-
for (id,op) enumerate(operations(ls))
174-
# won't define if already defined...
175-
# id = identifier(op)
176-
included_vars[id] && continue
177-
# it must also be a subset of defined symbols
178-
loopdependencies(op) nested_loop_syms || continue
179-
# hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
180-
rd = reduceddependencies(op)
181-
hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf
182-
if isstore(op) #TODO: DRY (this is repeated in evaluate_cost_tile)
183-
loadstoredeps = store_load_deps(op)
184-
if !(loadstoredeps === nothing)
185-
any(s -> (s loadstoredeps), nested_loop_syms) && return Inf
186-
end
187-
end
188-
included_vars[id] = true
189-
# TODO: use actual unrolls here?
190-
c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
191-
total_cost += iter * c
192-
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
142+
included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false)
143+
nested_loop_syms = Symbol[]#Set{Symbol}()
144+
total_cost = 0.0
145+
iter = 1.0
146+
size_T = biggest_type_size(ls)
147+
W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
148+
# Need to check if fusion is possible
149+
for itersym order
150+
cacheunrolled!(ls, itersym, Symbol(""), vloopsym)
151+
# Add to set of defined symbles
152+
push!(nested_loop_syms, itersym)
153+
looplength = length(ls, itersym)
154+
liter = itersym === vloopsym ? num_iterations(looplength, W) : looplength
155+
iter *= liter
156+
# check which vars we can define at this level of loop nest
157+
for (id,op) enumerate(operations(ls))
158+
# won't define if already defined...
159+
# id = identifier(op)
160+
included_vars[id] && continue
161+
# it must also be a subset of defined symbols
162+
loopdependencies(op) nested_loop_syms || continue
163+
# hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
164+
rd = reduceddependencies(op)
165+
hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf
166+
if isstore(op) #TODO: DRY (this is repeated in evaluate_cost_tile)
167+
loadstoredeps = store_load_deps(op)
168+
if !(loadstoredeps === nothing)
169+
any(s -> (s loadstoredeps), nested_loop_syms) && return Inf
193170
end
171+
end
172+
included_vars[id] = true
173+
# TODO: use actual unrolls here?
174+
c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
175+
total_cost += iter * c
176+
0.9total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
194177
end
195-
0.999total_cost + stride_penalty(ls, order) # 0.999 to place finger on scale in its favor
178+
end
179+
0.9total_cost + stride_penalty(ls, order) # 0.999 to place finger on scale in its favor
196180
end
197181

198182
# only covers vectorized ops; everything else considered lifted?
@@ -275,7 +259,7 @@ function unroll_no_reductions(ls, order, vloopsym)
275259
elseif iszero(load_rt)
276260
iszero(store_rt) ? 4 : max(1, min(4, round(Int, 2compute_rt / store_rt)))
277261
else
278-
max(1, min(4, round(Int, 2compute_rt / load_rt)))
262+
max(1, min(4, round(Int, 1.75compute_rt / load_rt)))
279263
end
280264
# u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
281265
# commented out here is to decide to align loops
@@ -1062,7 +1046,8 @@ function evaluate_cost_tile!(
10621046
# println("constoffelim")
10631047
continue
10641048
elseif load_elimination_cost_factor!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, Wshift, size_T)
1065-
# println("loadelim")
1049+
# println("loadelim")
1050+
# A[i,j-1], A[i,j]
10661051
continue
10671052
end
10681053
#elseif isconstant(op)

src/modeling/graphs.jl

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -971,21 +971,20 @@ end
971971
# end
972972
# instruction(ls::LoopSet, f::Symbol) = instruction!(ls, f)
973973
function instruction!(ls::LoopSet, x::Expr)
974-
# x isa Symbol && return x
975-
if x.head === :$
976-
_x = only(x.args)
977-
_x isa Symbol && return instruction!(ls, _x)
978-
@assert _x isa Expr
979-
x = _x
980-
end
974+
# x isa Symbol && return x
975+
if x.head === :$
976+
_x = only(x.args)
977+
_x isa Symbol && return instruction!(ls, _x)
978+
@assert _x isa Expr
979+
x = _x
980+
end
981+
if x.head :(->)
981982
instr = last(x.args).value
982-
if instr keys(COST)
983-
instr = gensym!(ls, "f")
984-
pushpreamble!(ls, Expr(:(=), instr, x))
985-
Instruction(Symbol(""), instr)
986-
else
987-
Instruction(:LoopVectorization, instr)
988-
end
983+
instr keys(COST) && return Instruction(:LoopVectorization, instr)
984+
end
985+
instr = gensym!(ls, "f")
986+
pushpreamble!(ls, Expr(:(=), instr, x))
987+
Instruction(Symbol(""), instr)
989988
end
990989
instruction!(ls::LoopSet, x::Symbol) = instruction(x)
991990
function instruction!(ls::LoopSet, f::F) where {F <: Function}

test/miscellaneous.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ function findreducedparentfornonvecstoreavx!(U::AbstractMatrix{T}, E1::AbstractV
735735
U[i,j] = t
736736
_s += a * (1 - t^2)
737737
end
738-
E1[j] = _s / n
738+
E1[j] = (x -> x / n)(_s)
739739
end
740740
U,E1
741741
end

test/runtests.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ const START_TIME = time()
1414
@time @testset "LoopVectorization.jl" begin
1515

1616
@time if LOOPVECTORIZATION_TEST == "all" || LOOPVECTORIZATION_TEST == "part1"
17-
@time Aqua.test_all(LoopVectorization)
17+
@time Aqua.test_all(LoopVectorization, ambiguities = VERSION v"1.6")
1818
# @test isempty(detect_unbound_args(LoopVectorization))
1919

2020
@time include("printmethods.jl")

0 commit comments

Comments
 (0)