Skip to content

Commit 500ae18

Browse files
committed
Cost modeling tweaks to make unrolling less aggressive. Bump version
1 parent a97368d commit 500ae18

File tree

3 files changed

+42
-56
lines changed

3 files changed

+42
-56
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.12.37"
4+
version = "0.12.38"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"

src/broadcast.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -464,7 +464,8 @@ end
464464
@inline function vmaterialize(
465465
bc::Broadcasted, ::Val{Mod}, ::Val{UNROLL}
466466
) where {Mod,UNROLL}
467-
ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
467+
ElType = Base.Broadcast.combine_eltypes(bc.f, bc.args)
468+
@show ElType
468469
dest = similar(bc, ElType)
469470
vmaterialize!(dest, bc, Val{Mod}(), Val{UNROLL}())
470471
end

src/modeling/determinestrategy.jl

Lines changed: 39 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,4 @@
11

2-
3-
# function indexappearences(op::Operation, s::Symbol)
4-
# s ∉ loopdependencies(op) && return 0
5-
# appearences = 0
6-
# if isloopvalue(op)
7-
# return s === first(loopdependencies(op)) ? 1 : 0
8-
# elseif isload(op)
9-
# return 100
10-
# end
11-
# newapp = 0
12-
# for opp ∈ parents(op)
13-
# newapp += indexappearences(opp, s)
14-
# end
15-
# factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :vadd_fast) ? 1 : 10
16-
# newapp * factor
17-
# end
182
function check_linear_parents(ls::LoopSet, op::Operation, s::Symbol)
193
(s loopdependencies(op)) || return true
204
if isload(op) # TODO: handle loading from ranges.
@@ -155,44 +139,44 @@ end
155139
function evaluate_cost_unroll(
156140
ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol, max_cost = typemax(Float64)
157141
)
158-
included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false)
159-
nested_loop_syms = Symbol[]#Set{Symbol}()
160-
total_cost = 0.0
161-
iter = 1.0
162-
size_T = biggest_type_size(ls)
163-
W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
164-
# Need to check if fusion is possible
165-
for itersym order
166-
cacheunrolled!(ls, itersym, Symbol(""), vloopsym)
167-
# Add to set of defined symbles
168-
push!(nested_loop_syms, itersym)
169-
looplength = length(ls, itersym)
170-
liter = itersym === vloopsym ? num_iterations(looplength, W) : looplength
171-
iter *= liter
172-
# check which vars we can define at this level of loop nest
173-
for (id,op) enumerate(operations(ls))
174-
# won't define if already defined...
175-
# id = identifier(op)
176-
included_vars[id] && continue
177-
# it must also be a subset of defined symbols
178-
loopdependencies(op) nested_loop_syms || continue
179-
# hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
180-
rd = reduceddependencies(op)
181-
hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf
182-
if isstore(op) #TODO: DRY (this is repeated in evaluate_cost_tile)
183-
loadstoredeps = store_load_deps(op)
184-
if !(loadstoredeps === nothing)
185-
any(s -> (s loadstoredeps), nested_loop_syms) && return Inf
186-
end
187-
end
188-
included_vars[id] = true
189-
# TODO: use actual unrolls here?
190-
c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
191-
total_cost += iter * c
192-
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
142+
included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false)
143+
nested_loop_syms = Symbol[]#Set{Symbol}()
144+
total_cost = 0.0
145+
iter = 1.0
146+
size_T = biggest_type_size(ls)
147+
W, Wshift = lsvecwidthshift(ls, vloopsym, size_T)
148+
# Need to check if fusion is possible
149+
for itersym order
150+
cacheunrolled!(ls, itersym, Symbol(""), vloopsym)
151+
# Add to set of defined symbles
152+
push!(nested_loop_syms, itersym)
153+
looplength = length(ls, itersym)
154+
liter = itersym === vloopsym ? num_iterations(looplength, W) : looplength
155+
iter *= liter
156+
# check which vars we can define at this level of loop nest
157+
for (id,op) enumerate(operations(ls))
158+
# won't define if already defined...
159+
# id = identifier(op)
160+
included_vars[id] && continue
161+
# it must also be a subset of defined symbols
162+
loopdependencies(op) nested_loop_syms || continue
163+
# hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
164+
rd = reduceddependencies(op)
165+
hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf
166+
if isstore(op) #TODO: DRY (this is repeated in evaluate_cost_tile)
167+
loadstoredeps = store_load_deps(op)
168+
if !(loadstoredeps === nothing)
169+
any(s -> (s loadstoredeps), nested_loop_syms) && return Inf
193170
end
171+
end
172+
included_vars[id] = true
173+
# TODO: use actual unrolls here?
174+
c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T))
175+
total_cost += iter * c
176+
0.9total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
194177
end
195-
0.999total_cost + stride_penalty(ls, order) # 0.999 to place finger on scale in its favor
178+
end
179+
0.9total_cost + stride_penalty(ls, order) # 0.999 to place finger on scale in its favor
196180
end
197181

198182
# only covers vectorized ops; everything else considered lifted?
@@ -275,7 +259,7 @@ function unroll_no_reductions(ls, order, vloopsym)
275259
elseif iszero(load_rt)
276260
iszero(store_rt) ? 4 : max(1, min(4, round(Int, 2compute_rt / store_rt)))
277261
else
278-
max(1, min(4, round(Int, 2compute_rt / load_rt)))
262+
max(1, min(4, round(Int, 1.75compute_rt / load_rt)))
279263
end
280264
# u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
281265
# commented out here is to decide to align loops
@@ -1062,7 +1046,8 @@ function evaluate_cost_tile!(
10621046
# println("constoffelim")
10631047
continue
10641048
elseif load_elimination_cost_factor!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, Wshift, size_T)
1065-
# println("loadelim")
1049+
# println("loadelim")
1050+
# A[i,j-1], A[i,j]
10661051
continue
10671052
end
10681053
#elseif isconstant(op)

0 commit comments

Comments
 (0)