|
1 | 1 |
|
2 |
| - |
3 |
| -# function indexappearences(op::Operation, s::Symbol) |
4 |
| -# s ∉ loopdependencies(op) && return 0 |
5 |
| -# appearences = 0 |
6 |
| -# if isloopvalue(op) |
7 |
| -# return s === first(loopdependencies(op)) ? 1 : 0 |
8 |
| -# elseif isload(op) |
9 |
| -# return 100 |
10 |
| -# end |
11 |
| -# newapp = 0 |
12 |
| -# for opp ∈ parents(op) |
13 |
| -# newapp += indexappearences(opp, s) |
14 |
| -# end |
15 |
| -# factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :vadd_fast) ? 1 : 10 |
16 |
| -# newapp * factor |
17 |
| -# end |
18 | 2 | function check_linear_parents(ls::LoopSet, op::Operation, s::Symbol)
|
19 | 3 | (s ∈ loopdependencies(op)) || return true
|
20 | 4 | if isload(op) # TODO: handle loading from ranges.
|
@@ -155,44 +139,44 @@ end
|
155 | 139 | function evaluate_cost_unroll(
|
156 | 140 | ls::LoopSet, order::Vector{Symbol}, vloopsym::Symbol, max_cost = typemax(Float64)
|
157 | 141 | )
|
158 |
| - included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false) |
159 |
| - nested_loop_syms = Symbol[]#Set{Symbol}() |
160 |
| - total_cost = 0.0 |
161 |
| - iter = 1.0 |
162 |
| - size_T = biggest_type_size(ls) |
163 |
| - W, Wshift = lsvecwidthshift(ls, vloopsym, size_T) |
164 |
| - # Need to check if fusion is possible |
165 |
| - for itersym ∈ order |
166 |
| - cacheunrolled!(ls, itersym, Symbol(""), vloopsym) |
167 |
| - # Add to set of defined symbles |
168 |
| - push!(nested_loop_syms, itersym) |
169 |
| - looplength = length(ls, itersym) |
170 |
| - liter = itersym === vloopsym ? num_iterations(looplength, W) : looplength |
171 |
| - iter *= liter |
172 |
| - # check which vars we can define at this level of loop nest |
173 |
| - for (id,op) ∈ enumerate(operations(ls)) |
174 |
| - # won't define if already defined... |
175 |
| - # id = identifier(op) |
176 |
| - included_vars[id] && continue |
177 |
| - # it must also be a subset of defined symbols |
178 |
| - loopdependencies(op) ⊆ nested_loop_syms || continue |
179 |
| - # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf |
180 |
| - rd = reduceddependencies(op) |
181 |
| - hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf |
182 |
| - if isstore(op) #TODO: DRY (this is repeated in evaluate_cost_tile) |
183 |
| - loadstoredeps = store_load_deps(op) |
184 |
| - if !(loadstoredeps === nothing) |
185 |
| - any(s -> (s ∉ loadstoredeps), nested_loop_syms) && return Inf |
186 |
| - end |
187 |
| - end |
188 |
| - included_vars[id] = true |
189 |
| - # TODO: use actual unrolls here? |
190 |
| - c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T)) |
191 |
| - total_cost += iter * c |
192 |
| - total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest |
| 142 | + included_vars = fill!(resize!(ls.included_vars, length(operations(ls))), false) |
| 143 | + nested_loop_syms = Symbol[]#Set{Symbol}() |
| 144 | + total_cost = 0.0 |
| 145 | + iter = 1.0 |
| 146 | + size_T = biggest_type_size(ls) |
| 147 | + W, Wshift = lsvecwidthshift(ls, vloopsym, size_T) |
| 148 | + # Need to check if fusion is possible |
| 149 | + for itersym ∈ order |
| 150 | + cacheunrolled!(ls, itersym, Symbol(""), vloopsym) |
| 151 | + # Add to set of defined symbles |
| 152 | + push!(nested_loop_syms, itersym) |
| 153 | + looplength = length(ls, itersym) |
| 154 | + liter = itersym === vloopsym ? num_iterations(looplength, W) : looplength |
| 155 | + iter *= liter |
| 156 | + # check which vars we can define at this level of loop nest |
| 157 | + for (id,op) ∈ enumerate(operations(ls)) |
| 158 | + # won't define if already defined... |
| 159 | + # id = identifier(op) |
| 160 | + included_vars[id] && continue |
| 161 | + # it must also be a subset of defined symbols |
| 162 | + loopdependencies(op) ⊆ nested_loop_syms || continue |
| 163 | + # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf |
| 164 | + rd = reduceddependencies(op) |
| 165 | + hasintersection(rd, @view(nested_loop_syms[1:end-length(rd)])) && return Inf |
| 166 | + if isstore(op) #TODO: DRY (this is repeated in evaluate_cost_tile) |
| 167 | + loadstoredeps = store_load_deps(op) |
| 168 | + if !(loadstoredeps === nothing) |
| 169 | + any(s -> (s ∉ loadstoredeps), nested_loop_syms) && return Inf |
193 | 170 | end
|
| 171 | + end |
| 172 | + included_vars[id] = true |
| 173 | + # TODO: use actual unrolls here? |
| 174 | + c = first(cost(ls, op, (Symbol(""),Symbol("")), vloopsym, Wshift, size_T)) |
| 175 | + total_cost += iter * c |
| 176 | + 0.9total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest |
194 | 177 | end
|
195 |
| - 0.999total_cost + stride_penalty(ls, order) # 0.999 to place finger on scale in its favor |
| 178 | + end |
| 179 | + 0.9total_cost + stride_penalty(ls, order) # 0.999 to place finger on scale in its favor |
196 | 180 | end
|
197 | 181 |
|
198 | 182 | # only covers vectorized ops; everything else considered lifted?
|
@@ -275,7 +259,7 @@ function unroll_no_reductions(ls, order, vloopsym)
|
275 | 259 | elseif iszero(load_rt)
|
276 | 260 | iszero(store_rt) ? 4 : max(1, min(4, round(Int, 2compute_rt / store_rt)))
|
277 | 261 | else
|
278 |
| - max(1, min(4, round(Int, 2compute_rt / load_rt))) |
| 262 | + max(1, min(4, round(Int, 1.75compute_rt / load_rt))) |
279 | 263 | end
|
280 | 264 | # u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
|
281 | 265 | # commented out here is to decide to align loops
|
@@ -1062,7 +1046,8 @@ function evaluate_cost_tile!(
|
1062 | 1046 | # println("constoffelim")
|
1063 | 1047 | continue
|
1064 | 1048 | elseif load_elimination_cost_factor!(cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, Wshift, size_T)
|
1065 |
| - # println("loadelim") |
| 1049 | + # println("loadelim") |
| 1050 | + # A[i,j-1], A[i,j] |
1066 | 1051 | continue
|
1067 | 1052 | end
|
1068 | 1053 | #elseif isconstant(op)
|
|
0 commit comments