@@ -67,7 +67,6 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
67
67
srt, sl, srp = opisvectorized ? vector_cost (instr, Wshift, size_T) : scalar_cost (instr)
68
68
if accesses_memory (op)
69
69
# either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
70
- # @show instr, vectorized, loopdependencies(op), unitstride(op, vectorized)
71
70
if opisvectorized
72
71
if ! unitstride (ls, op, vectorized)# || !isdense(op) # need gather/scatter
73
72
r = (1 << Wshift)
@@ -131,12 +130,11 @@ function evaluate_cost_unroll(
131
130
rd = reduceddependencies (op)
132
131
hasintersection (rd, @view (nested_loop_syms[1 : end - length (rd)])) && return Inf
133
132
included_vars[id] = true
134
- # @show op first(cost(op, vectorized, Wshift, size_T)), iter
135
133
total_cost += iter * first (cost (ls, op, vectorized, Wshift, size_T))
136
134
total_cost > max_cost && return total_cost # abort if more expensive; we only want to know the cheapest
137
135
end
138
136
end
139
- total_cost + stride_penalty (ls, order)
137
+ total_cost + stride_penalty (ls, order) - 1.0 # -1.0 to place finger on scale in its favor
140
138
end
141
139
142
140
# only covers vectorized ops; everything else considered lifted?
@@ -163,13 +161,16 @@ function parentsnotreduction(op::Operation)
163
161
end
164
162
return true
165
163
end
166
- function roundpow2 (i:: Integer )
167
- u = VectorizationBase. nextpow2 (i)
168
- l = u >>> 1
169
- ud = u - i
170
- ld = i - l
171
- ud > ld ? l : u
172
- end
164
+ # function roundpow2(i::Integer)
165
+ # u = VectorizationBase.nextpow2(i)
166
+ # l = u >>> 1
167
+ # ud = u - i
168
+ # ld = i - l
169
+ # ud > ld ? l : u
170
+ # end
171
+ # function roundpow2(x::Float64)
172
+ # 1 << round(Int, log2(x))
173
+ # end
173
174
function unroll_no_reductions (ls, order, vectorized)
174
175
size_T = biggest_type_size (ls)
175
176
W, Wshift = VectorizationBase. pick_vector_width_shift (length (ls, vectorized), size_T):: Tuple{Int,Int}
@@ -190,10 +191,10 @@ function unroll_no_reductions(ls, order, vectorized)
190
191
end
191
192
end
192
193
# heuristic guess
193
- # @show compute_rt, load_rt
194
194
# roundpow2(min(4, round(Int, (compute_rt + load_rt + 1) / compute_rt)))
195
195
rt = max (compute_rt, load_rt)
196
- (iszero (rt) ? 4 : max (1 , roundpow2 ( min ( 4 , round (Int, 16 / rt) ) ))), unrolled
196
+ # (iszero(rt) ? 4 : max(1, roundpow2( min( 4, round(Int, 16 / rt) ) ))), unrolled
197
+ (iszero (rt) ? 4 : max (1 , VectorizationBase. nextpow2 ( min ( 4 , round (Int, 16 / rt) ) ))), unrolled
197
198
end
198
199
function determine_unroll_factor (
199
200
ls:: LoopSet , order:: Vector{Symbol} , unrolled:: Symbol , vectorized:: Symbol
@@ -204,17 +205,24 @@ function determine_unroll_factor(
204
205
# So if num_reductions > 0, we set the unroll factor to be high enough so that the CPU can be kept busy
205
206
# if there are, U = max(1, round(Int, max(latency) * throughput / num_reductions)) = max(1, round(Int, latency / (recip_throughput * num_reductions)))
206
207
# We also make sure register pressure is not too high.
207
- latency = 0
208
+ latency = 1
209
+ # compute_recip_throughput_u = 0.0
208
210
compute_recip_throughput = 0.0
209
211
visited_nodes = fill (false , length (operations (ls)))
210
212
load_recip_throughput = 0.0
211
213
store_recip_throughput = 0.0
212
214
for op ∈ operations (ls)
213
- dependson (op, unrolled) || continue
215
+ # dependson(op, unrolled) || continue
214
216
if isreduction (op)
215
217
rt, sl = depchain_cost! (ls, visited_nodes, op, vectorized, Wshift, size_T)
216
- latency = max (sl, latency)
218
+ if isouterreduction (op) != - 1 || unrolled ∉ reduceddependencies (op)
219
+ latency = max (sl, latency)
220
+ end
221
+ # if unrolled ∈ loopdependencies(op)
222
+ # compute_recip_throughput_u += rt
223
+ # else
217
224
compute_recip_throughput += rt
225
+ # end
218
226
elseif isload (op)
219
227
load_recip_throughput += first (cost (ls, op, vectorized, Wshift, size_T))
220
228
elseif isstore (op)
@@ -247,19 +255,20 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vectorized:
247
255
# if more than 1 loop, there is some cost. Picking 2 here as a heuristic.
248
256
return unroll_no_reductions (ls, order, vectorized)
249
257
end
250
-
258
+ innermost_loop = last (order)
251
259
rt = Inf ; rtcomp = Inf ; latency = Inf ; best_unrolled = Symbol (" " )
252
260
for unrolled ∈ order
253
261
rttemp, ltemp = determine_unroll_factor (ls, order, unrolled, vectorized)
254
- rtcomptemp = rttemp + (0.01 * (vectorized === unrolled))
262
+ rtcomptemp = rttemp + (0.01 * (( vectorized === unrolled) + (unrolled === innermost_loop) - latency ))
255
263
if rtcomptemp < rtcomp
256
264
rt = rttemp
257
265
rtcomp = rtcomptemp
258
266
latency = ltemp
259
267
best_unrolled = unrolled
260
268
end
261
269
end
262
- min (8 , roundpow2 (max (1 , round (Int, latency / (rt * num_reductions) ) ))), best_unrolled
270
+ # min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
271
+ min (8 , VectorizationBase. nextpow2 (max (1 , round (Int, latency / (rt * num_reductions) ) ))), best_unrolled
263
272
end
264
273
265
274
function unroll_cost (X, u₁, u₂, u₁L, u₂L)
273
282
# u₁b = 1; u₂b = 1
274
283
# for u₁ ∈ 1:4, u₂ ∈ 1:4
275
284
# c = unroll_cost(X, u₁, u₂, u₁L, u₂L)
276
- # @show u₁, u₂, c
277
285
# if cb > c
278
286
# cb = c
279
287
# u₁b = u₁; u₂b = u₂
@@ -679,7 +687,6 @@ function evaluate_cost_tile(
679
687
# cost_mat[2] / ( u₂loopsym)
680
688
# cost_mat[3] / ( unrolled)
681
689
# cost_mat[4]
682
- # @show order
683
690
cost_vec = cost_vec_buf (ls)
684
691
reg_pressure = reg_pres_buf (ls)
685
692
# @inbounds reg_pressure[2] = 1
@@ -708,8 +715,6 @@ function evaluate_cost_tile(
708
715
included_vars[id] && continue
709
716
# it must also be a subset of defined symbols
710
717
all (ld -> ld ∈ nested_loop_syms, loopdependencies (op)) || continue
711
- # # @show nested_loop_syms
712
- # # @show reduceddependencies(op)
713
718
rd = reduceddependencies (op)
714
719
hasintersection (rd, @view (nested_loop_syms[1 : end - length (rd)])) && return 0 ,0 ,Inf ,false
715
720
included_vars[id] = true
@@ -720,7 +725,6 @@ function evaluate_cost_tile(
720
725
# reduced_by_unrolling[2,id] = (u₂reached | depends_on_u₁) & !depends_on_u₂
721
726
reduced_by_unrolling[1 ,id] = (u₁reached) & ! depends_on_u₁
722
727
reduced_by_unrolling[2 ,id] = (u₂reached) & ! depends_on_u₂
723
- # @show op iter, unrolledu₂loopsym[:,id]
724
728
iters[id] = iter
725
729
innerloop ∈ loopdependencies (op) && set_upstream_family! (descendentsininnerloop, op, true )
726
730
end
@@ -730,7 +734,6 @@ function evaluate_cost_tile(
730
734
opisininnerloop = descendentsininnerloop[id]
731
735
732
736
u₁reduces, u₂reduces = reduced_by_unrolling[1 ,id], reduced_by_unrolling[2 ,id]
733
- # @show op, u₁reduces, u₂reduces
734
737
if isload (op)
735
738
if add_constant_offset_load_elmination_cost! (cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, u₁reduces, u₂reduces, Wshift, size_T, opisininnerloop)
736
739
continue
@@ -743,34 +746,26 @@ function evaluate_cost_tile(
743
746
rt += 0.5 VectorizationBase. REGISTER_SIZE / VectorizationBase. CACHELINE_SIZE
744
747
prefetch_good_idea = true
745
748
end
746
- # @show isunrolled₁, isunrolled₂, op rt, lat, rp
747
749
rp = (opisininnerloop && ! (loadintostore (ls, op))) ? rp : zero (rp) # we only care about register pressure within the inner most loop
748
750
# rp = opisininnerloop ? rp : zero(rp) # we only care about register pressure within the inner most loop
749
751
rto = rt
750
752
rt *= iters[id]
751
753
if u₁reduces & u₂reduces
752
- # @show op 4, rto, iters[id], lat, rp
753
754
cost_vec[4 ] += rt
754
755
reg_pressure[4 ] += rp
755
756
elseif u₂reduces # cost decreased by unrolling u₂loop
756
- # @show op 2, rto, iters[id], lat, rp
757
757
cost_vec[2 ] += rt
758
758
reg_pressure[2 ] += rp
759
759
elseif u₁reduces # cost decreased by unrolling u₁loop
760
- # @show op 3, rto, iters[id], lat, rp
761
760
cost_vec[3 ] += rt
762
761
reg_pressure[3 ] += rp
763
762
else # no cost decrease; cost must be repeated
764
- # @show op 1, rto, iters[id], lat, rp
765
763
cost_vec[1 ] += rt
766
764
reg_pressure[1 ] += rp
767
765
end
768
766
end
769
767
# @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
770
- # @show cost_vec reg_pressure
771
768
costpenalty = (sum (reg_pressure) > REGISTER_COUNT) ? 2 : 1
772
- # @show order, vectorized cost_vec reg_pressure
773
- # @show solve_unroll(ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure)
774
769
u₁v = vectorized === u₁loopsym; u₂v = vectorized === u₂loopsym
775
770
round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0 )) : 0
776
771
u₁, u₂, ucost = solve_unroll (ls, u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vectorized, round_uᵢ)
820
815
# that I could come up with.
821
816
function Base. iterate (lo:: LoopOrders , state)
822
817
advance_state! (state) || return nothing
823
- # # @show state
824
818
syms = copyto! (lo. buff, lo. syms)
825
819
for i ∈ eachindex (state)
826
820
sᵢ = state[i]
0 commit comments