@@ -44,7 +44,6 @@ function unitstride(ls::LoopSet, op::Operation, s::Symbol)
44
44
li = op. ref. loopedindex
45
45
# The first index is allowed to be indexed by `s`
46
46
fi = first (inds)
47
- # @show (fi === DISCONTIGUOUS), (fi === CONSTANTZEROINDEX), (first(getstrides(op)) ≠ 1), unitstep(getloop(ls,s))
48
47
if ((fi === DISCONTIGUOUS) | (fi === CONSTANTZEROINDEX)) || (first (getstrides (op)) ≠ 1 ) || ! unitstep (getloop (ls,s))
49
48
return false
50
49
# elseif !first(li)
@@ -77,27 +76,25 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
77
76
end
78
77
elseif iscompute (op) &&
79
78
Base. sym_in (instruction (op). instr, (:(+ ), :(- ), :add_fast , :sub_fast )) &&
80
- all (opp -> (isloopvalue (opp) | isconstant (opp)), parents (op))
79
+ all (opp -> (isloopvalue (opp)), parents (op))
80
+ # all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
81
81
return 0.0 , 0 , 0.0
82
82
end
83
83
opisvectorized = isvectorized (op)
84
84
srt, sl, srp = opisvectorized ? vector_cost (instr, Wshift, size_T) : scalar_cost (instr)
85
85
if accesses_memory (op)
86
86
# either vbroadcast/reductionstore, vmov(a/u)pd, or gather/scatter
87
87
if opisvectorized
88
- # @show unitstride(ls,op,vloopsym), srt,sl,srp
89
88
if ! unitstride (ls, op, vloopsym)# || !isdense(op) # need gather/scatter
90
89
indices = getindices (op)
91
90
contigind = first (indices)
92
- # @show rejectinterleave(op) op
93
91
shifter = max (2 ,Wshift)
94
92
if rejectinterleave (op)
95
93
offset = 0.0 # gather/scatter, alignment doesn't matter
96
94
else
97
95
shifter -= 1
98
96
offset = 0.5 reg_size (ls) / cache_lnsze (ls)
99
97
end
100
- # @show shifter,offset, Wshift
101
98
if shifter > 1 &&
102
99
(! rejectcurly (op) && (((contigind === CONSTANTZEROINDEX) && ((length (indices) > 1 ) && (indices[2 ] === u₁) || (indices[2 ] === u₂))) ||
103
100
((u₁ === contigind) | (u₂ === contigind))))
@@ -118,7 +115,6 @@ function cost(ls::LoopSet, op::Operation, (u₁,u₂)::Tuple{Symbol,Symbol}, vlo
118
115
# this feature is common to all of them.
119
116
srt += 0.5 reg_size (ls) / cache_lnsze (ls)
120
117
end
121
- # @show srt,sl,srp
122
118
elseif isstore (op) # broadcast or reductionstore; if store we want to penalize reduction
123
119
srt *= 3
124
120
sl *= 3
@@ -184,7 +180,7 @@ function evaluate_cost_unroll(
184
180
included_vars[id] && continue
185
181
# it must also be a subset of defined symbols
186
182
loopdependencies (op) ⊆ nested_loop_syms || continue
187
- # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
183
+ # hasintersection(reduceddependencies(op), nested_loop_syms) && return Inf
188
184
rd = reduceddependencies (op)
189
185
hasintersection (rd, @view (nested_loop_syms[1 : end - length (rd)])) && return Inf
190
186
if isstore (op) # TODO : DRY (this is repeated in evaluate_cost_tile)
@@ -194,7 +190,6 @@ function evaluate_cost_unroll(
194
190
end
195
191
end
196
192
included_vars[id] = true
197
- # @show op, cost(ls, op, vloopsym, Wshift, size_T)
198
193
# TODO : use actual unrolls here?
199
194
c = first (cost (ls, op, (Symbol (" " ),Symbol (" " )), vloopsym, Wshift, size_T))
200
195
total_cost += iter * c
@@ -213,16 +208,13 @@ function depchain_cost!(
213
208
for opp ∈ parents (op)
214
209
skip[identifier (opp)] && continue
215
210
rt, sl = depchain_cost! (ls, skip, opp, unrolled, vloopsym, Wshift, size_T, rt, sl)
216
- # @show rt,sl, opp
217
211
end
218
212
# Basically assuming memory and compute don't conflict, but everything else does
219
213
# Ie, ignoring the fact that integer and floating point operations likely don't either
220
214
if iscompute (op)
221
215
rtᵢ, slᵢ = cost (ls, op, (unrolled,Symbol (" " )), vloopsym, Wshift, size_T)
222
- # @show rtᵢ, slᵢ, op
223
216
rt += rtᵢ; sl += slᵢ
224
217
end
225
- # @show rt, sl
226
218
rt, sl
227
219
end
228
220
function parentsnotreduction (op:: Operation )
@@ -280,7 +272,6 @@ function unroll_no_reductions(ls, order, vloopsym)
280
272
max (1 , min (4 , round (Int, 2 compute_rt / load_rt)))
281
273
end
282
274
# u = min(u, max(1, (reg_count(ls) ÷ max(1,round(Int,rp)))))
283
- # @show u
284
275
# commented out here is to decide to align loops
285
276
# if memory_rt > compute_rt && isone(u) && (length(order) > 1) && (last(order) === vloopsym) && length(getloop(ls, last(order))) > 8W
286
277
# ls.align_loops[] = findfirst(operations(ls)) do op
@@ -333,7 +324,6 @@ function determine_unroll_factor(
333
324
load_recip_throughput,
334
325
store_recip_throughput
335
326
)
336
- # @show recip_throughput, latency
337
327
recip_throughput, latency
338
328
end
339
329
function count_reductions (ls:: LoopSet )
@@ -393,9 +383,9 @@ function determine_unroll_factor(ls::LoopSet, order::Vector{Symbol}, vloopsym::S
393
383
end
394
384
end
395
385
# min(8, roundpow2(max(1, round(Int, latency / (rt * num_reductions) ) ))), best_unrolled
396
- UF = min ( 8 , VectorizationBase. nextpow2 (max ( 1 , round (Int, latency / (rt * num_reductions) ) )))
386
+ UF = VectorizationBase. nextpow2 (round (Int, clamp ( latency / (rt * num_reductions), 1.0 , 8.0 )))
397
387
if UF == 1 && num_reductions > 1
398
- UF = min ( 8 , VectorizationBase. nextpow2 (max ( 1 , round (Int, latency / (rt * cld (num_reductions, 2 )) ) )))
388
+ UF = VectorizationBase. nextpow2 (round (Int, clamp ( latency / (rt * cld (num_reductions, 2 )), 1.0 , 8.0 )))
399
389
end
400
390
if best_unrolled === vloopsym
401
391
UF = demote_unroll_factor (ls, UF, vloopsym)
406
396
function unroll_cost (X, u₁, u₂, u₁L, u₂L)
407
397
u₂factor = (num_iterations (u₂L, u₂)/ u₂L)
408
398
u₁factor = (num_iterations (u₁L, u₁)/ u₁L)
409
- # @show num_iterations(u₂L, u₂)/u₂L, u₂, u₂L
410
- # @show num_iterations(u₁L, u₁)/u₁L, u₁, u₁L
411
399
# X[1]*u₂factor*u₁factor + X[4] + X[2] * u₂factor + X[3] * u₁factor
412
400
X[1 ] + X[2 ] * u₂factor + X[3 ] * u₁factor + X[4 ] * u₁factor * u₂factor
413
401
end
@@ -433,8 +421,6 @@ function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
433
421
for u₂temp ∈ u₂range
434
422
RR ≥ u₁temp* u₂temp* R₁ + u₁temp* R₂ + u₂temp* R₃ || continue
435
423
tempcost = unroll_cost (X, u₁temp, u₂temp, u₁L, u₂L)
436
- # @show u₁temp, u₂temp, tempcost
437
- # @show u₁temp*u₂temp*R₁ + u₁temp*R₂ + u₂temp*R₃
438
424
if tempcost ≤ bestcost
439
425
bestcost = tempcost
440
426
u₁best, u₂best = u₁temp, u₂temp
@@ -455,10 +441,8 @@ function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int,
455
441
c = X₃* RR^ 2
456
442
discriminant = b^ 2 - 4 a* c
457
443
discriminant < 0 && return - 1 ,- 1 ,Inf
458
- # @show R₁, R₂, R₃, R₄
459
444
u₁float = max ((sqrt (discriminant) + b) / (- 2 a), float (u₁step)) # must be at least 1
460
445
u₂float = (RR - u₁float* R₂)/ (u₁float* R₁)
461
- # @show u₁float, u₂float
462
446
if ! (isfinite (u₂float) & isfinite (u₁float)) # brute force
463
447
u₁low = u₂low = 1
464
448
u₁high = iszero (X₂) ? 2 : (atleast32registers ? 8 : 6 )
@@ -611,7 +595,6 @@ function solve_unroll(
611
595
else
612
596
u₂Lf = Float64 (u₂L)
613
597
end
614
- # @show u₁Lf, u₂Lf, u₁L, length(u₁loop)
615
598
u₁, u₂, cost = solve_unroll (cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step, atleast32registers)
616
599
# heuristic to more evenly divide small numbers of iterations
617
600
if isstaticloop (u₂loop)
@@ -635,7 +618,6 @@ function loopdepindices(ls::LoopSet, op::Operation)
635
618
isdiscontig = first (loopdeps) === DISCONTIGUOUS
636
619
# isdiscontig = isdiscontiguous(op.ref)
637
620
loopedindex = op. ref. loopedindex
638
- # @show loopdeps
639
621
if ! isdiscontig && all (loopedindex) && ! (any (== (CONSTANTZEROINDEX), loopdeps))
640
622
return loopdeps
641
623
end
@@ -654,7 +636,6 @@ function loopdepindices(ls::LoopSet, op::Operation)
654
636
end
655
637
function stride_penalty (ls:: LoopSet , op:: Operation , order:: Vector{Symbol} , loopfreqs)
656
638
loopdeps = loopdepindices (ls, op)
657
- # @show op loopdeps
658
639
opstrides = Vector {Int} (undef, length (loopdeps))
659
640
# very minor stride assumption here, because we don't really want to base optimization decisions on it...
660
641
opstrides[1 ] = 1.0 + (first (loopdependencies (op. ref)) === DISCONTIGUOUS) + (first (loopdependencies (op. ref)) === CONSTANTZEROINDEX)
@@ -815,10 +796,8 @@ function load_elimination_cost_factor!(
815
796
cost_vec, reg_pressure, choose_to_inline, ls:: LoopSet , op:: Operation , iters, unrollsyms:: UnrollSymbols , Wshift, size_T
816
797
)
817
798
@unpack u₁loopsym, u₂loopsym, vloopsym = unrollsyms
818
- # @show isoptranslation(ls, op, unrollsyms)
819
799
if ! iszero (first (isoptranslation (ls, op, unrollsyms)))
820
800
rt, lat, rp = cost (ls, op, (u₁loopsym, u₂loopsym), vloopsym, Wshift, size_T)
821
- # @show rt
822
801
rto = rt
823
802
rt *= iters
824
803
# rt *= factor1; rp *= factor2;
@@ -1086,7 +1065,6 @@ function evaluate_cost_tile!(
1086
1065
if isstore (op) & (! u₁reducesrt) & (! u₂reducesrt)
1087
1066
irreducible_storecosts += rt
1088
1067
end
1089
- # iiter = convert(Int, iters[id]); @show u₁reducesrt, u₂reducesrt, op, rt, rto, rp, iiter
1090
1068
update_cost_vec! (cost_vec, rt, u₁reducesrt, u₂reducesrt)
1091
1069
update_reg_pres! (reg_pressure, rp, u₁reducesrp, u₂reducesrp)
1092
1070
# update_costs!(reg_pressure, rp, u₁reducesrp, u₂reducesrp)
@@ -1104,7 +1082,6 @@ function evaluate_cost_tile!(
1104
1082
else
1105
1083
0
1106
1084
end
1107
- # @show (irreducible_storecosts / sum(cost_vec))
1108
1085
if (irreducible_storecosts / sum (cost_vec) ≥ 0.5 ) && ! any (op -> loadintostore (ls, op), operations (ls))
1109
1086
u₁, u₂ = if visbit
1110
1087
vecsforbyte = 8 ÷ ls. vector_width
0 commit comments