255
255
# end
256
256
# u₁b, u₂b, cb
257
257
# end
258
+
259
+ function solve_unroll_iter (X, R, u₁L, u₂L, u₁range, u₂range)
260
+ R₁, R₂, R₃, R₄, R₅ = R[1 ], R[2 ], R[3 ], R[4 ], R[5 ]
261
+ RR = REGISTER_COUNT - R₃ - R₄
262
+ u₁best, u₂best = 0 , 0
263
+ bestcost = Inf
264
+ for u₁temp ∈ u₁range
265
+ for u₂temp ∈ u₂range
266
+ RR ≥ u₁temp* u₂temp* R₁ + u₁temp* R₂ + u₂temp* R₅ || continue
267
+ tempcost = unroll_cost (X, u₁temp, u₂temp, u₁L, u₂L)
268
+ if tempcost < bestcost
269
+ bestcost = tempcost
270
+ u₁best, u₂best = u₁temp, u₂temp
271
+ end
272
+ end
273
+ end
274
+ u₁best, u₂best, bestcost
275
+ end
276
+
258
277
function solve_unroll (X, R, u₁L, u₂L)
259
278
X₁, X₂, X₃, X₄ = X[1 ], X[2 ], X[3 ], X[4 ]
260
- R₁, R₂, R₃, R₄ = R[1 ], R[2 ], R[3 ], R[4 ]
279
+ R₁, R₂, R₃, R₄, R₅ = R[1 ], R[2 ], R[3 ], R[4 ], R[5 ]
280
+ iszero (R₅) || return solve_unroll_iter (X, R, u₁L, u₂L, 1 : 10 , 1 : 10 )
261
281
RR = REGISTER_COUNT - R₃ - R₄
262
282
a = R₂^ 2 * X₃ - R₁* X₄ * R₂ - R₁* X₂* RR
263
283
b = R₁ * X₄ * RR - R₁ * X₄ * RR - 2 X₃* RR* R₂
@@ -272,50 +292,24 @@ function solve_unroll(X, R, u₁L, u₂L)
272
292
end
273
293
u₁low = floor (Int, u₁float)
274
294
u₂low = max (1 , floor (Int, u₂float)) # must be at least 1
275
- u₁high = u₁low + 1 # ceil(Int, u₁float)
276
- u₂high = u₂low + 1 # ceil(Int, u₂float)
277
-
278
- # RR = REGISTER_COUNT - R[3] - R[4]
279
- u₁, u₂ = u₁low, u₂low
280
- ucost = unroll_cost (X, u₁low, u₂low, u₁L, u₂L)
281
- # @show u₁low*u₂high*R[1] + u₁low*R[2]
282
- if RR ≥ u₁low* u₂high* R[1 ] + u₁low* R[2 ]
283
- ucost_temp = unroll_cost (X, u₁low, u₂high, u₁L, u₂L)
284
- # @show ucost_temp, ucost
285
- if ucost_temp < ucost
286
- ucost = ucost_temp
287
- u₁, u₂ = u₁low, u₂high
288
- end
289
- end
290
- # The RR + 1 is a hack to get it to favor u₁high in more scenarios
291
- u₂l = u₂low
292
- while RR < u₁high* u₂l* R[1 ] + u₁high* R[2 ] && u₂l > 1
293
- u₂l -= 1
294
- end
295
- ucost_temp = unroll_cost (X, u₁high, u₂l, u₁L, u₂L)
296
- if ucost_temp < ucost
297
- ucost = ucost_temp
298
- u₁, u₂ = u₁high, u₂l
299
- end
300
- if RR > u₁high* u₂high* R[1 ] + u₁high* R[2 ]
301
- throw (" Something went wrong when solving for u₂float and u₁float." )
302
- end
303
- u₁, u₂, ucost
295
+ u₁high = solve_unroll_constT (R, u₂low) + 1
296
+ u₂high = solve_unroll_constU (R, u₁low) + 1
297
+ solve_unroll_iter (X, R, u₁L, u₂L, u₁low: u₁high, u₂low: u₂high)
304
298
end
305
299
306
- function solve_unroll_constU (X, R, U )
307
- floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ] - U * R[2 ]) / (U * R[1 ]))
300
+ function solve_unroll_constU (R :: AbstractVector , u₁ :: Int )
301
+ floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ] - u₁ * R[2 ]) / (u₁ * R[1 ] + R[ 5 ]))
308
302
end
309
- function solve_unroll_constT (X, R , u₂)
310
- floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ]) / (u₂ * R[1 ] + R[2 ]))
303
+ function solve_unroll_constT (R :: AbstractVector , u₂:: Int )
304
+ floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ] - u₂ * R[ 5 ] ) / (u₂ * R[1 ] + R[2 ]))
311
305
end
312
- function solve_unroll_constT (ls, u₂)
306
+ function solve_unroll_constT (ls:: LoopSet , u₂:: Int )
313
307
R = @view ls. reg_pres[:,1 ]
314
- floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ]) / (u₂ * R[1 ] + R[2 ]))
308
+ floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ] - u₂ * R[ 5 ] ) / (u₂ * R[1 ] + R[2 ]))
315
309
end
316
310
# Tiling here is about alleviating register pressure for the UxT
317
311
function solve_unroll (X, R, u₁max, u₂max, u₁L, u₂L)
318
- iszero (first (R)) && return - 1 ,- 1 ,Inf # solve_smalltilesize(X, R, u₁max, u₂max)
312
+ # iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
319
313
u₁, u₂, cost = solve_unroll (X, R, u₁L, u₂L)
320
314
# u₂ -= u₂ & 1
321
315
# u₁ = min(u₁, u₂)
@@ -327,12 +321,12 @@ function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L)
327
321
u₂ = u₂max
328
322
else # u₁ too large, resolve u₂
329
323
u₁ = u₁max
330
- u₂ = min (u₂max, max (1 ,solve_unroll_constU (X, R, u₁)))
324
+ u₂ = min (u₂max, max (1 ,solve_unroll_constU (R, u₁)))
331
325
end
332
326
cost = unroll_cost (X, u₁, u₂, u₁L, u₂L)
333
327
elseif u₂_too_large
334
328
u₂ = u₂max
335
- u₁ = min (u₁max, max (1 ,solve_unroll_constT (X, R, u₂)))
329
+ u₁ = min (u₁max, max (1 ,solve_unroll_constT (R, u₂)))
336
330
cost = unroll_cost (X, u₁, u₂, u₁L, u₂L)
337
331
end
338
332
u₁, u₂, cost
@@ -376,14 +370,14 @@ function solve_unroll(
376
370
W:: Int , vectorized:: Symbol ,
377
371
u₁loop:: Loop , u₂loop:: Loop
378
372
)
379
- maxu₂base = maxu₁base = VectorizationBase. REGISTER_COUNT == 32 ? 6 : 4 # 8
373
+ maxu₂base = maxu₁base = VectorizationBase. REGISTER_COUNT == 32 ? 10 : 6 # 8
380
374
maxu₂ = maxu₂base# 8
381
375
maxu₁ = maxu₁base# 8
382
376
u₁L = length (u₁loop)
383
377
u₂L = length (u₂loop)
384
378
if isstaticloop (u₂loop)
385
379
if u₂loopsym != = vectorized && u₂L ≤ 4
386
- u₁ = max (1 , solve_unroll_constT (cost_vec, reg_pressure, u₂L))
380
+ u₁ = max (1 , solve_unroll_constT (reg_pressure, u₂L))
387
381
u₁ = isstaticloop (u₁loop) ? min (u₁, u₁L) : u₁
388
382
return u₁, u₂L, unroll_cost (cost_vec, u₁, u₂L, u₁L, u₂L)
389
383
end
@@ -392,7 +386,7 @@ function solve_unroll(
392
386
end
393
387
if isstaticloop (u₁loop)
394
388
if u₁loopsym != = vectorized && u₁L ≤ 4
395
- u₂ = max (1 , solve_unroll_constU (cost_vec, reg_pressure, u₁L))
389
+ u₂ = max (1 , solve_unroll_constU (reg_pressure, u₁L))
396
390
u₂ = isstaticloop (u₂loop) ? min (u₂, u₂L) : u₂
397
391
return u₁L, u₂, unroll_cost (cost_vec, u₁L, u₂, u₁L, u₂L)
398
392
end
@@ -525,23 +519,41 @@ function maxnegativeoffset(ls::LoopSet, op::Operation, unrollsyms::UnrollSymbols
525
519
end
526
520
mno, i
527
521
end
528
- function load_elimination_cost_factor (ls:: LoopSet , op:: Operation , unrollsyms:: UnrollSymbols )
529
- @unpack u₁loopsym, u₂loopsym = unrollsyms
522
+ function load_elimination_cost_factor! (
523
+ cost_vec, reg_pressure, choose_to_inline, ls:: LoopSet , op:: Operation , iters, unrollsyms:: UnrollSymbols , Wshift, size_T
524
+ )
525
+ @unpack u₁loopsym, u₂loopsym, vectorized = unrollsyms
530
526
if ! iszero (first (isoptranslation (ls, op, unrollsyms)))
531
- for loop ∈ ls. loops
532
- # If another loop is short, assume that LLVM will unroll it, in which case
533
- # we want to be a little more conservative in terms of register pressure.
534
- # FIXME : heuristic hack to get some desired behavior.
535
- if isstaticloop (loop) && length (loop) ≤ 4
536
- itersym = loop. itersymbol
537
- if itersym != = u₁loopsym && itersym != = u₂loopsym
538
- return (0.25 , VectorizationBase. REGISTER_COUNT == 32 ? 2.0 : 1.0 )
539
- end
540
- end
541
- end
542
- (0.25 , VectorizationBase. REGISTER_COUNT == 32 ? 1.2 : 1.0 )
527
+ rt, lat, rp = cost (ls, op, vectorized, Wshift, size_T)
528
+ rt *= iters
529
+ # rt *= factor1; rp *= factor2;
530
+ choose_to_inline[] = true
531
+ # for loop ∈ ls.loops
532
+ # # If another loop is short, assume that LLVM will unroll it, in which case
533
+ # # we want to be a little more conservative in terms of register pressure.
534
+ # #FIXME : heuristic hack to get some desired behavior.
535
+ # if isstaticloop(loop) && length(loop) ≤ 4
536
+ # itersym = loop.itersymbol
537
+ # if itersym !== u₁loopsym && itersym !== u₂loopsym
538
+ # return (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 2.0 : 1.0)
539
+ # # return (0.25, 1.0)
540
+ # return true
541
+ # end
542
+ # end
543
+ # end
544
+ # # (0.25, VectorizationBase.REGISTER_COUNT == 32 ? 1.2 : 1.0)
545
+ # (0.25, 1.0)
546
+ cost_vec[1 ] += 0.1 rt
547
+ reg_pressure[1 ] += 0.51 rp
548
+ cost_vec[2 ] += rt
549
+ reg_pressure[2 ] += rp
550
+ cost_vec[3 ] += rt
551
+ # reg_pressure[3] += rp
552
+ reg_pressure[5 ] += rp
553
+ true
543
554
else
544
555
(1.0 , 1.0 )
556
+ false
545
557
end
546
558
end
547
559
function add_constant_offset_load_elmination_cost! (
@@ -624,6 +636,7 @@ function evaluate_cost_tile(
624
636
iter:: Int = 1
625
637
u₁reached = u₂reached = false
626
638
choose_to_inline = Ref (false )
639
+ copyto! (names (ls), order); reverse! (names (ls))
627
640
for n ∈ 1 : N
628
641
itersym = order[n]
629
642
if itersym == u₁loopsym
@@ -664,30 +677,33 @@ function evaluate_cost_tile(
664
677
665
678
u₁reduces, u₂reduces = reduced_by_unrolling[1 ,id], reduced_by_unrolling[2 ,id]
666
679
# @show op, u₁reduces, u₂reduces
667
- if ! (isload (op) && add_constant_offset_load_elmination_cost! (cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, u₁reduces, u₂reduces, Wshift, size_T, opisininnerloop))
668
- rt, lat, rp = cost (ls, op, vectorized, Wshift, size_T)
680
+ if isload (op)
681
+ if add_constant_offset_load_elmination_cost! (cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, u₁reduces, u₂reduces, Wshift, size_T, opisininnerloop)
682
+ continue
683
+ elseif load_elimination_cost_factor! (cost_vec, reg_pressure, choose_to_inline, ls, op, iters[id], unrollsyms, Wshift, size_T)
684
+ continue
685
+ end
686
+ end
669
687
# @show op rt, lat, rp
670
- if isload (op)
671
- factor1, factor2 = load_elimination_cost_factor (ls, op, unrollsyms)
672
- rt *= factor1; rp *= factor2;
673
- choose_to_inline[] |= factor1 < 1
674
- end
675
- # @show isunrolled₁, isunrolled₂, op rt, lat, rp
676
- rp = opisininnerloop ? rp : zero (rp) # we only care about register pressure within the inner most loop
677
- rt *= iters[id]
678
- if u₁reduces & u₂reduces
679
- cost_vec[4 ] += rt
680
- reg_pressure[4 ] += rp
681
- elseif u₂reduces # cost decreased by unrolling u₂loop
682
- cost_vec[2 ] += rt
683
- reg_pressure[2 ] += rp
684
- elseif u₁reduces # cost decreased by unrolling u₁loop
685
- cost_vec[3 ] += rt
686
- reg_pressure[3 ] += rp
687
- else # no cost decrease; cost must be repeated
688
- cost_vec[1 ] += rt
689
- reg_pressure[1 ] += rp
690
- end
688
+ rt, lat, rp = cost (ls, op, vectorized, Wshift, size_T)
689
+ if isload (op) && ! iszero (prefetchisagoodidea (ls, op, UnrollArgs (4 , unrollsyms, 4 , 0 )))
690
+ rt += 0.5 VectorizationBase. REGISTER_SIZE / VectorizationBase. CACHELINE_SIZE
691
+ end
692
+ # @show isunrolled₁, isunrolled₂, op rt, lat, rp
693
+ rp = opisininnerloop ? rp : zero (rp) # we only care about register pressure within the inner most loop
694
+ rt *= iters[id]
695
+ if u₁reduces & u₂reduces
696
+ cost_vec[4 ] += rt
697
+ reg_pressure[4 ] += rp
698
+ elseif u₂reduces # cost decreased by unrolling u₂loop
699
+ cost_vec[2 ] += rt
700
+ reg_pressure[2 ] += rp
701
+ elseif u₁reduces # cost decreased by unrolling u₁loop
702
+ cost_vec[3 ] += rt
703
+ reg_pressure[3 ] += rp
704
+ else # no cost decrease; cost must be repeated
705
+ cost_vec[1 ] += rt
706
+ reg_pressure[1 ] += rp
691
707
end
692
708
end
693
709
# @show cost_vec reg_pressure
0 commit comments