@@ -428,7 +428,7 @@ function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
428
428
u₁best, u₂best, bestcost
429
429
end
430
430
431
- function solve_unroll_lagrange (X, R, u₁L, u₂L, u₁step:: Int , u₂step:: Int , atleast32registers :: Bool )
431
+ function solve_unroll_lagrange (X, R, u₁L, u₂L, u₁step:: Int , u₂step:: Int , atleast31registers :: Bool )
432
432
X₁, X₂, X₃, X₄ = X[1 ], X[2 ], X[3 ], X[4 ]
433
433
# If we don't have opmask registers, masks probably occupy a vector register (e.g., on CPUs with AVX but not AVX512)
434
434
R₁, R₂, R₃, R₄ = R[1 ], R[2 ], R[3 ], R[4 ]
@@ -443,8 +443,8 @@ function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int,
443
443
u₂float = (RR - u₁float* R₂)/ (u₁float* R₁)
444
444
if ! (isfinite (u₂float) & isfinite (u₁float)) # brute force
445
445
u₁low = u₂low = 1
446
- u₁high = iszero (X₂) ? 2 : (atleast32registers ? 8 : 6 )
447
- u₂high = iszero (X₃) ? 2 : (atleast32registers ? 8 : 6 )
446
+ u₁high = iszero (X₂) ? 2 : (atleast31registers ? 8 : 6 )
447
+ u₂high = iszero (X₃) ? 2 : (atleast31registers ? 8 : 6 )
448
448
return solve_unroll_iter (X, R, u₁L, u₂L, u₁low: u₁step: u₁high, u₂low: u₂step: u₂high)
449
449
end
450
450
u₁low = floor (Int, u₁float)
@@ -457,7 +457,7 @@ function solve_unroll_lagrange(X, R, u₁L, u₂L, u₁step::Int, u₂step::Int,
457
457
if u₂low ≥ u₂high
458
458
u₂low = solve_unroll_constU (R, u₁high)
459
459
end
460
- maxunroll = atleast32registers ? (((X₂ > 0 ) & (X₃ > 0 )) ? 10 : 8 ) : 6
460
+ maxunroll = atleast31registers ? (((X₂ > 0 ) & (X₃ > 0 )) ? 10 : 8 ) : 6
461
461
u₁low = (clamp (u₁low, 1 , maxunroll) ÷ u₁step) * u₁step
462
462
u₂low = (clamp (u₂low, 1 , maxunroll) ÷ u₂step) * u₂step
463
463
u₁high = clamp (u₁high, 1 , maxunroll)
482
482
# floor(Int, (dynamic_register_count() - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
483
483
# end
484
484
# Tiling here is about alleviating register pressure for the UxT
485
- function solve_unroll (X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step, atleast32registers :: Bool )
485
+ function solve_unroll (X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step, atleast31registers :: Bool )
486
486
# iszero(first(R)) && return -1,-1,Inf #solve_smalltilesize(X, R, u₁max, u₂max)
487
- u₁, u₂, cost = solve_unroll_lagrange (X, R, u₁L, u₂L, u₁step, u₂step, atleast32registers )
487
+ u₁, u₂, cost = solve_unroll_lagrange (X, R, u₁L, u₂L, u₁step, u₂step, atleast31registers )
488
488
# u₂ -= u₂ & 1
489
489
# u₁ = min(u₁, u₂)
490
490
u₁_too_large = u₁ > u₁max
@@ -539,7 +539,7 @@ function solve_unroll(
539
539
u₁loop = getloop (ls, u₁loopsym)
540
540
u₂loop = getloop (ls, u₂loopsym)
541
541
solve_unroll (
542
- u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, u₁loop, u₂loop, u₁step, u₂step, reg_count (ls) ≥ 32
542
+ u₁loopsym, u₂loopsym, cost_vec, reg_pressure, W, vloopsym, u₁loop, u₂loop, u₁step, u₂step, reg_count (ls) ≥ 31
543
543
)
544
544
end
545
545
@@ -550,9 +550,9 @@ function solve_unroll(
550
550
W:: Int , vloopsym:: Symbol ,
551
551
u₁loop:: Loop , u₂loop:: Loop ,
552
552
u₁step:: Int , u₂step:: Int ,
553
- atleast32registers :: Bool
553
+ atleast31registers :: Bool
554
554
)
555
- maxu₂base = maxu₁base = atleast32registers ? 10 : 6 # 8
555
+ maxu₂base = maxu₁base = atleast31registers ? 10 : 6 # 8
556
556
maxu₂ = maxu₂base# 8
557
557
maxu₁ = maxu₁base# 8
558
558
u₁L = length (u₁loop)
@@ -593,7 +593,7 @@ function solve_unroll(
593
593
else
594
594
u₂Lf = Float64 (u₂L)
595
595
end
596
- u₁, u₂, cost = solve_unroll (cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step, atleast32registers )
596
+ u₁, u₂, cost = solve_unroll (cost_vec, reg_pressure, maxu₁, maxu₂, u₁Lf, u₂Lf, u₁step, u₂step, atleast31registers )
597
597
# heuristic to more evenly divide small numbers of iterations
598
598
if isstaticloop (u₂loop)
599
599
u₂ = maybedemotesize (u₂, length (u₂loop), u₁, u₁loop, maxu₂base)
0 commit comments