11
2- const CACHELINE_SIZE = something (VectorizationBase. L₁CACHE. linesize, 64 )
32
43# function indexappearences(op::Operation, s::Symbol)
54# s ∉ loopdependencies(op) && return 0
@@ -105,7 +104,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
105104 # would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
106105 # Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
107106 # this feature is common to all of them.
108- srt += 0.5 VectorizationBase. REGISTER_SIZE / CACHELINE_SIZE
107+ srt += 0.5 VectorizationBase. dynamic_register_size () / VectorizationBase . cacheline_size ()
109108 end
110109 elseif isstore (op) # broadcast or reductionstore; if store we want to penalize reduction
111110 srt *= 3
388387
389388function solve_unroll_iter (X, R, u₁L, u₂L, u₁range, u₂range)
390389 R₁, R₂, R₃, R₄, R₅ = R[1 ], R[2 ], R[3 ], R[4 ], R[5 ]
391- RR = REGISTER_COUNT - R₃ - R₄
390+ RR = dynamic_register_count () - R₃ - R₄
392391 u₁best, u₂best = 0 , 0
393392 bestcost = Inf
394393 for u₁temp ∈ u₁range
@@ -408,13 +407,11 @@ end
408407
409408function solve_unroll (X, R, u₁L, u₂L, u₁step, u₂step)
410409 X₁, X₂, X₃, X₄ = X[1 ], X[2 ], X[3 ], X[4 ]
411- # If we don't have AVX512, masks occupy a vector register;
412- # AVX512F is currently defined as `false` for non-x86 CPUs, but
413- # should instead define generic constant `HAS_OPMASK_REGISTERS` in VectorizationBase.jl to use here instead.
414- VectorizationBase. AVX512F || (R[3 ] += 1 )
410+ # If we don't have opmask registers, masks probably occupy a vector register (e.g., on CPUs with AVX but not AVX512)
411+ VectorizationBase. dynamic_has_opmask_registers () || (R[3 ] += 1 )
415412 R₁, R₂, R₃, R₄, R₅ = R[1 ], R[2 ], R[3 ], R[4 ], R[5 ]
416413 iszero (R₅) || return solve_unroll_iter (X, R, u₁L, u₂L, u₁step: u₁step: 10 , u₂step: u₂step: 10 )
417- RR = REGISTER_COUNT - R₃ - R₄
414+ RR = dynamic_register_count () - R₃ - R₄
418415 a = R₂^ 2 * X₃ - R₁* X₄ * R₂ - R₁* X₂* RR
419416 b = R₁ * X₄ * RR - R₁ * X₄ * RR - 2 X₃* RR* R₂
420417 c = X₃* RR^ 2
@@ -424,15 +421,15 @@ function solve_unroll(X, R, u₁L, u₂L, u₁step, u₂step)
424421 u₂float = (RR - u₁float* R₂)/ (u₁float* R₁)
425422 if ! (isfinite (u₂float) & isfinite (u₁float)) # brute force
426423 u₁low = u₂low = 1
427- u₁high = iszero (X₂) ? 2 : (REGISTER_COUNT == 32 ? 8 : 6 )
428- u₂high = iszero (X₃) ? 2 : (REGISTER_COUNT == 32 ? 8 : 6 )
424+ u₁high = iszero (X₂) ? 2 : (dynamic_register_count () == 32 ? 8 : 6 )
425+ u₂high = iszero (X₃) ? 2 : (dynamic_register_count () == 32 ? 8 : 6 )
429426 return solve_unroll_iter (X, R, u₁L, u₂L, u₁low: u₁step: u₁high, u₂low: u₂step: u₂high)
430427 end
431428 u₁low = floor (Int, u₁float)
432429 u₂low = max (u₂step, floor (Int, 0.8 u₂float)) # must be at least 1
433430 u₁high = solve_unroll_constT (R, u₂low) + u₁step
434431 u₂high = solve_unroll_constU (R, u₁low) + u₂step
435- maxunroll = REGISTER_COUNT == 32 ? (((X₂ > 0 ) & (X₃ > 0 )) ? 10 : 8 ) : 6
432+ maxunroll = dynamic_register_count () == 32 ? (((X₂ > 0 ) & (X₃ > 0 )) ? 10 : 8 ) : 6
436433 u₁low = (min (u₁low, maxunroll) ÷ u₁step) * u₁step
437434 u₂low = (min (u₂low, maxunroll) ÷ u₂step) * u₂step
438435 u₁high = min (u₁high, maxunroll)
@@ -443,18 +440,18 @@ end
443440function solve_unroll_constU (R:: AbstractVector , u₁:: Int )
444441 denom = u₁ * R[1 ] + R[5 ]
445442 iszero (denom) && return 8
446- floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ] - u₁* R[2 ]) / denom)
443+ floor (Int, (dynamic_register_count () - R[3 ] - R[4 ] - u₁* R[2 ]) / denom)
447444end
448445function solve_unroll_constT (R:: AbstractVector , u₂:: Int )
449446 denom = u₂ * R[1 ] + R[2 ]
450447 iszero (denom) && return 8
451- floor (Int, (REGISTER_COUNT - R[3 ] - R[4 ] - u₂* R[5 ]) / denom)
448+ floor (Int, (dynamic_register_count () - R[3 ] - R[4 ] - u₂* R[5 ]) / denom)
452449end
453450# function solve_unroll_constT(ls::LoopSet, u₂::Int)
454451# R = @view ls.reg_pres[:,1]
455452# denom = u₂ * R[1] + R[2]
456453# iszero(denom) && return 8
457- # floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
454+ # floor(Int, (dynamic_register_count() - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
458455# end
459456# Tiling here is about alleviating register pressure for the UxT
460457function solve_unroll (X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step)
@@ -501,9 +498,9 @@ function solve_unroll(
501498 W:: Int , vectorized:: Symbol , rounduᵢ:: Int
502499)
503500 (u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of.
504- (max (1 ,CACHELINE_SIZE ÷ VectorizationBase. REGISTER_SIZE ), 1 )
501+ (max (1 ,VectorizationBase . cacheline_size () ÷ VectorizationBase. dynamic_register_size () ), 1 )
505502 elseif rounduᵢ == 2
506- (1 , max (1 ,CACHELINE_SIZE ÷ VectorizationBase. REGISTER_SIZE ))
503+ (1 , max (1 ,VectorizationBase . cacheline_size () ÷ VectorizationBase. dynamic_register_size () ))
507504 else
508505 (1 , 1 )
509506 end
@@ -522,7 +519,7 @@ function solve_unroll(
522519 u₁loop:: Loop , u₂loop:: Loop ,
523520 u₁step:: Int , u₂step:: Int
524521)
525- maxu₂base = maxu₁base = REGISTER_COUNT == 32 ? 10 : 6 # 8
522+ maxu₂base = maxu₁base = dynamic_register_count () == 32 ? 10 : 6 # 8
526523 maxu₂ = maxu₂base# 8
527524 maxu₁ = maxu₁base# 8
528525 u₁L = length (u₁loop)
@@ -721,13 +718,13 @@ function load_elimination_cost_factor!(
721718 # if isstaticloop(loop) && length(loop) ≤ 4
722719 # itersym = loop.itersymbol
723720 # if itersym !== u₁loopsym && itersym !== u₂loopsym
724- # return (0.25, REGISTER_COUNT == 32 ? 2.0 : 1.0)
721+ # return (0.25, dynamic_register_count() == 32 ? 2.0 : 1.0)
725722 # # return (0.25, 1.0)
726723 # return true
727724 # end
728725 # end
729726 # end
730- # # (0.25, REGISTER_COUNT == 32 ? 1.2 : 1.0)
727+ # # (0.25, dynamic_register_count() == 32 ? 1.2 : 1.0)
731728 # (0.25, 1.0)
732729 cost_vec[1 ] -= 0.1 looplengthprod (ls)
733730 reg_pressure[1 ] += 0.25 rp
@@ -919,7 +916,7 @@ function evaluate_cost_tile(
919916 rt, lat, rp = cost (ls, op, vectorized, Wshift, size_T)
920917 if isload (op)
921918 if ! iszero (prefetchisagoodidea (ls, op, UnrollArgs (4 , unrollsyms, 4 , 0 )))
922- # rt += 0.5VectorizationBase.REGISTER_SIZE / CACHELINE_SIZE
919+ # rt += 0.5VectorizationBase.dynamic_register_size() / VectorizationBase.cacheline_size()
923920 prefetch_good_idea = true
924921 end
925922 end
@@ -936,7 +933,7 @@ function evaluate_cost_tile(
936933 end
937934 # reg_pressure[1] = max(reg_pressure[1], length(ls.outer_reductions))
938935 # @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
939- costpenalty = (sum (reg_pressure) > REGISTER_COUNT ) ? 2 : 1
936+ costpenalty = (sum (reg_pressure) > dynamic_register_count () ) ? 2 : 1
940937 u₁v = vectorized === u₁loopsym; u₂v = vectorized === u₂loopsym
941938 round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0 )) : 0
942939 if (irreducible_storecosts / sum (cost_vec) ≥ 0.25 ) && ! any (op -> loadintostore (ls, op), operations (ls))
0 commit comments