Skip to content

Commit 6fd39a4

Browse files
committed
Track VectorizationBase 0.16
1 parent 96c9b3e commit 6fd39a4

18 files changed

+263
-200
lines changed

Project.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.9.20"
4+
version = "0.10"
55

66
[deps]
77
ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -22,8 +22,9 @@ IfElse = "0.1"
2222
OffsetArrays = "1.4.1, 1.5"
2323
Requires = "1"
2424
SLEEFPirates = "0.6.6"
25+
ThreadingUtilities = "0.2.2"
2526
UnPack = "1"
26-
VectorizationBase = "0.15.3"
27+
VectorizationBase = "0.16"
2728
julia = "1.5"
2829

2930
[extras]

benchmark/loadsharedlibs.jl

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
using LinearAlgebra, LoopVectorization, Libdl
2-
using LoopVectorization.VectorizationBase: REGISTER_SIZE
2+
3+
const REGISTER_SIZE = LoopVectorization.VectorizationBase.register_size()
34

45
# const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
56
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
@@ -35,15 +36,15 @@ end
3536
eigenfile = joinpath(LOOPVECBENCHDIR, "looptestseigen.cpp")
3637
if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
3738
# Clang seems to have trouble finding includes
38-
if LoopVectorization.VectorizationBase.AVX512F
39+
if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
3940
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
4041
else
4142
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
4243
end
4344
end
4445
if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
4546
# run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
46-
if LoopVectorization.VectorizationBase.AVX512F
47+
if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
4748
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4849
else
4950
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)

src/LoopVectorization.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ module LoopVectorization
55
# end
66

77
using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
8-
using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
8+
using VectorizationBase: register_size, register_count, dynamic_register_size, dynamic_register_count, data,
99
mask, pick_vector_width_val, MM,
1010
maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
1111
Zero, maybestaticrange, offsetprecalc, lazymul,
@@ -18,6 +18,7 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
1818

1919
using IfElse: ifelse
2020

21+
using ThreadingUtilities
2122
using SLEEFPirates: pow
2223
using Base.Broadcast: Broadcasted, DefaultArrayStyle
2324
using LinearAlgebra: Adjoint, Transpose

src/costs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
7575
return srt, sl, srp
7676
elseif offsetscaling(ic) # offset scaling
7777
srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof_T) - 4)
78-
if (sizeof_T << Wshift) == 64 # VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
78+
if (sizeof_T << Wshift) == 64 # VectorizationBase.register_size() # These instructions experience double latency with zmm
7979
sl += sl
8080
end
8181
elseif linearscaling(ic) # linear scaling
@@ -90,7 +90,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
9090
srt, sl, srp
9191
end
9292

93-
const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 40, 40.0, REGISTER_COUNT)
93+
const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 40, 40.0, 32)
9494

9595
instruction_cost(instruction::Instruction) = instruction.mod === :LoopVectorization ? COST[instruction.instr] : OPAQUE_INSTRUCTION
9696
instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)

src/determinestrategy.jl

Lines changed: 18 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11

2-
const CACHELINE_SIZE = something(VectorizationBase.L₁CACHE.linesize, 64)
32

43
# function indexappearences(op::Operation, s::Symbol)
54
# s ∉ loopdependencies(op) && return 0
@@ -105,7 +104,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
105104
# would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
106105
# Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
107106
# this feature is common to all of them.
108-
srt += 0.5VectorizationBase.REGISTER_SIZE / CACHELINE_SIZE
107+
srt += 0.5VectorizationBase.dynamic_register_size() / VectorizationBase.cacheline_size()
109108
end
110109
elseif isstore(op) # broadcast or reductionstore; if store we want to penalize reduction
111110
srt *= 3
@@ -388,7 +387,7 @@ end
388387

389388
function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
390389
R₁, R₂, R₃, R₄, R₅ = R[1], R[2], R[3], R[4], R[5]
391-
RR = REGISTER_COUNT - R₃ - R₄
390+
RR = dynamic_register_count() - R₃ - R₄
392391
u₁best, u₂best = 0, 0
393392
bestcost = Inf
394393
for u₁temp u₁range
@@ -408,13 +407,11 @@ end
408407

409408
function solve_unroll(X, R, u₁L, u₂L, u₁step, u₂step)
410409
X₁, X₂, X₃, X₄ = X[1], X[2], X[3], X[4]
411-
# If we don't have AVX512, masks occupy a vector register;
412-
# AVX512F is currently defined as `false` for non-x86 CPUs, but
413-
# should instead define generic constant `HAS_OPMASK_REGISTERS` in VectorizationBase.jl to use here instead.
414-
VectorizationBase.AVX512F || (R[3] += 1)
410+
# If we don't have opmask registers, masks probably occupy a vector register (e.g., on CPUs with AVX but not AVX512)
411+
VectorizationBase.dynamic_has_opmask_registers() || (R[3] += 1)
415412
R₁, R₂, R₃, R₄, R₅ = R[1], R[2], R[3], R[4], R[5]
416413
iszero(R₅) || return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:10, u₂step:u₂step:10)
417-
RR = REGISTER_COUNT - R₃ - R₄
414+
RR = dynamic_register_count() - R₃ - R₄
418415
a = R₂^2*X₃ -R₁*X₄ * R₂ - R₁*X₂*RR
419416
b = R₁ * X₄ * RR - R₁ * X₄ * RR - 2X₃*RR*R₂
420417
c = X₃*RR^2
@@ -424,15 +421,15 @@ function solve_unroll(X, R, u₁L, u₂L, u₁step, u₂step)
424421
u₂float = (RR - u₁float*R₂)/(u₁float*R₁)
425422
if !(isfinite(u₂float) & isfinite(u₁float)) # brute force
426423
u₁low = u₂low = 1
427-
u₁high = iszero(X₂) ? 2 : (REGISTER_COUNT == 32 ? 8 : 6)
428-
u₂high = iszero(X₃) ? 2 : (REGISTER_COUNT == 32 ? 8 : 6)
424+
u₁high = iszero(X₂) ? 2 : (dynamic_register_count() == 32 ? 8 : 6)
425+
u₂high = iszero(X₃) ? 2 : (dynamic_register_count() == 32 ? 8 : 6)
429426
return solve_unroll_iter(X, R, u₁L, u₂L, u₁low:u₁step:u₁high, u₂low:u₂step:u₂high)
430427
end
431428
u₁low = floor(Int, u₁float)
432429
u₂low = max(u₂step, floor(Int, 0.8u₂float)) # must be at least 1
433430
u₁high = solve_unroll_constT(R, u₂low) + u₁step
434431
u₂high = solve_unroll_constU(R, u₁low) + u₂step
435-
maxunroll = REGISTER_COUNT == 32 ? (((X₂ > 0) & (X₃ > 0)) ? 10 : 8) : 6
432+
maxunroll = dynamic_register_count() == 32 ? (((X₂ > 0) & (X₃ > 0)) ? 10 : 8) : 6
436433
u₁low = (min(u₁low, maxunroll) ÷ u₁step) * u₁step
437434
u₂low = (min(u₂low, maxunroll) ÷ u₂step) * u₂step
438435
u₁high = min(u₁high, maxunroll)
@@ -443,18 +440,18 @@ end
443440
function solve_unroll_constU(R::AbstractVector, u₁::Int)
444441
denom = u₁ * R[1] + R[5]
445442
iszero(denom) && return 8
446-
floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₁*R[2]) / denom)
443+
floor(Int, (dynamic_register_count() - R[3] - R[4] - u₁*R[2]) / denom)
447444
end
448445
function solve_unroll_constT(R::AbstractVector, u₂::Int)
449446
denom = u₂ * R[1] + R[2]
450447
iszero(denom) && return 8
451-
floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / denom)
448+
floor(Int, (dynamic_register_count() - R[3] - R[4] - u₂*R[5]) / denom)
452449
end
453450
# function solve_unroll_constT(ls::LoopSet, u₂::Int)
454451
# R = @view ls.reg_pres[:,1]
455452
# denom = u₂ * R[1] + R[2]
456453
# iszero(denom) && return 8
457-
# floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
454+
# floor(Int, (dynamic_register_count() - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
458455
# end
459456
# Tiling here is about alleviating register pressure for the UxT
460457
function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step)
@@ -501,9 +498,9 @@ function solve_unroll(
501498
W::Int, vectorized::Symbol, rounduᵢ::Int
502499
)
503500
(u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of.
504-
(max(1,CACHELINE_SIZE ÷ VectorizationBase.REGISTER_SIZE), 1)
501+
(max(1,VectorizationBase.cacheline_size() ÷ VectorizationBase.dynamic_register_size()), 1)
505502
elseif rounduᵢ == 2
506-
(1, max(1,CACHELINE_SIZE ÷ VectorizationBase.REGISTER_SIZE))
503+
(1, max(1,VectorizationBase.cacheline_size() ÷ VectorizationBase.dynamic_register_size()))
507504
else
508505
(1, 1)
509506
end
@@ -522,7 +519,7 @@ function solve_unroll(
522519
u₁loop::Loop, u₂loop::Loop,
523520
u₁step::Int, u₂step::Int
524521
)
525-
maxu₂base = maxu₁base = REGISTER_COUNT == 32 ? 10 : 6#8
522+
maxu₂base = maxu₁base = dynamic_register_count() == 32 ? 10 : 6#8
526523
maxu₂ = maxu₂base#8
527524
maxu₁ = maxu₁base#8
528525
u₁L = length(u₁loop)
@@ -721,13 +718,13 @@ function load_elimination_cost_factor!(
721718
# if isstaticloop(loop) && length(loop) ≤ 4
722719
# itersym = loop.itersymbol
723720
# if itersym !== u₁loopsym && itersym !== u₂loopsym
724-
# return (0.25, REGISTER_COUNT == 32 ? 2.0 : 1.0)
721+
# return (0.25, dynamic_register_count() == 32 ? 2.0 : 1.0)
725722
# # return (0.25, 1.0)
726723
# return true
727724
# end
728725
# end
729726
# end
730-
# # (0.25, REGISTER_COUNT == 32 ? 1.2 : 1.0)
727+
# # (0.25, dynamic_register_count() == 32 ? 1.2 : 1.0)
731728
# (0.25, 1.0)
732729
cost_vec[1] -= 0.1looplengthprod(ls)
733730
reg_pressure[1] += 0.25rp
@@ -919,7 +916,7 @@ function evaluate_cost_tile(
919916
rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
920917
if isload(op)
921918
if !iszero(prefetchisagoodidea(ls, op, UnrollArgs(4, unrollsyms, 4, 0)))
922-
# rt += 0.5VectorizationBase.REGISTER_SIZE / CACHELINE_SIZE
919+
# rt += 0.5VectorizationBase.dynamic_register_size() / VectorizationBase.cacheline_size()
923920
prefetch_good_idea = true
924921
end
925922
end
@@ -936,7 +933,7 @@ function evaluate_cost_tile(
936933
end
937934
# reg_pressure[1] = max(reg_pressure[1], length(ls.outer_reductions))
938935
# @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
939-
costpenalty = (sum(reg_pressure) > REGISTER_COUNT) ? 2 : 1
936+
costpenalty = (sum(reg_pressure) > dynamic_register_count()) ? 2 : 1
940937
u₁v = vectorized === u₁loopsym; u₂v = vectorized === u₂loopsym
941938
round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0)) : 0
942939
if (irreducible_storecosts / sum(cost_vec) 0.25) && !any(op -> loadintostore(ls, op), operations(ls))

src/filter.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeT
1414
vy = vload(ptr_y, zero_index)
1515
mask = f(vy)
1616
VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
17-
ptr_y = gep(ptr_y, VectorizationBase.REGISTER_SIZE)
17+
ptr_y = gep(ptr_y, register_size())
1818
j = vadd_fast(j, count_ones(mask))
1919
end
2020
rem_mask = VectorizationBase.mask(T, Nrem)

src/lower_constant.jl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,13 @@ end
1818
@inline sizeequivalentint(::Type{Float32}) = Int32
1919
@inline sizeequivalentfloat(::Type{T}, x) where {T} = sizeequivalentfloat(T)(x)
2020
@inline sizeequivalentint(::Type{T}, x) where {T} = sizeequivalentint(T)(x)
21-
if VectorizationBase.AVX512DQ || !((Sys.ARCH === :x86_64) || (Sys.ARCH === :i686))
22-
@inline sizeequivalentint(::Type{Float64}) = Int64
23-
else
24-
@inline sizeequivalentint(::Type{Float64}) = Int32
21+
@generated function sizeequivalentint(::Type{Float64})
22+
if !((Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)) || VectorizationBase.has_feature("x86_64_avx512dq")
23+
:Int64
24+
else
25+
:Int32
26+
end
2527
end
26-
2728
# @inline onefloat(::Type{T}) where {T} = one(sizeequivalentfloat(T))
2829
# @inline oneinteger(::Type{T}) where {T} = one(sizeequivalentint(T))
2930
@inline zerofloat(::Type{T}) where {T} = zero(sizeequivalentfloat(T))

src/lower_load.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs)
7070
length(loopdependencies(op)) 1 && return 0
7171
vectorized loopdependencies(op) || return 0
7272
u₂loopsym === Symbol("##undefined##") && return 0
73-
dontskip = (CACHELINE_SIZE ÷ VectorizationBase.REGISTER_SIZE) - 1
73+
dontskip = (VectorizationBase.cacheline_size() ÷ VectorizationBase.dynamic_register_size()) - 1
7474
# u₂loopsym is vectorized
7575
# u₁vectorized = vectorized === u₁loopsym
7676
u₂vectorized = vectorized === u₂loopsym
@@ -101,7 +101,7 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs)
101101
end
102102
function add_prefetches!(q::Expr, ls::LoopSet, op::Operation, td::UnrollArgs, prefetchind::Int, umin::Int)
103103
@unpack u₁, u₁loopsym, u₂loopsym, vectorized, u₂max = td
104-
dontskip = (64 ÷ VectorizationBase.REGISTER_SIZE) - 1
104+
dontskip = (64 ÷ VectorizationBase.dynamic_register_size()) - 1
105105
ptr = vptr(op)
106106
innermostloopsym = first(names(ls))
107107
us = ls.unrollspecification[]

src/lowering.jl

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -845,7 +845,6 @@ end
845845
function lsexpr(ls::LoopSet, q)
846846
Expr(:block, ls.preamble, q)
847847
end
848-
const ISZEN1 = Sys.CPU_NAME === "znver1"
849848
function calc_Ureduct(ls::LoopSet, us::UnrollSpecification)
850849
@unpack u₁loopnum, u₁, u₂, vectorizedloopnum = us
851850
if iszero(length(ls.outer_reductions))
@@ -854,7 +853,7 @@ function calc_Ureduct(ls::LoopSet, us::UnrollSpecification)
854853
loopisstatic = isstaticloop(getloop(ls, names(ls)[u₁loopnum]))
855854
loopisstatic &= ((vectorizedloopnum != u₁loopnum) | (!iszero(ls.vector_width[])))
856855
# loopisstatic ? u₁ : min(u₁, 4) # much worse than the other two options, don't use this one
857-
if ISZEN1
856+
if Sys.CPU_NAME === "znver1"
858857
loopisstatic ? u₁ : 1
859858
else
860859
loopisstatic ? u₁ : (u₁ 4 ? 2 : 1)

0 commit comments

Comments
 (0)