JuliaSIMD
diff --git a/‎Project.toml
Lines changed: 3 additions & 2 deletions b/‎Project.toml
Lines changed: 3 additions & 2 deletions
diff --git a/‎benchmark/loadsharedlibs.jl
Lines changed: 4 additions & 3 deletions b/‎benchmark/loadsharedlibs.jl
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/LoopVectorization.jl
Lines changed: 2 additions & 1 deletion b/‎src/LoopVectorization.jl
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/costs.jl
Lines changed: 2 additions & 2 deletions b/‎src/costs.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/determinestrategy.jl
Lines changed: 18 additions & 21 deletions b/‎src/determinestrategy.jl
Lines changed: 18 additions & 21 deletions
diff --git a/‎src/filter.jl
Lines changed: 1 addition & 1 deletion b/‎src/filter.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/lower_constant.jl
Lines changed: 6 additions & 5 deletions b/‎src/lower_constant.jl
Lines changed: 6 additions & 5 deletions
diff --git a/‎src/lower_load.jl
Lines changed: 2 additions & 2 deletions b/‎src/lower_load.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/lowering.jl
Lines changed: 1 addition & 2 deletions b/‎src/lowering.jl
Lines changed: 1 addition & 2 deletions
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.9.20"
+version = "0.10"
 
 [deps]
 ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9"
@@ -22,8 +22,9 @@ IfElse = "0.1"
 OffsetArrays = "1.4.1, 1.5"
 Requires = "1"
 SLEEFPirates = "0.6.6"
+ThreadingUtilities = "0.2.2"
 UnPack = "1"
-VectorizationBase = "0.15.3"
+VectorizationBase = "0.16"
 julia = "1.5"
 
 [extras]
 
@@ -1,5 +1,6 @@
 using LinearAlgebra, LoopVectorization, Libdl
-using LoopVectorization.VectorizationBase: REGISTER_SIZE
+
+const REGISTER_SIZE = LoopVectorization.VectorizationBase.register_size()
 
 # const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
 include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
@@ -35,15 +36,15 @@ end
 eigenfile = joinpath(LOOPVECBENCHDIR, "looptestseigen.cpp")
 if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
     # Clang seems to have trouble finding includes
-    if LoopVectorization.VectorizationBase.AVX512F
+    if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
         run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
     else
         run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
     end
 end
 if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
     # run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
-    if LoopVectorization.VectorizationBase.AVX512F
+    if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
         run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
     else
         run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
 
@@ -5,7 +5,7 @@ module LoopVectorization
 # end
 
 using VectorizationBase, SLEEFPirates, UnPack, OffsetArrays
-using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
+using VectorizationBase: register_size, register_count, dynamic_register_size, dynamic_register_count, data,
     mask, pick_vector_width_val, MM,
     maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
     Zero, maybestaticrange, offsetprecalc, lazymul,
@@ -18,6 +18,7 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
 
 using IfElse: ifelse
 
+using ThreadingUtilities
 using SLEEFPirates: pow
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
 
@@ -75,7 +75,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
         return srt, sl, srp
     elseif offsetscaling(ic) # offset scaling
         srt *= 1 << (Wshift + VectorizationBase.intlog2(sizeof_T) - 4)
-        if (sizeof_T << Wshift) == 64 # VectorizationBase.REGISTER_SIZE # These instructions experience double latency with zmm
+        if (sizeof_T << Wshift) == 64 # VectorizationBase.register_size() # These instructions experience double latency with zmm
             sl += sl
         end
     elseif linearscaling(ic) # linear scaling
@@ -90,7 +90,7 @@ function vector_cost(ic::InstructionCost, Wshift, sizeof_T)
     srt, sl, srp
 end
 
-const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 40, 40.0, REGISTER_COUNT)
+const OPAQUE_INSTRUCTION = InstructionCost(-1.0, 40, 40.0, 32)
 
 instruction_cost(instruction::Instruction) = instruction.mod === :LoopVectorization ? COST[instruction.instr] : OPAQUE_INSTRUCTION
 instruction_cost(instruction::Symbol) = get(COST, instruction, OPAQUE_INSTRUCTION)
 
@@ -1,5 +1,4 @@
 
-const CACHELINE_SIZE = something(VectorizationBase.L₁CACHE.linesize, 64)
 
 # function indexappearences(op::Operation, s::Symbol)
 #     s ∉ loopdependencies(op) && return 0
@@ -105,7 +104,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
                 #       would be nice to add a check for this CPU, to see if such a penalty is still appropriate.
                 #       Also, once more SVE (scalable vector extension) CPUs are released, would be nice to know if
                 #       this feature is common to all of them.
-                srt += 0.5VectorizationBase.REGISTER_SIZE / CACHELINE_SIZE
+                srt += 0.5VectorizationBase.dynamic_register_size() / VectorizationBase.cacheline_size()
             end
         elseif isstore(op) # broadcast or reductionstore; if store we want to penalize reduction
             srt *= 3
@@ -388,7 +387,7 @@ end
 
 function solve_unroll_iter(X, R, u₁L, u₂L, u₁range, u₂range)
     R₁, R₂, R₃, R₄, R₅ = R[1], R[2], R[3], R[4], R[5]
-    RR = REGISTER_COUNT - R₃ - R₄
+    RR = dynamic_register_count() - R₃ - R₄
     u₁best, u₂best = 0, 0
     bestcost = Inf
     for u₁temp ∈ u₁range
@@ -408,13 +407,11 @@ end
 
 function solve_unroll(X, R, u₁L, u₂L, u₁step, u₂step)
     X₁, X₂, X₃, X₄ = X[1], X[2], X[3], X[4]
-    # If we don't have AVX512, masks occupy a vector register;
-    # AVX512F is currently defined as `false` for non-x86 CPUs, but
-    # should instead define generic constant `HAS_OPMASK_REGISTERS` in VectorizationBase.jl to use here instead.
-    VectorizationBase.AVX512F || (R[3] += 1)
+    # If we don't have opmask registers, masks probably occupy a vector register (e.g., on CPUs with AVX but not AVX512)
+    VectorizationBase.dynamic_has_opmask_registers() || (R[3] += 1)
     R₁, R₂, R₃, R₄, R₅ = R[1], R[2], R[3], R[4], R[5]
     iszero(R₅) || return solve_unroll_iter(X, R, u₁L, u₂L, u₁step:u₁step:10, u₂step:u₂step:10)
-    RR = REGISTER_COUNT - R₃ - R₄
+    RR = dynamic_register_count() - R₃ - R₄
     a = R₂^2*X₃ -R₁*X₄ * R₂ - R₁*X₂*RR
     b = R₁ * X₄ * RR - R₁ * X₄ * RR - 2X₃*RR*R₂
     c = X₃*RR^2
@@ -424,15 +421,15 @@ function solve_unroll(X, R, u₁L, u₂L, u₁step, u₂step)
     u₂float = (RR - u₁float*R₂)/(u₁float*R₁)
     if !(isfinite(u₂float) & isfinite(u₁float)) # brute force
         u₁low = u₂low = 1
-        u₁high = iszero(X₂) ? 2 : (REGISTER_COUNT == 32 ? 8 : 6)
-        u₂high = iszero(X₃) ? 2 : (REGISTER_COUNT == 32 ? 8 : 6)
+        u₁high = iszero(X₂) ? 2 : (dynamic_register_count() == 32 ? 8 : 6)
+        u₂high = iszero(X₃) ? 2 : (dynamic_register_count() == 32 ? 8 : 6)
         return solve_unroll_iter(X, R, u₁L, u₂L, u₁low:u₁step:u₁high, u₂low:u₂step:u₂high)
     end
     u₁low = floor(Int, u₁float)
     u₂low = max(u₂step, floor(Int, 0.8u₂float)) # must be at least 1
     u₁high = solve_unroll_constT(R, u₂low) + u₁step
     u₂high = solve_unroll_constU(R, u₁low) + u₂step
-    maxunroll = REGISTER_COUNT == 32 ? (((X₂ > 0) & (X₃ > 0)) ? 10 : 8) : 6
+    maxunroll = dynamic_register_count() == 32 ? (((X₂ > 0) & (X₃ > 0)) ? 10 : 8) : 6
     u₁low = (min(u₁low, maxunroll) ÷ u₁step) * u₁step
     u₂low = (min(u₂low, maxunroll) ÷ u₂step) * u₂step
     u₁high = min(u₁high, maxunroll)
@@ -443,18 +440,18 @@ end
 function solve_unroll_constU(R::AbstractVector, u₁::Int)
     denom = u₁ * R[1] + R[5]
     iszero(denom) && return 8
-    floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₁*R[2]) / denom)
+    floor(Int, (dynamic_register_count() - R[3] - R[4] - u₁*R[2]) / denom)
 end
 function solve_unroll_constT(R::AbstractVector, u₂::Int)
     denom = u₂ * R[1] + R[2]
     iszero(denom) && return 8
-    floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / denom)
+    floor(Int, (dynamic_register_count() - R[3] - R[4] - u₂*R[5]) / denom)
 end
 # function solve_unroll_constT(ls::LoopSet, u₂::Int)
 #     R = @view ls.reg_pres[:,1]
 #     denom = u₂ * R[1] + R[2]
 #     iszero(denom) && return 8
-#     floor(Int, (REGISTER_COUNT - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
+#     floor(Int, (dynamic_register_count() - R[3] - R[4] - u₂*R[5]) / (u₂ * R[1] + R[2]))
 # end
 # Tiling here is about alleviating register pressure for the UxT
 function solve_unroll(X, R, u₁max, u₂max, u₁L, u₂L, u₁step, u₂step)
@@ -501,9 +498,9 @@ function solve_unroll(
     W::Int, vectorized::Symbol, rounduᵢ::Int
 )
     (u₁step, u₂step) = if rounduᵢ == 1 # max is to safeguard against some weird arch I've never heard of.
-        (max(1,CACHELINE_SIZE ÷ VectorizationBase.REGISTER_SIZE), 1)
+        (max(1,VectorizationBase.cacheline_size() ÷ VectorizationBase.dynamic_register_size()), 1)
     elseif rounduᵢ == 2
-        (1, max(1,CACHELINE_SIZE ÷ VectorizationBase.REGISTER_SIZE))
+        (1, max(1,VectorizationBase.cacheline_size() ÷ VectorizationBase.dynamic_register_size()))
     else
         (1, 1)
     end
@@ -522,7 +519,7 @@ function solve_unroll(
     u₁loop::Loop, u₂loop::Loop,
     u₁step::Int, u₂step::Int
 )
-    maxu₂base = maxu₁base = REGISTER_COUNT == 32 ? 10 : 6#8
+    maxu₂base = maxu₁base = dynamic_register_count() == 32 ? 10 : 6#8
     maxu₂ = maxu₂base#8
     maxu₁ = maxu₁base#8
     u₁L = length(u₁loop)
@@ -721,13 +718,13 @@ function load_elimination_cost_factor!(
         #     if isstaticloop(loop) && length(loop) ≤ 4
         #         itersym = loop.itersymbol
         #         if itersym !== u₁loopsym && itersym !== u₂loopsym
-        #             return (0.25, REGISTER_COUNT == 32 ? 2.0 : 1.0)
+        #             return (0.25, dynamic_register_count() == 32 ? 2.0 : 1.0)
         #             # return (0.25, 1.0)
         #             return true
         #         end
         #     end
         # end
-        # # (0.25, REGISTER_COUNT == 32 ? 1.2 : 1.0)
+        # # (0.25, dynamic_register_count() == 32 ? 1.2 : 1.0)
         # (0.25, 1.0)
         cost_vec[1] -= 0.1looplengthprod(ls)
         reg_pressure[1] += 0.25rp
@@ -919,7 +916,7 @@ function evaluate_cost_tile(
         rt, lat, rp = cost(ls, op, vectorized, Wshift, size_T)
         if isload(op)
             if !iszero(prefetchisagoodidea(ls, op, UnrollArgs(4, unrollsyms, 4, 0)))
-                # rt += 0.5VectorizationBase.REGISTER_SIZE / CACHELINE_SIZE
+                # rt += 0.5VectorizationBase.dynamic_register_size() / VectorizationBase.cacheline_size()
                 prefetch_good_idea = true
             end
         end
@@ -936,7 +933,7 @@ function evaluate_cost_tile(
     end
     # reg_pressure[1] = max(reg_pressure[1], length(ls.outer_reductions))
     # @inbounds ((cost_vec[4] > 0) || ((cost_vec[2] > 0) & (cost_vec[3] > 0))) || return 0,0,Inf,false
-    costpenalty = (sum(reg_pressure) > REGISTER_COUNT) ? 2 : 1
+    costpenalty = (sum(reg_pressure) > dynamic_register_count()) ? 2 : 1
     u₁v = vectorized === u₁loopsym; u₂v = vectorized === u₂loopsym
     round_uᵢ = prefetch_good_idea ? (u₁v ? 1 : (u₂v ? 2 : 0)) : 0
     if (irreducible_storecosts / sum(cost_vec) ≥ 0.25) && !any(op -> loadintostore(ls, op), operations(ls))
 
@@ -14,7 +14,7 @@ function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeT
             vy = vload(ptr_y, zero_index)
             mask = f(vy)
             VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
-            ptr_y = gep(ptr_y, VectorizationBase.REGISTER_SIZE)
+            ptr_y = gep(ptr_y, register_size())
             j = vadd_fast(j, count_ones(mask))
         end
         rem_mask = VectorizationBase.mask(T, Nrem)
 
@@ -18,12 +18,13 @@ end
 @inline sizeequivalentint(::Type{Float32}) = Int32
 @inline sizeequivalentfloat(::Type{T}, x) where {T} = sizeequivalentfloat(T)(x)
 @inline sizeequivalentint(::Type{T}, x) where {T} = sizeequivalentint(T)(x)
-if VectorizationBase.AVX512DQ || !((Sys.ARCH === :x86_64) || (Sys.ARCH === :i686))
-    @inline sizeequivalentint(::Type{Float64}) = Int64
-else
-    @inline sizeequivalentint(::Type{Float64}) = Int32
+@generated function sizeequivalentint(::Type{Float64})
+    if !((Sys.ARCH === :x86_64) || (Sys.ARCH === :i686)) || VectorizationBase.has_feature("x86_64_avx512dq")
+        :Int64
+    else
+        :Int32
+    end
 end
-
 # @inline onefloat(::Type{T}) where {T} = one(sizeequivalentfloat(T))
 # @inline oneinteger(::Type{T}) where {T} = one(sizeequivalentint(T))
 @inline zerofloat(::Type{T}) where {T} = zero(sizeequivalentfloat(T))
 
@@ -70,7 +70,7 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs)
     length(loopdependencies(op)) ≤ 1 && return 0
     vectorized ∈ loopdependencies(op) || return 0
     u₂loopsym === Symbol("##undefined##") && return 0
-    dontskip = (CACHELINE_SIZE ÷ VectorizationBase.REGISTER_SIZE) - 1
+    dontskip = (VectorizationBase.cacheline_size() ÷ VectorizationBase.dynamic_register_size()) - 1
     # u₂loopsym is vectorized
     # u₁vectorized = vectorized === u₁loopsym
     u₂vectorized = vectorized === u₂loopsym
@@ -101,7 +101,7 @@ function prefetchisagoodidea(ls::LoopSet, op::Operation, td::UnrollArgs)
 end
 function add_prefetches!(q::Expr, ls::LoopSet, op::Operation, td::UnrollArgs, prefetchind::Int, umin::Int)
     @unpack u₁, u₁loopsym, u₂loopsym, vectorized, u₂max = td
-    dontskip = (64 ÷ VectorizationBase.REGISTER_SIZE) - 1
+    dontskip = (64 ÷ VectorizationBase.dynamic_register_size()) - 1
     ptr = vptr(op)
     innermostloopsym = first(names(ls))
     us = ls.unrollspecification[]
 
@@ -845,7 +845,6 @@ end
 function lsexpr(ls::LoopSet, q)
     Expr(:block, ls.preamble, q)
 end
-const ISZEN1 = Sys.CPU_NAME === "znver1"
 function calc_Ureduct(ls::LoopSet, us::UnrollSpecification)
     @unpack u₁loopnum, u₁, u₂, vectorizedloopnum = us
     if iszero(length(ls.outer_reductions))
@@ -854,7 +853,7 @@ function calc_Ureduct(ls::LoopSet, us::UnrollSpecification)
         loopisstatic = isstaticloop(getloop(ls, names(ls)[u₁loopnum]))
         loopisstatic &= ((vectorizedloopnum != u₁loopnum) | (!iszero(ls.vector_width[])))
         # loopisstatic ? u₁ : min(u₁, 4) # much worse than the other two options, don't use this one
-        if ISZEN1
+        if Sys.CPU_NAME === "znver1"
             loopisstatic ? u₁ : 1
         else
             loopisstatic ? u₁ : (u₁ ≥ 4 ? 2 : 1)