JuliaSIMD
diff --git a/‎.github/workflows/ci.yml
Lines changed: 8 additions & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 8 additions & 0 deletions
diff --git a/‎Project.toml
Lines changed: 3 additions & 3 deletions b/‎Project.toml
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/LoopVectorization.jl
Lines changed: 4 additions & 8 deletions b/‎src/LoopVectorization.jl
Lines changed: 4 additions & 8 deletions
diff --git a/‎src/add_stores.jl
Lines changed: 4 additions & 6 deletions b/‎src/add_stores.jl
Lines changed: 4 additions & 6 deletions
diff --git a/‎src/condense_loopset.jl
Lines changed: 5 additions & 5 deletions b/‎src/condense_loopset.jl
Lines changed: 5 additions & 5 deletions
diff --git a/‎src/costs.jl
Lines changed: 44 additions & 18 deletions b/‎src/costs.jl
Lines changed: 44 additions & 18 deletions
diff --git a/‎src/determinestrategy.jl
Lines changed: 2 additions & 2 deletions b/‎src/determinestrategy.jl
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/filter.jl
Lines changed: 27 additions & 27 deletions b/‎src/filter.jl
Lines changed: 27 additions & 27 deletions
@@ -3,9 +3,17 @@ on:
   pull_request:
     branches:
       - master
+    paths-ignore:
+      - 'LICENSE.md'
+      - 'README.md'
+      - '.github/workflows/TagBot.yml'
   push:
     branches:
       - master
+    paths-ignore:
+      - 'LICENSE.md'
+      - 'README.md'
+      - '.github/workflows/TagBot.yml'
     tags: '*'
 jobs:
   test:
 
@@ -17,10 +17,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
 ArrayInterface = "2.14.12"
 DocStringExtensions = "0.8"
 IfElse = "0.1"
-OffsetArrays = "1.4.1"
-SLEEFPirates = "0.6.3"
+OffsetArrays = "1.4.1, 1.5"
+SLEEFPirates = "0.6.4"
 UnPack = "1"
-VectorizationBase = "0.14.11"
+VectorizationBase = "0.15"
 julia = "1.5"
 
 [extras]
 
@@ -10,27 +10,23 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
     maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
     Zero, maybestaticrange, offsetprecalc, lazymul,
     maybestaticfirst, maybestaticlast, scalar_less, scalar_greaterequal, gep, gesp, pointerforcomparison, NativeTypes,
-    vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, vadd, vsub, vmul,
+    vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd_fast, vfmsub_fast, vfnmadd_fast, vfnmsub_fast, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231,
+    vfma_fast, vmuladd_fast, vdiv_fast, vadd_fast, vsub_fast, vmul_fast,
     relu, stridedpointer, StridedPointer, StridedBitPointer, AbstractStridedPointer,
     reduced_add, reduced_prod, reduce_to_add, reduce_to_prod, reduced_max, reduced_min, reduce_to_max, reduce_to_min,
     vsum, vprod, vmaximum, vminimum, vstorent!
 
 using IfElse: ifelse
 
-# missing: stridedpointer_for_broadcast, noalias!, gepbyte,
-# using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange,
-#     reduced_add, reduced_prod, reduce_to_add, reduced_max, reduced_min, vsum, vprod, vmaximum, vminimum,
-#     sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
-#     vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
-#     vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone, vadd1, relu
 using SLEEFPirates: pow
 using Base.Broadcast: Broadcasted, DefaultArrayStyle
 using LinearAlgebra: Adjoint, Transpose
 using Base.Meta: isexpr
 using DocStringExtensions
 import LinearAlgebra # for check_args
 
-using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast
+using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast, inv_fast, abs2_fast, rem_fast, max_fast, min_fast
+
 
 using ArrayInterface
 using ArrayInterface: OptionallyStaticUnitRange, Zero, One#, static_length
 
@@ -71,12 +71,10 @@ function add_store_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int)
     add_store!(ls, var, array, raw_indices, elementbytes)
 end
 function add_store_ref!(ls::LoopSet, var, ex::Expr, elementbytes::Int)
-    # array, raw_indices = ref_from_ref(ex)
-    # mpref = array_reference_meta!(ls, array, raw_indices, elementbytes)
-    # c = add_constant!(ls, var, loopdependencies(mpref), gensym(:storeconst), elementbytes)
-    # add_store!(ls, name(c), mpref, elementbytes)
-    c = add_constant!(ls, var, elementbytes)
-    add_store_ref!(ls, name(c), ex, elementbytes)
+    array, raw_indices = ref_from_ref!(ls, ex)
+    mpref = array_reference_meta!(ls, array, raw_indices, elementbytes)
+    c = add_constant!(ls, var, loopdependencies(mpref), gensym(:storeconst), elementbytes)
+    add_store!(ls, mpref, elementbytes, c)
 end
 
 # For now, it is illegal to load from a conditional store.
 
@@ -241,10 +241,11 @@ function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool =
     lbarg = debug ? Expr(:call, :typeof, loop_bounds) : loop_bounds
     q = Expr(
         :call, func, val(Expr(:tuple, inline, u₁, u₂, Expr(:call, lv(:unwrap), VECTORWIDTHSYMBOL))),
-        val(operation_descriptions), val(arrayref_descriptions), val(argmeta), val(loop_syms), lbarg
+        val(operation_descriptions), val(arrayref_descriptions), val(argmeta), val(loop_syms)
     )
     # debug && deleteat!(q.args, 2)
-    vargs_as_tuple = !debug
+    vargs_as_tuple = true#!debug
+    vargs_as_tuple || push!(q.args, lbarg)
     extra_args = vargs_as_tuple ? Expr(:tuple) : q
     foreach(ref -> push!(extra_args.args, vptr(ref)), ls.refs_aliasing_syms)
 
@@ -253,7 +254,7 @@ function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool =
     add_reassigned_syms!(extra_args, ls)
     add_external_functions!(extra_args, ls)
     # debug && return q
-    vargs_as_tuple && push!(q.args, extra_args)
+    vargs_as_tuple && push!(q.args, Expr(:tuple, lbarg, extra_args))
     vecwidthdefq = Expr(:block)
     define_eltype_vec_width!(vecwidthdefq, ls, nothing)
     Expr(:block, vecwidthdefq, q)
@@ -305,8 +306,7 @@ make_crashy(q) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,
 
 function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
     call = generate_call(ls, (inline,U,T))
-    noouterreductions = iszero(length(ls.outer_reductions))
-    if noouterreductions
+    if iszero(length(ls.outer_reductions))
         q = Expr(:block,gc_preserve(ls, call))
         append!(ls.preamble.args, q.args)
         return ls.preamble
 
@@ -137,7 +137,9 @@ const COST = Dict{Symbol,InstructionCost}(
     :mul_fast => InstructionCost(4,0.5),
     # :vfdiv => InstructionCost(13,4.0,-2.0),
     # :vfdiv! => InstructionCost(13,4.0,-2.0),
+    :rem_fast => InstructionCost(13,4.0,-2.0), # FIXME
     :div_fast => InstructionCost(13,4.0,-2.0),
+    :vdiv_fast => InstructionCost(20,4.0,-2.0), # FIXME
     # :evadd => InstructionCost(4,0.5),
     # :evsub => InstructionCost(4,0.5),
     # :evmul => InstructionCost(4,0.5),
@@ -152,7 +154,7 @@ const COST = Dict{Symbol,InstructionCost}(
     :reduce_to_prod => InstructionCost(0,0.0,0.0,0),
     :abs => InstructionCost(1, 0.5),
     :abs2 => InstructionCost(4,0.5),
-    # :vabs2 => InstructionCost(4,0.5),
+    :abs2_fast => InstructionCost(4,0.5),
     :(==) => InstructionCost(1, 0.5),
     :(!=) => InstructionCost(1, 0.5),
     :(isnan) => InstructionCost(1, 0.5),
@@ -179,19 +181,26 @@ const COST = Dict{Symbol,InstructionCost}(
     :iseven => InstructionCost(1, 0.5),
     :max => InstructionCost(4,0.5),
     :min => InstructionCost(4,0.5),
+    :max_fast => InstructionCost(4,0.5),
+    :min_fast => InstructionCost(4,0.5),
     :relu => InstructionCost(4,0.5),
     # Instruction(:ifelse) => InstructionCost(1, 0.5),
     :ifelse => InstructionCost(1, 0.5),
     :inv => InstructionCost(13,4.0,-2.0,1),
+    :inv_fast => InstructionCost(10,4.0,-2.0,1), # FIXME
     # :vinv => InstructionCost(13,4.0,-2.0,1),
     :muladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     :fma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    # :vmuladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
-    # :vfma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    :vmuladd_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    :vfma_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     :vfmadd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     :vfmsub => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
     :vfnmadd => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
     :vfnmsub => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
+    :vfmadd_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
+    :vfmsub_fast => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
+    :vfnmadd_fast => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
+    :vfnmsub_fast => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
     :vfmadd231 => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
     :vfmsub231 => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
     :vfnmadd231 => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
@@ -289,9 +298,15 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
     :* => MULTIPLICATIVE_IN_REDUCTIONS,
     :vadd => ADDITIVE_IN_REDUCTIONS,
     :vsub => ADDITIVE_IN_REDUCTIONS,
+    :add_fast => ADDITIVE_IN_REDUCTIONS,
+    :sub_fast => ADDITIVE_IN_REDUCTIONS,
+    :vadd_fast => ADDITIVE_IN_REDUCTIONS,
+    :vsub_fast => ADDITIVE_IN_REDUCTIONS,
     # :vadd! => ADDITIVE_IN_REDUCTIONS,
     # :vsub! => ADDITIVE_IN_REDUCTIONS,
     :vmul => MULTIPLICATIVE_IN_REDUCTIONS,
+    :mul_fast => MULTIPLICATIVE_IN_REDUCTIONS,
+    :vmul_fast => MULTIPLICATIVE_IN_REDUCTIONS,
     # :vmul! => MULTIPLICATIVE_IN_REDUCTIONS,
     # :evadd => ADDITIVE_IN_REDUCTIONS,
     # :evsub => ADDITIVE_IN_REDUCTIONS,
@@ -300,12 +315,16 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
     :| => ANY,
     :muladd => ADDITIVE_IN_REDUCTIONS,
     :fma => ADDITIVE_IN_REDUCTIONS,
-    # :vmuladd => ADDITIVE_IN_REDUCTIONS,
-    # :vfma => ADDITIVE_IN_REDUCTIONS,
+    :vmuladd_fast => ADDITIVE_IN_REDUCTIONS,
+    :vfma_fast => ADDITIVE_IN_REDUCTIONS,
     :vfmadd => ADDITIVE_IN_REDUCTIONS,
     :vfmsub => ADDITIVE_IN_REDUCTIONS,
     :vfnmadd => ADDITIVE_IN_REDUCTIONS,
     :vfnmsub => ADDITIVE_IN_REDUCTIONS,
+    :vfmadd_fast => ADDITIVE_IN_REDUCTIONS,
+    :vfmsub_fast => ADDITIVE_IN_REDUCTIONS,
+    :vfnmadd_fast => ADDITIVE_IN_REDUCTIONS,
+    :vfnmsub_fast => ADDITIVE_IN_REDUCTIONS,
     :vfmadd231 => ADDITIVE_IN_REDUCTIONS,
     :vfmsub231 => ADDITIVE_IN_REDUCTIONS,
     :vfnmadd231 => ADDITIVE_IN_REDUCTIONS,
@@ -314,22 +333,20 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
     # :vfnmadd! => ADDITIVE_IN_REDUCTIONS,
     # :vfmsub! => ADDITIVE_IN_REDUCTIONS,
     # :vfnmsub! => ADDITIVE_IN_REDUCTIONS,
-    :vfmadd_fast => ADDITIVE_IN_REDUCTIONS,
-    :vfmsub_fast => ADDITIVE_IN_REDUCTIONS,
-    :vfnmadd_fast => ADDITIVE_IN_REDUCTIONS,
-    :vfnmsub_fast => ADDITIVE_IN_REDUCTIONS,
     :reduced_add => ADDITIVE_IN_REDUCTIONS,
     :reduced_prod => MULTIPLICATIVE_IN_REDUCTIONS,
     :reduced_all => ALL,
     :reduced_any => ANY,
     :max => MAX,
-    :min => MIN
+    :min => MIN,
+    :max_fast => MAX,
+    :min_fast => MIN
 )
 reduction_instruction_class(instr::Symbol) = get(REDUCTION_CLASS, instr, NaN)
 reduction_instruction_class(instr::Instruction) = reduction_instruction_class(instr.instr)
 function reduction_to_single_vector(x::Float64)
     # x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 3.0 ? :vor : x == 4.0 ? :vand : x == 5.0 ? :max : x == 6.0 ? :min : throw("Reduction not found.")
-    x == ADDITIVE_IN_REDUCTIONS ? :vadd : x == MULTIPLICATIVE_IN_REDUCTIONS ? :vmul : x == MAX ? :max : x == MIN ? :min : throw("Reduction not found.")
+    x == ADDITIVE_IN_REDUCTIONS ? :(+) : x == MULTIPLICATIVE_IN_REDUCTIONS ? :(*) : x == MAX ? :max : x == MIN ? :min : throw("Reduction not found.")
 end
 reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction_class(x))
 # function reduction_to_scalar(x::Float64)
@@ -366,21 +383,22 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
     typeof(+) => :(+),
     typeof(VectorizationBase.vadd) => :(+),
     # typeof(VectorizationBase.vadd!) => :(+),
-    typeof(Base.FastMath.add_fast) => :(+),
+    typeof(Base.FastMath.add_fast) => :add_fast,
     typeof(-) => :(-),
     typeof(VectorizationBase.vsub) => :(-),
     # typeof(VectorizationBase.vsub!) => :(-),
-    typeof(Base.FastMath.sub_fast) => :(-),
+    typeof(Base.FastMath.sub_fast) => :sub_fast,
     typeof(*) => :(*),
     typeof(VectorizationBase.vmul) => :(*),
     # typeof(VectorizationBase.vmul!) => :(*),
-    typeof(Base.FastMath.mul_fast) => :(*),
+    typeof(Base.FastMath.mul_fast) => :mul_fast,
     typeof(/) => :(/),
     typeof(^) => :(^),
     # typeof(VectorizationBase.vfdiv) => :(/),
     # typeof(VectorizationBase.vfdiv!) => :(/),
     typeof(VectorizationBase.vdiv) => :(/),
-    typeof(Base.FastMath.div_fast) => :(/),
+    typeof(Base.FastMath.div_fast) => :div_fast,
+    typeof(Base.FastMath.rem_fast) => :rem_fast,
     typeof(==) => :(==),
     typeof(!=) => :(!=),
     typeof(isequal) => :isequal,
@@ -389,6 +407,7 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
     typeof(isfinite) => :isfinite,
     typeof(abs) => :abs,
     typeof(abs2) => :abs2,
+    typeof(abs2_fast) => :abs2_fast,
     typeof(~) => :(~),
     typeof(!) => :(!),
     typeof(&) => :(&),
@@ -399,14 +418,19 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
     typeof(>=) => :(>=),
     typeof(<=) => :(<=),
     typeof(inv) => :inv,
+    typeof(inv_fast) => :inv_fast,
     typeof(muladd) => :muladd,
     typeof(fma) => :fma,
-    # typeof(VectorizationBase.vmuladd) => :vmuladd,
-    # typeof(VectorizationBase.vfma) => :vfma,
-    typeof(VectorizationBase.vfmadd) => :vfmadd,
+    typeof(VectorizationBase.vfma) => :vfma,
+    typeof(VectorizationBase.vmuladd) => :vmuladd,
     typeof(VectorizationBase.vfmsub) => :vfmsub,
     typeof(VectorizationBase.vfnmadd) => :vfnmadd,
     typeof(VectorizationBase.vfnmsub) => :vfnmsub,
+    typeof(VectorizationBase.vfma_fast) => :vfma_fast,
+    typeof(VectorizationBase.vmuladd_fast) => :vmuladd_fast,
+    typeof(VectorizationBase.vfmsub_fast) => :vfmsub_fast,
+    typeof(VectorizationBase.vfnmadd_fast) => :vfnmadd_fast,
+    typeof(VectorizationBase.vfnmsub_fast) => :vfnmsub_fast,
     typeof(VectorizationBase.vfmadd231) => :vfmadd231,
     typeof(VectorizationBase.vfmsub231) => :vfmsub231,
     typeof(VectorizationBase.vfnmadd231) => :vfnmadd231,
@@ -454,6 +478,8 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
     # typeof(SLEEFPirates.tanh_fast) => :tanh_fast,
     typeof(max) => :max,
     typeof(min) => :min,
+    typeof(max_fast) => :max_fast,
+    typeof(min_fast) => :min_fast,
     typeof(relu) => :relu,
     typeof(<<) => :<<,
     typeof(>>) => :>>,
 
@@ -13,7 +13,7 @@ const CACHELINE_SIZE = something(VectorizationBase.L₁CACHE.linesize, 64)
 #     for opp ∈ parents(op)
 #         newapp += indexappearences(opp, s)
 #     end
-#     factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :evadd) ? 1 : 10
+#     factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :vadd_fast) ? 1 : 10
 #     newapp * factor
 # end
 function check_linear_parents(ls::LoopSet, op::Operation, s::Symbol)
@@ -82,7 +82,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
     instr = instruction(op)
     # instr = instruction(op)
     if length(parents(op)) == 1
-        if instr == Instruction(:-) || instr === Instruction(:vsub) || instr == Instruction(:+) || instr == Instruction(:vadd)
+        if instr == Instruction(:-) || instr === Instruction(:sub_fast) || instr == Instruction(:+) || instr == Instruction(:add_fast)
             return 0.0, 0, 0.0
         end
     elseif iscompute(op) && all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))
 
@@ -1,42 +1,42 @@
 
-if (Base.libllvm_version ≥ v"7" && VectorizationBase.AVX512F) || Base.libllvm_version ≥ v"9"
-    function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeTypes}
-        W, Wshift = VectorizationBase.pick_vector_width_shift(T)
-        N = length(y)
-        Nrep = N >>> Wshift
-        Nrem = N & (W - 1)
-        j = 0
-        st = VectorizationBase.static_sizeof(T)
-        zero_index = MM{W}(Static(0), st)
-        GC.@preserve x y begin
-            ptr_x = pointer(x)
-            ptr_y = pointer(y)
-            for _ ∈ 1:Nrep
-                vy = vload(ptr_y, zero_index)
-                mask = f(vy)
-                VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
-                ptr_y = gep(ptr_y, VectorizationBase.REGISTER_SIZE)
-                j = vadd(j, count_ones(mask))
-            end
-            rem_mask = VectorizationBase.mask(T, Nrem)
-            vy = vload(ptr_y, zero_index, rem_mask)
-            mask = rem_mask & f(vy)
+function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeTypes}
+    W, Wshift = VectorizationBase.pick_vector_width_shift(T)
+    N = length(y)
+    Nrep = N >>> Wshift
+    Nrem = N & (W - 1)
+    j = 0
+    st = VectorizationBase.static_sizeof(T)
+    zero_index = MM{W}(Static(0), st)
+    GC.@preserve x y begin
+        ptr_x = pointer(x)
+        ptr_y = pointer(y)
+        for _ ∈ 1:Nrep
+            vy = vload(ptr_y, zero_index)
+            mask = f(vy)
             VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
-            j = vadd(j, count_ones(mask))
-            Base._deleteend!(x, N-j) # resize!(x, j)
+            ptr_y = gep(ptr_y, VectorizationBase.REGISTER_SIZE)
+            j = vadd_fast(j, count_ones(mask))
         end
-        x
+        rem_mask = VectorizationBase.mask(T, Nrem)
+        vy = vload(ptr_y, zero_index, rem_mask)
+        mask = rem_mask & f(vy)
+        VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
+        j = vadd_fast(j, count_ones(mask))
+        Base._deleteend!(x, N-j) # resize!(x, j)
     end
-    vfilter!(f::F, x::Vector{T}) where {F, T<:NativeTypes} = vfilter!(f, x, x)
-    vfilter(f::F, y::AbstractArray{T}) where {F, T<:NativeTypes} = vfilter!(f, Vector{T}(undef, length(y)), y)
+    x
 end
+vfilter!(f::F, x::Vector{T}) where {F, T<:NativeTypes} = vfilter!(f, x, x)
+vfilter(f::F, y::AbstractArray{T}) where {F, T<:NativeTypes} = vfilter!(f, Vector{T}(undef, length(y)), y)
 vfilter(f::F, y) where {F} = filter(f, y)
 vfilter!(f::F, y) where {F} = filter!(f, y)
 
 """
     vfilter(f, a::AbstractArray)
 
 SIMD-vectorized `filter`, returning an array containing the elements of `a` for which `f` return `true`.
+
+This function requires AVX512 to be faster than `Base.filter`, as it adds compressstore instructions.
 """
 vfilter