Skip to content

Commit 6a5fb90

Browse files
committed
Track VectorizationBase update.
1 parent 7342b77 commit 6a5fb90

21 files changed

+264
-225
lines changed

.github/workflows/ci.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,17 @@ on:
33
pull_request:
44
branches:
55
- master
6+
paths-ignore:
7+
- 'LICENSE.md'
8+
- 'README.md'
9+
- '.github/workflows/TagBot.yml'
610
push:
711
branches:
812
- master
13+
paths-ignore:
14+
- 'LICENSE.md'
15+
- 'README.md'
16+
- '.github/workflows/TagBot.yml'
917
tags: '*'
1018
jobs:
1119
test:

Project.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,10 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1717
ArrayInterface = "2.14.12"
1818
DocStringExtensions = "0.8"
1919
IfElse = "0.1"
20-
OffsetArrays = "1.4.1"
21-
SLEEFPirates = "0.6.3"
20+
OffsetArrays = "1.4.1, 1.5"
21+
SLEEFPirates = "0.6.4"
2222
UnPack = "1"
23-
VectorizationBase = "0.14.11"
23+
VectorizationBase = "0.15"
2424
julia = "1.5"
2525

2626
[extras]

src/LoopVectorization.jl

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -10,27 +10,23 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, data,
1010
maybestaticlength, maybestaticsize, staticm1, staticp1, staticmul, vzero,
1111
Zero, maybestaticrange, offsetprecalc, lazymul,
1212
maybestaticfirst, maybestaticlast, scalar_less, scalar_greaterequal, gep, gesp, pointerforcomparison, NativeTypes,
13-
vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, vadd, vsub, vmul,
13+
vfmadd, vfmsub, vfnmadd, vfnmsub, vfmadd_fast, vfmsub_fast, vfnmadd_fast, vfnmsub_fast, vfmadd231, vfmsub231, vfnmadd231, vfnmsub231,
14+
vfma_fast, vmuladd_fast, vdiv_fast, vadd_fast, vsub_fast, vmul_fast,
1415
relu, stridedpointer, StridedPointer, StridedBitPointer, AbstractStridedPointer,
1516
reduced_add, reduced_prod, reduce_to_add, reduce_to_prod, reduced_max, reduced_min, reduce_to_max, reduce_to_min,
1617
vsum, vprod, vmaximum, vminimum, vstorent!
1718

1819
using IfElse: ifelse
1920

20-
# missing: stridedpointer_for_broadcast, noalias!, gepbyte,
21-
# using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange,
22-
# reduced_add, reduced_prod, reduce_to_add, reduced_max, reduced_min, vsum, vprod, vmaximum, vminimum,
23-
# sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,
24-
# vfmadd231, vfmsub231, vfnmadd231, vfnmsub231, sizeequivalentfloat, sizeequivalentint, #prefetch,
25-
# vmullog2, vmullog10, vdivlog2, vdivlog10, vmullog2add!, vmullog10add!, vdivlog2add!, vdivlog10add!, vfmaddaddone, vadd1, relu
2621
using SLEEFPirates: pow
2722
using Base.Broadcast: Broadcasted, DefaultArrayStyle
2823
using LinearAlgebra: Adjoint, Transpose
2924
using Base.Meta: isexpr
3025
using DocStringExtensions
3126
import LinearAlgebra # for check_args
3227

33-
using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast
28+
using Base.FastMath: add_fast, sub_fast, mul_fast, div_fast, inv_fast, abs2_fast, rem_fast, max_fast, min_fast
29+
3430

3531
using ArrayInterface
3632
using ArrayInterface: OptionallyStaticUnitRange, Zero, One#, static_length

src/add_stores.jl

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -71,12 +71,10 @@ function add_store_ref!(ls::LoopSet, var::Symbol, ex::Expr, elementbytes::Int)
7171
add_store!(ls, var, array, raw_indices, elementbytes)
7272
end
7373
function add_store_ref!(ls::LoopSet, var, ex::Expr, elementbytes::Int)
74-
# array, raw_indices = ref_from_ref(ex)
75-
# mpref = array_reference_meta!(ls, array, raw_indices, elementbytes)
76-
# c = add_constant!(ls, var, loopdependencies(mpref), gensym(:storeconst), elementbytes)
77-
# add_store!(ls, name(c), mpref, elementbytes)
78-
c = add_constant!(ls, var, elementbytes)
79-
add_store_ref!(ls, name(c), ex, elementbytes)
74+
array, raw_indices = ref_from_ref!(ls, ex)
75+
mpref = array_reference_meta!(ls, array, raw_indices, elementbytes)
76+
c = add_constant!(ls, var, loopdependencies(mpref), gensym(:storeconst), elementbytes)
77+
add_store!(ls, mpref, elementbytes, c)
8078
end
8179

8280
# For now, it is illegal to load from a conditional store.

src/condense_loopset.jl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,11 @@ function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool =
241241
lbarg = debug ? Expr(:call, :typeof, loop_bounds) : loop_bounds
242242
q = Expr(
243243
:call, func, val(Expr(:tuple, inline, u₁, u₂, Expr(:call, lv(:unwrap), VECTORWIDTHSYMBOL))),
244-
val(operation_descriptions), val(arrayref_descriptions), val(argmeta), val(loop_syms), lbarg
244+
val(operation_descriptions), val(arrayref_descriptions), val(argmeta), val(loop_syms)
245245
)
246246
# debug && deleteat!(q.args, 2)
247-
vargs_as_tuple = !debug
247+
vargs_as_tuple = true#!debug
248+
vargs_as_tuple || push!(q.args, lbarg)
248249
extra_args = vargs_as_tuple ? Expr(:tuple) : q
249250
foreach(ref -> push!(extra_args.args, vptr(ref)), ls.refs_aliasing_syms)
250251

@@ -253,7 +254,7 @@ function generate_call(ls::LoopSet, inline_unroll::NTuple{3,Int8}, debug::Bool =
253254
add_reassigned_syms!(extra_args, ls)
254255
add_external_functions!(extra_args, ls)
255256
# debug && return q
256-
vargs_as_tuple && push!(q.args, extra_args)
257+
vargs_as_tuple && push!(q.args, Expr(:tuple, lbarg, extra_args))
257258
vecwidthdefq = Expr(:block)
258259
define_eltype_vec_width!(vecwidthdefq, ls, nothing)
259260
Expr(:block, vecwidthdefq, q)
@@ -305,8 +306,7 @@ make_crashy(q) = Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,
305306

306307
function setup_call_inline(ls::LoopSet, inline::Int8 = zero(Int8), U::Int8 = zero(Int8), T::Int8 = zero(Int8))
307308
call = generate_call(ls, (inline,U,T))
308-
noouterreductions = iszero(length(ls.outer_reductions))
309-
if noouterreductions
309+
if iszero(length(ls.outer_reductions))
310310
q = Expr(:block,gc_preserve(ls, call))
311311
append!(ls.preamble.args, q.args)
312312
return ls.preamble

src/costs.jl

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,9 @@ const COST = Dict{Symbol,InstructionCost}(
137137
:mul_fast => InstructionCost(4,0.5),
138138
# :vfdiv => InstructionCost(13,4.0,-2.0),
139139
# :vfdiv! => InstructionCost(13,4.0,-2.0),
140+
:rem_fast => InstructionCost(13,4.0,-2.0), # FIXME
140141
:div_fast => InstructionCost(13,4.0,-2.0),
142+
:vdiv_fast => InstructionCost(20,4.0,-2.0), # FIXME
141143
# :evadd => InstructionCost(4,0.5),
142144
# :evsub => InstructionCost(4,0.5),
143145
# :evmul => InstructionCost(4,0.5),
@@ -152,7 +154,7 @@ const COST = Dict{Symbol,InstructionCost}(
152154
:reduce_to_prod => InstructionCost(0,0.0,0.0,0),
153155
:abs => InstructionCost(1, 0.5),
154156
:abs2 => InstructionCost(4,0.5),
155-
# :vabs2 => InstructionCost(4,0.5),
157+
:abs2_fast => InstructionCost(4,0.5),
156158
:(==) => InstructionCost(1, 0.5),
157159
:(!=) => InstructionCost(1, 0.5),
158160
:(isnan) => InstructionCost(1, 0.5),
@@ -179,19 +181,26 @@ const COST = Dict{Symbol,InstructionCost}(
179181
:iseven => InstructionCost(1, 0.5),
180182
:max => InstructionCost(4,0.5),
181183
:min => InstructionCost(4,0.5),
184+
:max_fast => InstructionCost(4,0.5),
185+
:min_fast => InstructionCost(4,0.5),
182186
:relu => InstructionCost(4,0.5),
183187
# Instruction(:ifelse) => InstructionCost(1, 0.5),
184188
:ifelse => InstructionCost(1, 0.5),
185189
:inv => InstructionCost(13,4.0,-2.0,1),
190+
:inv_fast => InstructionCost(10,4.0,-2.0,1), # FIXME
186191
# :vinv => InstructionCost(13,4.0,-2.0,1),
187192
:muladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
188193
:fma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
189-
# :vmuladd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
190-
# :vfma => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
194+
:vmuladd_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
195+
:vfma_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
191196
:vfmadd => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
192197
:vfmsub => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
193198
:vfnmadd => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
194199
:vfnmsub => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
200+
:vfmadd_fast => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
201+
:vfmsub_fast => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
202+
:vfnmadd_fast => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
203+
:vfnmsub_fast => InstructionCost(4,0.5), # - and -* will fuse into this, so much of the time they're not twice as expensive
195204
:vfmadd231 => InstructionCost(4,0.5), # + and * will fuse into this, so much of the time they're not twice as expensive
196205
:vfmsub231 => InstructionCost(4,0.5), # - and * will fuse into this, so much of the time they're not twice as expensive
197206
:vfnmadd231 => InstructionCost(4,0.5), # + and -* will fuse into this, so much of the time they're not twice as expensive
@@ -289,9 +298,15 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
289298
:* => MULTIPLICATIVE_IN_REDUCTIONS,
290299
:vadd => ADDITIVE_IN_REDUCTIONS,
291300
:vsub => ADDITIVE_IN_REDUCTIONS,
301+
:add_fast => ADDITIVE_IN_REDUCTIONS,
302+
:sub_fast => ADDITIVE_IN_REDUCTIONS,
303+
:vadd_fast => ADDITIVE_IN_REDUCTIONS,
304+
:vsub_fast => ADDITIVE_IN_REDUCTIONS,
292305
# :vadd! => ADDITIVE_IN_REDUCTIONS,
293306
# :vsub! => ADDITIVE_IN_REDUCTIONS,
294307
:vmul => MULTIPLICATIVE_IN_REDUCTIONS,
308+
:mul_fast => MULTIPLICATIVE_IN_REDUCTIONS,
309+
:vmul_fast => MULTIPLICATIVE_IN_REDUCTIONS,
295310
# :vmul! => MULTIPLICATIVE_IN_REDUCTIONS,
296311
# :evadd => ADDITIVE_IN_REDUCTIONS,
297312
# :evsub => ADDITIVE_IN_REDUCTIONS,
@@ -300,12 +315,16 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
300315
:| => ANY,
301316
:muladd => ADDITIVE_IN_REDUCTIONS,
302317
:fma => ADDITIVE_IN_REDUCTIONS,
303-
# :vmuladd => ADDITIVE_IN_REDUCTIONS,
304-
# :vfma => ADDITIVE_IN_REDUCTIONS,
318+
:vmuladd_fast => ADDITIVE_IN_REDUCTIONS,
319+
:vfma_fast => ADDITIVE_IN_REDUCTIONS,
305320
:vfmadd => ADDITIVE_IN_REDUCTIONS,
306321
:vfmsub => ADDITIVE_IN_REDUCTIONS,
307322
:vfnmadd => ADDITIVE_IN_REDUCTIONS,
308323
:vfnmsub => ADDITIVE_IN_REDUCTIONS,
324+
:vfmadd_fast => ADDITIVE_IN_REDUCTIONS,
325+
:vfmsub_fast => ADDITIVE_IN_REDUCTIONS,
326+
:vfnmadd_fast => ADDITIVE_IN_REDUCTIONS,
327+
:vfnmsub_fast => ADDITIVE_IN_REDUCTIONS,
309328
:vfmadd231 => ADDITIVE_IN_REDUCTIONS,
310329
:vfmsub231 => ADDITIVE_IN_REDUCTIONS,
311330
:vfnmadd231 => ADDITIVE_IN_REDUCTIONS,
@@ -314,22 +333,20 @@ const REDUCTION_CLASS = Dict{Symbol,Float64}(
314333
# :vfnmadd! => ADDITIVE_IN_REDUCTIONS,
315334
# :vfmsub! => ADDITIVE_IN_REDUCTIONS,
316335
# :vfnmsub! => ADDITIVE_IN_REDUCTIONS,
317-
:vfmadd_fast => ADDITIVE_IN_REDUCTIONS,
318-
:vfmsub_fast => ADDITIVE_IN_REDUCTIONS,
319-
:vfnmadd_fast => ADDITIVE_IN_REDUCTIONS,
320-
:vfnmsub_fast => ADDITIVE_IN_REDUCTIONS,
321336
:reduced_add => ADDITIVE_IN_REDUCTIONS,
322337
:reduced_prod => MULTIPLICATIVE_IN_REDUCTIONS,
323338
:reduced_all => ALL,
324339
:reduced_any => ANY,
325340
:max => MAX,
326-
:min => MIN
341+
:min => MIN,
342+
:max_fast => MAX,
343+
:min_fast => MIN
327344
)
328345
reduction_instruction_class(instr::Symbol) = get(REDUCTION_CLASS, instr, NaN)
329346
reduction_instruction_class(instr::Instruction) = reduction_instruction_class(instr.instr)
330347
function reduction_to_single_vector(x::Float64)
331348
# x == 1.0 ? :evadd : x == 2.0 ? :evmul : x == 3.0 ? :vor : x == 4.0 ? :vand : x == 5.0 ? :max : x == 6.0 ? :min : throw("Reduction not found.")
332-
x == ADDITIVE_IN_REDUCTIONS ? :vadd : x == MULTIPLICATIVE_IN_REDUCTIONS ? :vmul : x == MAX ? :max : x == MIN ? :min : throw("Reduction not found.")
349+
x == ADDITIVE_IN_REDUCTIONS ? :(+) : x == MULTIPLICATIVE_IN_REDUCTIONS ? :(*) : x == MAX ? :max : x == MIN ? :min : throw("Reduction not found.")
333350
end
334351
reduction_to_single_vector(x) = reduction_to_single_vector(reduction_instruction_class(x))
335352
# function reduction_to_scalar(x::Float64)
@@ -366,21 +383,22 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
366383
typeof(+) => :(+),
367384
typeof(VectorizationBase.vadd) => :(+),
368385
# typeof(VectorizationBase.vadd!) => :(+),
369-
typeof(Base.FastMath.add_fast) => :(+),
386+
typeof(Base.FastMath.add_fast) => :add_fast,
370387
typeof(-) => :(-),
371388
typeof(VectorizationBase.vsub) => :(-),
372389
# typeof(VectorizationBase.vsub!) => :(-),
373-
typeof(Base.FastMath.sub_fast) => :(-),
390+
typeof(Base.FastMath.sub_fast) => :sub_fast,
374391
typeof(*) => :(*),
375392
typeof(VectorizationBase.vmul) => :(*),
376393
# typeof(VectorizationBase.vmul!) => :(*),
377-
typeof(Base.FastMath.mul_fast) => :(*),
394+
typeof(Base.FastMath.mul_fast) => :mul_fast,
378395
typeof(/) => :(/),
379396
typeof(^) => :(^),
380397
# typeof(VectorizationBase.vfdiv) => :(/),
381398
# typeof(VectorizationBase.vfdiv!) => :(/),
382399
typeof(VectorizationBase.vdiv) => :(/),
383-
typeof(Base.FastMath.div_fast) => :(/),
400+
typeof(Base.FastMath.div_fast) => :div_fast,
401+
typeof(Base.FastMath.rem_fast) => :rem_fast,
384402
typeof(==) => :(==),
385403
typeof(!=) => :(!=),
386404
typeof(isequal) => :isequal,
@@ -389,6 +407,7 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
389407
typeof(isfinite) => :isfinite,
390408
typeof(abs) => :abs,
391409
typeof(abs2) => :abs2,
410+
typeof(abs2_fast) => :abs2_fast,
392411
typeof(~) => :(~),
393412
typeof(!) => :(!),
394413
typeof(&) => :(&),
@@ -399,14 +418,19 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
399418
typeof(>=) => :(>=),
400419
typeof(<=) => :(<=),
401420
typeof(inv) => :inv,
421+
typeof(inv_fast) => :inv_fast,
402422
typeof(muladd) => :muladd,
403423
typeof(fma) => :fma,
404-
# typeof(VectorizationBase.vmuladd) => :vmuladd,
405-
# typeof(VectorizationBase.vfma) => :vfma,
406-
typeof(VectorizationBase.vfmadd) => :vfmadd,
424+
typeof(VectorizationBase.vfma) => :vfma,
425+
typeof(VectorizationBase.vmuladd) => :vmuladd,
407426
typeof(VectorizationBase.vfmsub) => :vfmsub,
408427
typeof(VectorizationBase.vfnmadd) => :vfnmadd,
409428
typeof(VectorizationBase.vfnmsub) => :vfnmsub,
429+
typeof(VectorizationBase.vfma_fast) => :vfma_fast,
430+
typeof(VectorizationBase.vmuladd_fast) => :vmuladd_fast,
431+
typeof(VectorizationBase.vfmsub_fast) => :vfmsub_fast,
432+
typeof(VectorizationBase.vfnmadd_fast) => :vfnmadd_fast,
433+
typeof(VectorizationBase.vfnmsub_fast) => :vfnmsub_fast,
410434
typeof(VectorizationBase.vfmadd231) => :vfmadd231,
411435
typeof(VectorizationBase.vfmsub231) => :vfmsub231,
412436
typeof(VectorizationBase.vfnmadd231) => :vfnmadd231,
@@ -454,6 +478,8 @@ const FUNCTIONSYMBOLS = IdDict{Type{<:Function},Instruction}(
454478
# typeof(SLEEFPirates.tanh_fast) => :tanh_fast,
455479
typeof(max) => :max,
456480
typeof(min) => :min,
481+
typeof(max_fast) => :max_fast,
482+
typeof(min_fast) => :min_fast,
457483
typeof(relu) => :relu,
458484
typeof(<<) => :<<,
459485
typeof(>>) => :>>,

src/determinestrategy.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ const CACHELINE_SIZE = something(VectorizationBase.L₁CACHE.linesize, 64)
1313
# for opp ∈ parents(op)
1414
# newapp += indexappearences(opp, s)
1515
# end
16-
# factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :evadd) ? 1 : 10
16+
# factor = instruction(op).instr ∈ (:+, :vadd, :add_fast, :vadd_fast) ? 1 : 10
1717
# newapp * factor
1818
# end
1919
function check_linear_parents(ls::LoopSet, op::Operation, s::Symbol)
@@ -82,7 +82,7 @@ function cost(ls::LoopSet, op::Operation, vectorized::Symbol, Wshift::Int, size_
8282
instr = instruction(op)
8383
# instr = instruction(op)
8484
if length(parents(op)) == 1
85-
if instr == Instruction(:-) || instr === Instruction(:vsub) || instr == Instruction(:+) || instr == Instruction(:vadd)
85+
if instr == Instruction(:-) || instr === Instruction(:sub_fast) || instr == Instruction(:+) || instr == Instruction(:add_fast)
8686
return 0.0, 0, 0.0
8787
end
8888
elseif iscompute(op) && all(opp -> (isloopvalue(opp) | isconstant(opp)), parents(op))

src/filter.jl

Lines changed: 27 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,42 +1,42 @@
11

2-
if (Base.libllvm_version v"7" && VectorizationBase.AVX512F) || Base.libllvm_version v"9"
3-
function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeTypes}
4-
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
5-
N = length(y)
6-
Nrep = N >>> Wshift
7-
Nrem = N & (W - 1)
8-
j = 0
9-
st = VectorizationBase.static_sizeof(T)
10-
zero_index = MM{W}(Static(0), st)
11-
GC.@preserve x y begin
12-
ptr_x = pointer(x)
13-
ptr_y = pointer(y)
14-
for _ 1:Nrep
15-
vy = vload(ptr_y, zero_index)
16-
mask = f(vy)
17-
VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
18-
ptr_y = gep(ptr_y, VectorizationBase.REGISTER_SIZE)
19-
j = vadd(j, count_ones(mask))
20-
end
21-
rem_mask = VectorizationBase.mask(T, Nrem)
22-
vy = vload(ptr_y, zero_index, rem_mask)
23-
mask = rem_mask & f(vy)
2+
function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeTypes}
3+
W, Wshift = VectorizationBase.pick_vector_width_shift(T)
4+
N = length(y)
5+
Nrep = N >>> Wshift
6+
Nrem = N & (W - 1)
7+
j = 0
8+
st = VectorizationBase.static_sizeof(T)
9+
zero_index = MM{W}(Static(0), st)
10+
GC.@preserve x y begin
11+
ptr_x = pointer(x)
12+
ptr_y = pointer(y)
13+
for _ 1:Nrep
14+
vy = vload(ptr_y, zero_index)
15+
mask = f(vy)
2416
VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
25-
j = vadd(j, count_ones(mask))
26-
Base._deleteend!(x, N-j) # resize!(x, j)
17+
ptr_y = gep(ptr_y, VectorizationBase.REGISTER_SIZE)
18+
j = vadd_fast(j, count_ones(mask))
2719
end
28-
x
20+
rem_mask = VectorizationBase.mask(T, Nrem)
21+
vy = vload(ptr_y, zero_index, rem_mask)
22+
mask = rem_mask & f(vy)
23+
VectorizationBase.compressstore!(gep(ptr_x, VectorizationBase.lazymul(st, j)), vy, mask)
24+
j = vadd_fast(j, count_ones(mask))
25+
Base._deleteend!(x, N-j) # resize!(x, j)
2926
end
30-
vfilter!(f::F, x::Vector{T}) where {F, T<:NativeTypes} = vfilter!(f, x, x)
31-
vfilter(f::F, y::AbstractArray{T}) where {F, T<:NativeTypes} = vfilter!(f, Vector{T}(undef, length(y)), y)
27+
x
3228
end
29+
vfilter!(f::F, x::Vector{T}) where {F, T<:NativeTypes} = vfilter!(f, x, x)
30+
vfilter(f::F, y::AbstractArray{T}) where {F, T<:NativeTypes} = vfilter!(f, Vector{T}(undef, length(y)), y)
3331
vfilter(f::F, y) where {F} = filter(f, y)
3432
vfilter!(f::F, y) where {F} = filter!(f, y)
3533

3634
"""
3735
vfilter(f, a::AbstractArray)
3836
3937
SIMD-vectorized `filter`, returning an array containing the elements of `a` for which `f` return `true`.
38+
39+
This function requires AVX512 to be faster than `Base.filter`, as it adds compressstore instructions.
4040
"""
4141
vfilter
4242

0 commit comments

Comments
 (0)