Skip to content

Commit f926fdd

Browse files
committed
Track change in VectorizationBase and SIMDPirates where indices now are in terms of number of bytes.
1 parent 7cba99f commit f926fdd

File tree

7 files changed

+107
-107
lines changed

7 files changed

+107
-107
lines changed

src/LoopVectorization.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ using VectorizationBase: REGISTER_SIZE, REGISTER_COUNT, extract_data, num_vector
77
Static, StaticUnitRange, StaticLowerUnitRange, StaticUpperUnitRange, unwrap, maybestaticrange,
88
AbstractColumnMajorStridedPointer, AbstractRowMajorStridedPointer, AbstractSparseStridedPointer, AbstractStaticStridedPointer,
99
PackedStridedPointer, SparseStridedPointer, RowMajorStridedPointer, StaticStridedPointer, StaticStridedStruct,
10-
maybestaticfirst, maybestaticlast, scalar_less, scalar_greater, noalias!, gesp
10+
maybestaticfirst, maybestaticlast, scalar_less, scalar_greater, noalias!, gesp, gepbyte
1111
using SIMDPirates: VECTOR_SYMBOLS, evadd, evsub, evmul, evfdiv, vrange,
1212
reduced_add, reduced_prod, reduce_to_add, reduced_max, reduced_min, vsum, vprod, vmaximum, vminimum,
1313
sizeequivalentfloat, sizeequivalentint, vadd!, vsub!, vmul!, vfdiv!, vfmadd!, vfnmadd!, vfmsub!, vfnmsub!,

src/broadcast.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ end
118118
Base.@propagate_inbounds Base.getindex(A::LowDimArray, i...) = getindex(A.data, i...)
119119
Base.size(A::LowDimArray) = Base.size(A.data)
120120
@generated function VectorizationBase.stridedpointer(A::LowDimArray{D,T,N}) where {D,T,N}
121-
s = Expr(:tuple, [Expr(:ref, :strideA, n) for n 1+D[1]:N if D[n]]...)
121+
s = Expr(:call, Expr(:(.), Expr(:(.), :LoopVectorization, QuoteNode(:VectorizationBase)), QuoteNode(:staticmul)), T, Expr(:tuple, [Expr(:ref, :strideA, n) for n 1+D[1]:N if D[n]]...))
122122
f = D[1] ? :PackedStridedPointer : :SparseStridedPointer
123123
Expr(:block, Expr(:meta,:inline), Expr(:(=), :strideA, Expr(:call, :strides, Expr(:(.), :A, QuoteNode(:data)))),
124124
Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(f)), Expr(:call, :pointer, Expr(:(.), :A, QuoteNode(:data))), s))

src/filter.jl

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,23 +5,22 @@ if (Base.libllvm_version ≥ v"7" && VectorizationBase.AVX512F) || Base.libllvm_
55
N = length(y)
66
Nrep = N >>> Wshift
77
Nrem = N & (W - 1)
8-
i = 0
98
j = 0
109
GC.@preserve x y begin
1110
ptr_x = pointer(x)
1211
ptr_y = pointer(y)
1312
for _ 1:Nrep
14-
vy = vload(Vec{W,T}, ptr_y, i)
13+
vy = vload(Vec{W,T}, ptr_y)
1514
mask = f(SVec(vy))
1615
SIMDPirates.compressstore!(gep(ptr_x, j), vy, mask)
17-
i += W
18-
j += count_ones(mask)
16+
ptr_y = gepbyte(ptr_y, VectorizationBase.REGISTER_SIZE)
17+
j = vadd(j, count_ones(mask))
1918
end
2019
rem_mask = VectorizationBase.mask(T, Nrem)
21-
vy = vload(Vec{W,T}, gep(ptr_y, i), rem_mask)
20+
vy = vload(Vec{W,T}, ptr_y, rem_mask)
2221
mask = rem_mask & f(SVec(vy))
2322
SIMDPirates.compressstore!(gep(ptr_x, j), vy, mask)
24-
j += count_ones(mask)
23+
j = vadd(j, count_ones(mask))
2524
Base._deleteend!(x, N-j) # resize!(x, j)
2625
end
2726
x

src/map.jl

Lines changed: 17 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,14 @@ function vmap_quote(N, ::Type{T}) where {T}
44
val = Expr(:call, Expr(:curly, :Val, W))
55
q = Expr(:block, Expr(:(=), :M, Expr(:call, :length, :dest)), Expr(:(=), :vdest, Expr(:call, :pointer, :dest)), Expr(:(=), :m, 0))
66
fcall = Expr(:call, :f)
7-
loopbody = Expr(:block, Expr(:call, :vstore!, :vdest, fcall, :m), Expr(:(+=), :m, W))
7+
loopbody = Expr(:block, Expr(:call, :vstore!, Expr(:call, :gep, :vdest, :m), fcall), Expr(:(+=), :m, W))
88
fcallmask = Expr(:call, :f)
9-
bodymask = Expr(:block, Expr(:(=), :__mask__, Expr(:call, :mask, val, Expr(:call, :&, :M, W-1))), Expr(:call, :vstore!, :vdest, fcallmask, :m, :__mask__))
9+
bodymask = Expr(:block, Expr(:(=), :__mask__, Expr(:call, :mask, val, Expr(:call, :&, :M, W-1))), Expr(:call, :vstore!, Expr(:call, :gep, :vdest, :m), fcallmask, :__mask__))
1010
for n 1:N
1111
arg_n = Symbol(:varg_,n)
1212
push!(q.args, Expr(:(=), arg_n, Expr(:macrocall, Symbol("@inbounds"), LineNumberNode(@__LINE__,Symbol(@__FILE__)), Expr(:call, :pointer, Expr(:ref, :args, n)))))
13-
push!(fcall.args, Expr(:call, :vload, val, arg_n, :m))
14-
push!(fcallmask.args, Expr(:call, :vload, val, arg_n, :m, :__mask__))
13+
push!(fcall.args, Expr(:call, :vload, val, Expr(:call, :gep, arg_n, :m)))
14+
push!(fcallmask.args, Expr(:call, :vload, val, Expr(:call, :gep, arg_n, :m), :__mask__))
1515
end
1616
loop = Expr(:for, Expr(:(=), :_, Expr(:call, :(:), 0, Expr(:call, :-, Expr(:call, :(>>>), :M, Wshift), 1))), loopbody)
1717
push!(q.args, loop)
@@ -114,17 +114,17 @@ function vmapnt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,A
114114
W = VectorizationBase.pick_vector_width(T)
115115
V = VectorizationBase.pick_vector_width_val(T)
116116
while i < N - ((W << 2) - 1)
117-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
118-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
119-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
120-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
117+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
118+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
119+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
120+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
121121
end
122122
while i < N - (W - 1) # stops at 16 when
123-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
123+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
124124
end
125-
if i < N
125+
if i < N
126126
m = mask(T, N & (W - 1))
127-
vstore!(ptry, extract_data(f(vload.(V, ptrargs, i, m)...)), i, m)
127+
vstore!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i), m)...)), m)
128128
end
129129
y
130130
end
@@ -143,18 +143,18 @@ function vmapntt!(f::F, y::AbstractVector{T}, args::Vararg{<:Any,A}) where {F,T,
143143
Niter = N >>> Wsh
144144
Base.Threads.@threads for j 0:Niter-1
145145
i = j << Wsh
146-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
147-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
148-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i); i += W
149-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, i)...)), i)
146+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
147+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
148+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...))); i += W
149+
vstorent!(gep(ptry, i), extract_data(f(vload.(V, gep.(ptrargs, i))...)))
150150
end
151151
ii = Niter << Wsh
152152
while ii < N - (W - 1) # stops at 16 when
153-
vstorent!(ptry, extract_data(f(vload.(V, ptrargs, ii)...)), ii); ii += W
153+
vstorent!(gep(ptry, ii), extract_data(f(vload.(V, gep.(ptrargs, ii))...))); ii += W
154154
end
155155
if ii < N
156156
m = mask(T, N & (W - 1))
157-
vstore!(ptry, extract_data(f(vload.(V, ptrargs, ii, m)...)), ii, m)
157+
vstore!(gep(ptry, ii), extract_data(f(vload.(V, gep.(ptrargs, ii), m)...)), m)
158158
end
159159
y
160160
end

src/mapreduce.jl

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ function mapreduce_simple(f::F, op::OP, args::Vararg{DenseArray{T},A}) where {F,
2020
iszero(N) && throw("Length of vector is 0!")
2121
a_0 = f(vload.(ptrargs)...); i = 1
2222
while i < N
23-
a_0 = op(a_0, f(vload.(ptrargs, i)...)); i += 1
23+
a_0 = op(a_0, f(vload.(gep.(ptrargs, i))...)); i += 1
2424
end
2525
a_0
2626
end
@@ -38,28 +38,28 @@ function vmapreduce(f::F, op::OP, args::Vararg{DenseArray{T},A}) where {F,OP,T<:
3838
V = VectorizationBase.pick_vector_width_val(T)
3939
N < W && return mapreduce_simple(f, op, args...)
4040
ptrargs = pointer.(args)
41-
41+
4242
a_0 = f(vload.(V, ptrargs)...); i = W
4343
if N 4W
44-
a_1 = f(vload.(V, ptrargs, i)...); i += W
45-
a_2 = f(vload.(V, ptrargs, i)...); i += W
46-
a_3 = f(vload.(V, ptrargs, i)...); i += W
44+
a_1 = f(vload.(V, gep.(ptrargs, i))...); i += W
45+
a_2 = f(vload.(V, gep.(ptrargs, i))...); i += W
46+
a_3 = f(vload.(V, gep.(ptrargs, i))...); i += W
4747
while i < N - ((W << 2) - 1)
48-
a_0 = op(a_0, f(vload.(V, ptrargs, i)...)); i += W
49-
a_1 = op(a_1, f(vload.(V, ptrargs, i)...)); i += W
50-
a_2 = op(a_2, f(vload.(V, ptrargs, i)...)); i += W
51-
a_3 = op(a_3, f(vload.(V, ptrargs, i)...)); i += W
48+
a_0 = op(a_0, f(vload.(V, gep.(ptrargs, i))...)); i += W
49+
a_1 = op(a_1, f(vload.(V, gep.(ptrargs, i))...)); i += W
50+
a_2 = op(a_2, f(vload.(V, gep.(ptrargs, i))...)); i += W
51+
a_3 = op(a_3, f(vload.(V, gep.(ptrargs, i))...)); i += W
5252
end
5353
a_0 = op(a_0, a_1)
5454
a_2 = op(a_2, a_3)
5555
a_0 = op(a_0, a_2)
5656
end
5757
while i < N - (W - 1)
58-
a_0 = op(a_0, f(vload.(V, ptrargs, i)...)); i += W
58+
a_0 = op(a_0, f(vload.(V, gep.(ptrargs, i))...)); i += W
5959
end
6060
if i < N
6161
m = mask(T, N & (W - 1))
62-
a_0 = vifelse(m, op(a_0, f(vload.(V, ptrargs, i)...)), a_0)
62+
a_0 = vifelse(m, op(a_0, f(vload.(V, gep.(ptrargs, i))...)), a_0)
6363
end
6464
vreduce(op, a_0)
6565
end

src/reconstruct_loopset.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -472,7 +472,7 @@ Execute an `@avx` block. The block's code is represented via the arguments:
472472
- `vargs...` holds the encoded pointers of all the arrays (see `VectorizationBase`'s various pointer types).
473473
"""
474474
@generated function _avx_!(::Val{UNROLL}, ::Type{OPS}, ::Type{ARF}, ::Type{AM}, ::Type{LPSYM}, lb::LB, vargs...) where {UNROLL, OPS, ARF, AM, LPSYM, LB}
475-
# 1 + 1 # Irrelevant line you can comment out/in to force recompilation...
475+
1 + 1 # Irrelevant line you can comment out/in to force recompilation...
476476
ls = _avx_loopset(OPS.parameters, ARF.parameters, AM.parameters, LPSYM.parameters, LB.parameters, vargs)
477477
# @show avx_body(ls, UNROLL)
478478
avx_body(ls, UNROLL)

0 commit comments

Comments
 (0)