Skip to content

Commit 80ca38b

Browse files
committed
Various fixes.
1 parent 76a3613 commit 80ca38b

File tree

6 files changed

+39
-5
lines changed

6 files changed

+39
-5
lines changed

benchmark/loadsharedlibs.jl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,15 +36,15 @@ end
3636
eigenfile = joinpath(LOOPVECBENCHDIR, "looptestseigen.cpp")
3737
if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
3838
# Clang seems to have trouble finding includes
39-
if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
39+
if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
4040
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
4141
else
4242
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
4343
end
4444
end
4545
if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
4646
# run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
47-
if LoopVectorization.VectorizationBase.has_feature("x86_64_avx512f")
47+
if Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx512f)))
4848
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4949
else
5050
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)

src/filter.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ function vfilter!(f::F, x::Vector{T}, y::AbstractArray{T}) where {F,T <: NativeT
66
Nrem = N & (W - 1)
77
j = 0
88
st = VectorizationBase.static_sizeof(T)
9-
zero_index = MM{W}(Static(0), st)
9+
zero_index = MM(W, Static(0), st)
1010
GC.@preserve x y begin
1111
ptr_x = pointer(x)
1212
ptr_y = pointer(y)

src/map.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -196,7 +196,8 @@ function vmap_multithread!(
196196
# nt = min(Threads.nthreads(), VectorizationBase.SYS_CPU_THREADS, N >> (Wshift + 3))
197197
nt = min(Threads.nthreads(), VectorizationBase.num_cores(), N >> (Wshift + 5))
198198

199-
if !((nt > 1) && iszero(ccall(:jl_in_threaded_region, Cint, ())))
199+
# if !((nt > 1) && iszero(ccall(:jl_in_threaded_region, Cint, ())))
200+
if nt < 2
200201
vmap_singlethread!(f, ptry, Zero(), N, Val{NonTemporal}(), ptrargs)
201202
return
202203
end

src/user_api_conveniences.jl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ function matmul_params(rs::Int, rc::Int, cls::Int)
1818
order[5], last(order)
1919
end
2020
@generated function matmul_params(::StaticInt{RS}, ::StaticInt{RC}, ::StaticInt{CLS}) where {RS,RC,CLS}
21+
mᵣ, nᵣ = matmul_params(RS, RC, CLS)
2122
Expr(:tuple, Expr(:call, Expr(:curly, :StaticInt, mᵣ)), Expr(:call, Expr(:curly, :StaticInt, nᵣ)))
2223
end
2324
matmul_params() = matmul_params(register_size(), register_count(), cache_linesize())

test/dot.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ using Test
298298
@test πest == pi_avx_u4(a, b)
299299
end
300300

301-
if !(!LoopVectorization.VectorizationBase.has_feature("x86_64_avx2") && T === Int32)
301+
if !(!Bool(LoopVectorization.VectorizationBase.has_feature(Val(:x86_64_avx2))) && T === Int32)
302302
@test dotloopinductvarpow(a) dotloopinductvarpowavx(a)
303303
end
304304
@test dot_from_n_to_100(a, b, 33) == @views mydotavx(a[33:100], b[33:100])

test/gemv.jl

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,27 @@ using Test
206206
return out
207207
end
208208

209+
210+
function multiple_muls!(Y, dY, A, dA, b, db)
211+
mul!(dY, dA, b)
212+
# much of the cost is in memory bandwidth for traversing `A`, so we group the two together
213+
mul!(Y, A, b)
214+
mul!(dY, A, db, true, true)
215+
nothing
216+
end
217+
function multiple_muls_avx!(Y, dY, A, dA, b, db)
218+
@avx for m axes(A,1)
219+
dy = 0.0
220+
y = 0.0
221+
for n axes(A,2)
222+
dy += dA[m,n] * b[n] + A[m,n] * db[n]
223+
y += A[m,n] * b[n]
224+
end
225+
dY[m] = dy
226+
Y[m] = y
227+
end
228+
end
229+
209230
M, K, N = 51, 49, 61
210231
for T (Float32, Float64, Int32, Int64)
211232
@show T, @__LINE__
@@ -286,5 +307,16 @@ using Test
286307
out1 = similar(A, 11); out2 = similar(out1);
287308
@test reinterpret(T,tuplemul!(out1, A, b)) reinterpret(T,tuplemulavx!(out2, A, b))
288309

310+
311+
A = rand(R, N, N); dA = rand(R, N, N);
312+
b = rand(R, N); db = rand(R, N);
313+
Y0 = Vector{TC}(undef, N); Y1 = similar(Y0);
314+
dY0 = similar(Y0); dY1 = similar(Y0);
315+
316+
multiple_muls!(Y0, dY0, A, dA, b, db)
317+
multiple_muls_avx!(Y1, dY1, A, dA, b, db)
318+
@test Y0 Y1
319+
@test dY0 dY1
320+
289321
end
290322
end

0 commit comments

Comments
 (0)