Skip to content

Commit d63ebff

Browse files
committed
Reran benchmarks after switching to use OpenBLAS_jll and MKL_jll.
1 parent f645381 commit d63ebff

24 files changed

+41
-35
lines changed

benchmark/Manifest.toml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -440,9 +440,7 @@ uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
440440

441441
[[LoopVectorization]]
442442
deps = ["DocStringExtensions", "LinearAlgebra", "OffsetArrays", "SIMDPirates", "SLEEFPirates", "UnPack", "VectorizationBase"]
443-
git-tree-sha1 = "1a9a648f0aa612fdc36acb784ba47d77e8707383"
444-
repo-rev = "master"
445-
repo-url = "https://github.com/chriselrod/LoopVectorization.jl.git"
443+
path = "/home/chriselrod/.julia/dev/LoopVectorization"
446444
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
447445
version = "0.8.2"
448446

benchmark/benchmarkflops.jl

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -157,10 +157,10 @@ function dot_bench!(br, s, i)
157157
@assert edot(a,b) dotblas "eigen dot wrong?"
158158
br[8,i] = n_gflop / @belapsed iedot($a, $b)
159159
@assert iedot(a,b) dotblas "i-eigen dot wrong?"
160-
br[9,i] = n_gflop / @belapsed dot($a, $b)
160+
# br[9,i] = n_gflop / @belapsed dot($a, $b)
161161
end
162162
function benchmark_dot(sizes)
163-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
163+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3"]#, "OpenBLAS"]
164164
br = BenchmarkResult(tests, sizes)
165165
sm = br.sizedresults.results
166166
pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -186,10 +186,10 @@ function selfdot_bench!(br, s, i)
186186
@assert eselfdot(a) dotblas "eigen dot wrong?"
187187
br[8,i] = n_gflop / @belapsed ieselfdot($a)
188188
@assert ieselfdot(a) dotblas "i-eigen dot wrong?"
189-
br[9,i] = n_gflop / @belapsed dot($a, $a)
189+
# br[9,i] = n_gflop / @belapsed dot($a, $a)
190190
end
191191
function benchmark_selfdot(sizes)
192-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
192+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3"]#, "OpenBLAS"]
193193
br = BenchmarkResult(tests, sizes)
194194
sm = br.sizedresults.results
195195
pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -284,6 +284,7 @@ function benchmark_dot3(sizes)
284284
pmap(is -> dot3_bench!(sm, is[2], is[1]), enumerate(sizes))
285285
br
286286
end
287+
BLAS.set_num_threads(1)
287288
function sse!(Xβ, y, X, β)
288289
mul!(copyto!(Xβ, y), X, β, 1.0, -1.0)
289290
dot(Xβ, Xβ)
@@ -317,7 +318,7 @@ function sse_bench!(br, s, i)
317318
br[9,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
318319
end
319320
function benchmark_sse(sizes)
320-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "OpenBLAS"]
321+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "MKL" : "OpenBLAS"]
321322
br = BenchmarkResult(tests, sizes)
322323
sm = br.sizedresults.results
323324
pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))

benchmark/loadsharedlibs.jl

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,13 @@ using LoopVectorization.VectorizationBase: REGISTER_SIZE
44
# const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
55
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
66

7+
const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
8+
const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
9+
const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so")
10+
const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
11+
const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
12+
const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
13+
714

815
# requires Clang with polly to build
916
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
@@ -28,12 +35,19 @@ end
2835
eigenfile = joinpath(LOOPVECBENCHDIR, "looptestseigen.cpp")
2936
if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
3037
# Clang seems to have trouble finding includes
31-
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
32-
38+
if LoopVectorization.VectorizationBase.AVX512F
39+
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
40+
else
41+
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
42+
end
3343
end
3444
if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
3545
# run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
36-
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
46+
if LoopVectorization.VectorizationBase.AVX512F
47+
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
48+
else
49+
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
50+
end
3751
# run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
3852
end
3953

@@ -47,13 +61,6 @@ end
4761
# end
4862

4963

50-
const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
51-
const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
52-
const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so")
53-
const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so")
54-
const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so")
55-
const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so")
56-
5764
using MKL_jll, OpenBLAS_jll
5865

5966
const libMKL = Libdl.dlopen(MKL_jll.libmkl_rt)
@@ -105,9 +112,9 @@ function dgemmopenblas!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64},
105112
transA, transB, M, N, K, α, pA, ldA, pB, ldB, β, C, ldC
106113
)
107114
end
108-
mkl_set_num_threads(N::Integer) = ccall(MKL_SET_NUM_THREADS, Cvoid, (Ref{UInt32},), Ref(N % UInt32))
115+
mkl_set_num_threads(N::Integer) = ccall(MKL_SET_NUM_THREADS, Cvoid, (Int32,), N % Int32)
109116
mkl_set_num_threads(1)
110-
openblas_set_num_threads(N::Integer) = ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Ref{Int64},), Ref(N))
117+
openblas_set_num_threads(N::Integer) = ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Int64,), N)
111118
openblas_set_num_threads(1)
112119
function dgemvmkl!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
113120
transA = istransposed(A)
@@ -123,7 +130,7 @@ function dgemvmkl!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::Ab
123130
ccall(
124131
DGEMV_MKL, Cvoid,
125132
(Ref{UInt8}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}),
126-
transA, M32, N32, α, A, ldA, x, incx, β, y, incy
133+
transA, M32, N32, α, pA, ldA, x, incx, β, y, incy
127134
)
128135
end
129136
function dgemvopenblas!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
@@ -138,7 +145,7 @@ function dgemvopenblas!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64},
138145
ccall(
139146
DGEMV_OpenBLAS, Cvoid,
140147
(Ref{UInt8}, Ref{Int64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}),
141-
transA, M, N, α, A, ldA, x, incx, β, y, incy
148+
transA, M, N, α, pA, ldA, x, incx, β, y, incy
142149
)
143150
end
144151

benchmark/looptests.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -248,8 +248,8 @@ void filter2d(double* restrict B, double* restrict A, double* restrict K, long M
248248
for (long na = offset; na < N-offset; na++){
249249
for (long ma = offset; ma < M-offset; ma++){
250250
double tmp = 0.0;
251-
for (long mk = -offset; mk < offset + 1; mk++){
252-
for (long nk = -offset; nk < offset + 1; nk++){
251+
for (long nk = -offset; nk < offset + 1; nk++){
252+
for (long mk = -offset; mk < offset + 1; mk++){
253253
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
254254
}
255255
}
@@ -262,8 +262,8 @@ void filter2d3x3(double* restrict B, double* restrict A, double* restrict K, lon
262262
for (long na = offset; na < N-offset; na++){
263263
for (long ma = offset; ma < M-offset; ma++){
264264
double tmp = 0.0;
265-
for (long mk = -offset; mk < offset + 1; mk++){
266-
for (long nk = -offset; nk < offset + 1; nk++){
265+
for (long nk = -offset; nk < offset + 1; nk++){
266+
for (long mk = -offset; mk < offset + 1; mk++){
267267
tmp += A[(ma+mk) + (na+nk)*M] * K[(mk+offset) + (nk+offset)*(2*offset+1)];
268268
}
269269
}

benchmark/looptests.f90

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -324,10 +324,8 @@ subroutine filter2d(B, A, K, Ma, Na, offset) BIND(C, name="filter2d")
324324
real(C_double) :: tmp
325325
do concurrent(mma = 1+offset:Ma-offset, nna = 1+offset:Na-offset)
326326
tmp = 0
327-
do mmk = -offset,offset
328-
do nnk = -offset,offset
329-
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
330-
end do
327+
do concurrent(mmk = -offset:offset, nnk = -offset:offset)
328+
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
331329
end do
332330
B(mma,nna) = tmp
333331
end do
@@ -342,10 +340,8 @@ subroutine filter2d3x3(B, A, K, Ma, Na) BIND(C, name="filter2d3x3")
342340
real(C_double) :: tmp
343341
do concurrent(mma = 1+offset:Ma-offset, nna = 1+offset:Na-offset)
344342
tmp = 0
345-
do mmk = -offset,offset
346-
do nnk = -offset,offset
347-
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
348-
end do
343+
do concurrent(mmk = -offset:offset, nnk = -offset:offset)
344+
tmp = tmp + A(mma + mmk, nna + nnk) * K(mmk, nnk)
349345
end do
350346
B(mma,nna) = tmp
351347
end do

benchmark/plotbenchmarks.jl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,11 @@ function Gadfly.plot(br::BenchmarkResult)
3535
# sizes = Vector{eltype(brsizes)}(undef, length(res))
3636
tests = replace_and.(@view(br.tests[2:end]))
3737
colors = getcolor.(tests)
38+
addlabel = false
3839

39-
xt = 0:20:260
40+
maxxval, maxxind = findmax(sizes)
41+
maxxtick = 10cld(maxxval,10) + (addlabel ? 20 : 0)
42+
xt = 0:20:maxxtick
4043
maxres = maximum(res)
4144
maxtick = 10round(Int, 0.1maxres)
4245
yt = if iszero(maxtick)
@@ -59,6 +62,7 @@ function Gadfly.plot(br::BenchmarkResult)
5962
for i eachindex(tests)
6063
push!(p, layer(x = sizes, y = res[i,:], Geom.line, Theme(default_color=colors[i])))
6164
end
65+
addlabel && push!(p, layer(x = fill(maxxtick - 10, length(tests)), y = res[:,maxxind], label=tests, Geom.label(position=:centered)))
6266
p
6367
end
6468

docs/src/assets/bench_AmulB_v1.png

-1.89 KB
Loading

docs/src/assets/bench_AmulBt_v1.png

10.2 KB
Loading

docs/src/assets/bench_Amulvb_v1.png

-5.02 KB
Loading

docs/src/assets/bench_AplusAt_v1.png

142 KB
Loading

0 commit comments

Comments
 (0)