Skip to content

Commit 922b065

Browse files
committed
Run benchmarks.
1 parent c1c8236 commit 922b065

28 files changed

+70
-54
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
1515
[compat]
1616
DocStringExtensions = "0.8"
1717
OffsetArrays = "1"
18-
SIMDPirates = "0.8"
18+
SIMDPirates = "0.8.1"
1919
SLEEFPirates = "0.5"
2020
UnPack = "0,1"
2121
VectorizationBase = "0.12"

benchmark/benchmarkflops.jl

Lines changed: 15 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ blastests() = [
107107
"LoopVectorization",
108108
"Julia", "Clang",
109109
"GFortran", "icc", "ifort",
110-
"g++ & Eigen-3", "icpc & Eigen-3",
110+
"g++ & Eigen-3", "clang++ & Eigen-3",
111111
"GFortran-builtin", "ifort-builtin",
112112
BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "MKL"
113113
]
@@ -160,7 +160,7 @@ function dot_bench!(br, s, i)
160160
br[9,i] = n_gflop / @belapsed dot($a, $b)
161161
end
162162
function benchmark_dot(sizes)
163-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
163+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
164164
br = BenchmarkResult(tests, sizes)
165165
sm = br.sizedresults.results
166166
pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -189,7 +189,7 @@ function selfdot_bench!(br, s, i)
189189
br[9,i] = n_gflop / @belapsed dot($a, $a)
190190
end
191191
function benchmark_selfdot(sizes)
192-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
192+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
193193
br = BenchmarkResult(tests, sizes)
194194
sm = br.sizedresults.results
195195
pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -258,9 +258,9 @@ function dot3_bench!(br, s, i)
258258
x = rand(M); A = rand(M, N); y = rand(N);
259259
dotblas = dot(x, A, y)
260260
n_gflop = M*N * 3e-9
261-
br[1,i] = n_gflop / @belapsed jdot3v2avx($x, $A, $y)
261+
br[1,i] = n_gflop / @belapsed jdot3avx($x, $A, $y)
262262
@assert jdot3avx(x, A, y) dotblas "LoopVec dot wrong?"
263-
br[2,i] = n_gflop / @belapsed jdot3v2($x, $A, $y)
263+
br[2,i] = n_gflop / @belapsed jdot3($x, $A, $y)
264264
@assert jdot3(x, A, y) dotblas "Julia dot wrong?"
265265
br[3,i] = n_gflop / @belapsed cdot3($x, $A, $y)
266266
@assert cdot3(x, A, y) dotblas "Clang dot wrong?"
@@ -273,11 +273,11 @@ function dot3_bench!(br, s, i)
273273
br[7,i] = n_gflop / @belapsed edot3($x, $A, $y)
274274
@assert edot3(x, A, y) dotblas "eigen dot wrong?"
275275
br[8,i] = n_gflop / @belapsed iedot3($x, $A, $y)
276-
@assert iedot3(x, A, y) dotblas "i-eigen dot wrong?"
276+
@assert iedot3(x, A, y) dotblas "c-eigen dot wrong?"
277277
br[9,i] = n_gflop / @belapsed dot($x, $A, $y)
278278
end
279279
function benchmark_dot3(sizes)
280-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LinearAlgebra" ]
280+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "LinearAlgebra" ]
281281
br = BenchmarkResult(tests, sizes)
282282
sm = br.sizedresults.results
283283
pmap(is -> dot3_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -316,7 +316,7 @@ function sse_bench!(br, s, i)
316316
br[9,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
317317
end
318318
function benchmark_sse(sizes)
319-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
319+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS"]
320320
br = BenchmarkResult(tests, sizes)
321321
sm = br.sizedresults.results
322322
pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -370,7 +370,7 @@ function aplusBc_bench!(br, s, i)
370370
@assert D Dcopy "i-eigen wrong?"; fill!(D, NaN);
371371
end
372372
function benchmark_aplusBc(sizes)
373-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3"]
373+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3"]
374374
br = BenchmarkResult(tests, sizes)
375375
sm = br.sizedresults.results
376376
pmap(is -> aplusBc_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -402,7 +402,7 @@ function AplusAt_bench!(br, s, i)
402402
@assert B baseB "ifort-builtin wrong?"; fill!(B, NaN);
403403
end
404404
function benchmark_AplusAt(sizes)
405-
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "GFortran-builtin", "ifort-builtin"]
405+
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "GFortran-builtin", "ifort-builtin"]
406406
br = BenchmarkResult(tests, sizes)
407407
sm = br.sizedresults.results
408408
pmap(is -> AplusAt_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -453,9 +453,14 @@ function logdettriangle_bench!(br, s, i)
453453
@assert ld iclogdettriangle(U) "icc wrong?"
454454
br[6,i] = n_gflop / @belapsed iflogdettriangle($U)
455455
@assert ld iflogdettriangle(U) "ifort wrong?"
456+
# br[7,i] = n_gflop / @belapsed elogdettriangle($U)
457+
# @assert ld ≈ elogdettriangle(U) "eigen wrong?"; fill!(B, NaN);
458+
# br[8,i] = n_gflop / @belapsed ielogdettriangle($U)
459+
# @assert ld ≈ ielogdettriangle(U) "i-eigen wrong?"; fill!(B, NaN);
456460
br[7,i] = n_gflop / @belapsed logdet($U)
457461
end
458462
function benchmark_logdettriangle(sizes)
463+
# tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "clang++ & Eigen-3", "LinearAlgebra"]
459464
tests = ["LoopVectorization", "Julia", "Clang", "GFortran", "icc", "ifort", "LinearAlgebra"]
460465
br = BenchmarkResult(tests, sizes)
461466
sm = br.sizedresults.results

benchmark/driver.jl

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
33
# includet(joinpath(LOOPVECBENCHDIR, "driver.jl"))
44

5-
using Distributed, LoopVectorization
5+
using Distributed, LoopVectorization, JLD2
66

77
const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
88
include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl"))
@@ -22,6 +22,9 @@ end
2222
# sizes = 23:23
2323
sizes = 256:-1:2
2424

25+
logdettriangle_bench = benchmark_logdettriangle(sizes); println("logdet(LowerTriangular(A)) benchmark results:"); println(logdettriangle_bench)
26+
dot3_bench = benchmark_dot3(sizes); println("x' * A * y benchmark results:"); println(dot3_bench)
27+
2528
AmulB_bench = benchmark_AmulB(sizes); println("A * B benchmark results:"); println(AmulB_bench)
2629
AmulBt_bench = benchmark_AmulBt(sizes); println("A * B' benchmark results:"); println(AmulBt_bench)
2730
AtmulBt_bench = benchmark_AtmulBt(sizes); println("A' * B' benchmark results:"); println(AtmulBt_bench)
@@ -34,15 +37,13 @@ filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes); println("Benchmark re
3437
filter2d_3x3_bench = benchmark_filter2d3x3(sizes); println("Benchmark results for statically sized 3x3 convolution:"); println(filter2d_3x3_bench)
3538
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes); println("Benchmark results for unrolled 3x3 convolution:"); println(filter2d_unrolled_bench)
3639

37-
dot3_bench = benchmark_dot3(sizes); println("x' * A * y benchmark results:"); println(dot3_bench)
3840
dot_bench = benchmark_dot(sizes); println("a' * b benchmark results:"); println(dot_bench)
3941
selfdot_bench = benchmark_selfdot(sizes); println("a' * a benchmark results:"); println(selfdot_bench)
4042
sse_bench = benchmark_sse(sizes); println("Benchmark resutls of summing squared error:"); println(sse_bench)
4143
aplusBc_bench = benchmark_aplusBc(sizes); println("Benchmark results of a .+ B .* c':"); println(aplusBc_bench)
4244
AplusAt_bench = benchmark_AplusAt(sizes); println("Benchmark results of A * A':"); println(AplusAt_bench)
4345
vexp_bench = benchmark_exp(sizes); println("Benchmark results of exponentiating a vector:"); println(vexp_bench)
4446
randomaccess_bench = benchmark_random_access(sizes); println("Benchmark results from using a vector of indices:"); println(randomaccess_bench)
45-
logdettriangle_bench = benchmark_logdettriangle(sizes); println("logdet(LowerTriangular(A)) benchmark results:"); println(logdettriangle_bench)
4647

4748
const v = 1
4849
using Cairo, Fontconfig
@@ -68,3 +69,5 @@ saveplot("bench_AtmulBt_v", AtmulBt_bench);
6869
saveplot("bench_Amulvb_v", Amulvb_bench);
6970
saveplot("bench_Atmulvb_v", Atmulvb_bench);
7071

72+
@save "benchmarkresults.jld2" logdettriangle_bench filter2d_dynamic_bench filter2d_3x3_bench filter2d_unrolled_bench dot_bench selfdot_bench dot3_bench sse_bench aplusBc_bench AplusAt_bench vexp_bench randomaccess_bench AmulB_bench AmulBt_bench AtmulB_bench AtmulBt_bench Amulvb_bench Atmulvb_bench
73+

benchmark/loadsharedlibs.jl

Lines changed: 26 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -35,11 +35,13 @@ end
3535
eigenfile = joinpath(LOOPVECBENCHDIR, "looptestseigen.cpp")
3636
if !isfile(LIBEIGENTEST) || mtime(eigenfile) > mtime(LIBEIGENTEST)
3737
# Clang seems to have trouble finding includes
38-
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
39-
# run(`clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
38+
run(`g++ -O3 -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
39+
4040
end
4141
if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
42-
run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
42+
# run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/c++/9 -I/usr/include/c++/9/x86_64-generic-linux -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
43+
run(`/usr/bin/clang++ -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -DEIGEN_VECTORIZE_AVX512 -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBEIGENTEST`)
44+
# run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4345
end
4446

4547
MKL_ROOT = "/home/chriselrod/intel"
@@ -238,7 +240,7 @@ for (p,s) ∈ [(:c,Cshared) (:e,Eshared)]
238240
@eval function $(Symbol(prefix,p,:dot3))(x, A, y)
239241
M, N = size(A)
240242
ccall(
241-
(:dot3v2, $s), Float64,
243+
(:dot3, $s), Float64,
242244
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
243245
x, A, y, M, N
244246
)
@@ -247,7 +249,7 @@ end
247249
@eval function $(Symbol(prefix,:fdot3))(x, A, y)
248250
M, N = size(A)
249251
ccall(
250-
(:dot3v2, $Fshared), Float64,
252+
(:dot3, $Fshared), Float64,
251253
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
252254
x, A, y, Ref(M), Ref(N)
253255
)
@@ -382,16 +384,16 @@ end
382384
B, A, Ref(N)
383385
)
384386
end
385-
for (p,s) [(:c,Cshared) (:e,Eshared)]
386-
@eval function $(Symbol(prefix,p,:AplusAt!))(B, A)
387-
N = size(B,1)
388-
ccall(
389-
(:AplusAt, $s), Cvoid,
390-
(Ptr{Float64}, Ptr{Float64}, Clong),
391-
B, A, N
392-
)
387+
for (p,s) [(:c,Cshared) (:e,Eshared)]
388+
@eval function $(Symbol(prefix,p,:AplusAt!))(B, A)
389+
N = size(B,1)
390+
ccall(
391+
(:AplusAt, $s), Cvoid,
392+
(Ptr{Float64}, Ptr{Float64}, Clong),
393+
B, A, N
394+
)
395+
end
393396
end
394-
end
395397
@eval function $(Symbol(prefix,:crandomaccess))(P, basis, coefs)
396398
A, C = size(P)
397399
ccall(
@@ -408,14 +410,16 @@ end
408410
P, basis, coefs, Ref(A), Ref(C)
409411
)
410412
end
411-
@eval function $(Symbol(prefix,:clogdettriangle))(T::Union{LowerTriangular,UpperTriangular})
412-
N = size(T,1)
413-
Tp = parent(T)
414-
ccall(
415-
(:logdettriangle, $Cshared), Float64,
416-
(Ptr{Float64}, Clong),
417-
Tp, N
418-
)
413+
for (p,s) [(:c,Cshared) (:e,Eshared)]
414+
@eval function $(Symbol(prefix,p,:logdettriangle))(T::Union{LowerTriangular,UpperTriangular})
415+
N = size(T,1)
416+
Tp = parent(T)
417+
ccall(
418+
(:logdettriangle, $s), Float64,
419+
(Ptr{Float64}, Clong),
420+
Tp, N
421+
)
422+
end
419423
end
420424
@eval function $(Symbol(prefix,:flogdettriangle))(T::Union{LowerTriangular,UpperTriangular})
421425
N = size(T,1)

benchmark/looptests.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,7 +131,7 @@ double selfdot(double* restrict a, long N){
131131
}
132132
return s;
133133
}
134-
double dot3(double* restrict x, double* restrict A, double* restrict y, long M, long N){
134+
double dot3v2(double* restrict x, double* restrict A, double* restrict y, long M, long N){
135135
double s = 0.0;
136136
for (long n = 0; n < N; n++){
137137
for (long m = 0; m < M; m++){
@@ -140,7 +140,7 @@ double dot3(double* restrict x, double* restrict A, double* restrict y, long M,
140140
}
141141
return s;
142142
}
143-
double dot3v2(double* restrict x, double* restrict A, double* restrict y, long M, long N){
143+
double dot3(double* restrict x, double* restrict A, double* restrict y, long M, long N){
144144
double s = 0.0;
145145
for (long n = 0; n < N; n++){
146146
double t = 0.0;

benchmark/looptests.f90

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -151,30 +151,30 @@ real(C_double) function selfdot(a, N) BIND(C, name="selfdot")
151151
selfdot = selfdot + a(i) * a(i)
152152
end do
153153
end function selfdot
154-
real(C_double) function dot3(x, A, y, M, N) BIND(C, name="dot3")
154+
real(C_double) function dot3v2(x, A, y, M, N) BIND(C, name="dot3v2")
155155
integer(C_long), intent(in) :: M, N
156156
real(C_double), intent(in) :: x(M), A(M,N), y(N)
157157
real(C_double) :: t
158158
integer(C_long) :: mm, nn
159-
dot3 = 0.0d0
159+
dot3v2 = 0.0d0
160160
do concurrent(nn = 1:N, mm = 1:M)
161-
dot3 = dot3 + x(mm) * A(mm, nn) * y(nn)
161+
dot3v2 = dot3v2 + x(mm) * A(mm, nn) * y(nn)
162162
end do
163-
end function dot3
164-
real(C_double) function dot3v2(x, A, y, M, N) BIND(C, name="dot3v2")
163+
end function dot3v2
164+
real(C_double) function dot3(x, A, y, M, N) BIND(C, name="dot3")
165165
integer(C_long), intent(in) :: M, N
166166
real(C_double), intent(in) :: x(M), A(M,N), y(N)
167167
real(C_double) :: t
168168
integer(C_long) :: mm, nn
169-
dot3v2 = 0.0d0
169+
dot3 = 0.0d0
170170
do concurrent(nn = 1:N)
171171
t = 0.0d0
172172
do concurrent(mm = 1:M)
173173
t = t + x(mm) * A(mm, nn)
174174
end do
175-
dot3v2 = dot3v2 + t * y(nn)
175+
dot3 = dot3 + t * y(nn)
176176
end do
177-
end function dot3v2
177+
end function dot3
178178
real(C_double) function dot3builtin(x, A, y, M, N) BIND(C, name="dot3builtin")
179179
integer(C_long), intent(in) :: M, N
180180
real(C_double), intent(in) :: x(M), A(M,N), y(N)

benchmark/looptests.jl

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -99,23 +99,23 @@ function jselfdotavx(a)
9999
end
100100
s
101101
end
102-
function jdot3(x, A, y)
102+
function jdot3v2(x, A, y)
103103
M, N = size(A)
104104
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
105105
@inbounds @fastmath for n 1:N, m 1:M
106106
s += x[m] * A[m,n] * y[n]
107107
end
108108
s
109109
end
110-
function jdot3avx(x, A, y)
110+
function jdot3v2avx(x, A, y)
111111
M, N = size(A)
112112
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
113113
@avx for n 1:N, m 1:M
114114
s += x[m] * A[m,n] * y[n]
115115
end
116116
s
117117
end
118-
function jdot3v2(x, A, y)
118+
function jdot3(x, A, y)
119119
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
120120
@inbounds @fastmath for n axes(A,2)
121121
t = zero(s)
@@ -126,7 +126,7 @@ function jdot3v2(x, A, y)
126126
end
127127
s
128128
end
129-
function jdot3v2avx(x, A, y)
129+
function jdot3avx(x, A, y)
130130
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
131131
@avx for n axes(A,2)
132132
t = zero(s)

benchmark/looptestseigen.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ extern "C" {
1515
void aplusBc(double*, double*, double*, double*, long, long);
1616
double OLSlp(double*, double*, double*, long, long);
1717
void AplusAt(double*, double*, long);
18+
double logdettriangle(double*, long);
1819
}
1920

2021
typedef Map<MatrixXd> mMatrix;
@@ -108,3 +109,8 @@ void AplusAt(double* pB, double* pA, long N){
108109
return;
109110
}
110111

112+
// double logdettriangle(double* pA, long N){
113+
// mMatrix A(pA, N, N);
114+
// return log(A.diagonal()).sum();
115+
// }
116+

docs/src/assets/bench_AmulB_v1.png

13.7 KB
Loading

docs/src/assets/bench_AmulBt_v1.png

4.18 KB
Loading

0 commit comments

Comments
 (0)