Skip to content

Commit 50af712

Browse files
committed
Update documentation and benchmarks.
1 parent be39207 commit 50af712

31 files changed

+616
-441
lines changed

.travis.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ after_success:
1616
jobs:
1717
include:
1818
- stage: Documentation
19-
julia: 1.0
19+
julia: 1.3
2020
script: julia --project=docs -e '
2121
using Pkg;
2222
Pkg.develop(PackageSpec(path=pwd()));

benchmark/benchmarkflops.jl

Lines changed: 311 additions & 328 deletions
Large diffs are not rendered by default.

benchmark/driver.jl

Lines changed: 22 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -10,59 +10,48 @@ include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
1010

1111
using Distributed
1212

13-
addprocs(13);
13+
addprocs((Sys.CPU_THREADS >> 1)-1); nprocs()
1414

1515
@everywhere begin
1616
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
1717
const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmark")
1818
include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl"))
19-
BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1
19+
# BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1
2020
end
2121

22-
gemm_future = @spawnat 2 benchmark_gemm(2:256);
23-
AtmulB_future = @spawnat 3 benchmark_AtmulB(2:256);
24-
dot_future = @spawnat 4 benchmark_dot(2:256);
25-
selfdot_future = @spawnat 5 benchmark_selfdot(2:256);
26-
gemv_future = @spawnat 6 benchmark_gemv(2:256);
27-
dot3_future = @spawnat 7 benchmark_dot3(2:256);
28-
sse_future = @spawnat 8 benchmark_sse(2:256);
29-
exp_future = @spawnat 9 benchmark_exp(2:256);
30-
aplusBc_future = @spawnat 10 benchmark_aplusBc(2:256);
31-
AplusAt_future = @spawnat 11 benchmark_AplusAt(2:256);
32-
randomaccess_future = @spawnat 12 benchmark_random_access(2:256);
33-
AmulBt_future = @spawnat 13 benchmark_AmulBt(2:256);
34-
Atmulvb_future = @spawnat 14 benchmark_Atmulvb(2:256);
35-
36-
dot_bench = fetch(dot_future)
37-
selfdot_bench = fetch(selfdot_future)
38-
gemv_bench = fetch(gemv_future)
39-
randomaccess_bench = fetch(randomaccess_future)
40-
dot3_bench = fetch(dot3_future)
41-
sse_bench = fetch(sse_future)
42-
exp_bench = fetch(exp_future)
43-
AplusAt_bench = fetch(AplusAt_future)
44-
aplusBc_bench = fetch(aplusBc_future)
45-
gemm_bench = fetch(gemm_future)
46-
AtmulB_bench = fetch(AtmulB_future)
47-
AmulBt_bench = fetch(AmulBt_future)
48-
Atmulvb_bench = fetch(Atmulvb_future)
49-
22+
AmulB_bench = benchmark_AmulB(2:256)
23+
AmulBt_bench = benchmark_AmulBt(2:256)
24+
AtmulB_bench = benchmark_AtmulB(2:256)
25+
AtmulBt_bench = benchmark_AtmulBt(2:256)
26+
dot_bench = benchmark_dot(2:256)
27+
selfdot_bench = benchmark_selfdot(2:256)
28+
Amulvb_bench = benchmark_Amulvb(2:256)
29+
Atmulvb_bench = benchmark_Atmulvb(2:256)
30+
dot3_bench = benchmark_dot3(2:256)
31+
sse_bench = benchmark_sse(2:256)
32+
exp_bench = benchmark_exp(2:256)
33+
aplusBc_bench = benchmark_aplusBc(2:256)
34+
AplusAt_bench = benchmark_AplusAt(2:256)
35+
randomaccess_bench = benchmark_random_access(2:256)
36+
logdettriangle_bench = benchmark_logdettriangle(2:256)
5037

5138
v = 1
5239
filetype = "svg"
5340
const PICTURES = joinpath(pkgdir("LoopVectorization"), "docs", "src", "assets")
5441
save(joinpath(PICTURES, "bench_exp_v$v.$filetype"), plot(exp_bench));
55-
save(joinpath(PICTURES, "bench_gemm_v$v.$filetype"), plot(gemm_bench));
42+
save(joinpath(PICTURES, "bench_logdettriangle_v$v.$filetype"), plot(logdettriangle_bench));
43+
save(joinpath(PICTURES, "bench_AmulB_v$v.$filetype"), plot(AmulB_bench));
44+
save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
5645
save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));
46+
save(joinpath(PICTURES, "bench_AtmulBt_v$v.$filetype"), plot(AtmulBt_bench));
5747
save(joinpath(PICTURES, "bench_dot_v$v.$filetype"), plot(dot_bench));
5848
save(joinpath(PICTURES, "bench_selfdot_v$v.$filetype"), plot(selfdot_bench));
59-
save(joinpath(PICTURES, "bench_gemv_v$v.$filetype"), plot(gemv_bench));
6049
save(joinpath(PICTURES, "bench_dot3_v$v.$filetype"), plot(dot3_bench));
6150
save(joinpath(PICTURES, "bench_sse_v$v.$filetype"), plot(sse_bench));
6251
save(joinpath(PICTURES, "bench_aplusBc_v$v.$filetype"), plot(aplusBc_bench));
6352
save(joinpath(PICTURES, "bench_AplusAt_v$v.$filetype"), plot(AplusAt_bench));
6453
save(joinpath(PICTURES, "bench_random_access_v$v.$filetype"), plot(randomaccess_bench));
65-
save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
54+
save(joinpath(PICTURES, "bench_Amulvb_v$v.$filetype"), plot(Amulvb_bench));
6655
save(joinpath(PICTURES, "bench_Atmulvb_v$v.$filetype"), plot(Atmulvb_bench));
6756

6857

benchmark/loadsharedlibs.jl

Lines changed: 46 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,15 +16,15 @@ if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
1616
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
1717
end
1818
if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
19-
run(`icc -fast -qopt-zmm-usage=high -shared -fPIC $cfile -o $LIBICTEST`)
19+
run(`icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`)
2020
end
2121
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90")
2222
if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST)
2323
# --param max-unroll-times defaults to ≥8, which is generally excessive
2424
run(`gfortran -Ofast -march=native -funroll-loops --param max-unroll-times=4 -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
2525
end
2626
if !isfile(LIBIFTEST) || mtime(ffile) > mtime(LIBIFTEST)
27-
run(`ifort -fast -qopt-zmm-usage=high -shared -fPIC $ffile -o $LIBIFTEST`)
27+
run(`ifort -fast -qopt-zmm-usage=high -qoverride-limits -shared -fPIC $ffile -o $LIBIFTEST`)
2828
end
2929

3030
for (prefix,Cshared,Fshared) ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,LIBIFTEST))
@@ -105,6 +105,30 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
105105
C, A, parent(B), Ref(M), Ref(K), Ref(N)
106106
)
107107
end
108+
@eval @inline function $(Symbol(prefix,:cgemm!))(C, A::Adjoint, B::Adjoint)
109+
M, N = size(C); K = size(B, 1)
110+
ccall(
111+
(:AtmulBt, $Cshared), Cvoid,
112+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong, Clong),
113+
C, parent(A), parent(B), M, K, N
114+
)
115+
end
116+
@eval @inline function $(Symbol(prefix,:fgemm!))(C, A::Adjoint, B::Adjoint)
117+
M, N = size(C); K = size(B, 1)
118+
ccall(
119+
(:AtmulBt, $Fshared), Cvoid,
120+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
121+
C, parent(A), parent(B), Ref(M), Ref(K), Ref(N)
122+
)
123+
end
124+
@eval @inline function $(Symbol(prefix,:fgemm_builtin!))(C, A::Adjoint, B::Adjoint)
125+
M, N = size(C); K = size(B, 1)
126+
ccall(
127+
(:AtmulBtbuiltin, $Fshared), Cvoid,
128+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
129+
C, parent(A), parent(B), Ref(M), Ref(K), Ref(N)
130+
)
131+
end
108132
@eval function $(Symbol(prefix,:cdot))(a, b)
109133
N = length(a)
110134
ccall(
@@ -312,5 +336,24 @@ for (prefix,Cshared,Fshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST), (:i,LIBICTEST,
312336
)
313337
p[]
314338
end
315-
339+
@eval function $(Symbol(prefix,:clogdettriangle))(T::Union{LowerTriangular,UpperTriangular})
340+
N = size(T,1)
341+
Tp = parent(T)
342+
ccall(
343+
(:logdettriangle, $Cshared), Float64,
344+
(Ptr{Float64}, Clong),
345+
Tp, N
346+
)
347+
end
348+
@eval function $(Symbol(prefix,:flogdettriangle))(T::Union{LowerTriangular,UpperTriangular})
349+
N = size(T,1)
350+
Tp = parent(T)
351+
ld = Ref{Float64}()
352+
ccall(
353+
(:logdettriangle, $Fshared), Cvoid,
354+
(Ref{Float64}, Ptr{Float64}, Ref{Clong}),
355+
ld, Tp, Ref(N)
356+
)
357+
ld[]
358+
end
316359
end

benchmark/looptests.c

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,19 @@ void AmulBt(double* restrict C, double* restrict A, double* restrict Bt, long M,
104104
}
105105
return;
106106
}
107+
void AtmulBt(double* restrict C, double* restrict A, double* restrict Bt, long M, long K, long N){
108+
for (long i = 0; i < M*N; i++){
109+
C[i] = 0.0;
110+
}
111+
for (long n = 0; n < N; n++){
112+
for (long k = 0; k < K; k++){
113+
for (long m = 0; m < M; m++){
114+
C[m + n*M] += A[k + K*m] * Bt[n + N*k];
115+
}
116+
}
117+
}
118+
return;
119+
}
107120
double dot(double* restrict a, double* restrict b, long N){
108121
double s = 0.0;
109122
for (long n = 0; n < N; n++){
@@ -212,5 +225,11 @@ double randomaccess(double* restrict P, long* restrict basis, double* restrict c
212225
}
213226
return p;
214227
}
215-
228+
double logdettriangle(double* T, long N){
229+
double ld = 0;
230+
for (long n = 0; n < N; n++){
231+
ld += log(T[n + n*N]);
232+
}
233+
return ld;
234+
}
216235

benchmark/looptests.f90

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,28 @@ subroutine AmulBtbuiltin(C, A, B, M, K, N) BIND(C, name="AmulBtbuiltin")
147147
real(C_double), dimension(N, K), intent(in) :: B
148148
C = matmul(A, transpose(B))
149149
end subroutine AmulBtbuiltin
150+
subroutine AtmulBt(C, A, B, M, K, N) BIND(C, name="AtmulBt")
151+
integer(C_long), intent(in) :: M, K, N
152+
real(C_double), dimension(M, N), intent(out) :: C
153+
real(C_double), dimension(K, M), intent(in) :: A
154+
real(C_double), dimension(N, K), intent(in) :: B
155+
integer(C_long) :: mm, kk, nn
156+
C = 0.0
157+
do concurrent(nn = 1:N)
158+
do concurrent(kk = 1:K)
159+
do concurrent(mm = 1:M)
160+
C(mm,nn) = C(mm,nn) + A(kk,mm) * B(nn,kk)
161+
end do
162+
end do
163+
end do
164+
end subroutine AtmulBt
165+
subroutine AtmulBtbuiltin(C, A, B, M, K, N) BIND(C, name="AtmulBtbuiltin")
166+
integer(C_long), intent(in) :: M, K, N
167+
real(C_double), dimension(M, N), intent(out) :: C
168+
real(C_double), dimension(K, M), intent(in) :: A
169+
real(C_double), dimension(N, K), intent(in) :: B
170+
C = transpose(matmul(B, A))
171+
end subroutine AtmulBtbuiltin
150172
subroutine dot(s, a, b, N) BIND(C, name="dot")
151173
integer(C_long), intent(in) :: N
152174
real(C_double), dimension(N), intent(in) :: a, b
@@ -309,4 +331,14 @@ subroutine randomaccess(pp, P, basis, coefs, A, C) BIND(C, name="randomaccess")
309331
pp = pp + pc
310332
end do
311333
end subroutine randomaccess
334+
subroutine logdettriangle(ld, T, N) BIND(C, name="logdettriangle")
335+
integer(C_long), intent(in) :: N
336+
real(C_double), intent(in) :: T(N,N)
337+
real(C_double), intent(out) :: ld
338+
integer(C_long) :: nn
339+
ld = 0
340+
do concurrent(nn = 1:N)
341+
ld = ld + log(T(nn,nn))
342+
end do
343+
end subroutine logdettriangle
312344
end module looptests

benchmark/looptests.jl

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ function jgemm!(𝐂, 𝐀, 𝐁)
66
M, N = size(𝐂); K = size(𝐁,1)
77
@inbounds for n 1:N, k 1:K
88
@simd ivdep for m 1:M
9-
𝐂[m,n] += 𝐀[m,k] * 𝐁[k,n]
9+
@fastmath 𝐂[m,n] += 𝐀[m,k] * 𝐁[k,n]
1010
end
1111
end
1212
end
@@ -15,18 +15,29 @@ end
1515
@inbounds for n 1:size(𝐂,2), m 1:size(𝐂,1)
1616
𝐂ₘₙ = zero(eltype(𝐂))
1717
@simd ivdep for k 1:size(𝐀,1)
18-
𝐂ₘₙ += 𝐀[k,m] * 𝐁[k,n]
18+
@fastmath 𝐂ₘₙ += 𝐀[k,m] * 𝐁[k,n]
1919
end
2020
𝐂[m,n] = 𝐂ₘₙ
2121
end
2222
end
2323
@inline function jgemm!(𝐂, 𝐀, 𝐁ᵀ::Adjoint)
2424
𝐂 .= 0
2525
𝐁 = parent(𝐁ᵀ)
26-
M, N = size(𝐂); K = size(𝐁,1)
26+
M, N = size(𝐂); K = size(𝐁ᵀ,1)
2727
@inbounds for k 1:K, n 1:N
2828
@simd ivdep for m 1:M
29-
𝐂[m,n] += 𝐀[m,k] * 𝐁[n,k]
29+
@fastmath 𝐂[m,n] += 𝐀[m,k] * 𝐁[n,k]
30+
end
31+
end
32+
end
33+
@inline function jgemm!(𝐂, 𝐀ᵀ::Adjoint, 𝐁ᵀ::Adjoint)
34+
𝐂 .= 0
35+
𝐀 = parent(𝐀ᵀ)
36+
𝐁 = parent(𝐁ᵀ)
37+
M, N = size(𝐂); K = size(𝐁ᵀ,1)
38+
@inbounds for n 1:N, k 1:K
39+
@simd ivdep for m 1:M
40+
@fastmath 𝐂[m,n] += 𝐀[k,m] * 𝐁[n,k]
3041
end
3142
end
3243
end
@@ -72,7 +83,7 @@ function jdot3(x, A, y)
7283
s = zero(promote_type(eltype(x), eltype(A), eltype(y)))
7384
@inbounds for n 1:N
7485
@simd ivdep for m 1:M
75-
s += x[m] * A[m,n] * y[n]
86+
@fastmath s += x[m] * A[m,n] * y[n]
7687
end
7788
end
7889
s
@@ -113,7 +124,7 @@ function jgemv!(y, 𝐀, x)
113124
y .= zero(eltype(y))
114125
@inbounds for j eachindex(x)
115126
@simd ivdep for i eachindex(y)
116-
y[i] += 𝐀[i,j] * x[j]
127+
@fastmath y[i] += 𝐀[i,j] * x[j]
117128
end
118129
end
119130
end
@@ -122,7 +133,7 @@ end
122133
@inbounds for i eachindex(𝐲)
123134
𝐲ᵢ = zero(eltype(𝐲))
124135
@simd ivdep for j eachindex(𝐱)
125-
𝐲ᵢ += 𝐀[j,i] * 𝐱[j]
136+
@fastmath 𝐲ᵢ += 𝐀[j,i] * 𝐱[j]
126137
end
127138
𝐲[i] = 𝐲ᵢ
128139
end
@@ -138,7 +149,7 @@ end
138149
end
139150
function jvar!(𝐬², 𝐀, x̄)
140151
@.= zero(eltype(𝐬²))
141-
@inbounds for i 1:size(𝐀,2)
152+
@inbounds @fastmath for i 1:size(𝐀,2)
142153
@simd for j eachindex(𝐬²)
143154
δ = 𝐀[j,i] - x̄[j]
144155
𝐬²[j] += δ*δ
@@ -207,6 +218,19 @@ function randomaccessavx(P, basis, coeffs::Vector{T}) where {T}
207218
end
208219
return p
209220
end
210-
221+
function jlogdettriangle(T::Union{LowerTriangular,UpperTriangular})
222+
ld = 0.0
223+
@inbounds for n 1:size(T,1)
224+
ld += log(T[n,n])
225+
end
226+
ld
227+
end
228+
function jlogdettriangleavx(T::Union{LowerTriangular,UpperTriangular})
229+
ld = 0.0
230+
@avx for n 1:size(T,1)
231+
ld += log(T[n,n])
232+
end
233+
ld
234+
end
211235

212236

docs/make.jl

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,10 @@ makedocs(;
1515
"Vectorized Convenience Functions" => "vectorized_convenience_functions.md",
1616
"Future Work" => "future_work.md"
1717
],
18-
repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
18+
# repo="https://github.com/chriselrod/LoopVectorization.jl/blob/{commit}{path}#L{line}",
1919
sitename="LoopVectorization.jl",
20-
authors="Chris Elrod",
21-
assets=[],
20+
authors="Chris Elrod"
21+
# assets=[],
2222
)
2323

2424
deploydocs(;

docs/src/assets/bench_AmulBt_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

docs/src/assets/bench_AplusAt_v1.svg

Lines changed: 1 addition & 1 deletion
Loading

0 commit comments

Comments
 (0)