Skip to content

Commit 77aa28b

Browse files
committed
Added benchmarks folder, made a few tweeks to unrolling behavior.
1 parent 9b004cc commit 77aa28b

File tree

9 files changed

+872
-111
lines changed

9 files changed

+872
-111
lines changed

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,4 +7,8 @@
77
*~
88
src/#*#
99
tests/#*#
10+
benchmarks/#*#
1011
*.mem
12+
*.mod
13+
*.mod0
14+
*.so

benchmarks/driver.jl

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
2+
const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
3+
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
4+
include(joinpath(LOOPVECBENCHDIR, "loadsharedlibs.jl"))
5+
6+
using PrettyTables, BenchmarkTools
7+
struct SizedResults{V <: AbstractVector} <: AbstractMatrix{String}
8+
results::Matrix{Float64}
9+
sizes::V
10+
end
11+
function Base.size(sr::SizedResults)
12+
M, N = size(sr.results)
13+
N, M+1
14+
end
15+
struct BenchmarkResult{V}
16+
tests::Vector{String}
17+
sizedresults::SizedResults{V}
18+
end
19+
function BenchmarkResult(tests, sizes)
20+
ntests = length(tests); nsizes = length(sizes)
21+
BenchmarkResult(
22+
append!(["Size"], tests),
23+
SizedResults(Matrix{Float64}(undef, ntests, nsizes), sizes)
24+
)
25+
end
26+
function Base.getindex(br::SizedResults, row, col)
27+
col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
28+
end
29+
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
30+
function Base.show(io::IO, br::BenchmarkResult)
31+
pretty_table(io, br.sizedresults, br.tests)
32+
end
33+
function alloc_matrices(s::NTuple{3,Int})
34+
M, K, N = s
35+
C = Matrix{Float64}(undef, M, N)
36+
A = rand(M, K)
37+
B = rand(K, N)
38+
C, A, B
39+
end
40+
alloc_matrices(s::Int) = alloc_matrices((s,s,s))
41+
gflop(s::Int) = s^3 * 1e-9
42+
gflop(s::NTuple{3,Int}) = prod(s) * 1e-9
43+
function benchmark_gemm(sizes)
44+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
45+
br = BenchmarkResult(tests, sizes)
46+
for (i,s) enumerate(sizes)
47+
C, A, B = alloc_matrices(s)
48+
n_gflop = gflop(s)
49+
br[1,i] = n_gflop / @belapsed mul!($C, $A, $B)
50+
Cblas = copy(C)
51+
br[2,i] = n_gflop / @belapsed jgemm_nkm!($C, $A, $B)
52+
@assert C Cblas "Julia gemm wrong?"
53+
br[3,i] = n_gflop / @belapsed cgemm_nkm!($C, $A, $B)
54+
@assert C Cblas "Polly gemm wrong?"
55+
br[4,i] = n_gflop / @belapsed fgemm_nkm!($C, $A, $B)
56+
@assert C Cblas "Fort gemm wrong?"
57+
br[5,i] = n_gflop / @belapsed fgemm_builtin!($C, $A, $B)
58+
@assert C Cblas "Fort intrinsic gemm wrong?"
59+
br[6,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
60+
@assert C Cblas "LoopVec gemm wrong?"
61+
end
62+
br
63+
end
64+
65+
using VegaLite, IndexedTables
66+
function plot(br::BenchmarkResult)
67+
res = vec(br.sizedresults.results)
68+
brsizes = br.sizedresults.sizes
69+
sizes = Vector{eltype(brsizes)}(undef, length(res))
70+
ntests = length(br.tests) - 1
71+
for i 0:length(brsizes)-1
72+
si = brsizes[i+1]
73+
for j 1:ntests
74+
sizes[j + i*ntests] = si
75+
end
76+
end
77+
tests = vcat((@view(br.tests[2:end]) for _ eachindex(brsizes))...)
78+
t = table((GFLOPS = res, Size = sizes, Method = tests))
79+
t |> @vlplot(
80+
:line,
81+
x = :Size,
82+
y = :GFLOPS,
83+
color = :Method
84+
)
85+
end
86+
87+

benchmarks/loadsharedlibs.jl

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
2+
using VectorizationBase: REGISTER_SIZE
3+
# run(`gfortran `)
4+
5+
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
6+
const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
7+
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
8+
9+
const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
10+
const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
11+
12+
# requires Clang with polly to build
13+
if !isfile(LIBCTEST)
14+
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
15+
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
16+
end
17+
if !isfile(LIBFTEST)
18+
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90") # --param max-unroll-times defaults to ≥8, which is generally excessive
19+
run(`gfortran -Ofast -march=native -funroll-loops --param max-unroll-times=4 -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
20+
end
21+
22+
for order (:kmn, :knm, :mkn, :mnk, :nkm, :nmk)
23+
gemm = Symbol(:gemm_, order)
24+
@eval function $(Symbol(:c, gemm, :!))(C, A, B)
25+
M, N = size(C); K = size(B, 1)
26+
ccall(
27+
($(QuoteNode(gemm)), LIBCTEST), Cvoid,
28+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong, Clong),
29+
C, A, B, M, K, N
30+
)
31+
end
32+
@eval function $(Symbol(:f, gemm, :!))(C, A, B)
33+
M, N = size(C); K = size(B, 1)
34+
ccall(
35+
($(QuoteNode(gemm)), LIBFTEST), Cvoid,
36+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
37+
C, A, B, Ref(M), Ref(K), Ref(N)
38+
)
39+
end
40+
end
41+
42+
function fgemm_builtin!(C, A, B)
43+
M, N = size(C); K = size(B, 1)
44+
ccall(
45+
(:gemmbuiltin, LIBFTEST), Cvoid,
46+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
47+
C, A, B, Ref(M), Ref(K), Ref(N)
48+
)
49+
end
50+
51+
function cdot(a, b)
52+
N = length(a)
53+
ccall(
54+
(:dot, LIBCTEST), Float64,
55+
(Ptr{Float64}, Ptr{Float64}, Clong),
56+
a, b, N
57+
)
58+
end
59+
function fdot(a, b)
60+
N = length(a)
61+
d = Ref{Float64}()
62+
ccall(
63+
(:dot, LIBFTEST), Cvoid,
64+
(Ref{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}),
65+
d, a, b, Ref(N)
66+
)
67+
d[]
68+
end
69+
function cselfdot(a)
70+
N = length(a)
71+
ccall(
72+
(:selfdot, LIBCTEST), Float64,
73+
(Ptr{Float64}, Clong),
74+
a, N
75+
)
76+
end
77+
function fselfdot(a)
78+
N = length(a)
79+
d = Ref{Float64}()
80+
ccall(
81+
(:selfdot, LIBFTEST), Cvoid,
82+
(Ref{Float64}, Ptr{Float64}, Ref{Clong}),
83+
d, a, Ref(N)
84+
)
85+
d[]
86+
end
87+
88+
function cgemv!(y, A, x)
89+
M, K = size(A)
90+
ccall(
91+
(:gemv, LIBCTEST), Cvoid,
92+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
93+
y, A, x, M, K
94+
)
95+
end
96+
function fgemv!(y, A, x)
97+
M, K = size(A)
98+
ccall(
99+
(:gemv, LIBFTEST), Cvoid,
100+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
101+
y, A, x, Ref(M), Ref(K)
102+
)
103+
end
104+
function fgemv_builtin!(y, A, x)
105+
M, K = size(A)
106+
ccall(
107+
(:gemvbuiltin, LIBFTEST), Cvoid,
108+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
109+
y, A, x, Ref(M), Ref(K)
110+
)
111+
end
112+
113+
function caplusBc!(D, a, B, c)
114+
M, K = size(B)
115+
ccall(
116+
(:aplusBc, LIBCTEST), Cvoid,
117+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
118+
y, A, x, M, K
119+
)
120+
end
121+
function faplusBc!(D, a, B, c)
122+
M, K = size(B)
123+
ccall(
124+
(:aplusBc, LIBFTEST), Cvoid,
125+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
126+
y, A, x, Ref(M), Ref(K)
127+
)
128+
end
129+
function cOLSlp(y, X, β)
130+
N, P = size(X)
131+
ccall(
132+
(:OLSlp, LIBCTEST), Float64,
133+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
134+
y, X, β, N, P
135+
)
136+
end
137+
function fOLSlp(y, X, β)
138+
N, P = size(X)
139+
lp = Ref{Float64}()
140+
ccall(
141+
(:OLSlp, LIBFTEST), Cvoid,
142+
(Ref{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
143+
lp, y, X, β, Ref(N), Ref(P)
144+
)
145+
lp[]
146+
end
147+
148+

benchmarks/looptests.c

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
2+
3+
void gemm_mnk(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
4+
for (long i = 0; i < M*N; i++){
5+
C[i] = 0.0;
6+
}
7+
for (long m = 0; m < M; m++){
8+
for (long n = 0; n < N; n++){
9+
for (long k = 0; k < K; k++){
10+
C[m + n*M] += A[m + k*M] * B[k + n*K];
11+
}
12+
}
13+
}
14+
return;
15+
}
16+
void gemm_mkn(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
17+
for (long i = 0; i < M*N; i++){
18+
C[i] = 0.0;
19+
}
20+
for (long m = 0; m < M; m++){
21+
for (long k = 0; k < K; k++){
22+
for (long n = 0; n < N; n++){
23+
C[m + n*M] += A[m + k*M] * B[k + n*K];
24+
}
25+
}
26+
}
27+
return;
28+
}
29+
void gemm_nmk(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
30+
for (long i = 0; i < M*N; i++){
31+
C[i] = 0.0;
32+
}
33+
for (long n = 0; n < N; n++){
34+
for (long m = 0; m < M; m++){
35+
for (long k = 0; k < K; k++){
36+
C[m + n*M] += A[m + k*M] * B[k + n*K];
37+
}
38+
}
39+
}
40+
return;
41+
}
42+
void gemm_nkm(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
43+
for (long i = 0; i < M*N; i++){
44+
C[i] = 0.0;
45+
}
46+
for (long n = 0; n < N; n++){
47+
for (long k = 0; k < K; k++){
48+
for (long m = 0; m < M; m++){
49+
C[m + n*M] += A[m + k*M] * B[k + n*K];
50+
}
51+
}
52+
}
53+
return;
54+
}
55+
void gemm_kmn(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
56+
for (long i = 0; i < M*N; i++){
57+
C[i] = 0.0;
58+
}
59+
for (long k = 0; k < K; k++){
60+
for (long m = 0; m < M; m++){
61+
for (long n = 0; n < N; n++){
62+
C[m + n*M] += A[m + k*M] * B[k + n*K];
63+
}
64+
}
65+
}
66+
return;
67+
}
68+
void gemm_knm(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
69+
for (long i = 0; i < M*N; i++){
70+
C[i] = 0.0;
71+
}
72+
for (long k = 0; k < K; k++){
73+
for (long n = 0; n < N; n++){
74+
for (long m = 0; m < M; m++){
75+
C[m + n*M] += A[m + k*M] * B[k + n*K];
76+
}
77+
}
78+
}
79+
return;
80+
}
81+
double dot(double* restrict a, double* restrict b, long N){
82+
double s = 0.0;
83+
for (long n = 0; n < N; n++){
84+
s += a[n]*b[n];
85+
}
86+
return s;
87+
}
88+
double selfdot(double* restrict a, long N){
89+
double s = 0.0;
90+
for (long n = 0; n < N; n++){
91+
s += a[n]*a[n];
92+
}
93+
return s;
94+
}
95+
96+
void gemv(double* restrict y, double* restrict A, double* restrict x, long M, long K){
97+
for (long m = 0; m < M; m++){
98+
y[m] = 0.0;
99+
}
100+
for (long k = 0; k < K; k++){
101+
for (long m = 0; m < M; m++){
102+
y[m] += A[m + k*M] * x[k];
103+
}
104+
}
105+
return;
106+
}
107+
108+
void unscaledvar(double* restrict s, double* restrict A, double* restrict xb, long M, long N){
109+
for (long m = 0; m < M; m++){
110+
s[m] = 0.0;
111+
}
112+
for (long n = 0; n < N; n++){
113+
for (long m = 0; m < M; m++){
114+
double d = A[m + n*M] - xb[m];
115+
s[m] += d*d;
116+
}
117+
}
118+
return;
119+
}
120+
121+
void aplucBc(double* restrict D, double* restrict a, double* restrict B, double* restrict c, long M, long N){
122+
for (long n = 0; n < N; n++){
123+
for (long m = 0; m < M; m++){
124+
D[m + n*M] = a[m] + B[m + n*M] * c[n];
125+
}
126+
}
127+
return;
128+
}
129+
130+
double OLSlp(double* restrict y, double* restrict X, double* restrict b, long N, long P){
131+
double lp = 0.0;
132+
for (long n = 0; n < N; n++){
133+
double d = y[n];
134+
for (long p = 0; p < P; p++){
135+
d -= X[n + p*N] * b[p];
136+
}
137+
lp += d*d;
138+
}
139+
return lp;
140+
}
141+
142+

0 commit comments

Comments
 (0)