Skip to content

Commit 309750f

Browse files
committed
2 parents 7a58714 + 081c373 commit 309750f

File tree

3 files changed

+325
-293
lines changed

3 files changed

+325
-293
lines changed

benchmarks/benchmarkflops.jl

Lines changed: 270 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,270 @@
1+
include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
2+
include(joinpath(LOOPVECBENCHDIR, "loadsharedlibs.jl"))
3+
4+
using PrettyTables, BenchmarkTools
5+
struct SizedResults{V <: AbstractVector} <: AbstractMatrix{String}
6+
results::Matrix{Float64}
7+
sizes::V
8+
end
9+
function Base.size(sr::SizedResults)
10+
M, N = size(sr.results)
11+
N, M+1
12+
end
13+
struct BenchmarkResult{V}
14+
tests::Vector{String}
15+
sizedresults::SizedResults{V}
16+
end
17+
function BenchmarkResult(tests, sizes)
18+
ntests = length(tests); nsizes = length(sizes)
19+
BenchmarkResult(
20+
append!(["Size"], tests),
21+
SizedResults(Matrix{Float64}(undef, ntests, nsizes), sizes)
22+
)
23+
end
24+
function Base.getindex(br::SizedResults, row, col)
25+
col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
26+
end
27+
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
28+
29+
const HIGHLIGHT_BEST = Highlighter(
30+
(br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
31+
foreground = :green
32+
);
33+
function Base.show(io::IO, br::BenchmarkResult)
34+
pretty_table(
35+
io, br.sizedresults, br.tests, crop = :none, highlighters = (HIGHLIGHT_BEST,)
36+
)
37+
end
38+
39+
tothreetuple(i::Int) = (i,i,i)
40+
tothreetuple(i::NTuple{3,Int}) = i
41+
function benchmark_gemm(sizes)
42+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
43+
br = BenchmarkResult(tests, sizes)
44+
for (i,s) enumerate(sizes)
45+
M, K, N = tothreetuple(s)
46+
C = Matrix{Float64}(undef, M, N)
47+
A = rand(M, K)
48+
B = rand(K, N)
49+
n_gflop = M*K*N*2e-9
50+
br[1,i] = n_gflop / @belapsed mul!($C, $A, $B)
51+
Cblas = copy(C)
52+
br[2,i] = n_gflop / @belapsed jgemm_nkm!($C, $A, $B)
53+
@assert C Cblas "Julia gemm wrong?"
54+
br[3,i] = n_gflop / @belapsed cgemm_nkm!($C, $A, $B)
55+
@assert C Cblas "Polly gemm wrong?"
56+
br[4,i] = n_gflop / @belapsed fgemm_nkm!($C, $A, $B)
57+
@assert C Cblas "Fort gemm wrong?"
58+
br[5,i] = n_gflop / @belapsed fgemm_builtin!($C, $A, $B)
59+
@assert C Cblas "Fort intrinsic gemm wrong?"
60+
br[6,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
61+
@assert C Cblas "LoopVec gemm wrong?"
62+
# if i % 10 == 0
63+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
64+
# @show percent_complete
65+
# end
66+
end
67+
br
68+
end
69+
function benchmark_AtmulB(sizes)
70+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
71+
br = BenchmarkResult(tests, sizes)
72+
for (i,s) enumerate(sizes)
73+
M, K, N = tothreetuple(s)
74+
C = Matrix{Float64}(undef, M, N)
75+
At = rand(K, M)
76+
B = rand(K, N)
77+
n_gflop = M*K*N*2e-9
78+
br[1,i] = n_gflop / @belapsed mul!($C, $At', $B)
79+
Cblas = copy(C)
80+
br[2,i] = n_gflop / @belapsed jAtmulB!($C, $At, $B)
81+
@assert C Cblas "Julia gemm wrong?"
82+
br[3,i] = n_gflop / @belapsed cAtmulB!($C, $At, $B)
83+
@assert C Cblas "Polly gemm wrong?"
84+
br[4,i] = n_gflop / @belapsed fAtmulB!($C, $At, $B)
85+
@assert C Cblas "Fort gemm wrong?"
86+
br[5,i] = n_gflop / @belapsed fAtmulB_builtin!($C, $At, $B)
87+
@assert C Cblas "Fort intrinsic gemm wrong?"
88+
br[6,i] = n_gflop / @belapsed jAtmulBavx!($C, $At, $B)
89+
@assert C Cblas "LoopVec gemm wrong?"
90+
# if i % 10 == 0
91+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
92+
# @show percent_complete
93+
# end
94+
end
95+
br
96+
end
97+
98+
function benchmark_dot(sizes)
99+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
100+
br = BenchmarkResult(tests, sizes)
101+
for (i,s) enumerate(sizes)
102+
a = rand(s); b = rand(s);
103+
n_gflop = s * 2e-9
104+
br[1,i] = n_gflop / @belapsed dot($a, $b)
105+
dotblas = dot(a, b)
106+
br[2,i] = n_gflop / @belapsed jdot($a, $b)
107+
@assert jdot(a,b) dotblas "Julia dot wrong?"
108+
br[3,i] = n_gflop / @belapsed cdot($a, $b)
109+
@assert cdot(a,b) dotblas "Polly dot wrong?"
110+
br[4,i] = n_gflop / @belapsed fdot($a, $b)
111+
@assert fdot(a,b) dotblas "Fort dot wrong?"
112+
br[5,i] = n_gflop / @belapsed jdotavx($a, $b)
113+
@assert jdotavx(a,b) dotblas "LoopVec dot wrong?"
114+
# if i % 10 == 0
115+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
116+
# @show percent_complete
117+
# end
118+
end
119+
br
120+
end
121+
function benchmark_selfdot(sizes)
122+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
123+
br = BenchmarkResult(tests, sizes)
124+
for (i,s) enumerate(sizes)
125+
a = rand(s);
126+
n_gflop = s * 2e-9
127+
br[1,i] = n_gflop / @belapsed dot($a, $a)
128+
dotblas = dot(a, a)
129+
br[2,i] = n_gflop / @belapsed jselfdot($a)
130+
@assert jselfdot(a) dotblas "Julia dot wrong?"
131+
br[3,i] = n_gflop / @belapsed cselfdot($a)
132+
@assert cselfdot(a) dotblas "Polly dot wrong?"
133+
br[4,i] = n_gflop / @belapsed fselfdot($a)
134+
@assert fselfdot(a) dotblas "Fort dot wrong?"
135+
br[5,i] = n_gflop / @belapsed jselfdotavx($a)
136+
@assert jselfdotavx(a) dotblas "LoopVec dot wrong?"
137+
# if i % 10 == 0
138+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
139+
# @show percent_complete
140+
# end
141+
end
142+
br
143+
end
144+
totwotuple(i::Int) = (i,i)
145+
totwotuple(i::Tuple{Int,Int}) = i
146+
function benchmark_gemv(sizes)
147+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
148+
br = BenchmarkResult(tests, sizes)
149+
for (i,s) enumerate(sizes)
150+
M, N = totwotuple(s)
151+
x = Vector{Float64}(undef, M); A = rand(M, N); y = rand(N);
152+
n_gflop = M*N * 2e-9
153+
br[1,i] = n_gflop / @belapsed mul!($x, $A, $y)
154+
xblas = copy(x)
155+
br[2,i] = n_gflop / @belapsed jgemv!($x, $A, $y)
156+
@assert x xblas "Julia wrong?"
157+
br[3,i] = n_gflop / @belapsed cgemv!($x, $A, $y)
158+
@assert x xblas "Polly wrong?"
159+
br[4,i] = n_gflop / @belapsed fgemv!($x, $A, $y)
160+
@assert x xblas "Fort wrong?"
161+
br[5,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
162+
@assert x xblas "LoopVec wrong?"
163+
# if i % 10 == 0
164+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
165+
# @show percent_complete
166+
# end
167+
end
168+
br
169+
end
170+
function benchmark_dot3(sizes)
171+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
172+
br = BenchmarkResult(tests, sizes)
173+
for (i,s) enumerate(sizes)
174+
M, N = totwotuple(s)
175+
x = rand(M); A = rand(M, N); y = rand(N);
176+
n_gflop = M*N * 3e-9
177+
br[1,i] = n_gflop / @belapsed dot($x, $A, $y)
178+
dotblas = dot(x, A, y)
179+
br[2,i] = n_gflop / @belapsed jdot3($x, $A, $y)
180+
@assert jdot3(x, A, y) dotblas "Julia dot wrong?"
181+
br[3,i] = n_gflop / @belapsed cdot3($x, $A, $y)
182+
@assert cdot3(x, A, y) dotblas "Polly dot wrong?"
183+
br[4,i] = n_gflop / @belapsed fdot3($x, $A, $y)
184+
@assert fdot3(x, A, y) dotblas "Fort dot wrong?"
185+
br[5,i] = n_gflop / @belapsed jdot3avx($x, $A, $y)
186+
@assert jdot3avx(x, A, y) dotblas "LoopVec dot wrong?"
187+
# if i % 10 == 0
188+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
189+
# @show percent_complete
190+
# end
191+
end
192+
br
193+
end
194+
function sse!(Xβ, y, X, β)
195+
mul!(copyto!(Xβ, y), X, β, 1.0, -1.0)
196+
dot(Xβ, Xβ)
197+
end
198+
function benchmark_sse(sizes)
199+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
200+
br = BenchmarkResult(tests, sizes)
201+
for (i,s) enumerate(sizes)
202+
N, P = totwotuple(s)
203+
y = rand(N); β = rand(P)
204+
X = randn(N, P)
205+
= similar(y)
206+
n_gflop = 2e-9*(P*N + 2N)
207+
br[1,i] = n_gflop / @belapsed sse!($Xβ, $y, $X, $β)
208+
lpblas = sse!(Xβ, y, X, β)
209+
br[2,i] = n_gflop / @belapsed jOLSlp($y, $X, $β)
210+
@assert jOLSlp(y, X, β) lpblas "Julia wrong?"
211+
br[3,i] = n_gflop / @belapsed cOLSlp($y, $X, $β)
212+
@assert cOLSlp(y, X, β) lpblas "Polly wrong?"
213+
br[4,i] = n_gflop / @belapsed fOLSlp($y, $X, $β)
214+
@assert fOLSlp(y, X, β) lpblas "Fort wrong?"
215+
br[5,i] = n_gflop / @belapsed jOLSlp_avx($y, $X, $β)
216+
@assert jOLSlp_avx(y, X, β) lpblas "LoopVec wrong?"
217+
# if i % 10 == 0
218+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
219+
# @show percent_complete
220+
# end
221+
end
222+
br
223+
end
224+
225+
function benchmark_exp(sizes)
226+
tests = ["Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
227+
br = BenchmarkResult(tests, sizes)
228+
for (i,s) enumerate(sizes)
229+
a = rand(s); b = similar(a)
230+
n_gflop = 1e-9*s # not really gflops
231+
br[1,i] = n_gflop / @belapsed @. $b = exp($a)
232+
baseb = copy(b)
233+
br[2,i] = n_gflop / @belapsed cvexp!($b, $a)
234+
@assert b baseb "Clang wrong?"
235+
br[3,i] = n_gflop / @belapsed fvexp!($b, $a)
236+
@assert b baseb "Fort wrong?"
237+
br[4,i] = n_gflop / @belapsed @avx @. $b = exp($a)
238+
@assert b baseb "LoopVec wrong?"
239+
# if i % 10 == 0
240+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
241+
# @show percent_complete
242+
# end
243+
end
244+
br
245+
end
246+
247+
function benchmark_aplusBc(sizes)
248+
tests = ["Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
249+
br = BenchmarkResult(tests, sizes)
250+
for (i,s) enumerate(sizes)
251+
M, N = totwotuple(s)
252+
a = rand(M); B = rand(M,N); c = rand(N);
253+
c′ = c'; D = similar(B)
254+
n_gflop = 2e-9 * M*N
255+
br[1,i] = n_gflop / @belapsed @. $D = $a + $B * $c′
256+
Dcopy = copy(D)
257+
br[2,i] = n_gflop / @belapsed caplusBc!($D, $a, $B, $c)
258+
@assert D Dcopy "Polly wrong?"
259+
br[3,i] = n_gflop / @belapsed faplusBc!($D, $a, $B, $c)
260+
@assert D Dcopy "Fort wrong?"
261+
br[4,i] = n_gflop / @belapsed @avx @. $D = $a + $B * $c′
262+
@assert D Dcopy "LoopVec wrong?"
263+
# if i % 10 == 0
264+
# percent_complete = round(100i/ length(sizes), sigdigits = 4)
265+
# @show percent_complete
266+
# end
267+
end
268+
br
269+
end
270+

0 commit comments

Comments
 (0)