Skip to content

Commit e8f621a

Browse files
committed
Breaking change: dropped the old @vectorize macro to reduce the amount of code. Also added
1 parent d3a23f5 commit e8f621a

File tree

9 files changed

+310
-889
lines changed

9 files changed

+310
-889
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.2.4"
4+
version = "0.3.0"
55

66
[deps]
77
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

benchmarks/driver.jl

Lines changed: 121 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,15 @@ function Base.getindex(br::SizedResults, row, col)
3131
col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
3232
end
3333
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
34+
35+
const HIGHLIGHT_BEST = Highlighter(
36+
(br,i,j) -> (j > 1 && maximum(@view(br.results[:, i])) == br.results[j-1,i]),
37+
foreground = :green
38+
);
3439
function Base.show(io::IO, br::BenchmarkResult)
35-
pretty_table(io, br.sizedresults, br.tests)
40+
pretty_table(
41+
io, br.sizedresults, br.tests, crop = :none, highlighters = (HIGHLIGHT_BEST,)
42+
)
3643
end
3744

3845
using VegaLite, IndexedTables
@@ -57,22 +64,17 @@ function plot(br::BenchmarkResult)
5764
)
5865
end
5966

60-
function alloc_matrices(s::NTuple{3,Int})
61-
M, K, N = s
62-
C = Matrix{Float64}(undef, M, N)
63-
A = rand(M, K)
64-
B = rand(K, N)
65-
C, A, B
66-
end
67-
alloc_matrices(s::Int) = alloc_matrices((s,s,s))
68-
gflop(s::Int) = s^3 * 2e-9
69-
gflop(s::NTuple{3,Int}) = prod(s) * 2e-9
67+
tothreetuple(i::Int) = (i,i,i)
68+
tothreetuple(i::NTuple{3,Int}) = i
7069
function benchmark_gemm(sizes)
7170
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
7271
br = BenchmarkResult(tests, sizes)
7372
for (i,s) enumerate(sizes)
74-
C, A, B = alloc_matrices(s)
75-
n_gflop = gflop(s)
73+
M, K, N = tothreetuple(s)
74+
C = Matrix{Float64}(undef, M, N)
75+
A = rand(M, K)
76+
B = rand(K, N)
77+
n_gflop = M*K*N*2e-9
7678
br[1,i] = n_gflop / @belapsed mul!($C, $A, $B)
7779
Cblas = copy(C)
7880
br[2,i] = n_gflop / @belapsed jgemm_nkm!($C, $A, $B)
@@ -85,9 +87,42 @@ function benchmark_gemm(sizes)
8587
@assert C Cblas "Fort intrinsic gemm wrong?"
8688
br[6,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
8789
@assert C Cblas "LoopVec gemm wrong?"
90+
if i % 10 == 0
91+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
92+
@show percent_complete
93+
end
94+
end
95+
br
96+
end
97+
function benchmark_AtmulB(sizes)
98+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "GFort-intrinsic", "LoopVectorization"]
99+
br = BenchmarkResult(tests, sizes)
100+
for (i,s) enumerate(sizes)
101+
M, K, N = tothreetuple(s)
102+
C = Matrix{Float64}(undef, M, N)
103+
At = rand(K, M)
104+
B = rand(K, N)
105+
n_gflop = M*K*N*2e-9
106+
br[1,i] = n_gflop / @belapsed mul!($C, $At', $B)
107+
Cblas = copy(C)
108+
br[2,i] = n_gflop / @belapsed jAtmulB!($C, $At, $B)
109+
@assert C Cblas "Julia gemm wrong?"
110+
br[3,i] = n_gflop / @belapsed cAtmulB!($C, $At, $B)
111+
@assert C Cblas "Polly gemm wrong?"
112+
br[4,i] = n_gflop / @belapsed fAtmulB!($C, $At, $B)
113+
@assert C Cblas "Fort gemm wrong?"
114+
br[5,i] = n_gflop / @belapsed fAtmulB_builtin!($C, $At, $B)
115+
@assert C Cblas "Fort intrinsic gemm wrong?"
116+
br[6,i] = n_gflop / @belapsed jAtmulBavx!($C, $At, $B)
117+
@assert C Cblas "LoopVec gemm wrong?"
118+
if i % 10 == 0
119+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
120+
@show percent_complete
121+
end
88122
end
89123
br
90124
end
125+
91126
function benchmark_dot(sizes)
92127
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
93128
br = BenchmarkResult(tests, sizes)
@@ -104,6 +139,10 @@ function benchmark_dot(sizes)
104139
@assert fdot(a,b) dotblas "Fort dot wrong?"
105140
br[5,i] = n_gflop / @belapsed jdotavx($a, $b)
106141
@assert jdotavx(a,b) dotblas "LoopVec dot wrong?"
142+
if i % 10 == 0
143+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
144+
@show percent_complete
145+
end
107146
end
108147
br
109148
end
@@ -123,11 +162,63 @@ function benchmark_selfdot(sizes)
123162
@assert fselfdot(a) dotblas "Fort dot wrong?"
124163
br[5,i] = n_gflop / @belapsed jselfdotavx($a)
125164
@assert jselfdotavx(a) dotblas "LoopVec dot wrong?"
165+
if i % 10 == 0
166+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
167+
@show percent_complete
168+
end
126169
end
127170
br
128171
end
129172
totwotuple(i::Int) = (i,i)
130173
totwotuple(i::Tuple{Int,Int}) = i
174+
function benchmark_gemv(sizes)
175+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
176+
br = BenchmarkResult(tests, sizes)
177+
for (i,s) enumerate(sizes)
178+
M, N = totwotuple(s)
179+
x = Vector{Float64}(undef, M); A = rand(M, N); y = rand(N);
180+
n_gflop = M*N * 2e-9
181+
br[1,i] = n_gflop / @belapsed mul!($x, $A, $y)
182+
xblas = copy(x)
183+
br[2,i] = n_gflop / @belapsed jgemv!($x, $A, $y)
184+
@assert x xblas "Julia wrong?"
185+
br[3,i] = n_gflop / @belapsed cgemv!($x, $A, $y)
186+
@assert x xblas "Polly wrong?"
187+
br[4,i] = n_gflop / @belapsed fgemv!($x, $A, $y)
188+
@assert x xblas "Fort wrong?"
189+
br[5,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
190+
@assert x xblas "LoopVec wrong?"
191+
if i % 10 == 0
192+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
193+
@show percent_complete
194+
end
195+
end
196+
br
197+
end
198+
function benchmark_dot3(sizes)
199+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
200+
br = BenchmarkResult(tests, sizes)
201+
for (i,s) enumerate(sizes)
202+
M, N = totwotuple(s)
203+
x = rand(M); A = rand(M, N); y = rand(N);
204+
n_gflop = M*N * 3e-9
205+
br[1,i] = n_gflop / @belapsed dot($x, $A, $y)
206+
dotblas = dot(x, A, y)
207+
br[2,i] = n_gflop / @belapsed jdot3($x, $A, $y)
208+
@assert jdot3(x, A, y) dotblas "Julia dot wrong?"
209+
br[3,i] = n_gflop / @belapsed cdot3($x, $A, $y)
210+
@assert cdot3(x, A, y) dotblas "Polly dot wrong?"
211+
br[4,i] = n_gflop / @belapsed fdot3($x, $A, $y)
212+
@assert fdot3(x, A, y) dotblas "Fort dot wrong?"
213+
br[5,i] = n_gflop / @belapsed jdot3avx($x, $A, $y)
214+
@assert jdot3avx(x, A, y) dotblas "LoopVec dot wrong?"
215+
if i % 10 == 0
216+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
217+
@show percent_complete
218+
end
219+
end
220+
br
221+
end
131222
function sse!(Xβ, y, X, β)
132223
mul!(copyto!(Xβ, y), X, β, 1.0, -1.0)
133224
dot(Xβ, Xβ)
@@ -151,22 +242,32 @@ function benchmark_sse(sizes)
151242
@assert fOLSlp(y, X, β) lpblas "Fort wrong?"
152243
br[5,i] = n_gflop / @belapsed jOLSlp_avx($y, $X, $β)
153244
@assert jOLSlp_avx(y, X, β) lpblas "LoopVec wrong?"
245+
if i % 10 == 0
246+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
247+
@show percent_complete
248+
end
154249
end
155250
br
156251
end
157252

158253
function benchmark_exp(sizes)
159-
tests = ["Julia", "GFort-loops", "LoopVectorization"]
254+
tests = ["Julia", "Clang-Polly", "GFort-loops", "LoopVectorization"]
160255
br = BenchmarkResult(tests, sizes)
161256
for (i,s) enumerate(sizes)
162257
a = rand(s); b = similar(a)
163258
n_gflop = 1e-9*s # not really gflops
164259
br[1,i] = n_gflop / @belapsed @. $b = exp($a)
165260
baseb = copy(b)
166-
br[2,i] = n_gflop / @belapsed fvexp!($b, $a)
261+
br[2,i] = n_gflop / @belapsed cvexp!($b, $a)
262+
@assert b baseb "Clang wrong?"
263+
br[3,i] = n_gflop / @belapsed fvexp!($b, $a)
167264
@assert b baseb "Fort wrong?"
168-
br[3,i] = n_gflop / @belapsed @avx @. $b = exp($a)
265+
br[4,i] = n_gflop / @belapsed @avx @. $b = exp($a)
169266
@assert b baseb "LoopVec wrong?"
267+
if i % 10 == 0
268+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
269+
@show percent_complete
270+
end
170271
end
171272
br
172273
end
@@ -187,6 +288,10 @@ function benchmark_aplusBc(sizes)
187288
@assert D Dcopy "Fort wrong?"
188289
br[4,i] = n_gflop / @belapsed @avx @. $D = $a + $B * $c′
189290
@assert D Dcopy "LoopVec wrong?"
291+
if i % 10 == 0
292+
percent_complete = round(100i/ length(sizes), sigdigits = 4)
293+
@show percent_complete
294+
end
190295
end
191296
br
192297
end

benchmarks/loadsharedlibs.jl

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,13 @@ const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so")
99
const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so")
1010

1111
# requires Clang with polly to build
12-
if !isfile(LIBCTEST)
13-
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
14-
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
12+
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
13+
if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
14+
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
1515
end
16-
if !isfile(LIBFTEST)
17-
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90") # --param max-unroll-times defaults to ≥8, which is generally excessive
16+
ffile = joinpath(LOOPVECBENCHDIR, "looptests.f90")
17+
if !isfile(LIBFTEST) || mtime(ffile) > mtime(LIBFTEST)
18+
# --param max-unroll-times defaults to ≥8, which is generally excessive
1819
run(`gfortran -Ofast -march=native -funroll-loops --param max-unroll-times=4 -floop-nest-optimize -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $ffile -o $LIBFTEST`)
1920
end
2021

@@ -46,6 +47,30 @@ function fgemm_builtin!(C, A, B)
4647
C, A, B, Ref(M), Ref(K), Ref(N)
4748
)
4849
end
50+
function cAtmulB!(C, A, B)
51+
M, N = size(C); K = size(B, 1)
52+
ccall(
53+
(:AtmulB, LIBCTEST), Cvoid,
54+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong, Clong),
55+
C, A, B, M, K, N
56+
)
57+
end
58+
function fAtmulB!(C, A, B)
59+
M, N = size(C); K = size(B, 1)
60+
ccall(
61+
(:AtmulB, LIBFTEST), Cvoid,
62+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
63+
C, A, B, Ref(M), Ref(K), Ref(N)
64+
)
65+
end
66+
function fAtmulB_builtin!(C, A, B)
67+
M, N = size(C); K = size(B, 1)
68+
ccall(
69+
(:AtmulBbuiltin, LIBFTEST), Cvoid,
70+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}, Ref{Clong}),
71+
C, A, B, Ref(M), Ref(K), Ref(N)
72+
)
73+
end
4974

5075
function cdot(a, b)
5176
N = length(a)
@@ -83,6 +108,24 @@ function fselfdot(a)
83108
)
84109
d[]
85110
end
111+
function cdot3(x, A, y)
112+
M, N = size(A)
113+
ccall(
114+
(:dot3, LIBCTEST), Float64,
115+
(Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Clong, Clong),
116+
x, A, y, M, N
117+
)
118+
end
119+
function fdot3(x, A, y)
120+
M, N = size(A)
121+
d = Ref{Float64}()
122+
ccall(
123+
(:dot3, LIBFTEST), Cvoid,
124+
(Ref{Float64}, Ptr{Float64}, Ptr{Float64}, Ptr{Float64}, Ref{Clong}, Ref{Clong}),
125+
d, x, A, y, Ref(M), Ref(N)
126+
)
127+
d[]
128+
end
86129

87130
function cgemv!(y, A, x)
88131
M, K = size(A)
@@ -143,6 +186,14 @@ function fOLSlp(y, X, β)
143186
)
144187
lp[]
145188
end
189+
function cvexp!(b, a)
190+
N = length(b)
191+
ccall(
192+
(:vexp, LIBCTEST), Cvoid,
193+
(Ptr{Float64}, Ptr{Float64}, Clong),
194+
b, a, N
195+
)
196+
end
146197
function fvexp!(b, a)
147198
N = length(b)
148199
ccall(

benchmarks/looptests.c

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
1+
#include<math.h>
22

33
void gemm_mnk(double* restrict C, double* restrict A, double* restrict B, long M, long K, long N){
44
for (long i = 0; i < M*N; i++){
@@ -78,6 +78,19 @@ void gemm_knm(double* restrict C, double* restrict A, double* restrict B, long M
7878
}
7979
return;
8080
}
81+
void AtmulB(double* restrict C, double* restrict At, double* restrict B, long M, long K, long N){
82+
for (long i = 0; i < M*N; i++){
83+
C[i] = 0.0;
84+
}
85+
for (long n = 0; n < N; n++){
86+
for (long m = 0; m < M; m++){
87+
for (long k = 0; k < K; k++){
88+
C[m + n*M] += At[k + m*K] * B[k + n*K];
89+
}
90+
}
91+
}
92+
return;
93+
}
8194
double dot(double* restrict a, double* restrict b, long N){
8295
double s = 0.0;
8396
for (long n = 0; n < N; n++){
@@ -92,7 +105,15 @@ double selfdot(double* restrict a, long N){
92105
}
93106
return s;
94107
}
95-
108+
double dot3(double* restrict x, double* restrict A, double* restrict y, long M, long N){
109+
double s = 0.0;
110+
for (long n = 0; n < N; n++){
111+
for (long m = 0; m < M; m++){
112+
s += x[m] * A[m + n*M] * y[n];
113+
}
114+
}
115+
return s;
116+
}
96117
void gemv(double* restrict y, double* restrict A, double* restrict x, long M, long K){
97118
for (long m = 0; m < M; m++){
98119
y[m] = 0.0;
@@ -104,7 +125,19 @@ void gemv(double* restrict y, double* restrict A, double* restrict x, long M, l
104125
}
105126
return;
106127
}
107-
128+
double svexp(double* restrict a, long N){
129+
double s = 0.0;
130+
for (long n = 0; n < N; n++){
131+
s += exp(a[n]);
132+
}
133+
return s;
134+
}
135+
void vexp(double* restrict b, double* restrict a, long N){
136+
for (long n = 0; n < N; n++){
137+
b[n] = exp(a[n]);
138+
}
139+
return;
140+
}
108141
void unscaledvar(double* restrict s, double* restrict A, double* restrict xb, long M, long N){
109142
for (long m = 0; m < M; m++){
110143
s[m] = 0.0;
@@ -117,7 +150,6 @@ void unscaledvar(double* restrict s, double* restrict A, double* restrict xb, lo
117150
}
118151
return;
119152
}
120-
121153
void aplusBc(double* restrict D, double* restrict a, double* restrict B, double* restrict c, long M, long N){
122154
for (long n = 0; n < N; n++){
123155
for (long m = 0; m < M; m++){

0 commit comments

Comments
 (0)