Skip to content

Commit 5fa93cc

Browse files
committed
Some progress.
2 parents 15d5c66 + 3bd2fdf commit 5fa93cc

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+795
-388
lines changed

Manifest.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
7676

7777
[[VectorizationBase]]
7878
deps = ["CpuId", "LinearAlgebra"]
79-
git-tree-sha1 = "1e8a90888ec61405ea345c1ac2bdc7d86b99bd69"
79+
git-tree-sha1 = "b68b3234127d7839280f39bd668fd0025633aa01"
8080
uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
81-
version = "0.8.2"
81+
version = "0.8.5"

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ OffsetArrays = "1"
1616
Parameters = "0"
1717
SIMDPirates = "0.7"
1818
SLEEFPirates = "0.4"
19-
VectorizationBase = "0.8"
19+
VectorizationBase = "0.8.5"
2020
julia = "1.1"
2121

2222
[extras]

benchmark/benchmarkflops.jl

Lines changed: 94 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,15 @@ function Base.getindex(br::SizedResults, row, col)
2525
col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
2626
end
2727
Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
28-
28+
function Base.vcat(br1::BenchmarkResult, br2::BenchmarkResult)
29+
BenchmarkResult(
30+
br1.tests,
31+
SizedResults(
32+
SharedMatrix(hcat(br1.sizedresults.results, br2.sizedresults.results)),
33+
vcat(br1.sizedresults.sizes, br2.sizedresults.sizes)
34+
)
35+
)
36+
end
2937

3038
tothreetuple(i::Int) = (i,i,i)
3139
tothreetuple(i::NTuple{3,Int}) = i
@@ -52,9 +60,10 @@ function matmul_bench!(br, C, A, B, i)
5260
@assert C Cblas "eigen gemm wrong?"; fill!(C, NaN)
5361
br[10,i] = n_gflop / @belapsed iegemm!($C, $A, $B)
5462
@assert C Cblas "i-eigen gemm wrong?"; fill!(C, NaN)
55-
br[11,i] = n_gflop / @belapsed dgemmjit!($C, $A, $B)
56-
@assert C Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
57-
br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
63+
# br[11,i] = n_gflop / @belapsed dgemmjit!($C, $A, $B)
64+
# @assert C ≈ Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
65+
# br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
66+
br[end,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
5867
@assert C Cblas "LoopVec gemm wrong?"
5968
end
6069
function A_mul_B_bench!(br, s, i)
@@ -93,35 +102,36 @@ function At_mul_Bt_bench!(br, s, i)
93102
matmul_bench!(br, C, A, B, i)
94103
end
95104

96-
const BLASTESTS = [
105+
blastests() = [
97106
BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS",
98107
"Julia", "Clang-Polly",
99108
"GFortran", "GFort-intrinsic",
100109
"icc", "ifort", "ifort-intrinsic",
101-
"Clang++ & Eigen-3", "icpc & Eigen-3",
102-
"MKL JIT", "LoopVectorization"
110+
"g++ & Eigen-3", "icpc & Eigen-3",
111+
"LoopVectorization"
112+
# "MKL JIT", "LoopVectorization"
103113
]
104114

105115
function benchmark_AmulB(sizes)
106-
br = BenchmarkResult(BLASTESTS, sizes)
116+
br = BenchmarkResult(blastests(), sizes)
107117
sm = br.sizedresults.results
108118
pmap(is -> A_mul_B_bench!(sm, is[2], is[1]), enumerate(sizes))
109119
br
110120
end
111121
function benchmark_AmulBt(sizes)
112-
br = BenchmarkResult(BLASTESTS, sizes)
122+
br = BenchmarkResult(blastests(), sizes)
113123
sm = br.sizedresults.results
114124
pmap(is -> A_mul_Bt_bench!(sm, is[2], is[1]), enumerate(sizes))
115125
br
116126
end
117127
function benchmark_AtmulB(sizes)
118-
br = BenchmarkResult(BLASTESTS, sizes)
128+
br = BenchmarkResult(blastests(), sizes)
119129
sm = br.sizedresults.results
120130
pmap(is -> At_mul_B_bench!(sm, is[2], is[1]), enumerate(sizes))
121131
br
122132
end
123133
function benchmark_AtmulBt(sizes)
124-
br = BenchmarkResult(BLASTESTS, sizes)
134+
br = BenchmarkResult(blastests(), sizes)
125135
sm = br.sizedresults.results
126136
pmap(is -> At_mul_Bt_bench!(sm, is[2], is[1]), enumerate(sizes))
127137
br
@@ -150,7 +160,7 @@ function dot_bench!(br, s, i)
150160
@assert jdotavx(a,b) dotblas "LoopVec dot wrong?"
151161
end
152162
function benchmark_dot(sizes)
153-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
163+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
154164
br = BenchmarkResult(tests, sizes)
155165
sm = br.sizedresults.results
156166
pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -179,7 +189,7 @@ function selfdot_bench!(br, s, i)
179189
@assert jselfdotavx(a) dotblas "LoopVec dot wrong?"
180190
end
181191
function benchmark_selfdot(sizes)
182-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
192+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
183193
br = BenchmarkResult(tests, sizes)
184194
sm = br.sizedresults.results
185195
pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -211,9 +221,9 @@ function gemv_bench!(br, x, A, y, i)
211221
@assert x xblas "eigen wrong?"; fill!(x, NaN);
212222
br[10,i] = n_gflop / @belapsed iegemv!($x, $A, $y)
213223
@assert x xblas "i-eigen wrong?"; fill!(x, NaN);
214-
br[11,i] = n_gflop / @belapsed dgemmjit!($x, $A, $y)
215-
@assert x xblas "gemmjit wrong?"; fill!(x, NaN);
216-
br[12,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
224+
# br[11,i] = n_gflop / @belapsed dgemmjit!($x, $A, $y)
225+
# @assert x ≈ xblas "gemmjit wrong?"; fill!(x, NaN);
226+
br[end,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
217227
@assert x xblas "LoopVec wrong?"
218228
end
219229
function A_mul_vb_bench!(br, s, i)
@@ -231,13 +241,13 @@ function At_mul_vb_bench!(br, s, i)
231241
gemv_bench!(br, x, A, y, i)
232242
end
233243
function benchmark_Amulvb(sizes)
234-
br = BenchmarkResult(BLASTESTS, sizes)
244+
br = BenchmarkResult(blastests(), sizes)
235245
sm = br.sizedresults.results
236246
pmap(is -> A_mul_vb_bench!(sm, is[2], is[1]), enumerate(sizes))
237247
br
238248
end
239249
function benchmark_Atmulvb(sizes)
240-
br = BenchmarkResult(BLASTESTS, sizes)
250+
br = BenchmarkResult(blastests(), sizes)
241251
sm = br.sizedresults.results
242252
pmap(is -> At_mul_vb_bench!(sm, is[2], is[1]), enumerate(sizes))
243253
br
@@ -267,7 +277,7 @@ function dot3_bench!(br, s, i)
267277
@assert jdot3avx(x, A, y) dotblas "LoopVec dot wrong?"
268278
end
269279
function benchmark_dot3(sizes)
270-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
280+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
271281
br = BenchmarkResult(tests, sizes)
272282
sm = br.sizedresults.results
273283
pmap(is -> dot3_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -306,7 +316,7 @@ function sse_bench!(br, s, i)
306316
@assert jOLSlp_avx(y, X, β) lpblas "LoopVec wrong?"
307317
end
308318
function benchmark_sse(sizes)
309-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
319+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
310320
br = BenchmarkResult(tests, sizes)
311321
sm = br.sizedresults.results
312322
pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -360,7 +370,7 @@ function aplusBc_bench!(br, s, i)
360370
@assert D Dcopy "LoopVec wrong?"
361371
end
362372
function benchmark_aplusBc(sizes)
363-
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
373+
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
364374
br = BenchmarkResult(tests, sizes)
365375
sm = br.sizedresults.results
366376
pmap(is -> aplusBc_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -392,7 +402,7 @@ function AplusAt_bench!(br, s, i)
392402
@assert B baseB "LoopVec wrong?"
393403
end
394404
function benchmark_AplusAt(sizes)
395-
tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
405+
tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
396406
br = BenchmarkResult(tests, sizes)
397407
sm = br.sizedresults.results
398408
pmap(is -> AplusAt_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -453,3 +463,65 @@ function benchmark_logdettriangle(sizes)
453463
br
454464
end
455465

466+
467+
function filter2d_bench_run!(br, s, i, K)
468+
A = rand(s + 2, s + 2)
469+
B = OffsetArray(similar(A, (s,s)), 1, 1)
470+
Mk, Nk = size(K)
471+
n_gflop = 1e-9 * (2Mk * Nk - 1) * s^2
472+
br[1,i] = n_gflop / @belapsed filter2d!($B, $A, $K)
473+
Bcopy = copy(B); fill!(B, NaN);
474+
br[2,i] = n_gflop / @belapsed cfilter2d!($B, $A, $K)
475+
@assert B Bcopy "Clang wrong?"
476+
br[3,i] = n_gflop / @belapsed ffilter2d!($B, $A, $K)
477+
@assert B Bcopy "Fort wrong?"
478+
br[4,i] = n_gflop / @belapsed icfilter2d!($B, $A, $K)
479+
@assert B Bcopy "icc wrong?"
480+
br[5,i] = n_gflop / @belapsed iffilter2d!($B, $A, $K)
481+
@assert B Bcopy "ifort wrong?"
482+
br[6,i] = n_gflop / @belapsed filter2davx!($B, $A, $K)
483+
@assert B Bcopy "LoopVec wrong?"
484+
end
485+
function benchmark_filter2d(sizes, K)
486+
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
487+
br = BenchmarkResult(tests, sizes)
488+
sm = br.sizedresults.results
489+
pmap(is -> filter2d_bench_run!(sm, is[2], is[1], K), enumerate(sizes))
490+
br
491+
end
492+
493+
function benchmark_filter2ddynamic(sizes)
494+
K = OffsetArray(rand(Float64, 3, 3), -1:1, -1:1)
495+
benchmark_filter2d(sizes, K)
496+
end
497+
function benchmark_filter2d3x3(sizes)
498+
K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3,3))
499+
benchmark_filter2d(sizes, K)
500+
end
501+
502+
function filter2dunrolled_bench_run!(br, s, i, K)
503+
A = rand(s + 2, s + 2)
504+
B = OffsetArray(similar(A, (s,s)), 1, 1)
505+
Mk, Nk = size(K)
506+
n_gflop = 1e-9 * (2Mk * Nk - 1) * s^2
507+
br[1,i] = n_gflop / @belapsed filter2dunrolled!($B, $A, $K)
508+
Bcopy = copy(B); fill!(B, NaN);
509+
br[2,i] = n_gflop / @belapsed cfilter2dunrolled!($B, $A, $K)
510+
@assert B Bcopy "Clang wrong?"
511+
br[3,i] = n_gflop / @belapsed ffilter2dunrolled!($B, $A, $K)
512+
@assert B Bcopy "Fort wrong?"
513+
br[4,i] = n_gflop / @belapsed icfilter2dunrolled!($B, $A, $K)
514+
@assert B Bcopy "icc wrong?"
515+
br[5,i] = n_gflop / @belapsed iffilter2dunrolled!($B, $A, $K)
516+
@assert B Bcopy "ifort wrong?"
517+
br[6,i] = n_gflop / @belapsed filter2dunrolledavx!($B, $A, $K)
518+
@assert B Bcopy "LoopVec wrong?"
519+
end
520+
function benchmark_filter2dunrolled(sizes)
521+
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
522+
br = BenchmarkResult(tests, sizes)
523+
sm = br.sizedresults.results
524+
K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3,3))
525+
pmap(is -> filter2dunrolled_bench_run!(sm, is[2], is[1], K), enumerate(sizes))
526+
br
527+
end

benchmark/driver.jl

Lines changed: 28 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,14 @@
22
# const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
33
# includet(joinpath(LOOPVECBENCHDIR, "driver.jl"))
44

5+
using Distributed
6+
57
pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
68
const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmark")
79
include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl"))
810
include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
911

1012

11-
using Distributed
12-
1313
addprocs((Sys.CPU_THREADS >> 1)-1); nprocs()
1414

1515
@everywhere begin
@@ -19,25 +19,36 @@ addprocs((Sys.CPU_THREADS >> 1)-1); nprocs()
1919
# BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1
2020
end
2121

22-
AmulB_bench = benchmark_AmulB(2:256)
23-
AmulBt_bench = benchmark_AmulBt(2:256)
24-
AtmulB_bench = benchmark_AtmulB(2:256)
25-
AtmulBt_bench = benchmark_AtmulBt(2:256)
26-
dot_bench = benchmark_dot(2:256)
27-
selfdot_bench = benchmark_selfdot(2:256)
28-
Amulvb_bench = benchmark_Amulvb(2:256)
29-
Atmulvb_bench = benchmark_Atmulvb(2:256)
30-
dot3_bench = benchmark_dot3(2:256)
31-
sse_bench = benchmark_sse(2:256)
32-
aplusBc_bench = benchmark_aplusBc(2:256)
33-
AplusAt_bench = benchmark_AplusAt(2:256)
34-
exp_bench = benchmark_exp(2:256)
35-
randomaccess_bench = benchmark_random_access(2:256)
36-
logdettriangle_bench = benchmark_logdettriangle(2:256)
22+
23+
# sizes = 23:23
24+
sizes = 256:-1:2
25+
26+
filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)#512:-1:2)
27+
filter2d_3x3_bench = benchmark_filter2d3x3(sizes)#512:-1:2)
28+
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)#512:-1:2)
29+
30+
AmulB_bench = benchmark_AmulB(sizes)
31+
AmulBt_bench = benchmark_AmulBt(sizes)
32+
AtmulB_bench = benchmark_AtmulB(sizes)
33+
AtmulBt_bench = benchmark_AtmulBt(sizes)
34+
dot_bench = benchmark_dot(sizes)
35+
selfdot_bench = benchmark_selfdot(sizes)
36+
Amulvb_bench = benchmark_Amulvb(sizes)
37+
Atmulvb_bench = benchmark_Atmulvb(sizes)
38+
dot3_bench = benchmark_dot3(sizes)
39+
sse_bench = benchmark_sse(sizes)
40+
aplusBc_bench = benchmark_aplusBc(sizes)
41+
AplusAt_bench = benchmark_AplusAt(sizes)
42+
exp_bench = benchmark_exp(sizes)
43+
randomaccess_bench = benchmark_random_access(sizes)
44+
logdettriangle_bench = benchmark_logdettriangle(sizes)
3745

3846
v = 1
3947
filetype = "svg"
4048
const PICTURES = joinpath(pkgdir("LoopVectorization"), "docs", "src", "assets")
49+
save(joinpath(PICTURES, "bench_filter2d_dynamic_v$v.$filetype"), plot(filter2d_dynamic_bench));
50+
save(joinpath(PICTURES, "bench_filter2d_3x3_v$v.$filetype"), plot(filter2d_3x3_bench));
51+
save(joinpath(PICTURES, "bench_filter2d_unrolled_v$v.$filetype"), plot(filter2d_unrolled_bench));
4152
save(joinpath(PICTURES, "bench_AmulB_v$v.$filetype"), plot(AmulB_bench));
4253
save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
4354
save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));

0 commit comments

Comments
 (0)