JuliaSIMD
diff --git a/‎Manifest.toml
Lines changed: 2 additions & 2 deletions b/‎Manifest.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎Project.toml
Lines changed: 1 addition & 1 deletion b/‎Project.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmarkflops.jl
Lines changed: 94 additions & 22 deletions b/‎benchmark/benchmarkflops.jl
Lines changed: 94 additions & 22 deletions
diff --git a/‎benchmark/driver.jl
Lines changed: 28 additions & 17 deletions b/‎benchmark/driver.jl
Lines changed: 28 additions & 17 deletions
@@ -76,6 +76,6 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [[VectorizationBase]]
 deps = ["CpuId", "LinearAlgebra"]
-git-tree-sha1 = "1e8a90888ec61405ea345c1ac2bdc7d86b99bd69"
+git-tree-sha1 = "b68b3234127d7839280f39bd668fd0025633aa01"
 uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
-version = "0.8.2"
+version = "0.8.5"
@@ -16,7 +16,7 @@ OffsetArrays = "1"
 Parameters = "0"
 SIMDPirates = "0.7"
 SLEEFPirates = "0.4"
-VectorizationBase = "0.8"
+VectorizationBase = "0.8.5"
 julia = "1.1"
 
 [extras]
 
@@ -25,7 +25,15 @@ function Base.getindex(br::SizedResults, row, col)
     col == 1 ? string(br.sizes[row]) : string(br.results[col - 1, row])
 end
 Base.setindex!(br::BenchmarkResult, v, i...) = br.sizedresults.results[i...] = v
-
+function Base.vcat(br1::BenchmarkResult, br2::BenchmarkResult)
+    BenchmarkResult(
+        br1.tests,
+        SizedResults(
+            SharedMatrix(hcat(br1.sizedresults.results, br2.sizedresults.results)),
+            vcat(br1.sizedresults.sizes, br2.sizedresults.sizes)
+        )
+    )
+end
 
 tothreetuple(i::Int) = (i,i,i)
 tothreetuple(i::NTuple{3,Int}) = i
@@ -52,9 +60,10 @@ function matmul_bench!(br, C, A, B, i)
     @assert C ≈ Cblas "eigen gemm wrong?"; fill!(C, NaN)
     br[10,i] = n_gflop / @belapsed iegemm!($C, $A, $B)
     @assert C ≈ Cblas "i-eigen gemm wrong?"; fill!(C, NaN)
-    br[11,i] = n_gflop / @belapsed dgemmjit!($C, $A, $B)
-    @assert C ≈ Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
-    br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
+    # br[11,i] = n_gflop / @belapsed dgemmjit!($C, $A, $B)
+    # @assert C ≈ Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
+    # br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
+    br[end,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
     @assert C ≈ Cblas "LoopVec gemm wrong?"
 end
 function A_mul_B_bench!(br, s, i)
@@ -93,35 +102,36 @@ function At_mul_Bt_bench!(br, s, i)
     matmul_bench!(br, C, A, B, i)
 end
 
-const BLASTESTS = [
+blastests() = [
     BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS",
     "Julia", "Clang-Polly",
     "GFortran", "GFort-intrinsic",
     "icc", "ifort", "ifort-intrinsic",
-    "Clang++ & Eigen-3", "icpc & Eigen-3",
-    "MKL JIT", "LoopVectorization"
+    "g++ & Eigen-3", "icpc & Eigen-3",
+    "LoopVectorization"
+    # "MKL JIT", "LoopVectorization"
 ]
 
 function benchmark_AmulB(sizes)
-    br = BenchmarkResult(BLASTESTS, sizes)
+    br = BenchmarkResult(blastests(), sizes)
     sm = br.sizedresults.results
     pmap(is -> A_mul_B_bench!(sm, is[2], is[1]), enumerate(sizes))
     br
 end
 function benchmark_AmulBt(sizes)
-    br = BenchmarkResult(BLASTESTS, sizes)
+    br = BenchmarkResult(blastests(), sizes)
     sm = br.sizedresults.results
     pmap(is -> A_mul_Bt_bench!(sm, is[2], is[1]), enumerate(sizes))
     br
 end
 function benchmark_AtmulB(sizes)
-    br = BenchmarkResult(BLASTESTS, sizes)
+    br = BenchmarkResult(blastests(), sizes)
     sm = br.sizedresults.results
     pmap(is -> At_mul_B_bench!(sm, is[2], is[1]), enumerate(sizes))
     br
 end
 function benchmark_AtmulBt(sizes)
-    br = BenchmarkResult(BLASTESTS, sizes)
+    br = BenchmarkResult(blastests(), sizes)
     sm = br.sizedresults.results
     pmap(is -> At_mul_Bt_bench!(sm, is[2], is[1]), enumerate(sizes))
     br
@@ -150,7 +160,7 @@ function dot_bench!(br, s, i)
     @assert jdotavx(a,b) ≈ dotblas "LoopVec dot wrong?"
 end
 function benchmark_dot(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -179,7 +189,7 @@ function selfdot_bench!(br, s, i)
     @assert jselfdotavx(a) ≈ dotblas "LoopVec dot wrong?"
 end
 function benchmark_selfdot(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -211,9 +221,9 @@ function gemv_bench!(br, x, A, y, i)
     @assert x ≈ xblas "eigen wrong?"; fill!(x, NaN);
     br[10,i] = n_gflop / @belapsed iegemv!($x, $A, $y)
     @assert x ≈ xblas "i-eigen wrong?"; fill!(x, NaN);
-    br[11,i] = n_gflop / @belapsed dgemmjit!($x, $A, $y)
-    @assert x ≈ xblas "gemmjit wrong?"; fill!(x, NaN);
-    br[12,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
+    # br[11,i] = n_gflop / @belapsed dgemmjit!($x, $A, $y)
+    # @assert x ≈ xblas "gemmjit wrong?"; fill!(x, NaN);
+    br[end,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
     @assert x ≈ xblas "LoopVec wrong?"
 end
 function A_mul_vb_bench!(br, s, i)
@@ -231,13 +241,13 @@ function At_mul_vb_bench!(br, s, i)
     gemv_bench!(br, x, A, y, i)
 end
 function benchmark_Amulvb(sizes)
-    br = BenchmarkResult(BLASTESTS, sizes)
+    br = BenchmarkResult(blastests(), sizes)
     sm = br.sizedresults.results
     pmap(is -> A_mul_vb_bench!(sm, is[2], is[1]), enumerate(sizes))
     br
 end
 function benchmark_Atmulvb(sizes)
-    br = BenchmarkResult(BLASTESTS, sizes)
+    br = BenchmarkResult(blastests(), sizes)
     sm = br.sizedresults.results
     pmap(is -> At_mul_vb_bench!(sm, is[2], is[1]), enumerate(sizes))
     br
@@ -267,7 +277,7 @@ function dot3_bench!(br, s, i)
     @assert jdot3avx(x, A, y) ≈ dotblas "LoopVec dot wrong?"
 end
 function benchmark_dot3(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> dot3_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -306,7 +316,7 @@ function sse_bench!(br, s, i)
     @assert jOLSlp_avx(y, X, β) ≈ lpblas "LoopVec wrong?"
 end
 function benchmark_sse(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -360,7 +370,7 @@ function aplusBc_bench!(br, s, i)
     @assert D ≈ Dcopy "LoopVec wrong?"
 end
 function benchmark_aplusBc(sizes)
-    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> aplusBc_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -392,7 +402,7 @@ function AplusAt_bench!(br, s, i)
     @assert B ≈ baseB "LoopVec wrong?"
 end
 function benchmark_AplusAt(sizes)
-    tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "Clang++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> AplusAt_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -453,3 +463,65 @@ function benchmark_logdettriangle(sizes)
     br
 end
 
+
+function filter2d_bench_run!(br, s, i, K)
+    A = rand(s + 2, s + 2)
+    B = OffsetArray(similar(A, (s,s)), 1, 1)
+    Mk, Nk = size(K)
+    n_gflop = 1e-9 * (2Mk * Nk - 1) * s^2
+    br[1,i] = n_gflop / @belapsed filter2d!($B, $A, $K)
+    Bcopy = copy(B); fill!(B, NaN);
+    br[2,i] = n_gflop / @belapsed cfilter2d!($B, $A, $K)
+    @assert B ≈ Bcopy "Clang wrong?"
+    br[3,i] = n_gflop / @belapsed ffilter2d!($B, $A, $K)
+    @assert B ≈ Bcopy "Fort wrong?"
+    br[4,i] = n_gflop / @belapsed icfilter2d!($B, $A, $K)
+    @assert B ≈ Bcopy "icc wrong?"
+    br[5,i] = n_gflop / @belapsed iffilter2d!($B, $A, $K)
+    @assert B ≈ Bcopy "ifort wrong?"
+    br[6,i] = n_gflop / @belapsed filter2davx!($B, $A, $K)
+    @assert B ≈ Bcopy "LoopVec wrong?"
+end
+function benchmark_filter2d(sizes, K)
+    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
+    br = BenchmarkResult(tests, sizes)
+    sm = br.sizedresults.results
+    pmap(is -> filter2d_bench_run!(sm, is[2], is[1], K), enumerate(sizes))
+    br
+end
+
+function benchmark_filter2ddynamic(sizes)
+    K = OffsetArray(rand(Float64, 3, 3), -1:1, -1:1)
+    benchmark_filter2d(sizes, K)
+end
+function benchmark_filter2d3x3(sizes)
+    K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3,3))
+    benchmark_filter2d(sizes, K)
+end
+
+function filter2dunrolled_bench_run!(br, s, i, K)
+    A = rand(s + 2, s + 2)
+    B = OffsetArray(similar(A, (s,s)), 1, 1)
+    Mk, Nk = size(K)
+    n_gflop = 1e-9 * (2Mk * Nk - 1) * s^2
+    br[1,i] = n_gflop / @belapsed filter2dunrolled!($B, $A, $K)
+    Bcopy = copy(B); fill!(B, NaN);
+    br[2,i] = n_gflop / @belapsed cfilter2dunrolled!($B, $A, $K)
+    @assert B ≈ Bcopy "Clang wrong?"
+    br[3,i] = n_gflop / @belapsed ffilter2dunrolled!($B, $A, $K)
+    @assert B ≈ Bcopy "Fort wrong?"
+    br[4,i] = n_gflop / @belapsed icfilter2dunrolled!($B, $A, $K)
+    @assert B ≈ Bcopy "icc wrong?"
+    br[5,i] = n_gflop / @belapsed iffilter2dunrolled!($B, $A, $K)
+    @assert B ≈ Bcopy "ifort wrong?"
+    br[6,i] = n_gflop / @belapsed filter2dunrolledavx!($B, $A, $K)
+    @assert B ≈ Bcopy "LoopVec wrong?"
+end
+function benchmark_filter2dunrolled(sizes)
+    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
+    br = BenchmarkResult(tests, sizes)
+    sm = br.sizedresults.results
+    K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3,3))
+    pmap(is -> filter2dunrolled_bench_run!(sm, is[2], is[1], K), enumerate(sizes))
+    br
+end
@@ -2,14 +2,14 @@
 # const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmarks")
 # includet(joinpath(LOOPVECBENCHDIR, "driver.jl"))
 
+using Distributed
+
 pkgdir(pkg::String) = abspath(joinpath(dirname(Base.find_package(pkg)), ".."))
 const LOOPVECBENCHDIR = joinpath(pkgdir("LoopVectorization"), "benchmark")
 include(joinpath(LOOPVECBENCHDIR, "benchmarkflops.jl"))
 include(joinpath(LOOPVECBENCHDIR, "plotbenchmarks.jl"))
 
 
-using Distributed
-
 addprocs((Sys.CPU_THREADS >> 1)-1); nprocs()
 
 @everywhere begin
@@ -19,25 +19,36 @@ addprocs((Sys.CPU_THREADS >> 1)-1); nprocs()
     # BenchmarkTools.DEFAULT_PARAMETERS.seconds = 1
 end
 
-AmulB_bench = benchmark_AmulB(2:256)
-AmulBt_bench = benchmark_AmulBt(2:256)
-AtmulB_bench = benchmark_AtmulB(2:256)
-AtmulBt_bench = benchmark_AtmulBt(2:256)
-dot_bench = benchmark_dot(2:256)
-selfdot_bench = benchmark_selfdot(2:256)
-Amulvb_bench = benchmark_Amulvb(2:256)
-Atmulvb_bench = benchmark_Atmulvb(2:256)
-dot3_bench = benchmark_dot3(2:256)
-sse_bench = benchmark_sse(2:256)
-aplusBc_bench = benchmark_aplusBc(2:256)
-AplusAt_bench = benchmark_AplusAt(2:256)
-exp_bench = benchmark_exp(2:256)
-randomaccess_bench = benchmark_random_access(2:256)
-logdettriangle_bench = benchmark_logdettriangle(2:256)
+
+# sizes = 23:23
+sizes = 256:-1:2
+
+filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)#512:-1:2)
+filter2d_3x3_bench = benchmark_filter2d3x3(sizes)#512:-1:2)
+filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)#512:-1:2)
+
+AmulB_bench = benchmark_AmulB(sizes)
+AmulBt_bench = benchmark_AmulBt(sizes)
+AtmulB_bench = benchmark_AtmulB(sizes)
+AtmulBt_bench = benchmark_AtmulBt(sizes)
+dot_bench = benchmark_dot(sizes)
+selfdot_bench = benchmark_selfdot(sizes)
+Amulvb_bench = benchmark_Amulvb(sizes)
+Atmulvb_bench = benchmark_Atmulvb(sizes)
+dot3_bench = benchmark_dot3(sizes)
+sse_bench = benchmark_sse(sizes)
+aplusBc_bench = benchmark_aplusBc(sizes)
+AplusAt_bench = benchmark_AplusAt(sizes)
+exp_bench = benchmark_exp(sizes)
+randomaccess_bench = benchmark_random_access(sizes)
+logdettriangle_bench = benchmark_logdettriangle(sizes)
 
 v = 1
 filetype = "svg"
 const PICTURES = joinpath(pkgdir("LoopVectorization"), "docs", "src", "assets")
+save(joinpath(PICTURES, "bench_filter2d_dynamic_v$v.$filetype"), plot(filter2d_dynamic_bench));
+save(joinpath(PICTURES, "bench_filter2d_3x3_v$v.$filetype"), plot(filter2d_3x3_bench));
+save(joinpath(PICTURES, "bench_filter2d_unrolled_v$v.$filetype"), plot(filter2d_unrolled_bench));
 save(joinpath(PICTURES, "bench_AmulB_v$v.$filetype"), plot(AmulB_bench));
 save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
 save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));