Skip to content

Commit 6f299fa

Browse files
committed
Rerun benchmarks, and replace tile=(U1,U2) with unroll=(U1,U2), bumping version to 0.7.0.
1 parent 04e48b4 commit 6f299fa

36 files changed

+161
-115
lines changed

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "LoopVectorization"
22
uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
33
authors = ["Chris Elrod <[email protected]>"]
4-
version = "0.6.31"
4+
version = "0.7.0"
55

66
[deps]
77
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"

benchmark/benchmarkflops.jl

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ function matmul_bench!(br, C, A, B, i)
4545
br[2,i] = n_gflop / @belapsed jgemm!($C, $A, $B)
4646
@assert C Cblas "Julia gemm wrong?"; fill!(C, NaN)
4747
br[3,i] = n_gflop / @belapsed cgemm!($C, $A, $B)
48-
@assert C Cblas "Polly gemm wrong?"; fill!(C, NaN)
48+
@assert C Cblas "Clang gemm wrong?"; fill!(C, NaN)
4949
br[4,i] = n_gflop / @belapsed fgemm!($C, $A, $B)
5050
@assert C Cblas "Fort gemm wrong?"; fill!(C, NaN)
5151
br[5,i] = n_gflop / @belapsed fgemm_builtin!($C, $A, $B)
@@ -60,8 +60,8 @@ function matmul_bench!(br, C, A, B, i)
6060
@assert C Cblas "eigen gemm wrong?"; fill!(C, NaN)
6161
br[10,i] = n_gflop / @belapsed iegemm!($C, $A, $B)
6262
@assert C Cblas "i-eigen gemm wrong?"; fill!(C, NaN)
63-
# br[11,i] = n_gflop / @belapsed dgemmjit!($C, $A, $B)
64-
# @assert C ≈ Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
63+
br[11,i] = n_gflop / @belapsed dgemmmkl!($C, $A, $B)
64+
@assert C Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
6565
# br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
6666
br[end,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
6767
@assert C Cblas "LoopVec gemm wrong?"
@@ -104,10 +104,10 @@ end
104104

105105
blastests() = [
106106
BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS",
107-
"Julia", "Clang-Polly",
107+
"Julia", "Clang",
108108
"GFortran", "GFort-intrinsic",
109109
"icc", "ifort", "ifort-intrinsic",
110-
"g++ & Eigen-3", "icpc & Eigen-3",
110+
"g++ & Eigen-3", "icpc & Eigen-3", "MKL",
111111
"LoopVectorization"
112112
# "MKL JIT", "LoopVectorization"
113113
]
@@ -145,7 +145,7 @@ function dot_bench!(br, s, i)
145145
br[2,i] = n_gflop / @belapsed jdot($a, $b)
146146
@assert jdot(a,b) dotblas "Julia dot wrong?"
147147
br[3,i] = n_gflop / @belapsed cdot($a, $b)
148-
@assert cdot(a,b) dotblas "Polly dot wrong?"
148+
@assert cdot(a,b) dotblas "Clang dot wrong?"
149149
br[4,i] = n_gflop / @belapsed fdot($a, $b)
150150
@assert fdot(a,b) dotblas "Fort dot wrong?"
151151
br[5,i] = n_gflop / @belapsed icdot($a, $b)
@@ -160,7 +160,7 @@ function dot_bench!(br, s, i)
160160
@assert jdotavx(a,b) dotblas "LoopVec dot wrong?"
161161
end
162162
function benchmark_dot(sizes)
163-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
163+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
164164
br = BenchmarkResult(tests, sizes)
165165
sm = br.sizedresults.results
166166
pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -174,7 +174,7 @@ function selfdot_bench!(br, s, i)
174174
br[2,i] = n_gflop / @belapsed jselfdot($a)
175175
@assert jselfdot(a) dotblas "Julia dot wrong?"
176176
br[3,i] = n_gflop / @belapsed cselfdot($a)
177-
@assert cselfdot(a) dotblas "Polly dot wrong?"
177+
@assert cselfdot(a) dotblas "Clang dot wrong?"
178178
br[4,i] = n_gflop / @belapsed fselfdot($a)
179179
@assert fselfdot(a) dotblas "Fort dot wrong?"
180180
br[5,i] = n_gflop / @belapsed icselfdot($a)
@@ -189,7 +189,7 @@ function selfdot_bench!(br, s, i)
189189
@assert jselfdotavx(a) dotblas "LoopVec dot wrong?"
190190
end
191191
function benchmark_selfdot(sizes)
192-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
192+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
193193
br = BenchmarkResult(tests, sizes)
194194
sm = br.sizedresults.results
195195
pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -206,7 +206,7 @@ function gemv_bench!(br, x, A, y, i)
206206
br[2,i] = n_gflop / @belapsed jgemv!($x, $A, $y)
207207
@assert x xblas "Julia wrong?"; fill!(x, NaN);
208208
br[3,i] = n_gflop / @belapsed cgemv!($x, $A, $y)
209-
@assert x xblas "Polly wrong?"; fill!(x, NaN);
209+
@assert x xblas "Clang wrong?"; fill!(x, NaN);
210210
br[4,i] = n_gflop / @belapsed fgemv!($x, $A, $y)
211211
@assert x xblas "Fort wrong?"; fill!(x, NaN);
212212
br[5,i] = n_gflop / @belapsed fgemv_builtin!($x, $A, $y)
@@ -221,8 +221,8 @@ function gemv_bench!(br, x, A, y, i)
221221
@assert x xblas "eigen wrong?"; fill!(x, NaN);
222222
br[10,i] = n_gflop / @belapsed iegemv!($x, $A, $y)
223223
@assert x xblas "i-eigen wrong?"; fill!(x, NaN);
224-
# br[11,i] = n_gflop / @belapsed dgemmjit!($x, $A, $y)
225-
# @assert x ≈ xblas "gemmjit wrong?"; fill!(x, NaN);
224+
br[11,i] = n_gflop / @belapsed dgemvmkl!($x, $A, $y)
225+
@assert x xblas "gemvmkl wrong?"; fill!(x, NaN);
226226
br[end,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
227227
@assert x xblas "LoopVec wrong?"
228228
end
@@ -262,7 +262,7 @@ function dot3_bench!(br, s, i)
262262
br[2,i] = n_gflop / @belapsed jdot3($x, $A, $y)
263263
@assert jdot3(x, A, y) dotblas "Julia dot wrong?"
264264
br[3,i] = n_gflop / @belapsed cdot3($x, $A, $y)
265-
@assert cdot3(x, A, y) dotblas "Polly dot wrong?"
265+
@assert cdot3(x, A, y) dotblas "Clang dot wrong?"
266266
br[4,i] = n_gflop / @belapsed fdot3($x, $A, $y)
267267
@assert fdot3(x, A, y) dotblas "Fort dot wrong?"
268268
br[5,i] = n_gflop / @belapsed icdot3($x, $A, $y)
@@ -277,7 +277,7 @@ function dot3_bench!(br, s, i)
277277
@assert jdot3avx(x, A, y) dotblas "LoopVec dot wrong?"
278278
end
279279
function benchmark_dot3(sizes)
280-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
280+
tests = ["LinearAlgebra", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
281281
br = BenchmarkResult(tests, sizes)
282282
sm = br.sizedresults.results
283283
pmap(is -> dot3_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -301,7 +301,7 @@ function sse_bench!(br, s, i)
301301
br[2,i] = n_gflop / @belapsed jOLSlp($y, $X, $β)
302302
@assert jOLSlp(y, X, β) lpblas "Julia wrong?"
303303
br[3,i] = n_gflop / @belapsed cOLSlp($y, $X, $β)
304-
@assert cOLSlp(y, X, β) lpblas "Polly wrong?"
304+
@assert cOLSlp(y, X, β) lpblas "Clang wrong?"
305305
br[4,i] = n_gflop / @belapsed fOLSlp($y, $X, $β)
306306
@assert fOLSlp(y, X, β) lpblas "Fort wrong?"
307307
br[5,i] = n_gflop / @belapsed icOLSlp($y, $X, $β)
@@ -316,7 +316,7 @@ function sse_bench!(br, s, i)
316316
@assert jOLSlp_avx(y, X, β) lpblas "LoopVec wrong?"
317317
end
318318
function benchmark_sse(sizes)
319-
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
319+
tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
320320
br = BenchmarkResult(tests, sizes)
321321
sm = br.sizedresults.results
322322
pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -340,7 +340,7 @@ function exp_bench!(br, s, i)
340340
@assert b baseb "LoopVec wrong?"
341341
end
342342
function benchmark_exp(sizes)
343-
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
343+
tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
344344
br = BenchmarkResult(tests, sizes)
345345
sm = br.sizedresults.results
346346
pmap(is -> exp_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -355,7 +355,7 @@ function aplusBc_bench!(br, s, i)
355355
br[1,i] = n_gflop / @belapsed @. $D = $a + $B * $c′
356356
Dcopy = copy(D); fill!(D, NaN);
357357
br[2,i] = n_gflop / @belapsed caplusBc!($D, $a, $B, $c)
358-
@assert D Dcopy "Polly wrong?"; fill!(D, NaN);
358+
@assert D Dcopy "Clang wrong?"; fill!(D, NaN);
359359
br[3,i] = n_gflop / @belapsed faplusBc!($D, $a, $B, $c)
360360
@assert D Dcopy "Fort wrong?"; fill!(D, NaN);
361361
br[4,i] = n_gflop / @belapsed icaplusBc!($D, $a, $B, $c)
@@ -370,7 +370,7 @@ function aplusBc_bench!(br, s, i)
370370
@assert D Dcopy "LoopVec wrong?"
371371
end
372372
function benchmark_aplusBc(sizes)
373-
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
373+
tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
374374
br = BenchmarkResult(tests, sizes)
375375
sm = br.sizedresults.results
376376
pmap(is -> aplusBc_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -402,7 +402,7 @@ function AplusAt_bench!(br, s, i)
402402
@assert B baseB "LoopVec wrong?"
403403
end
404404
function benchmark_AplusAt(sizes)
405-
tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
405+
tests = ["Julia", "Clang", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
406406
br = BenchmarkResult(tests, sizes)
407407
sm = br.sizedresults.results
408408
pmap(is -> AplusAt_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -429,7 +429,7 @@ function randomaccess_bench!(br, s, i)
429429
@assert p randomaccessavx(P, basis, coefs) "LoopVec wrong?"
430430
end
431431
function benchmark_random_access(sizes)
432-
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
432+
tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
433433
br = BenchmarkResult(tests, sizes)
434434
sm = br.sizedresults.results
435435
pmap(is -> randomaccess_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -456,7 +456,7 @@ function logdettriangle_bench!(br, s, i)
456456
@assert ld jlogdettriangleavx(U) "LoopVec wrong?"
457457
end
458458
function benchmark_logdettriangle(sizes)
459-
tests = ["Julia-builtin", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
459+
tests = ["Julia-builtin", "Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
460460
br = BenchmarkResult(tests, sizes)
461461
sm = br.sizedresults.results
462462
pmap(is -> logdettriangle_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -483,7 +483,7 @@ function filter2d_bench_run!(br, s, i, K)
483483
@assert B Bcopy "LoopVec wrong?"
484484
end
485485
function benchmark_filter2d(sizes, K)
486-
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
486+
tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
487487
br = BenchmarkResult(tests, sizes)
488488
sm = br.sizedresults.results
489489
pmap(is -> filter2d_bench_run!(sm, is[2], is[1], K), enumerate(sizes))
@@ -518,7 +518,7 @@ function filter2dunrolled_bench_run!(br, s, i, K)
518518
@assert B Bcopy "LoopVec wrong?"
519519
end
520520
function benchmark_filter2dunrolled(sizes)
521-
tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
521+
tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
522522
br = BenchmarkResult(tests, sizes)
523523
sm = br.sizedresults.results
524524
K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3,3))

benchmark/directcalljit.f90

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,9 @@ module jitmul
44
use mkl_service
55
implicit none
66

7-
include "/opt/intel/mkl/include/mkl_direct_call.fi"
7+
! include "/opt/intel/mkl/include/mkl_direct_call.fi"
8+
include "/home/chriselrod/intel/mkl/include/mkl_direct_call.fi"
9+
! include "/home/chriselrod/intel/mkl/include/mkl_service.fi"
810
! include "/opt/intel/mkl/include/mkl_service.fi"
911
! include "/opt/intel/mkl/include/mkl.fi"
1012

@@ -59,5 +61,22 @@ subroutine dgemmjit(C,A,B,M,K,N,At,Bt) bind(C, name = "dgemmjit")
5961
end if
6062
call dgemm(Atc, Btc, M, N, K, alpha, A, M, B, K, beta, C, M)
6163
end subroutine dgemmjit
64+
subroutine dgemvjit(y,A,x,M,N,At) bind(C, name = "dgemvjit")
65+
integer(C_int32_t), intent(in) :: M, N
66+
integer(C_int8_t), intent(in) :: At
67+
real(C_double), parameter :: alpha = 1.0d0, beta = 0.0d0
68+
! real(C_double), intent(in) :: alpha, beta
69+
real(C_double), dimension(M,N), intent(in) :: A
70+
real(C_double), dimension(M), intent(in) :: x
71+
real(C_double), dimension(N), intent(out) :: y
72+
character :: Atc
73+
! call mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL)
74+
if (At == 1_C_int8_t) then
75+
Atc = 'T'
76+
else
77+
Atc = 'N'
78+
end if
79+
call dgemv(Atc, M, N, alpha, A, M, x, 1, beta, y, 1)
80+
end subroutine dgemvjit
6281

6382
end module jitmul

benchmark/driver.jl

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,18 +23,19 @@ end
2323
# sizes = 23:23
2424
sizes = 256:-1:2
2525

26-
filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)#512:-1:2)
27-
filter2d_3x3_bench = benchmark_filter2d3x3(sizes)#512:-1:2)
28-
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)#512:-1:2)
29-
3026
AmulB_bench = benchmark_AmulB(sizes)
3127
AmulBt_bench = benchmark_AmulBt(sizes)
3228
AtmulB_bench = benchmark_AtmulB(sizes)
3329
AtmulBt_bench = benchmark_AtmulBt(sizes)
34-
dot_bench = benchmark_dot(sizes)
35-
selfdot_bench = benchmark_selfdot(sizes)
3630
Amulvb_bench = benchmark_Amulvb(sizes)
3731
Atmulvb_bench = benchmark_Atmulvb(sizes)
32+
33+
filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)#512:-1:2)
34+
filter2d_3x3_bench = benchmark_filter2d3x3(sizes)#512:-1:2)
35+
filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)#512:-1:2)
36+
37+
dot_bench = benchmark_dot(sizes)
38+
selfdot_bench = benchmark_selfdot(sizes)
3839
dot3_bench = benchmark_dot3(sizes)
3940
sse_bench = benchmark_sse(sizes)
4041
aplusBc_bench = benchmark_aplusBc(sizes)
@@ -49,21 +50,21 @@ const PICTURES = joinpath(pkgdir("LoopVectorization"), "docs", "src", "assets")
4950
save(joinpath(PICTURES, "bench_filter2d_dynamic_v$v.$filetype"), plot(filter2d_dynamic_bench));
5051
save(joinpath(PICTURES, "bench_filter2d_3x3_v$v.$filetype"), plot(filter2d_3x3_bench));
5152
save(joinpath(PICTURES, "bench_filter2d_unrolled_v$v.$filetype"), plot(filter2d_unrolled_bench));
52-
save(joinpath(PICTURES, "bench_AmulB_v$v.$filetype"), plot(AmulB_bench));
53-
save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
54-
save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));
55-
save(joinpath(PICTURES, "bench_AtmulBt_v$v.$filetype"), plot(AtmulBt_bench));
5653
save(joinpath(PICTURES, "bench_dot_v$v.$filetype"), plot(dot_bench));
5754
save(joinpath(PICTURES, "bench_selfdot_v$v.$filetype"), plot(selfdot_bench));
5855
save(joinpath(PICTURES, "bench_dot3_v$v.$filetype"), plot(dot3_bench));
5956
save(joinpath(PICTURES, "bench_sse_v$v.$filetype"), plot(sse_bench));
6057
save(joinpath(PICTURES, "bench_aplusBc_v$v.$filetype"), plot(aplusBc_bench));
6158
save(joinpath(PICTURES, "bench_AplusAt_v$v.$filetype"), plot(AplusAt_bench));
62-
save(joinpath(PICTURES, "bench_Amulvb_v$v.$filetype"), plot(Amulvb_bench));
63-
save(joinpath(PICTURES, "bench_Atmulvb_v$v.$filetype"), plot(Atmulvb_bench));
6459
save(joinpath(PICTURES, "bench_exp_v$v.$filetype"), plot(exp_bench));
6560
save(joinpath(PICTURES, "bench_random_access_v$v.$filetype"), plot(randomaccess_bench));
6661
save(joinpath(PICTURES, "bench_logdettriangle_v$v.$filetype"), plot(logdettriangle_bench));
62+
save(joinpath(PICTURES, "bench_AmulB_v$v.$filetype"), plot(AmulB_bench));
63+
save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
64+
save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));
65+
save(joinpath(PICTURES, "bench_AtmulBt_v$v.$filetype"), plot(AtmulBt_bench));
66+
save(joinpath(PICTURES, "bench_Amulvb_v$v.$filetype"), plot(Amulvb_bench));
67+
save(joinpath(PICTURES, "bench_Atmulvb_v$v.$filetype"), plot(Atmulvb_bench));
6768

6869

6970

benchmark/loadsharedlibs.jl

Lines changed: 40 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,8 @@ const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so")
1616
# requires Clang with polly to build
1717
cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
1818
if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)
19-
run(`/usr/local/bin/clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
19+
run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`)
20+
# run(`/usr/local/bin/clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
2021
end
2122
if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
2223
run(`icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`)
@@ -42,25 +43,45 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
4243
run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
4344
end
4445

45-
# directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
46-
# if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
47-
# # run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
48-
# run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
49-
# end
46+
MKL_ROOT = "/home/chriselrod/intel"
47+
directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
48+
if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
49+
run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
50+
# run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
51+
52+
# run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
53+
end
5054

51-
# istransposed(x) = false
52-
# istransposed(x::Adjoint) = true
53-
# istransposed(x::Transpose) = true
54-
# function dgemmjit!(C::AbstractVecOrMat{Float64}, A::AbstractVecOrMat{Float64}, B::AbstractVecOrMat{Float64})
55-
# M, N = size(C); K = size(B, 1)
56-
# ccall(
57-
# (:dgemmjit, LIBDIRECTCALLJIT), Cvoid,
58-
# (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}),
59-
# parent(C), parent(A), parent(B),
60-
# Ref(M), Ref(K), Ref(N),
61-
# Ref(istransposed(A)), Ref(istransposed(B))
62-
# )
63-
# end
55+
istransposed(x) = false
56+
istransposed(x::Adjoint) = true
57+
istransposed(x::Transpose) = true
58+
"""
59+
If transposed, requires them to be square
60+
"""
61+
function dgemmmkl!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})
62+
M, N = size(C); K = size(B, 1)
63+
ccall(
64+
(:dgemmjit, LIBDIRECTCALLJIT), Cvoid,
65+
(Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}),
66+
parent(C), parent(A), parent(B),
67+
Ref(M), Ref(K), Ref(N),
68+
Ref(istransposed(A)), Ref(istransposed(B))
69+
)
70+
end
71+
mkl_set_num_threads(N::Integer) = ccall((:set_num_threads, LIBDIRECTCALLJIT), Cvoid, (Ref{UInt32},), Ref(N % UInt32))
72+
mkl_set_num_threads(1)
73+
"""
74+
If transposed, requires them to be square
75+
"""
76+
function dgemvmkl!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
77+
M, N = size(A);
78+
ccall(
79+
(:dgemvjit, LIBDIRECTCALLJIT), Cvoid,
80+
(Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Bool}),
81+
parent(y), parent(A), parent(x),
82+
Ref(M), Ref(N), Ref(istransposed(A))
83+
)
84+
end
6485

6586
for (prefix,Cshared,Fshared,Eshared) ((Symbol(""),LIBCTEST,LIBFTEST,LIBEIGENTEST), (:i,LIBICTEST,LIBIFTEST,LIBIEIGENTEST))
6687
for order (:kmn, :knm, :mkn, :mnk, :nkm, :nmk)

0 commit comments

Comments
 (0)