JuliaSIMD
diff --git a/‎Project.toml
Lines changed: 1 addition & 1 deletion b/‎Project.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmark/benchmarkflops.jl
Lines changed: 24 additions & 24 deletions b/‎benchmark/benchmarkflops.jl
Lines changed: 24 additions & 24 deletions
diff --git a/‎benchmark/directcalljit.f90
Lines changed: 20 additions & 1 deletion b/‎benchmark/directcalljit.f90
Lines changed: 20 additions & 1 deletion
diff --git a/‎benchmark/driver.jl
Lines changed: 13 additions & 12 deletions b/‎benchmark/driver.jl
Lines changed: 13 additions & 12 deletions
diff --git a/‎benchmark/loadsharedlibs.jl
Lines changed: 40 additions & 19 deletions b/‎benchmark/loadsharedlibs.jl
Lines changed: 40 additions & 19 deletions
@@ -1,7 +1,7 @@
 name = "LoopVectorization"
 uuid = "bdcacae8-1622-11e9-2a5c-532679323890"
 authors = ["Chris Elrod <[email protected]>"]
-version = "0.6.31"
+version = "0.7.0"
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 
@@ -45,7 +45,7 @@ function matmul_bench!(br, C, A, B, i)
     br[2,i] = n_gflop / @belapsed jgemm!($C, $A, $B)
     @assert C ≈ Cblas "Julia gemm wrong?"; fill!(C, NaN)
     br[3,i] = n_gflop / @belapsed cgemm!($C, $A, $B)
-    @assert C ≈ Cblas "Polly gemm wrong?"; fill!(C, NaN)
+    @assert C ≈ Cblas "Clang gemm wrong?"; fill!(C, NaN)
     br[4,i] = n_gflop / @belapsed fgemm!($C, $A, $B)
     @assert C ≈ Cblas "Fort gemm wrong?"; fill!(C, NaN)
     br[5,i] = n_gflop / @belapsed fgemm_builtin!($C, $A, $B)
@@ -60,8 +60,8 @@ function matmul_bench!(br, C, A, B, i)
     @assert C ≈ Cblas "eigen gemm wrong?"; fill!(C, NaN)
     br[10,i] = n_gflop / @belapsed iegemm!($C, $A, $B)
     @assert C ≈ Cblas "i-eigen gemm wrong?"; fill!(C, NaN)
-    # br[11,i] = n_gflop / @belapsed dgemmjit!($C, $A, $B)
-    # @assert C ≈ Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
+    br[11,i] = n_gflop / @belapsed dgemmmkl!($C, $A, $B)
+    @assert C ≈ Cblas "MKL JIT gemm wrong?"; fill!(C, NaN)
     # br[12,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
     br[end,i] = n_gflop / @belapsed gemmavx!($C, $A, $B)
     @assert C ≈ Cblas "LoopVec gemm wrong?"
@@ -104,10 +104,10 @@ end
 
 blastests() = [
     BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS",
-    "Julia", "Clang-Polly",
+    "Julia", "Clang",
     "GFortran", "GFort-intrinsic",
     "icc", "ifort", "ifort-intrinsic",
-    "g++ & Eigen-3", "icpc & Eigen-3",
+    "g++ & Eigen-3", "icpc & Eigen-3", "MKL",
     "LoopVectorization"
     # "MKL JIT", "LoopVectorization"
 ]
@@ -145,7 +145,7 @@ function dot_bench!(br, s, i)
     br[2,i] = n_gflop / @belapsed jdot($a, $b)
     @assert jdot(a,b) ≈ dotblas "Julia dot wrong?"
     br[3,i] = n_gflop / @belapsed cdot($a, $b)
-    @assert cdot(a,b) ≈ dotblas "Polly dot wrong?"
+    @assert cdot(a,b) ≈ dotblas "Clang dot wrong?"
     br[4,i] = n_gflop / @belapsed fdot($a, $b)
     @assert fdot(a,b) ≈ dotblas "Fort dot wrong?"
     br[5,i] = n_gflop / @belapsed icdot($a, $b)
@@ -160,7 +160,7 @@ function dot_bench!(br, s, i)
     @assert jdotavx(a,b) ≈ dotblas "LoopVec dot wrong?"
 end
 function benchmark_dot(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> dot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -174,7 +174,7 @@ function selfdot_bench!(br, s, i)
     br[2,i] = n_gflop / @belapsed jselfdot($a)
     @assert jselfdot(a) ≈ dotblas "Julia dot wrong?"
     br[3,i] = n_gflop / @belapsed cselfdot($a)
-    @assert cselfdot(a) ≈ dotblas "Polly dot wrong?"
+    @assert cselfdot(a) ≈ dotblas "Clang dot wrong?"
     br[4,i] = n_gflop / @belapsed fselfdot($a)
     @assert fselfdot(a) ≈ dotblas "Fort dot wrong?"
     br[5,i] = n_gflop / @belapsed icselfdot($a)
@@ -189,7 +189,7 @@ function selfdot_bench!(br, s, i)
     @assert jselfdotavx(a) ≈ dotblas "LoopVec dot wrong?"
 end
 function benchmark_selfdot(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> selfdot_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -206,7 +206,7 @@ function gemv_bench!(br, x, A, y, i)
     br[2,i] = n_gflop / @belapsed jgemv!($x, $A, $y)
     @assert x ≈ xblas "Julia wrong?"; fill!(x, NaN);
     br[3,i] = n_gflop / @belapsed cgemv!($x, $A, $y)
-    @assert x ≈ xblas "Polly wrong?"; fill!(x, NaN);
+    @assert x ≈ xblas "Clang wrong?"; fill!(x, NaN);
     br[4,i] = n_gflop / @belapsed fgemv!($x, $A, $y)
     @assert x ≈ xblas "Fort wrong?"; fill!(x, NaN);
     br[5,i] = n_gflop / @belapsed fgemv_builtin!($x, $A, $y)
@@ -221,8 +221,8 @@ function gemv_bench!(br, x, A, y, i)
     @assert x ≈ xblas "eigen wrong?"; fill!(x, NaN);
     br[10,i] = n_gflop / @belapsed iegemv!($x, $A, $y)
     @assert x ≈ xblas "i-eigen wrong?"; fill!(x, NaN);
-    # br[11,i] = n_gflop / @belapsed dgemmjit!($x, $A, $y)
-    # @assert x ≈ xblas "gemmjit wrong?"; fill!(x, NaN);
+    br[11,i] = n_gflop / @belapsed dgemvmkl!($x, $A, $y)
+    @assert x ≈ xblas "gemvmkl wrong?"; fill!(x, NaN);
     br[end,i] = n_gflop / @belapsed jgemvavx!($x, $A, $y)
     @assert x ≈ xblas "LoopVec wrong?"
 end
@@ -262,7 +262,7 @@ function dot3_bench!(br, s, i)
     br[2,i] = n_gflop / @belapsed jdot3($x, $A, $y)
     @assert jdot3(x, A, y) ≈ dotblas "Julia dot wrong?"
     br[3,i] = n_gflop / @belapsed cdot3($x, $A, $y)
-    @assert cdot3(x, A, y) ≈ dotblas "Polly dot wrong?"
+    @assert cdot3(x, A, y) ≈ dotblas "Clang dot wrong?"
     br[4,i] = n_gflop / @belapsed fdot3($x, $A, $y)
     @assert fdot3(x, A, y) ≈ dotblas "Fort dot wrong?"
     br[5,i] = n_gflop / @belapsed icdot3($x, $A, $y)
@@ -277,7 +277,7 @@ function dot3_bench!(br, s, i)
     @assert jdot3avx(x, A, y) ≈ dotblas "LoopVec dot wrong?"
 end
 function benchmark_dot3(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = ["LinearAlgebra", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> dot3_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -301,7 +301,7 @@ function sse_bench!(br, s, i)
     br[2,i] = n_gflop / @belapsed jOLSlp($y, $X, $β)
     @assert jOLSlp(y, X, β) ≈ lpblas "Julia wrong?"
     br[3,i] = n_gflop / @belapsed cOLSlp($y, $X, $β)
-    @assert cOLSlp(y, X, β) ≈ lpblas "Polly wrong?"
+    @assert cOLSlp(y, X, β) ≈ lpblas "Clang wrong?"
     br[4,i] = n_gflop / @belapsed fOLSlp($y, $X, $β)
     @assert fOLSlp(y, X, β) ≈ lpblas "Fort wrong?"
     br[5,i] = n_gflop / @belapsed icOLSlp($y, $X, $β)
@@ -316,7 +316,7 @@ function sse_bench!(br, s, i)
     @assert jOLSlp_avx(y, X, β) ≈ lpblas "LoopVec wrong?"
 end
 function benchmark_sse(sizes)
-    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = [BLAS.vendor() === :mkl ? "IntelMKL" : "OpenBLAS", "Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> sse_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -340,7 +340,7 @@ function exp_bench!(br, s, i)
     @assert b ≈ baseb "LoopVec wrong?"
 end
 function benchmark_exp(sizes)
-    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
+    tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> exp_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -355,7 +355,7 @@ function aplusBc_bench!(br, s, i)
     br[1,i] = n_gflop / @belapsed @. $D = $a + $B * $c′
     Dcopy = copy(D); fill!(D, NaN);
     br[2,i] = n_gflop / @belapsed caplusBc!($D, $a, $B, $c)
-    @assert D ≈ Dcopy "Polly wrong?"; fill!(D, NaN);
+    @assert D ≈ Dcopy "Clang wrong?"; fill!(D, NaN);
     br[3,i] = n_gflop / @belapsed faplusBc!($D, $a, $B, $c)
     @assert D ≈ Dcopy "Fort wrong?"; fill!(D, NaN);
     br[4,i] = n_gflop / @belapsed icaplusBc!($D, $a, $B, $c)
@@ -370,7 +370,7 @@ function aplusBc_bench!(br, s, i)
     @assert D ≈ Dcopy "LoopVec wrong?"
 end
 function benchmark_aplusBc(sizes)
-    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> aplusBc_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -402,7 +402,7 @@ function AplusAt_bench!(br, s, i)
     @assert B ≈ baseB "LoopVec wrong?"
 end
 function benchmark_AplusAt(sizes)
-    tests = ["Julia", "Clang-Polly", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
+    tests = ["Julia", "Clang", "GFortran", "GFortran-builtin", "icc", "ifort", "ifort-builtin", "g++ & Eigen-3", "icpc & Eigen-3", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> AplusAt_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -429,7 +429,7 @@ function randomaccess_bench!(br, s, i)
     @assert p ≈ randomaccessavx(P, basis, coefs) "LoopVec wrong?"
 end
 function benchmark_random_access(sizes)
-    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
+    tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> randomaccess_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -456,7 +456,7 @@ function logdettriangle_bench!(br, s, i)
     @assert ld ≈ jlogdettriangleavx(U) "LoopVec wrong?"
 end
 function benchmark_logdettriangle(sizes)
-    tests = ["Julia-builtin", "Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
+    tests = ["Julia-builtin", "Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> logdettriangle_bench!(sm, is[2], is[1]), enumerate(sizes))
@@ -483,7 +483,7 @@ function filter2d_bench_run!(br, s, i, K)
     @assert B ≈ Bcopy "LoopVec wrong?"
 end
 function benchmark_filter2d(sizes, K)
-    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
+    tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     pmap(is -> filter2d_bench_run!(sm, is[2], is[1], K), enumerate(sizes))
@@ -518,7 +518,7 @@ function filter2dunrolled_bench_run!(br, s, i, K)
     @assert B ≈ Bcopy "LoopVec wrong?"
 end
 function benchmark_filter2dunrolled(sizes)
-    tests = ["Julia", "Clang-Polly", "GFortran", "icc", "ifort", "LoopVectorization"]
+    tests = ["Julia", "Clang", "GFortran", "icc", "ifort", "LoopVectorization"]
     br = BenchmarkResult(tests, sizes)
     sm = br.sizedresults.results
     K = SizedOffsetMatrix{Float64,-1,1,-1,1}(rand(3,3))
 
@@ -4,7 +4,9 @@ module jitmul
 use mkl_service
 implicit none
 
-include "/opt/intel/mkl/include/mkl_direct_call.fi"
+! include "/opt/intel/mkl/include/mkl_direct_call.fi"
+include "/home/chriselrod/intel/mkl/include/mkl_direct_call.fi"
+! include "/home/chriselrod/intel/mkl/include/mkl_service.fi"
 ! include "/opt/intel/mkl/include/mkl_service.fi"
 ! include "/opt/intel/mkl/include/mkl.fi"
 
@@ -59,5 +61,22 @@ subroutine dgemmjit(C,A,B,M,K,N,At,Bt) bind(C, name = "dgemmjit")
     end if
     call dgemm(Atc, Btc, M, N, K, alpha, A, M, B, K, beta, C, M)
   end subroutine dgemmjit
+  subroutine dgemvjit(y,A,x,M,N,At) bind(C, name = "dgemvjit")
+    integer(C_int32_t),  intent(in)  :: M, N
+    integer(C_int8_t),   intent(in) :: At
+    real(C_double), parameter :: alpha = 1.0d0, beta = 0.0d0
+    ! real(C_double),                 intent(in)  :: alpha, beta
+    real(C_double), dimension(M,N), intent(in)  :: A
+    real(C_double), dimension(M), intent(in)  :: x
+    real(C_double), dimension(N), intent(out) :: y
+    character :: Atc
+    ! call mkl_set_threading_layer(MKL_THREADING_SEQUENTIAL)
+    if (At == 1_C_int8_t) then
+       Atc = 'T'
+    else
+       Atc = 'N'
+    end if
+    call dgemv(Atc, M, N, alpha, A, M, x, 1, beta, y, 1)
+  end subroutine dgemvjit
 
 end module jitmul
@@ -23,18 +23,19 @@ end
 # sizes = 23:23
 sizes = 256:-1:2
 
-filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)#512:-1:2)
-filter2d_3x3_bench = benchmark_filter2d3x3(sizes)#512:-1:2)
-filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)#512:-1:2)
-
 AmulB_bench = benchmark_AmulB(sizes)
 AmulBt_bench = benchmark_AmulBt(sizes)
 AtmulB_bench = benchmark_AtmulB(sizes)
 AtmulBt_bench = benchmark_AtmulBt(sizes)
-dot_bench = benchmark_dot(sizes)
-selfdot_bench = benchmark_selfdot(sizes)
 Amulvb_bench = benchmark_Amulvb(sizes)
 Atmulvb_bench = benchmark_Atmulvb(sizes)
+
+filter2d_dynamic_bench = benchmark_filter2ddynamic(sizes)#512:-1:2)
+filter2d_3x3_bench = benchmark_filter2d3x3(sizes)#512:-1:2)
+filter2d_unrolled_bench = benchmark_filter2dunrolled(sizes)#512:-1:2)
+
+dot_bench = benchmark_dot(sizes)
+selfdot_bench = benchmark_selfdot(sizes)
 dot3_bench = benchmark_dot3(sizes)
 sse_bench = benchmark_sse(sizes)
 aplusBc_bench = benchmark_aplusBc(sizes)
@@ -49,21 +50,21 @@ const PICTURES = joinpath(pkgdir("LoopVectorization"), "docs", "src", "assets")
 save(joinpath(PICTURES, "bench_filter2d_dynamic_v$v.$filetype"), plot(filter2d_dynamic_bench));
 save(joinpath(PICTURES, "bench_filter2d_3x3_v$v.$filetype"), plot(filter2d_3x3_bench));
 save(joinpath(PICTURES, "bench_filter2d_unrolled_v$v.$filetype"), plot(filter2d_unrolled_bench));
-save(joinpath(PICTURES, "bench_AmulB_v$v.$filetype"), plot(AmulB_bench));
-save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
-save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));
-save(joinpath(PICTURES, "bench_AtmulBt_v$v.$filetype"), plot(AtmulBt_bench));
 save(joinpath(PICTURES, "bench_dot_v$v.$filetype"), plot(dot_bench));
 save(joinpath(PICTURES, "bench_selfdot_v$v.$filetype"), plot(selfdot_bench));
 save(joinpath(PICTURES, "bench_dot3_v$v.$filetype"), plot(dot3_bench));
 save(joinpath(PICTURES, "bench_sse_v$v.$filetype"), plot(sse_bench));
 save(joinpath(PICTURES, "bench_aplusBc_v$v.$filetype"), plot(aplusBc_bench));
 save(joinpath(PICTURES, "bench_AplusAt_v$v.$filetype"), plot(AplusAt_bench));
-save(joinpath(PICTURES, "bench_Amulvb_v$v.$filetype"), plot(Amulvb_bench));
-save(joinpath(PICTURES, "bench_Atmulvb_v$v.$filetype"), plot(Atmulvb_bench));
 save(joinpath(PICTURES, "bench_exp_v$v.$filetype"), plot(exp_bench));
 save(joinpath(PICTURES, "bench_random_access_v$v.$filetype"), plot(randomaccess_bench));
 save(joinpath(PICTURES, "bench_logdettriangle_v$v.$filetype"), plot(logdettriangle_bench));
+save(joinpath(PICTURES, "bench_AmulB_v$v.$filetype"), plot(AmulB_bench));
+save(joinpath(PICTURES, "bench_AmulBt_v$v.$filetype"), plot(AmulBt_bench));
+save(joinpath(PICTURES, "bench_AtmulB_v$v.$filetype"), plot(AtmulB_bench));
+save(joinpath(PICTURES, "bench_AtmulBt_v$v.$filetype"), plot(AtmulBt_bench));
+save(joinpath(PICTURES, "bench_Amulvb_v$v.$filetype"), plot(Amulvb_bench));
+save(joinpath(PICTURES, "bench_Atmulvb_v$v.$filetype"), plot(Atmulvb_bench));
 
 
 
 
@@ -16,7 +16,8 @@ const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so")
 # requires Clang with polly to build
 cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
 if !isfile(LIBCTEST) || mtime(cfile) > mtime(LIBCTEST)    
-    run(`/usr/local/bin/clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
+    run(`clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -shared -fPIC $cfile -o $LIBCTEST`)
+    # run(`/usr/local/bin/clang -Ofast -march=native -mprefer-vector-width=$(8REGISTER_SIZE) -lm -mllvm -polly -mllvm -polly-vectorizer=stripmine -shared -fPIC $cfile -o $LIBCTEST`)
 end
 if !isfile(LIBICTEST) || mtime(cfile) > mtime(LIBICTEST)
     run(`icc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -shared -fPIC $cfile -o $LIBICTEST`)
@@ -42,25 +43,45 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
     run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
 end
 
-# directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
-# if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
-#     # run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
-#     run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
-# end
+MKL_ROOT = "/home/chriselrod/intel"
+directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90")
+if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT)
+    run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
+    # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
+    
+    # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`)
+end
 
-# istransposed(x) = false
-# istransposed(x::Adjoint) = true
-# istransposed(x::Transpose) = true
-# function dgemmjit!(C::AbstractVecOrMat{Float64}, A::AbstractVecOrMat{Float64}, B::AbstractVecOrMat{Float64})
-#     M, N = size(C); K = size(B, 1)
-#     ccall(
-#         (:dgemmjit, LIBDIRECTCALLJIT), Cvoid,
-#         (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}),
-#         parent(C), parent(A), parent(B),
-#         Ref(M), Ref(K), Ref(N),
-#         Ref(istransposed(A)), Ref(istransposed(B))
-#     )
-# end
+istransposed(x) = false
+istransposed(x::Adjoint) = true
+istransposed(x::Transpose) = true
+"""
+If transposed, requires them to be square
+"""
+function dgemmmkl!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})
+    M, N = size(C); K = size(B, 1)
+    ccall(
+        (:dgemmjit, LIBDIRECTCALLJIT), Cvoid,
+        (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}),
+        parent(C), parent(A), parent(B),
+        Ref(M), Ref(K), Ref(N),
+        Ref(istransposed(A)), Ref(istransposed(B))
+    )
+end
+mkl_set_num_threads(N::Integer) = ccall((:set_num_threads, LIBDIRECTCALLJIT), Cvoid, (Ref{UInt32},), Ref(N % UInt32))
+mkl_set_num_threads(1)
+"""
+If transposed, requires them to be square
+"""
+function dgemvmkl!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
+    M, N = size(A);
+    ccall(
+        (:dgemvjit, LIBDIRECTCALLJIT), Cvoid,
+        (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Bool}),
+        parent(y), parent(A), parent(x),
+        Ref(M), Ref(N), Ref(istransposed(A))
+    )
+end
 
 for (prefix,Cshared,Fshared,Eshared) ∈ ((Symbol(""),LIBCTEST,LIBFTEST,LIBEIGENTEST), (:i,LIBICTEST,LIBIFTEST,LIBIEIGENTEST))
     for order ∈ (:kmn, :knm, :mkn, :mnk, :nkm, :nmk)