|
1 |
| -using LinearAlgebra, LoopVectorization |
| 1 | +using LinearAlgebra, LoopVectorization, Libdl |
2 | 2 | using LoopVectorization.VectorizationBase: REGISTER_SIZE
|
3 | 3 |
|
4 | 4 | # const LOOPVECBENCHDIR = joinpath(pkgdir(LoopVectorization), "benchmark")
|
5 | 5 | include(joinpath(LOOPVECBENCHDIR, "looptests.jl"))
|
6 | 6 |
|
7 |
| -const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so") |
8 |
| -const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so") |
9 |
| -const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so") |
10 |
| -const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so") |
11 |
| -const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so") |
12 |
| -const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so") |
13 |
| -const LIBDIRECTCALLJIT = joinpath(LOOPVECBENCHDIR, "libdcjtest.so") |
14 | 7 |
|
15 | 8 | # requires Clang with polly to build
|
16 | 9 | cfile = joinpath(LOOPVECBENCHDIR, "looptests.c")
|
@@ -44,43 +37,108 @@ if !isfile(LIBIEIGENTEST) || mtime(eigenfile) > mtime(LIBIEIGENTEST)
|
44 | 37 | # run(`icpc -fast -qopt-zmm-usage=high -fargument-noalias-global -qoverride-limits -I/usr/include/eigen3 -shared -fPIC $eigenfile -o $LIBIEIGENTEST`)
|
45 | 38 | end
|
46 | 39 |
|
47 |
| -MKL_ROOT = "/home/chriselrod/intel" |
48 |
| -directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90") |
49 |
| -if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT) |
50 |
| - run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`) |
51 |
| - # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`) |
| 40 | +# MKL_ROOT = "/home/chriselrod/intel" |
| 41 | +# directcalljitfile = joinpath(LOOPVECBENCHDIR, "directcalljit.f90") |
| 42 | +# if !isfile(LIBDIRECTCALLJIT) || mtime(directcalljitfile) > mtime(LIBDIRECTCALLJIT) |
| 43 | +# run(`ifort -fast -DMKL_DIRECT_CALL_SEQ_JIT -fpp -qopt-zmm-usage=high -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`) |
| 44 | +# # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -Wl,--start-group $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_intel_lp64.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_sequential.a")) $(joinpath(MKL_ROOT,"mkl/lib/intel64/libmkl_core.a")) -Wl,--end-group -I$(joinpath(MKL_ROOT, "mkl/include")) -I$(joinpath(MKL_ROOT, "compilers_and_libraries_2020.1.217/linux/mkl/include/intel64/lp64")) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`) |
52 | 45 |
|
53 |
| - # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`) |
54 |
| -end |
| 46 | +# # run(`gfortran -Ofast -march=native -DMKL_DIRECT_CALL_SEQ_JIT -cpp -mprefer-vector-width=$(8REGISTER_SIZE) -shared -fPIC $directcalljitfile -o $LIBDIRECTCALLJIT`) |
| 47 | +# end |
55 | 48 |
|
56 |
| -istransposed(x) = false |
57 |
| -istransposed(x::Adjoint) = true |
58 |
| -istransposed(x::Transpose) = true |
59 |
| -""" |
60 |
| -If transposed, requires them to be square |
61 |
| -""" |
| 49 | + |
| 50 | +const LIBCTEST = joinpath(LOOPVECBENCHDIR, "libctests.so") |
| 51 | +const LIBFTEST = joinpath(LOOPVECBENCHDIR, "libftests.so") |
| 52 | +const LIBICTEST = joinpath(LOOPVECBENCHDIR, "libictests.so") |
| 53 | +const LIBIFTEST = joinpath(LOOPVECBENCHDIR, "libiftests.so") |
| 54 | +const LIBEIGENTEST = joinpath(LOOPVECBENCHDIR, "libetest.so") |
| 55 | +const LIBIEIGENTEST = joinpath(LOOPVECBENCHDIR, "libietest.so") |
| 56 | + |
| 57 | +using MKL_jll, OpenBLAS_jll |
| 58 | + |
| 59 | +const libMKL = Libdl.dlopen(MKL_jll.libmkl_rt) |
| 60 | +const DGEMM_MKL = Libdl.dlsym(libMKL, :dgemm) |
| 61 | +const DGEMV_MKL = Libdl.dlsym(libMKL, :dgemv) |
| 62 | +const MKL_SET_NUM_THREADS = Libdl.dlsym(libMKL, :MKL_Set_Num_Threads) |
| 63 | + |
| 64 | +const libOpenBLAS = Libdl.dlopen(OpenBLAS_jll.libopenblas) |
| 65 | +const DGEMM_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemm_64_) |
| 66 | +const DGEMV_OpenBLAS = Libdl.dlsym(libOpenBLAS, :dgemv_64_) |
| 67 | +const OPENBLAS_SET_NUM_THREADS = Libdl.dlsym(libOpenBLAS, :openblas_set_num_threads64_) |
| 68 | + |
| 69 | +istransposed(x) = 'N' |
| 70 | +istransposed(x::Adjoint{<:Real}) = 'T' |
| 71 | +istransposed(x::Adjoint) = 'C' |
| 72 | +istransposed(x::Transpose) = 'T' |
62 | 73 | function dgemmmkl!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64})
|
| 74 | + transA = istransposed(A) |
| 75 | + transB = istransposed(B) |
| 76 | + M, N = size(C); K = size(B, 1) |
| 77 | + M32 = M % Int32 |
| 78 | + K32 = K % Int32 |
| 79 | + N32 = N % Int32 |
| 80 | + pA = parent(A); pB = parent(B) |
| 81 | + ldA = stride(pA, 2) % Int32 |
| 82 | + ldB = stride(pB, 2) % Int32 |
| 83 | + ldC = stride(C, 2) % Int32 |
| 84 | + α = 1.0 |
| 85 | + β = 0.0 |
| 86 | + ccall( |
| 87 | + DGEMM_MKL, Cvoid, |
| 88 | + (Ref{UInt8}, Ref{UInt8}, Ref{Int32}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}), |
| 89 | + transA, transB, M32, N32, K32, α, pA, ldA, pB, ldB, β, C, ldC |
| 90 | + ) |
| 91 | +end |
| 92 | +function dgemmopenblas!(C::AbstractMatrix{Float64}, A::AbstractMatrix{Float64}, B::AbstractMatrix{Float64}) |
| 93 | + transA = istransposed(A) |
| 94 | + transB = istransposed(B) |
63 | 95 | M, N = size(C); K = size(B, 1)
|
| 96 | + pA = parent(A); pB = parent(B) |
| 97 | + ldA = stride(pA, 2) |
| 98 | + ldB = stride(pB, 2) |
| 99 | + ldC = stride(C, 2) |
| 100 | + α = 1.0 |
| 101 | + β = 0.0 |
64 | 102 | ccall(
|
65 |
| - (:dgemmjit, LIBDIRECTCALLJIT), Cvoid, |
66 |
| - (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Int},Ref{Bool},Ref{Bool}), |
67 |
| - parent(C), parent(A), parent(B), |
68 |
| - Ref(M), Ref(K), Ref(N), |
69 |
| - Ref(istransposed(A)), Ref(istransposed(B)) |
| 103 | + DGEMM_OpenBLAS, Cvoid, |
| 104 | + (Ref{UInt8}, Ref{UInt8}, Ref{Int64}, Ref{Int64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}), |
| 105 | + transA, transB, M, N, K, α, pA, ldA, pB, ldB, β, C, ldC |
70 | 106 | )
|
71 | 107 | end
|
72 |
| -mkl_set_num_threads(N::Integer) = ccall((:set_num_threads, LIBDIRECTCALLJIT), Cvoid, (Ref{UInt32},), Ref(N % UInt32)) |
| 108 | +mkl_set_num_threads(N::Integer) = ccall(MKL_SET_NUM_THREADS, Cvoid, (Ref{UInt32},), Ref(N % UInt32)) |
73 | 109 | mkl_set_num_threads(1)
|
74 |
| -""" |
75 |
| -If transposed, requires them to be square |
76 |
| -""" |
| 110 | +openblas_set_num_threads(N::Integer) = ccall(OPENBLAS_SET_NUM_THREADS, Cvoid, (Ref{Int64},), Ref(N)) |
| 111 | +openblas_set_num_threads(1) |
77 | 112 | function dgemvmkl!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64})
|
78 |
| - M, N = size(A); |
| 113 | + transA = istransposed(A) |
| 114 | + pA = parent(A) |
| 115 | + M, N = size(pA) |
| 116 | + M32 = M % Int32 |
| 117 | + N32 = N % Int32 |
| 118 | + ldA = stride(pA, 2) % Int32 |
| 119 | + incx = LinearAlgebra.stride1(x) % Int32 |
| 120 | + incy = LinearAlgebra.stride1(y) % Int32 |
| 121 | + α = 1.0 |
| 122 | + β = 0.0 |
| 123 | + ccall( |
| 124 | + DGEMV_MKL, Cvoid, |
| 125 | + (Ref{UInt8}, Ref{Int32}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Int32}, Ref{Float64}, Ref{Float64}, Ref{Int32}), |
| 126 | + transA, M32, N32, α, A, ldA, x, incx, β, y, incy |
| 127 | + ) |
| 128 | +end |
| 129 | +function dgemvopenblas!(y::AbstractVector{Float64}, A::AbstractMatrix{Float64}, x::AbstractVector{Float64}) |
| 130 | + transA = istransposed(A) |
| 131 | + pA = parent(A) |
| 132 | + M, N = size(pA) |
| 133 | + ldA = stride(pA, 2) |
| 134 | + incx = LinearAlgebra.stride1(x) |
| 135 | + incy = LinearAlgebra.stride1(y) |
| 136 | + α = 1.0 |
| 137 | + β = 0.0 |
79 | 138 | ccall(
|
80 |
| - (:dgemvjit, LIBDIRECTCALLJIT), Cvoid, |
81 |
| - (Ptr{Float64},Ptr{Float64},Ptr{Float64},Ref{Int},Ref{Int},Ref{Bool}), |
82 |
| - parent(y), parent(A), parent(x), |
83 |
| - Ref(M), Ref(N), Ref(istransposed(A)) |
| 139 | + DGEMV_OpenBLAS, Cvoid, |
| 140 | + (Ref{UInt8}, Ref{Int64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Int64}, Ref{Float64}, Ref{Float64}, Ref{Int64}), |
| 141 | + transA, M, N, α, A, ldA, x, incx, β, y, incy |
84 | 142 | )
|
85 | 143 | end
|
86 | 144 |
|
|
0 commit comments