diff --git a/.gitignore b/.gitignore index 06fb031..7b23463 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # Temporary +results.csv *.DS_Store *.swp *.jl.cov diff --git a/README.md b/README.md index 887bc23..43d0ef2 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,7 @@ There are three classes of benchmarks: | | linked | list.jl | Small pointer-heavy data structure. | | | | tree.jl | Small pointer-heavy data structure. | | | strings | strings.jl | | +| | forwarddiff | fdiff.jl| ForwardDiff matrix benchmark | | Multithreaded | binary_tree | tree_immutable.jl | Small pointer-heavy data structure. | | | | tree_mutable.jl | Small pointer-heavy data structure. | | Slow | rb\_tree | rb\_tree.jl | Pointer graph whose minimum linear arrangement has cost Θ(n²). | diff --git a/benches/serial/forwarddiff/Project.toml b/benches/serial/forwarddiff/Project.toml new file mode 100644 index 0000000..3de82aa --- /dev/null +++ b/benches/serial/forwarddiff/Project.toml @@ -0,0 +1,4 @@ +[deps] +ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" diff --git a/benches/serial/forwarddiff/fdiff.jl b/benches/serial/forwarddiff/fdiff.jl new file mode 100644 index 0000000..b17ae4b --- /dev/null +++ b/benches/serial/forwarddiff/fdiff.jl @@ -0,0 +1,244 @@ +include(joinpath("..", "..", "..", "utils.jl")) + +using LinearAlgebra, Statistics, ForwardDiff + +const BENCH_OPNORMS = (66.0, 33.0, 22.0, 11.0, 6.0, 3.0, 2.0, 0.5, 0.03, 0.001) + +function randmatrices(n) + A = randn(n, n) + op = opnorm(A, 1) + map(BENCH_OPNORMS) do x + (x / op) .* A + end +end + +function _matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple{1}) where {T} + M = size(A, 1) + te = T(last(t)) + @inbounds for n = 1:M, m = 1:M + B[m, n] = ifelse(m == n, te, zero(te)) + end + @inbounds for n = 1:M, k = 1:M, m = 1:M + B[m, n] = muladd(A[m, k], D[k, n], B[m, n]) + end + return B +end +function _matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple) where {T} + M = size(A, 1) + te = T(last(t)) + @inbounds for n = 1:M, m = 1:M + C[m, n] = ifelse(m == n, te, zero(te)) + end + @inbounds for n = 1:M, k = 1:M, m = 1:M + C[m, n] = muladd(A[m, k], D[k, n], C[m, n]) + end + _matevalpoly!(B, D, C, A, Base.front(t)) +end +function matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple) where {T} + t1 = Base.front(t) + te = T(last(t)) + tp = T(last(t1)) + @inbounds for j in axes(A, 2), i in axes(A, 1) + D[i, j] = muladd(A[i, j], te, ifelse(i == j, tp, zero(tp))) + end + _matevalpoly!(B, C, D, A, Base.front(t1)) +end +function matevalpoly!(B, C, D, A::AbstractMatrix{T}, t::NTuple{2}) where {T} + t1 = Base.front(t) + te = T(last(t)) + tp = T(last(t1)) + @inbounds for j in axes(A, 2), i in axes(A, 1) + B[i, j] = muladd(A[i, j], te, ifelse(i == j, tp, zero(tp))) + end + return B +end +ceillog2(x::Float64) = + (reinterpret(Int, x) - 1) >> Base.significand_bits(Float64) - 1022 + +naive_matmul!(C, A, B) = @inbounds for n in axes(C, 2), m in axes(C, 1) + Cmn = zero(eltype(C)) + for k in axes(A, 2) + Cmn = muladd(A[m, k], B[k, n], Cmn) + end + C[m, n] = Cmn +end +naive_matmuladd!(C, A, B) = @inbounds for n in axes(C, 2), m in axes(C, 1) + Cmn = zero(eltype(C)) + for k in axes(A, 2) + Cmn = muladd(A[m, k], B[k, n], Cmn) + end + C[m, n] += Cmn +end +_deval(x) = x +_deval(x::ForwardDiff.Dual) = _deval(ForwardDiff.value(x)) + +function opnorm1(A) + n = _deval(zero(eltype(A))) + @inbounds for j in axes(A, 2) + s = _deval(zero(eltype(A))) + @fastmath for i in axes(A, 1) + s += abs(_deval(A[i, j])) + end + n = max(n, s) + end + return n +end + +function expm!( + Z::AbstractMatrix, + A::AbstractMatrix, + (matmul!)=naive_matmul!, + (matmuladd!)=naive_matmuladd! +) + # omitted: matrix balancing, i.e., LAPACK.gebal! + # nA = maximum(sum(abs.(A); dims=Val(1))) # marginally more performant than norm(A, 1) + nA = opnorm1(A) + N = LinearAlgebra.checksquare(A) + # B and C are temporaries + ## For sufficiently small nA, use lower order Padé-Approximations + A2 = similar(A) + matmul!(A2, A, A) + if nA <= 2.1 + U = Z + V = similar(A) + if nA <= 0.015 + matevalpoly!(V, nothing, nothing, A2, (60, 1)) + matmul!(U, A, V) + matevalpoly!(V, nothing, nothing, A2, (120, 12)) + else + B = similar(A) + if nA <= 0.25 + matevalpoly!(V, nothing, U, A2, (15120, 420, 1)) + matmul!(U, A, V) + matevalpoly!(V, nothing, B, A2, (30240, 3360, 30)) + else + C = similar(A) + if nA <= 0.95 + matevalpoly!(V, C, U, A2, (8648640, 277200, 1512, 1)) + matmul!(U, A, V) + matevalpoly!(V, B, C, A2, (17297280, 1995840, 25200, 56)) + else + matevalpoly!(V, C, U, A2, (8821612800, 302702400, 2162160, 3960, 1)) + matmul!(U, A, V) + matevalpoly!( + V, + B, + C, + A2, + (17643225600, 2075673600, 30270240, 110880, 90) + ) + end + end + end + @inbounds for m = 1:N*N + u = U[m] + v = V[m] + U[m] = v + u + V[m] = v - u + end + ldiv!(lu!(V), U) + expA = U + # expA = (V - U) \ (V + U) + else + si = ceillog2(nA / 5.4) # power of 2 later reversed by squaring + if si > 0 + A = A / exp2(si) + end + A4 = similar(A) + A6 = similar(A) + matmul!(A4, A2, A2) + matmul!(A6, A2, A4) + + U = Z + B = zero(A) + @inbounds for m = 1:N + B[m, m] = 32382376266240000 + end + @inbounds for m = 1:N*N + a6 = A6[m] + a4 = A4[m] + a2 = A2[m] + B[m] = muladd( + 33522128640, + a6, + muladd(10559470521600, a4, muladd(1187353796428800, a2, B[m])) + ) + U[m] = muladd(16380, a4, muladd(40840800, a2, a6)) + end + matmuladd!(B, A6, U) + matmul!(U, A, B) + + V = si > 0 ? fill!(A, 0) : zero(A) + @inbounds for m = 1:N + V[m, m] = 64764752532480000 + end + @inbounds for m = 1:N*N + a6 = A6[m] + a4 = A4[m] + a2 = A2[m] + B[m] = muladd(182, a6, muladd(960960, a4, 1323241920 * a2)) + V[m] = muladd( + 670442572800, + a6, + muladd(129060195264000, a4, muladd(7771770303897600, a2, V[m])) + ) + end + matmuladd!(V, A6, B) + + @inbounds for m = 1:N*N + u = U[m] + v = V[m] + U[m] = v + u + V[m] = v - u + end + ldiv!(lu!(V), U) + expA = U + # expA = (V - U) \ (V + U) + + if si > 0 # squaring to reverse dividing by power of 2 + for _ = 1:si + matmul!(V, expA, expA) + expA, V = V, expA + end + if Z !== expA + copyto!(Z, expA) + expA = Z + end + end + end + expA +end +expm_bad!(Z, A) = expm!(Z, A, mul!, (C, A, B) -> mul!(C, A, B, 1.0, 1.0)) + +d(x, n) = ForwardDiff.Dual(x, ntuple(_ -> randn(), n)) +function dualify(A, n, j) + if n > 0 + A = d.(A, n) + if (j > 0) + A = d.(A, j) + end + end + A +end +struct ForEach{A,B,F} + f::F + a::A + b::B +end +ForEach(f, b) = ForEach(f, nothing, b) +(f::ForEach)() = foreach(Base.Fix1(f.f, f.a), f.b) +(f::ForEach{Nothing})() = foreach(f.f, f.b) + +l = 2; +j = 2; +n = 7; +const As = map(x -> dualify(x, n, j), randmatrices(l)); +const B = similar(As[1]); + +function foo() + for i in 1:400000 + ForEach(expm_bad!, B, As)() + end +end + +@gctime foo()