threads

Michael Abbott · Michael Abbott · commit 69269259e6da · 2019-06-10T14:48:15.000+02:00
diff --git a/README.md b/README.md
@@ -7,13 +7,21 @@ with gradients for [Flux](https://github.com/FluxML/Flux.jl) and [Zygote](https:
 
 ```julia
 mapcols(f, M) ≈ mapreduce(f, hcat, eachcol(M))
-MapCols{d}(f, M) # where d=size(M,1), for StaticArrays
+MapCols{d}(f, M)         # where d=size(M,1), for SVector slices
+ThreadMapCols{d}(f, M)   # using Threads.@threads
 
-maprows(f, M) ≈ mapreduce(f, vcat, eachrow(M))
+maprows(f, M) ≈ mapslices(f, M, dims=2)
 
-slicemap(f, A; dims) ≈ mapslices(f, A, dims)
+slicemap(f, A; dims) ≈ mapslices(f, A, dims=dims) # only Zygote
 ```
 
+<!--
+It also defines Zygote gradients for the Slice/Align functions in 
+[JuliennedArrays](https://github.com/bramtayl/JuliennedArrays.jl), 
+and the slice/glue functions in [TensorCast](https://github.com/mcabbott/TensorCast.jl), 
+both of which are good ways to roll-your-own `mapslices`-like behaviour.
+-->
+
 ### Simple example
 
 ```julia
@@ -25,7 +33,7 @@ using SliceMap
 mapcols(fun, mat)     # eachcol(m)
 MapCols{3}(fun, mat)  # reinterpret(SArray,...)
 
-using Tracker, Zygote, ForwardDiff
+using ForwardDiff, Tracker, Zygote
 ForwardDiff.gradient(m -> sum(sin, mapslices(fun, m, dims=1)), mat)
 
 Tracker.gradient(m -> sum(sin, mapcols(fun, m)), mat)[1]     # Tracker.forward per slice
@@ -88,20 +96,59 @@ Zygote.gradient(m -> sum(sin, jumap(fun, m)), mat)[1]
 @btime Zygote.gradient(m -> sum(sin, jumap(fun, m)), $mat1k); # 18.638 ms
 ```
 
+That's a 2-line gradient definition, so borrowing it may be easier than depending on this package. 
+
+The original purpose of `MapCols`, with ForwardDiff on slices, was that this works well when
+the function being mapped integrates some differential equation. 
+
+```julia
+using DifferentialEquations, ParameterizedFunctions
+ode = @ode_def begin
+  du = ( - k2 * u )/(k1 + u) # an equation with 2 parameters
+end k1 k2
+
+function g(k::AbstractVector{T}, times) where T
+    u0 = T[ 1.0 ] # NB convert initial values to eltype(k)
+    prob = ODEProblem(ode, u0, (0.0, 0.0+maximum(times)), k)
+    Array(solve(prob, saveat=times))::Matrix{T}
+end
+
+kay = rand(2,50);
+MapCols{2}(g, kay, 1:5) # 5 time steps, for each col of parameters
+
+Tracker.gradient(k -> sum(sin, MapCols{2}(g, k, 1:5)), kay)[1]
+```
+
+This is both quite efficient, and seems to go well with multi-threading:
+
+```julia
+@btime MapCols{2}(g, $kay, 1:5)        # 1.369 ms
+@btime ThreadMapCols{2}(g, $kay, 1:5)  #   670.384 μs
+
+@btime Tracker.gradient(k -> sum(sin, MapCols{2}(g, k, 1:5)), $kay)[1]       # 2.438 ms
+@btime Tracker.gradient(k -> sum(sin, ThreadMapCols{2}(g, k, 1:5)), $kay)[1] # 1.229 ms
+
+Threads.nthreads() == 4
+```
+
 ### Elsewhere
 
-About mapslices:
+Issues about mapslices:
 * https://github.com/FluxML/Zygote.jl/issues/92
 * https://github.com/FluxML/Flux.jl/issues/741
 * https://github.com/JuliaLang/julia/issues/29146
 
+Differential equations:
+* https://arxiv.org/abs/1812.01892 "DSAAD"
+* http://docs.juliadiffeq.org/latest/analysis/sensitivity.html
+
 Other packages which define gradients of possible interest:
 * https://github.com/GiggleLiu/LinalgBackwards.jl
 * https://github.com/mcabbott/ArrayAllez.jl
 
-AD packages this could perhaps support, quite the zoo:
-* https://github.com/invenia/Nabla.jl
+Differentiation packages this could perhaps support, quite the zoo:
 * https://github.com/dfdx/Yota.jl
+* https://github.com/invenia/Nabla.jl
 * https://github.com/denizyuret/AutoGrad.jl
 * https://github.com/Roger-luo/YAAD.jl
 * And perhaps one day, just https://github.com/JuliaDiff/ChainRules.jl
diff --git a/src/SliceMap.jl b/src/SliceMap.jl
@@ -1,7 +1,7 @@
 
 module SliceMap
 
-export mapcols, MapCols, maprows, slicemap
+export mapcols, MapCols, maprows, slicemap, ThreadMapCols
 
 using MacroTools, Requires, WeightedArrays, TensorCast, JuliennedArrays
 
@@ -98,25 +98,29 @@ MapCols(f::Function, M::AT, args...) where {AT<:WeightedArrays.MaybeWeightedMatr
 MapCols{d}(f::Function, M::WeightedMatrix, args...) where {d} =
     Weighted(MapCols{d}(f, M.array, args...), M.weights, M.opt)
 
-MapCols{d}(f::Function, M::AbstractMatrix, args...) where {d} = _MapCols(f, M, Val(d), args...)
+MapCols{d}(f::Function, M::AbstractMatrix, args...) where {d} =
+    _MapCols(f, M, Val(d), Val(false), args...)
 
-function _MapCols(f::Function, M::Matrix{T}, ::Val{d}, args...) where {T,d}
+function _MapCols(f::Function, M::Matrix{T}, ::Val{d}, tval::Val, args...) where {T,d}
     d == size(M,1) || error("expected M with $d columns")
     A = reinterpret(SArray{Tuple{d}, T, 1, d}, vec(M))
-    B = map(col -> surevec(f(col, args...)), A)
+    B = maybethreadmap(col -> surevec(f(col, args...)), A, tval)
     reduce(hcat, B)
 end
 
-_MapCols(f::Function, M::TrackedMatrix, dval, args...) = track(_MapCols, f, M, dval, args...)
+_MapCols(f::Function, M::TrackedMatrix, dval, tval, args...) =
+    track(_MapCols, f, M, dval, tval, args...)
 
-@grad _MapCols(f::Function, M::TrackedMatrix, dval, args...) = ∇MapCols(f, M, dval, args...)
+@grad _MapCols(f::Function, M::TrackedMatrix, dval, tval, args...) =
+    ∇MapCols(f, M, dval, tval, args...)
+
+function ∇MapCols(f::Function, M::AbstractMatrix{T}, dval::Val{d}, tval::Val, args...) where {T,d}
 
-function ∇MapCols(f::Function, M::AbstractMatrix{T}, dval::Val{d}, args...) where {T,d}
     d == size(M,1) || error("expected M with $d columns")
     A = reinterpret(SArray{Tuple{d}, T, 1, d}, vec(data(M)))
 
     dualcol = SVector(ntuple(j->ForwardDiff.Dual(0, ntuple(i->i==j ? 1 : 0, dval)...), dval))
-    C = map(col -> surevec(f(col + dualcol, args...)), A)
+    C = maybethreadmap(col -> surevec(f(col + dualcol, args...)), A, tval)
 
     Z = reduce(hcat, map(col -> ForwardDiff.value.(col), C))
 
@@ -130,7 +134,7 @@ function ∇MapCols(f::Function, M::AbstractMatrix{T}, dval::Val{d}, args...) wh
                 end
             end
         end
-        (nothing, ∇M, nothing, map(_->nothing, args)...)
+        (nothing, ∇M, nothing, nothing, map(_->nothing, args)...)
     end
     Z, back
 end
@@ -210,5 +214,45 @@ end
 # Following a suggestion? Doesn't help.
 # @adjoint Base.collect(x) = collect(x), Δ -> (Δ,)
 
+#========== Threaded Map ==========#
+
+# What KissThreading does is much more complicated, perhaps worth investigating:
+# https://github.com/mohamed82008/KissThreading.jl/blob/master/src/KissThreading.jl
+
+function threadmap(f::Function, v::AbstractVector)
+    length(v)==0 && error("can't map over empty vector, sorry")
+    out1 = f(first(v))
+    _threadmap(out1, f, v)
+end
+# NB barrier
+function _threadmap(out1, f, v)
+    out = Vector{typeof(out1)}(undef, length(v))
+    out[1] = out1
+    Threads.@threads for i=2:length(v)
+        @inbounds out[i] = f(v[i])
+    end
+    out
+end
+
+# This switch is fast inside ∇MapCols, after many attempts!
+maybethreadmap(f, v, ::Val{true}) = threadmap(f, v)
+maybethreadmap(f, v, ::Val{false}) = map(f, v)
+
+struct ThreadMapCols{d} end
+
+"""
+    ThreadMapCols{d}(f, m::Matrix, args...)
+
+Like `MapCols` but with multi-threading!
+"""
+ThreadMapCols(f::Function, M::AT, args...) where {AT<:WeightedArrays.MaybeWeightedMatrix} =
+    ThreadMapCols{size(M,1)}(f, M, args...)
+
+ThreadMapCols{d}(f::Function, M::WeightedMatrix, args...) where {d} =
+    Weighted(ThreadMapCols{d}(f, M.array, args...), M.weights, M.opt)
+
+ThreadMapCols{d}(f::Function, M::AbstractMatrix, args...) where {d} =
+    _MapCols(f, M, Val(d), Val(true), args...)
+
 
 end # module
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -13,14 +13,17 @@ Zygote.refresh()
 
     @test res ≈ mapcols(fun, mat)
     @test res ≈ MapCols{3}(fun, mat)
+    @test res ≈ ThreadMapCols{3}(fun, mat)
 
     grad = ForwardDiff.gradient(m -> sum(sin, mapslices(fun, m, dims=1)), mat)
 
     @test grad ≈ Tracker.gradient(m -> sum(sin, mapcols(fun, m)), mat)[1]
     @test grad ≈ Tracker.gradient(m -> sum(sin, MapCols{3}(fun, m)), mat)[1]
+    @test grad ≈ Tracker.gradient(m -> sum(sin, ThreadMapCols{3}(fun, m)), mat)[1]
 
     @test grad ≈ Zygote.gradient(m -> sum(sin, mapcols(fun, m)), mat)[1]
     @test grad ≈ Zygote.gradient(m -> sum(sin, MapCols{3}(fun, m)), mat)[1]
+    @test grad ≈ Zygote.gradient(m -> sum(sin, ThreadMapCols{3}(fun, m)), mat)[1]
 
     tcm(mat) = @cast out[i,j] := fun(mat[:,j])[i]
     @test res ≈ tcm(mat)