day two

Michael Abbott · Michael Abbott · commit dd3b0a83ce32 · 2019-06-06T15:37:31.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -5,12 +5,16 @@ version = "0.1.0"
 
 [deps]
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 TensorCast = "02d47bb6-7ce6-556a-be16-bb1710789e2b"
 Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
 WeightedArrays = "379a43df-f81c-573e-83a6-069eb6c11a71"
 Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
+[compat]
+julia = "1"
+
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ ForwardDiff.gradient(m -> sum(sin, mapslices(fun, m, dims=1)), mat)
 Tracker.gradient(m -> sum(sin, mapcols(fun, m)), mat)[1]     # Tracker.forward per slice
 Tracker.gradient(m -> sum(sin, MapCols{3}(fun, m)), mat)[1]  # ForwardDiff on slices
 
-# Zygote.gradient(m -> sum(sin, mapslices(fun, m, dims=1)), mat)
+# Zygote.gradient(m -> sum(sin, mapslices(fun, m, dims=1)), mat) # errors
 Zygote.gradient(m -> sum(sin, mapcols(fun, m)), mat)[1]      # Zygote.forward 
 Zygote.gradient(m -> sum(sin, MapCols{3}(fun, m)), mat)[1]
 ```
@@ -36,15 +36,28 @@ mat1k = rand(3,1000);
 
 @btime ForwardDiff.gradient(m -> sum(sin, mapslices(fun, m, dims=1)), $mat1k); # 372.705 ms
 @btime Tracker.gradient(m -> sum(sin, mapcols(fun, m)), $mat1k);               #  70.203 ms
-@btime Tracker.gradient(m -> sum(sin, MapCols{3}(fun, m)), $mat1k);            #     255.032 μs
-@btime Zygote.gradient(m -> sum(sin, mapcols(fun, m)), $mat1k);                #  20.018 ms
+@btime Tracker.gradient(m -> sum(sin, MapCols{3}(fun, m)), $mat1k);            #     255.032 μs, 690.09 KiB
+@btime Zygote.gradient(m -> sum(sin, mapcols(fun, m)), $mat1k);                #  20.018 ms, 3.82 MiB
 @btime Zygote.gradient(m -> sum(sin, MapCols{3}(fun, m)), $mat1k);             #     354.112 μs
 ```
 
 Of course `mapslices()` does things other than columns of matrices. 
 Most of which can be done better with `eachslice()` and `reduce(hcat,...)`, 
 maybe with some thought one could just write gradients for those. 
 
+Perhaps done. The views of `eachcol()` have quite inefficient gradients, 
+but `collecteachcol()` is efficient:
+
+```julia
+@btime Zygote.gradient(m -> sum(sin, mapcols4(fun, m)), $mat1k);  # 45.616 ms, 49.49 MiB
+@btime Zygote.gradient(m -> sum(sin, mapcols6(fun, m)), $mat1k);  # 18.655 ms,  3.37 MiB
+```
+
+<!--
 Or for the slice/glue functions in [TensorCast](https://github.com/mcabbott/TensorCast.jl),
 which now does some mapslices things (and will soon do many more) by chaining such functions.
+-->
 
+Issues about mapslices:
+* https://github.com/FluxML/Zygote.jl/issues/92
+* https://github.com/FluxML/Flux.jl/issues/741
diff --git a/src/SliceMap.jl b/src/SliceMap.jl
@@ -3,22 +3,49 @@ module SliceMap
 
 export MapCols, mapcols
 
+#========== Gradient Macro ==========#
+
+using MacroTools, Tracker, Zygote
+using Tracker: TrackedMatrix, track, @grad, data
+using Zygote: @adjoint, _zero
+
+macro gradadjoint(ex)
+    quote
+        # $(Zygote.gradm(ex)) # this doesn't work
+        $(trackergrad(ex))
+    end
+end
+
+# Copied from https://github.com/FluxML/Tracker.jl/blob/master/src/Tracker.jl#L55
+function trackergrad(ex)
+  @capture(shortdef(ex), (name_(args__) = body_) |
+                         (name_(args__) where {T__} = body_)) || error("Need a function definition")
+  T == nothing && (T = [])
+  isexpr(name, :(::)) || (name = :(::typeof($name)))
+  insert!(args, 1+isexpr(args[1], :parameters) , name)
+  MacroTools.@q(Tracker._forward($(args...)) where $(T...) = $body) |> esc
+end
+
 #========== Reverse, Eachslice ==========#
 
+using WeightedArrays
+
 """
     mapcols(f, m::Matrix, args...) = reduce(hcat, f(c, args...) for c in eachcol(M))
 
 When `m::TrackedMatrix`, it saves the backward function for each slice.
+All further arguments are scalar constants, i.e. they do not get sliced/iterated (unlike `map`)
+nor are their gradients tracked.
 """
-mapcols(f::Function, M::Matrix, args...) =
+mapcols(f::Function, M::AbstractMatrix, args...) =
     reduce(hcat, [ rvec(f(col, args...)) for col in eachcol(M) ])
 
-using Tracker
-using Tracker: TrackedMatrix, track, @grad, data
+mapcols(f::Function, M::WeightedMatrix, args...) =
+    Weighted(mapcols(f, M.array, args...), M.weights, M.opt)
 
 mapcols(f::Function, M::TrackedMatrix, args...) = track(mapcols, f, M, args...)
 
-@grad function mapcols(f::Function, M::TrackedMatrix, args...)
+@gradadjoint function mapcols(f::Function, M::AbstractMatrix, args...)
     res = [ Tracker.forward(x -> rvec(f(x, args...)), col) for col in eachcol(data(M)) ]
     fwd = reduce(hcat, data.(first.(res)))
     function back(Δ)
@@ -29,7 +56,7 @@ mapcols(f::Function, M::TrackedMatrix, args...) = track(mapcols, f, M, args...)
     fwd, back
 end
 
-using Zygote
+# @gradadjoint not yet working
 Zygote.@adjoint function mapcols(f::Function, M::Matrix, args...)
     res = [ Zygote.forward(x -> rvec(f(x, args...)), col) for col in eachcol(data(M)) ]
     fwd = reduce(hcat, data.(first.(res)))
@@ -41,6 +68,23 @@ Zygote.@adjoint function mapcols(f::Function, M::Matrix, args...)
     fwd, back
 end
 
+maprows(f::Function, M::AbstractMatrix, args...) =
+    reduce(vcat, [ tvec(f(col, args...)) for col in eachrow(M) ])
+
+maprows(f::Function, M::TrackedMatrix, args...) = track(maprows, f, M, args...)
+
+@gradadjoint function maprows(f::Function, M::AbstractMatrix, args...)
+    res = [ Tracker.forward(x -> tvec(f(x, args...)), row) for row in eachrow(data(M)) ]
+    fwd = reduce(vcat, data.(first.(res)))
+    function back(Δ)
+        rows = [ data((last(res[r]))(Δrow)[1]) for (r, Δrow) in enumerate(eachrow(data(Δ))) ]
+        ∇M = reduce(vcat, rows)
+        (nothing, ∇M, map(_->nothing, args)...)
+    end
+    fwd, back
+end
+
+
 #========== Forward, Static ==========#
 
 using TensorCast, StaticArrays, WeightedArrays
@@ -80,6 +124,7 @@ rvec(x::Number) = [x] # to allow for f vector -> scalar, as mapslices does
 rvec(x::StaticArray) = vec(Array(x)) # to avoid creating a giant staticarray, as reduce(hcat would otherwise do
 rvec(A) = vec(A) # LinearAlgebra.
 
+tvec(x) = transpose(rvec(x))
 
 using ForwardDiff
 
@@ -111,6 +156,8 @@ MapCols{d}(f::Function, M::TrackedMatrix, args...) where {d} = track(MapCols, f,
     Z, back
 end
 
+# TODO make a _MapCols which always takes Val(d), then unite these
+
 Zygote.@adjoint function MapCols{d}(f::Function, M::Matrix, args...) where {d} # no dval!
 
     @cast A[c]{r:d} := M[r,c]
@@ -137,4 +184,94 @@ Zygote.@adjoint function MapCols{d}(f::Function, M::Matrix, args...) where {d} #
     Z, back
 end
 
+#========== Gradient for eachslice / reduce ==========#
+
+export gluecol, mapcols2, mapcols4, mapcols5, mapcols6, mapcols7
+
+gluecol(V::AbstractVector{<:AbstractVector}) = reduce(hcat, V)
+
+gluecol(V::AbstractVector{<:TrackedVector}) = track(gluecol, V)
+
+@grad function gluecol(V::AbstractVector)
+    gluecol(data.(V)), ΔM -> (collect(eachcol(data(ΔM))),) # doesn't work
+end
+
+Zygote.@adjoint function gluecol(V::AbstractVector)
+    gluecol(V), ΔM -> (collect(eachcol(ΔM)),) # does work!
+end
+
+function mapcols2(f, A)
+    cols = [A[:,c] for c=1:size(A,2)]
+    res = f.(cols)
+    gluecol(res)
+end
+
+# Apply that straight to reduce(hcat,...)
+
+Zygote.@adjoint function Base.reduce(::typeof(hcat), V::AbstractVector{<:AbstractVector})
+    reduce(hcat, V), dV -> (nothing, collect(eachcol(dV)),)
+end
+
+function mapcols4(f, A)
+    cols = [view(A,:,c) for c=1:size(A,2)]
+    res = map(f, cols)
+    reduce(hcat, res)
+end
+
+# Zygote doesn't understand views, but easy to fix:
+# https://github.com/FluxML/Zygote.jl/issues/52
+# now https://github.com/FluxML/Zygote.jl/pull/219
+
+Zygote.@adjoint function view(x::AbstractArray, inds...; kwargs...)
+    view(x, inds...; kwargs...), dy -> begin
+        dx = _zero(x)
+        copyto!(view(dx, inds...; kwargs...), dy)
+        (dx, map(_->nothing, inds)...)
+    end
+end
+
+# Surprisingly dy for eachcol seems to know the answer?
+# typeof(dy) = NamedTuple{(:f, :iter),Tuple{NamedTuple{(:A,),Tuple{Array{Float64,2}}},Array{Nothing,1}}}
+# dy = (f = (A = [47.9325 51.3781
+# Which means this works... but uses as much memory as gradient of array of views:
+
+Zygote.@adjoint function eachcol(x::AbstractMatrix)
+    eachcol(x), dy -> (dy.f.A,) #= begin
+        @show typeof(dy) dy
+        dx = zero(x) .+ 0.0  # zeros(eltype(dy), size(x))
+        foreach(copyto!, eachcol(dx), dy)
+        (dx,)
+    end =#
+end
+
+# @adjoint eachcol(x) = eachcol(x), dy -> (dy.f.A,)
+
+function mapcols5(f, A)
+    cols = collect(eachcol(A))
+    res = map(f, cols)
+    reduce(hcat, res)
+end
+
+collecteachcol(x) = collect(eachcol(x))
+
+Zygote.@adjoint function collecteachcol(x)
+    collecteachcol(x), dy -> begin
+        dx = _zero(x)
+        foreach(copyto!, collecteachcol(dx), dy)
+        (dx,)
+    end
+end
+
+function mapcols6(f, A)
+    cols = collecteachcol(A)
+    res = map(f, cols)
+    reduce(hcat, res)
+end
+
+# function mapcols7(f, A)
+#     cols = eachcol(A) # without collect. Zygote.gradient -> StackOverflowError
+#     res = map(f, cols)
+#     reduce(hcat, res)
+# end
+
 end # module