two plus

Michael Abbott · Michael Abbott · commit 002092a945ed · 2019-06-06T19:22:02.000+02:00
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@ It would be nice if [Flux](https://github.com/FluxML/Flux.jl) worked with `mapsl
 or with something generalising that. This package has some quick attempts:
 
 ```julia
-mat = rand(1:99, 3,10)
+mat = rand(1:9, 3,10)
 fun(x) = 2 .+ x.^2
 mapslices(fun, mat, dims=1)
 
@@ -31,32 +31,45 @@ mat1k = rand(3,1000);
 
 @btime mapslices(fun, $mat1k, dims=1)  # 1.017 ms
 @btime mapcols(fun, $mat1k)            #   399.016 μs
-@btime MapCols{3}(fun, $mat1k)         #    46.733 μs
-@btime MapCols(fun, $mat1k)            #    59.471 μs without size
+@btime MapCols{3}(fun, $mat1k)         #    15.564 μs
+@btime MapCols(fun, $mat1k)            #    16.774 μs  without size
 
 @btime ForwardDiff.gradient(m -> sum(sin, mapslices(fun, m, dims=1)), $mat1k); # 372.705 ms
 @btime Tracker.gradient(m -> sum(sin, mapcols(fun, m)), $mat1k);               #  70.203 ms
-@btime Tracker.gradient(m -> sum(sin, MapCols{3}(fun, m)), $mat1k);            #     255.032 μs, 690.09 KiB
+@btime Tracker.gradient(m -> sum(sin, MapCols{3}(fun, m)), $mat1k);            #     146.561 μs, 330.51 KiB
 @btime Zygote.gradient(m -> sum(sin, mapcols(fun, m)), $mat1k);                #  20.018 ms, 3.82 MiB
-@btime Zygote.gradient(m -> sum(sin, MapCols{3}(fun, m)), $mat1k);             #     354.112 μs
+@btime Zygote.gradient(m -> sum(sin, MapCols{3}(fun, m)), $mat1k);             #     245.550 μs
 ```
 
 Of course `mapslices()` does things other than columns of matrices. 
 Most of which can be done better with `eachslice()` and `reduce(hcat,...)`, 
-maybe with some thought one could just write gradients for those. 
+maybe with some thought one could just write gradients for those...
 
-Perhaps done. The views of `eachcol()` have quite inefficient gradients, 
-but `collecteachcol()` is efficient:
+Perhaps this is done. The views of `eachcol()` have quite inefficient gradients, 
+because for each `view()` they make a fresh `zero(A)`, but `collecteachcol()` is efficient:
 
 ```julia
 @btime Zygote.gradient(m -> sum(sin, mapcols4(fun, m)), $mat1k);  # 45.616 ms, 49.49 MiB
 @btime Zygote.gradient(m -> sum(sin, mapcols6(fun, m)), $mat1k);  # 18.655 ms,  3.37 MiB
 ```
 
-<!--
 Or for the slice/glue functions in [TensorCast](https://github.com/mcabbott/TensorCast.jl),
 which now does some mapslices things (and will soon do many more) by chaining such functions.
--->
+
+```julia
+using TensorCast
+@cast [i,j] := fun(mat[:,j])[i]                       # same as mapcols
+
+tcm(mat) = @cast out[i,j] := fun(mat[:,j])[i]
+Zygote.gradient(m -> sum(sin, tcm(m)), mat)[1]
+
+@btime tcm($mat1k)                                    # 407.176 μs
+@btime Zygote.gradient(m -> sum(sin, tcm(m)), $mat1k) # 19.086 ms
+
+ten = rand(1:9, 3,10,2)
+@cast zed[i,j,k] := fun(ten[i,:,k])[j]
+Zygote.gradient(m -> sum(sin, @cast zed[i,j,k] := fun(m[i,:,k])[j]  nolazy), ten)[1]
+```
 
 Issues about mapslices:
 * https://github.com/FluxML/Zygote.jl/issues/92
diff --git a/src/SliceMap.jl b/src/SliceMap.jl
@@ -1,7 +1,8 @@
 
 module SliceMap
 
-export MapCols, mapcols
+export MapCols, mapcols, maprows
+
 
 #========== Gradient Macro ==========#
 
@@ -26,6 +27,7 @@ function trackergrad(ex)
   MacroTools.@q(Tracker._forward($(args...)) where $(T...) = $body) |> esc
 end
 
+
 #========== Reverse, Eachslice ==========#
 
 using WeightedArrays
@@ -38,15 +40,18 @@ All further arguments are scalar constants, i.e. they do not get sliced/iterated
 nor are their gradients tracked.
 """
 mapcols(f::Function, M::AbstractMatrix, args...) =
-    reduce(hcat, [ rvec(f(col, args...)) for col in eachcol(M) ])
+    reduce(hcat, [ surevec(f(col, args...)) for col in eachcol(M) ])
 
 mapcols(f::Function, M::WeightedMatrix, args...) =
     Weighted(mapcols(f, M.array, args...), M.weights, M.opt)
 
+surevec(x::Number) = [x] # to allow f vector -> scalar, as mapslices does
+surevec(A) = vec(A)      # to allow f vector -> matrix, by reshaping
+
 mapcols(f::Function, M::TrackedMatrix, args...) = track(mapcols, f, M, args...)
 
-@gradadjoint function mapcols(f::Function, M::AbstractMatrix, args...)
-    res = [ Tracker.forward(x -> rvec(f(x, args...)), col) for col in eachcol(data(M)) ]
+@grad function mapcols(f::Function, M::AbstractMatrix, args...)
+    res = [ Tracker.forward(x -> surevec(f(x, args...)), col) for col in eachcol(data(M)) ]
     fwd = reduce(hcat, data.(first.(res)))
     function back(Δ)
         cols = [ data((last(res[c]))(Δcol)[1]) for (c, Δcol) in enumerate(eachcol(data(Δ))) ]
@@ -56,25 +61,26 @@ mapcols(f::Function, M::TrackedMatrix, args...) = track(mapcols, f, M, args...)
     fwd, back
 end
 
-# @gradadjoint not yet working
-Zygote.@adjoint function mapcols(f::Function, M::Matrix, args...)
-    res = [ Zygote.forward(x -> rvec(f(x, args...)), col) for col in eachcol(data(M)) ]
-    fwd = reduce(hcat, data.(first.(res)))
+@adjoint function mapcols(f::Function, M::Matrix, args...)
+    res = [ Zygote.forward(x -> surevec(f(x, args...)), col) for col in eachcol(M) ]
+    fwd = reduce(hcat, first.(res))
     function back(Δ)
-        cols = [ data((last(res[c]))(Δcol)[1]) for (c, Δcol) in enumerate(eachcol(data(Δ))) ]
+        cols = [ (last(res[c]))(Δcol)[1] for (c, Δcol) in enumerate(eachcol(Δ)) ]
         ∇M = reduce(hcat, cols)
         (nothing, ∇M, map(_->nothing, args)...)
     end
     fwd, back
 end
 
 maprows(f::Function, M::AbstractMatrix, args...) =
-    reduce(vcat, [ tvec(f(col, args...)) for col in eachrow(M) ])
+    reduce(vcat, [ surerow(f(col, args...)) for col in eachrow(M) ])
+
+surerow(x) = transpose(surevec(x))
 
 maprows(f::Function, M::TrackedMatrix, args...) = track(maprows, f, M, args...)
 
-@gradadjoint function maprows(f::Function, M::AbstractMatrix, args...)
-    res = [ Tracker.forward(x -> tvec(f(x, args...)), row) for row in eachrow(data(M)) ]
+@grad function maprows(f::Function, M::AbstractMatrix, args...)
+    res = [ Tracker.forward(x -> surerow(f(x, args...)), row) for row in eachrow(data(M)) ]
     fwd = reduce(vcat, data.(first.(res)))
     function back(Δ)
         rows = [ data((last(res[r]))(Δrow)[1]) for (r, Δrow) in enumerate(eachrow(data(Δ))) ]
@@ -87,7 +93,7 @@ end
 
 #========== Forward, Static ==========#
 
-using TensorCast, StaticArrays, WeightedArrays
+using StaticArrays, ForwardDiff, WeightedArrays
 
 struct MapCols{d} end
 
@@ -106,48 +112,72 @@ Takes `m.weights` along for the ride.
 MapCols(f::Function, M::WeightedArrays.MaybeWeightedMatrix, args...) =
     MapCols{size(M,1)}(f, M, args...)
 
-MapCols{d}(f::Function, M::WeightedMatrix, args...)  where {d} =
+MapCols{d}(f::Function, M::WeightedMatrix, args...) where {d} =
     Weighted(MapCols{d}(f, M.array, args...), M.weights, M.opt)
 
-function MapCols{d}(f::Function, M::Matrix, args...) where {d}
-    @cast A[c]{r:d} := M[r,c] assert
-    reduce(hcat, [ rvec(f(acol, args...)) for acol in A ])
+MapCols{d}(f::Function, M::AbstractMatrix, args...) where {d} = _MapCols(f, M, Val(d), args...)
+
+function _MapCols(f::Function, M::Matrix{T}, ::Val{d}, args...) where {T,d}
+    d == size(M,1) || error("expected M with $d columns")
+    # @cast A[c]{r:d} := M[r,c] assert
+    A = reinterpret(SArray{Tuple{d}, T, 1, d}, vec(M))
+    B = map(col -> surevec(f(col, args...)), A)
+    reduce(hcat, B)
+    # maybestaticgluecols(B)
+end
 
-    # TODO: call some function which static-glues if possible...
-    # TensorCast.auto_glue(map(col -> rvec(f(col, args...)), A), (:,*))
+# surevec(x::MArray) = Array(x) # avoid making a huge MArray, ad
 
-    # TODO: can I thread this? Is it even safe to do so?
-    # https://github.com/mohamed82008/KissThreading.jl
+function maybestaticgluecols(B)
+    TB = eltype(B)
+    if TB <: SArray
+        C = collect(reshape(reinterpret(eltype(TB), B),:,length(B)))
+    elseif TB <: MArray
+        C = reduce(hcat, Array.(B))
+    else
+        C = reduce(hcat, B)
+    end
 end
 
-rvec(x::Number) = [x] # to allow for f vector -> scalar, as mapslices does
-rvec(x::StaticArray) = vec(Array(x)) # to avoid creating a giant staticarray, as reduce(hcat would otherwise do
-rvec(A) = vec(A) # LinearAlgebra.
+# surevecS(x::Number) = @SVector [x]
+# surevecS(A) = vec(A) # like surevec
+
+_MapCols(f::Function, M::TrackedMatrix, dval, args...) = track(_MapCols, f, M, dval, args...)
 
-tvec(x) = transpose(rvec(x))
+@grad _MapCols(f::Function, M::TrackedMatrix, dval, args...) = ∇MapCols(f, M, dval, args...)
 
-using ForwardDiff
+@adjoint _MapCols(f::Function, M::Matrix, dval, args...) = ∇MapCols(f, M, dval, args...)
 
-MapCols{d}(f::Function, M::TrackedMatrix, args...) where {d} = track(MapCols, f, M, Val(d), args...)
+function ∇MapCols(f::Function, M::AbstractMatrix{T}, dval::Val{d}, args...) where {T,d}
 
-@grad function MapCols(f::Function, M::TrackedMatrix, dval::Val{d}, args...) where {d}
+    d == size(M,1) || error("expected M with $d columns")
+    # @cast A[c]{r:d} := data(M)[r,c]
+    A = reinterpret(SArray{Tuple{d}, T, 1, d}, vec(data(M)))
 
-    @cast A[c]{r:d} := M.data[r,c]
     dualcol = SVector(ntuple(j->ForwardDiff.Dual(0, ntuple(i->i==j ? 1 : 0, dval)...), dval))
 
-    C = [ rvec(f(acol .+ dualcol, args...)) for acol in A ]
+    # C = [ surevec(f(col .+ dualcol, args...)) for col in A ]
+    C = map(col -> surevec(f(col .+ dualcol, args...)), A)
 
-    Z = reduce(hcat, [ ForwardDiff.value.(full) for full in C ]) # full is not an SVector here
+    # Z = reduce(hcat, [ ForwardDiff.value.(full) for full in C ])
+    Z = reduce(hcat, map(col -> ForwardDiff.value.(col), C))
 
     function back(ΔZ)
-        ∇M = similar(data(M)) .+ zero(first(data(ΔZ)))
+        # accum = zero(eltype(data(ΔZ)))
+        # ∇M = similar(data(M)) .+ zero(first(data(ΔZ)))
+        ∇M = zeros(eltype(data(ΔZ)), size(M))
         @inbounds for c=1:size(M,2)
             part = ForwardDiff.partials.(C[c])
             for r=1:d
-                ∇M[r,c] = 0
+                # ∇M[r,c] = 0
+                # accum = 0
                 for i=1:size(ΔZ,1)
                     ∇M[r,c] += data(ΔZ)[i,c] * part[i].values[r]
+                    # parti = ForwardDiff.partials(C[c][i])
+                    # ∇M[r,c] += data(ΔZ)[i,c] * parti.values[r]
+                    # accum += data(ΔZ)[i,c] * part[i].values[r]
                 end
+                # ∇M[r,c] = accum
             end
         end
         (nothing, ∇M, nothing, map(_->nothing, args)...)
@@ -156,37 +186,11 @@ MapCols{d}(f::Function, M::TrackedMatrix, args...) where {d} = track(MapCols, f,
     Z, back
 end
 
-# TODO make a _MapCols which always takes Val(d), then unite these
-
-Zygote.@adjoint function MapCols{d}(f::Function, M::Matrix, args...) where {d} # no dval!
-
-    @cast A[c]{r:d} := M[r,c]
-    dualcol = SVector(ntuple(j->ForwardDiff.Dual(0, ntuple(i->i==j ? 1 : 0, Val(d))...), Val(d)))
-
-    C = [ rvec(f(acol .+ dualcol, args...)) for acol in A ]
-
-    Z = reduce(hcat, [ ForwardDiff.value.(full) for full in C ])
-
-    function back(ΔZ)
-        ∇M = similar(data(M)) .+ zero(first(data(ΔZ)))
-        @inbounds for c=1:size(M,2)
-            part = ForwardDiff.partials.(C[c])
-            for r=1:d
-                ∇M[r,c] = 0
-                for i=1:size(ΔZ,1)
-                    ∇M[r,c] += data(ΔZ)[i,c] * part[i].values[r]
-                end
-            end
-        end
-        (nothing, ∇M, map(_->nothing, args)...) # changed!
-    end
-
-    Z, back
-end
 
 #========== Gradient for eachslice / reduce ==========#
 
-export gluecol, mapcols2, mapcols4, mapcols5, mapcols6, mapcols7
+export gluecol, collecteachcol
+export mapcols2, mapcols4, mapcols5, mapcols6, mapcols7
 
 gluecol(V::AbstractVector{<:AbstractVector}) = reduce(hcat, V)
 
@@ -235,14 +239,14 @@ end
 # dy = (f = (A = [47.9325 51.3781
 # Which means this works... but uses as much memory as gradient of array of views:
 
-Zygote.@adjoint function eachcol(x::AbstractMatrix)
+#=Zygote.@adjoint function eachcol(x::AbstractMatrix)
     eachcol(x), dy -> (dy.f.A,) #= begin
         @show typeof(dy) dy
         dx = zero(x) .+ 0.0  # zeros(eltype(dy), size(x))
         foreach(copyto!, eachcol(dx), dy)
         (dx,)
     end =#
-end
+end=#
 
 # @adjoint eachcol(x) = eachcol(x), dy -> (dy.f.A,)
 
@@ -254,7 +258,7 @@ end
 
 collecteachcol(x) = collect(eachcol(x))
 
-Zygote.@adjoint function collecteachcol(x)
+@adjoint function collecteachcol(x)
     collecteachcol(x), dy -> begin
         dx = _zero(x)
         foreach(copyto!, collecteachcol(dx), dy)
@@ -274,4 +278,25 @@ end
 #     reduce(hcat, res)
 # end
 
+# Following a suggestion? Doesn't help.
+# @adjoint Base.collect(x) = collect(x), Δ -> (Δ,)
+
+
+#========== Gradients for TensorCast's functions ==========#
+
+using TensorCast
+
+@adjoint function TensorCast.sliceview(A::AbstractArray, code::Tuple)
+    TensorCast.sliceview(A, code), Δ -> begin
+        dA = _zero(A)
+        foreach(copyto!, TensorCast.sliceview(dA, code), Δ)
+        (dA, nothing)
+    end
+end
+
+@adjoint function TensorCast.red_glue(A::AbstractArray, code::Tuple)
+    TensorCast.red_glue(A, code), Δ -> (TensorCast.sliceview(Δ, code), nothing)
+end
+
+
 end # module