revert things and add simple broadcast version

jumerckx · jumerckx · commit 671f155a960a · 2019-06-25T15:17:20.000+02:00
diff --git a/src/softmax.jl b/src/softmax.jl
@@ -3,29 +3,63 @@ export softmax, softmax!, ∇softmax, ∇softmax!,
 
 """
     softmax(xs) = exp.(xs) ./ sum(exp.(xs))
-
 [Softmax](https://en.wikipedia.org/wiki/Softmax_function) takes
 log-probabilities (any real vector) and returns a probability distribution that
 sums to 1.
-
 If given a matrix it will treat it as a batch of vectors, with each column
 independent.
-
     julia> softmax([1,2,3.])
     3-element Array{Float64,1}:
       0.0900306
       0.244728
       0.665241
 """
+softmax(xs) = softmax!(similar(xs), xs)
+
 function softmax(xs::AbstractArray{T}; dims=1) where {T}
     max = maximum(xs, dims=dims)
     out = exp.(xs .- max)
     out = out ./ sum(out, dims=dims)
 end
 
+function softmax!(out::AbstractVecOrMat{T}, xs::AbstractVecOrMat{T}) where {T}
+    @inbounds for j = 1:size(xs, 2)
+        # First, store column-wise maximum in the last element of `out`
+        out[end, j] = xs[end, j]
+        @inbounds for i = 1:(size(xs, 1) - 1)
+            out[end, j] = max(out[end, j], xs[i, j])
+        end
+
+        # Subtract the column-wise maximums to normalize, take exp()
+        # out .= exp(xs .- out[end, :])
+        @inbounds for i = 1:size(out, 1)
+            out[i, j] = exp(xs[i, j] - out[end, j])
+        end
+
+        # Normalize by sum of the entire thing
+        # out ./= sum(out, 1)
+        s = T(0)
+        @inbounds for i = 1:size(out, 1)
+            s += out[i, j]
+        end
+        @inbounds for i = 1:size(out, 1)
+            out[i, j] /= s
+        end
+    end
+    return out
+end
+
+function ∇softmax!(out::AbstractVecOrMat, Δ::AbstractVecOrMat, xs::AbstractVecOrMat)
+    sf = softmax(xs)
+    out .= sf .* (Δ .- sum(Δ .*sf, dims = 1))
+end
+
+∇softmax(Δ, xs) = ∇softmax!(similar(Δ), Δ, xs)
+∇softmax!(Δ, xs) = ∇softmax!(Δ, Δ, xs)
+
+
 """
     logsoftmax(xs) = log.(exp.(xs) ./ sum(exp.(xs)))
-
 `logsoftmax(xs)` computes the log of `softmax(xs)`, but in a more numerically stable
 way than directly taking the log of the softmax function, which is commonly used in
 computing cross entropy loss.