FluxML
diff --git a/‎src/NNlib.jl
Lines changed: 4 additions & 1 deletion b/‎src/NNlib.jl
Lines changed: 4 additions & 1 deletion
diff --git a/‎src/conv.jl
Lines changed: 36 additions & 40 deletions b/‎src/conv.jl
Lines changed: 36 additions & 40 deletions
diff --git a/‎src/gemm.jl
Lines changed: 22 additions & 19 deletions b/‎src/gemm.jl
Lines changed: 22 additions & 19 deletions
diff --git a/‎src/impl/conv_direct.jl
Lines changed: 30 additions & 15 deletions b/‎src/impl/conv_direct.jl
Lines changed: 30 additions & 15 deletions
@@ -1,5 +1,5 @@
 module NNlib
-using Requires
+using Requires, TimerOutputs
 
 # Include APIs
 include("dim_helpers.jl")
@@ -21,4 +21,7 @@ include("impl/depthwiseconv_im2col.jl")
 
 # Direct implementations of pooling
 include("impl/pooling_direct.jl")
+
+to = TimerOutput()
+
 end # module NNlib
@@ -45,9 +45,9 @@ for (front_name, backend) in (
     # We only define 3d conv primitives, we reshape lower down to get 1d and 2d convolution
     @eval begin
         # im2col-accelerated function forwarding definition
-        function $(Symbol("$(front_name)!"))(
-                out::AbstractArray{T,5}, in1::AbstractArray{T,5},
-                in2::AbstractArray{T,5}, cdims::ConvDims; kwargs...) where {T <: $G}
+        @timeit_debug to function $(Symbol("$(front_name)!"))(
+                        out::AbstractArray{T,5}, in1::AbstractArray{T,5},
+                        in2::AbstractArray{T,5}, cdims::ConvDims; kwargs...) where {T <: $G}
             $(Symbol("$(front_name)_$(backend)!"))(out, in1, in2, cdims; kwargs...)
         end
     end
@@ -62,9 +62,9 @@ for front_name in (:conv, :∇conv_data, :∇conv_filter,
         for N in (3, 4)
             @eval begin
                 function $(Symbol("$(front_name)$(backend)!"))(
-                        y::AbstractArray{yT,$N}, x::AbstractArray{xT,$N},
-                        w::AbstractArray{wT,$N}, cdims::ConvDims;
-                        kwargs...) where {yT, xT, wT}
+                                y::AbstractArray{yT,$N}, x::AbstractArray{xT,$N},
+                                w::AbstractArray{wT,$N}, cdims::ConvDims;
+                                kwargs...) where {yT, xT, wT}
                     $(Symbol("$(front_name)$(backend)!"))(
                         insert_singleton_spatial_dimension(y, $(5 - N)),
                         insert_singleton_spatial_dimension(x, $(5 - N)),
@@ -88,43 +88,41 @@ end
 for front_name in (:conv, :∇conv_data, :∇conv_filter,
                    :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter)
     @eval begin
-        function $(Symbol("$(front_name)!"))(out::AbstractArray, in1::AbstractArray,
-                                            in2::AbstractArray, cdims::ConvDims; kwargs...)
-            @debug "Slow fallback implementation invoked for $(front_name)!  You probably don't want this; check your datatypes."
-            $(Symbol("$(front_name)_direct!"))(out, in1, in2, cdims; kwargs...)
+        function $(Symbol("$(front_name)!"))(
+                        y::AbstractArray{yT,N}, in1::AbstractArray{T1,N},
+                        in2::AbstractArray{T2,N}, cdims::ConvDims;
+                        kwargs...) where {yT, T1, T2, N}
+            @debug string("Slow fallback implementation invoked for $(front_name)!  ",
+                          "You probably don't want this; check your datatypes.")
+            $(Symbol("$(front_name)_direct!"))(y, in1, in2, cdims; kwargs...)
         end
     end
 end
 
-# Finally, let's generate auto-allocating versions of all our functions, for all backends:
+# Finally, let's generate auto-allocating versions of all our functions, for all backends.
+# We `@timeit` these methods separately, as we want to know how much time is spent in
+# allocation.  :P
 for backend in (Symbol(), :_direct, :_im2col)
     # First make auto-allocating versions of the conv()-like calls:
     for name in (:conv, :depthwiseconv)
         @eval begin
-            function $(Symbol("$(name)$(backend)"))(
-                    x::AbstractArray{xT,N}, w::AbstractArray{wT,N},
-                    cdims::ConvDims; kwargs...) where {xT, wT, N}
-                yT = promote_type(xT, wT)
-                # Annoyingly, we must allocate with `zeros()` because if we were to use
-                # the faster `similar()`, it may have NaNs within it, which will poison
-                # the output because we support accumulation (even with `beta = 0` the
-                # NaNs poison us as NaN * 0 == NaN).  This is a bit of a shame, but it's
-                # not really that bad as if you're truly interested in performance, you
-                # should be allocating your own `y` and calling the non-allocating
-                # variant of this method anyway.                
-                y = zeros(yT, output_size(cdims)..., channels_out(cdims), size(x, N))
+            @timeit_debug to function $(Symbol("$(name)$(backend)"))(
+                            x::AbstractArray{xT,N}, w::AbstractArray{wT,N},
+                            cdims::ConvDims; kwargs...) where {xT, wT, N}
+                y = similar(x, promote_type(xT, wT), output_size(cdims)...,
+                               channels_out(cdims), size(x,N))
                 return $(Symbol("$(name)$(backend)!"))(y, x, w, cdims; kwargs...)
             end
         end
     end
 
     for name in (:∇conv_data, :∇depthwiseconv_data)
         @eval begin
-            function $(Symbol("$(name)$(backend)"))(
-                    dy::AbstractArray{yT,N}, w::AbstractArray{wT,N},
-                    cdims::cdT; kwargs...) where {yT, wT, N, cdT <: ConvDims}
-                # Again, allocate with zeros
-                dx = zeros(yT, input_size(cdims)..., channels_in(cdims), size(dy, N))
+            @timeit_debug to function $(Symbol("$(name)$(backend)"))(
+                            dy::AbstractArray{yT,N}, w::AbstractArray{wT,N},
+                            cdims::ConvDims; kwargs...) where {yT, wT, N}
+                dx = similar(dy, input_size(cdims)..., channels_in(cdims),
+                                                        size(dy, N))
                 return $(Symbol("$(name)$(backend)!"))(dx, dy, w, cdims; kwargs...)
             end
         end
@@ -133,23 +131,21 @@ for backend in (Symbol(), :_direct, :_im2col)
     # We do the conv/depthwiseconv filter backprops separately, as the shape calculation
     # for `w` is slightly different for depthwise than for normal dense convolution.
     @eval begin
-        function $(Symbol("∇conv_filter$(backend)"))(
-                x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
-                cdims::cdT; kwargs...) where {xT, yT, N, cdT <: ConvDims}
-            # Again, allocate with zeros
-            dw = zeros(yT, kernel_size(cdims)..., channels_in(cdims),
-                                                   channels_out(cdims))
+        @timeit_debug to function $(Symbol("∇conv_filter$(backend)"))(
+                        x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
+                        cdims::ConvDims; kwargs...) where {xT, yT, N}
+            dw = similar(dy, kernel_size(cdims)..., channels_in(cdims),
+                                                    channels_out(cdims))
             return $(Symbol("∇conv_filter$(backend)!"))(dw, x, dy, cdims; kwargs...)
         end
     end
 
     @eval begin
-        function $(Symbol("∇depthwiseconv_filter$(backend)"))(
-                x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
-                cdims::cdT; kwargs...) where {xT, yT, N, cdT <: ConvDims}
-            # Again, allocate with zeros
-            dw = zeros(yT, kernel_size(cdims)..., channel_multiplier(cdims),
-                                                  channels_in(cdims))
+        @timeit_debug to function $(Symbol("∇depthwiseconv_filter$(backend)"))(
+                        x::AbstractArray{xT,N}, dy::AbstractArray{yT,N},
+                        cdims::ConvDims; kwargs...) where {xT, yT, N}
+            dw = similar(dy, kernel_size(cdims)..., channel_multiplier(cdims),
+                                                    channels_in(cdims))
             return $(Symbol("∇depthwiseconv_filter$(backend)!"))(dw, x, dy, cdims;
                                                                  kwargs...)
         end
 
@@ -1,9 +1,23 @@
 ## Low level gemm! call with pointers
-## Borrowed from Knet.jl
+## Borrowed from Knet.jl, adapted for compile-time constants
 
 using LinearAlgebra
 using LinearAlgebra.BLAS: libblas, BlasInt, @blasfunc
 
+"""
+    gemm!()
+
+Low-level gemm!() call with pointers, borrowed from Knet.jl
+
+Calculates `C = alpha*op(A)*op(B) + beta*C`, where:
+  - `transA` and `transB` set `op(X)` to be either `identity()` or `transpose()`
+  - alpha and beta are scalars
+  - op(A) is an (M, K) matrix
+  - op(B) is a (K, N) matrix
+  - C is an (M, N) matrix.
+"""
+gemm!
+
 # These are the datatypes we have fast GEMM for
 gemm_datatype_mappings = (
     (:dgemm_, Float64),
@@ -13,34 +27,23 @@ gemm_datatype_mappings = (
 )
 for (gemm, elt) in gemm_datatype_mappings
     @eval begin
-        """
-            gemm!()
-
-        Low-level gemm!() call with pointers, borrowed from Knet.jl
-
-        Calculates `C = alpha*op(A)*op(B) + beta*C`, where:
-        - `transA` and `transB` set `op(X)` to be either `identity()` or `transpose()`
-        - alpha and beta are scalars
-        - op(A) is an (M, K) matrix
-        - op(B) is a (K, N) matrix
-        - C is an (M, N) matrix.
-        """
-        @inline function gemm!(transA::Val, transB::Val, M::Int, N::Int, K::Int,
+        @inline @timeit_debug to function gemm!(transA::Val, transB::Val,
+                               M::Int, N::Int, K::Int,
                                alpha::$(elt), A::Ptr{$elt}, B::Ptr{$elt},
                                beta::$(elt), C::Ptr{$elt})
             # Convert our compile-time transpose marker to a char for BLAS
             convtrans(V::Val{false}) = 'N'
             convtrans(V::Val{true})  = 'T'
 
-            if transA==Val(false)
-                lda=M
+            if transA == Val(false)
+                lda = M
             else
-                lda=K
+                lda = K
             end
             if transB == Val(false)
-                ldb=K
+                ldb = K
             else
-                ldb=N
+                ldb = N
             end
             ldc = M
             ccall((@blasfunc($(gemm)), libblas), Nothing,
 
@@ -33,13 +33,20 @@ calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a nonz
 value, the user is able to accumulate values into a preallocated `y` buffer, or by
 setting `alpha` to a nonunitary value, an arbitrary gain factor can be applied.
 
+By defaulting `beta` to `false`, we make use of the Bradbury promotion trick to override
+`NaN`'s that may pre-exist within our output buffer, as `false*NaN == 0.0`, whereas
+`0.0*NaN == NaN`.  Only set `beta` if you are certain that none of the elements within
+`y` are `NaN`.
+
 The basic implementation performs 3-dimensional convolution; 1-dimensional and 2-
 dimensional casesa are supported by simply reshaping `y`, `x` and `w`, for which
 wrapper methods are available.
 """
-function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
+conv_direct!
+
+@timeit_debug to function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
                       w::AbstractArray{wT,5}, cdims::DenseConvDims;
-                      alpha::yT = yT(1), beta::yT = yT(0)) where {yT, xT, wT}
+                      alpha::yT = yT(1), beta = false) where {yT, xT, wT}
     check_dims(size(x), size(w), size(y), cdims)
 
     width, height, depth = input_size(cdims)
@@ -50,12 +57,13 @@ function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
     stride_w, stride_h, stride_d = stride(cdims)
     out_width, out_height, out_depth = output_size(cdims)
 
-    project(idx, s, p) = (idx - 1)*s - p + 1
-    
     # If we're doing crosscorr instead of conv, then don't bother to flip `w`
     if !flipkernel(cdims)
         w = w[end:-1:1, end:-1:1, end:-1:1, :, :]
     end
+
+    # A helper function to project from output (w, h) to input (input_w, input_h)
+    @inline project(idx, stride, pad) = (idx - 1)*stride - pad + 1
 
     # explicit formulation of convolution.  Oh hoisting gods, hear my plea.
     @inbounds for batch in 1:size(x)[end],
@@ -94,7 +102,7 @@ function conv_direct!(y::AbstractArray{yT,5}, x::AbstractArray{xT,5},
         y[w_idx, h_idx, d_idx, c_out, batch] = alpha*convert(yT, dotprod) +
                                                beta*y[w_idx, h_idx, d_idx, c_out, batch]
     end
-    
+
     return y
 end
 
@@ -104,27 +112,34 @@ end
 
 Calculate the gradient imposed upon `x` in the convolution `y = x * w`.
 """
-function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
+∇conv_data_direct!
+
+@timeit_debug to function ∇conv_data_direct!(dx::AbstractArray{xT,5}, dy::AbstractArray{yT,5},
                             w::AbstractArray{wT,5}, cdims::DenseConvDims;
-                            alpha::xT=xT(1), beta::xT=xT(0)) where {xT, yT, wT}
-    w = transpose_flipbatch(w[end:-1:1, end:-1:1, end:-1:1, :, :])
+                            alpha::xT=xT(1), beta=false) where {xT, yT, wT}
+    w = transpose_swapbatch(w[end:-1:1, end:-1:1, end:-1:1, :, :])
     dy = predilate(dy, stride(cdims))
     ctdims = DenseConvDims(dy, w; padding=transpose_pad(cdims),
-                                  dilation=dilation(cdims), flipkernel=flipkernel(cdims))
-    return transpose_flipbatch(conv_direct!(dx, dy, w, ctdims; alpha=alpha, beta=beta))
+                                    dilation=dilation(cdims),
+                                    flipkernel=flipkernel(cdims))
+    dx = conv_direct!(dx, dy, w, ctdims; alpha=alpha, beta=beta)
+    return transpose_swapbatch(dx)
 end
 
 """
     ∇conv_filter_direct!(dw, x, dy, cdims; alpha=1, beta=0)
 
 Calculate the gradient imposed upon `w` in the convolution `y = x * w`.
 """
-function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
+∇conv_filter_direct!
+
+@timeit_debug to function ∇conv_filter_direct!(dw::AbstractArray{wT,5}, x::AbstractArray{xT,5},
                               dy::AbstractArray{yT,5}, cdims::DenseConvDims;
-                              alpha::wT=wT(1), beta::wT=wT(0)) where {xT, yT, wT}
-    x = transpose_flipbatch(x[end:-1:1, end:-1:1, end:-1:1, :, :])
-    dy = transpose_flipbatch(predilate(dy, stride(cdims)))
-    ctdims = DenseConvDims(dy, x; padding=transpose_pad(cdims), stride=dilation(cdims))
+                              alpha::wT=wT(1), beta=false) where {xT, yT, wT}
+    x = transpose_swapbatch(x[end:-1:1, end:-1:1, end:-1:1, :, :])
+    dy = transpose_swapbatch(predilate(dy, stride(cdims)))
+    ctdims = DenseConvDims(dy, x; padding=transpose_pad(cdims),
+                                    stride=dilation(cdims))
     conv_direct!(dw, dy, x, ctdims; alpha=alpha, beta=beta)
     if flipkernel(cdims)
         dw .= dw[end:-1:1, end:-1:1, end:-1:1, :, :]