Fix merge conflicts with master

Avik Pal · Avik Pal · commit bc19012cddfa · 2019-04-11T14:39:49.000+05:30
diff --git a/.codecov.yml b/.codecov.yml
@@ -0,0 +1 @@
+comment: false
diff --git a/.travis.yml b/.travis.yml
@@ -5,6 +5,11 @@ os:
   - osx
 julia:
   - 1.0
+  - 1.1
+  - nightly
+matrix:
+  allow_failures:
+    - julia: nightly
 notifications:
   email: false
 git:
@@ -13,6 +18,8 @@ env:
   # Disable test fuzzing for the moment, as we're a little too slow for Travis
   - NNLIB_TEST_FUZZING=false
 
-# Submit to Codecov  
+# Submit to Codecov
 after_success:
-   - julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())'
+   - if [[ $TRAVIS_JULIA_VERSION = 1.1 ]] && [[ $TRAVIS_OS_NAME = linux ]]; then
+       julia -e 'using Pkg; Pkg.add("Coverage"); using Coverage; Codecov.submit(process_folder())';
+     fi
diff --git a/Project.toml b/Project.toml
@@ -8,5 +8,10 @@ Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
-Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/REQUIRE b/REQUIRE
@@ -2,3 +2,4 @@ julia 1.0
 Requires
 MacroTools
 BinaryProvider
+TimerOutputs
diff --git a/src/NNlib.jl b/src/NNlib.jl
@@ -1,6 +1,8 @@
 module NNlib
 using Requires, TimerOutputs
 
+const to = TimerOutput()
+
 # Include APIs
 include("dim_helpers.jl")
 include("activation.jl")
diff --git a/src/activation.jl b/src/activation.jl
@@ -1,17 +1,17 @@
 export σ, sigmoid, relu, leakyrelu, elu, gelu, swish, selu, softplus, softsign, logσ,
-       logsigmoid
+       logsigmoid, logcosh
 
 """
     σ(x) = 1 / (1 + exp(-x))
 
 Classic [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function) activation
 function.
 """
-σ(x) = one(x) / (one(x) + exp(-x))
+σ(x::Real) = one(x) / (one(x) + exp(-x))
 const sigmoid = σ
 
 # ForwardDiff numerical stability hack
-σ_stable(x) = ifelse(x < -80, zero(x), one(x) / (one(x) + exp(-x)))
+σ_stable(x::Real) = ifelse(x < -80, zero(x), one(x) / (one(x) + exp(-x)))
 σ(x::Float32) = σ_stable(x)
 @init @require ForwardDiff="f6369f11-7733-5829-9624-2563aa707210" begin
   σ(x::ForwardDiff.Dual{T,Float32}) where T = σ_stable(x)
@@ -27,11 +27,11 @@ Return `log(σ(x))` which is computed in a numerically stable way.
     -0.6931471805599453
     julia> logσ.([-100, -10, 100])
     3-element Array{Float64,1}:
-     -100.0                  
-      -10.000045398899218    
+     -100.0
+      -10.000045398899218
        -3.720075976020836e-44
 """
-logσ(x) = -softplus(-x)
+logσ(x::Real) = -softplus(-x)
 const logsigmoid = logσ
 
 
@@ -41,7 +41,7 @@ const logsigmoid = logσ
 [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))
 activation function.
 """
-relu(x) = max(zero(x), x)
+relu(x::Real) = max(zero(x), x)
 
 
 """
@@ -51,7 +51,7 @@ Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_ne
 activation function.
 You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`.
 """
-leakyrelu(x, a = oftype(x/1, 0.01)) = max(a*x, x/1)
+leakyrelu(x::Real, a = oftype(x/1, 0.01)) = max(a*x, x/1)
 
 
 """
@@ -71,7 +71,7 @@ elu(x, α = one(x)) = ifelse(x ≥ 0, x/1, α * (exp(x) - one(x)))
 [Gaussian Error Linear Unit](https://arxiv.org/pdf/1606.08415.pdf)
 activation function.
 """
-function gelu(x)
+function gelu(x::Real)
     λ = oftype(x/1, √(2/π))
     α = oftype(x/1, 0.044715)
     h = oftype(x/1, 0.5)
@@ -85,7 +85,7 @@ end
 Self-gated actvation function.
 See [Swish: a Self-Gated Activation Function](https://arxiv.org/pdf/1710.05941.pdf).
 """
-swish(x) = x * σ(x)
+swish(x::Real) = x * σ(x)
 
 """
     selu(x) = λ * (x ≥ 0 ? x : α * (exp(x) - 1))
@@ -96,7 +96,7 @@ swish(x) = x * σ(x)
 Scaled exponential linear units.
 See [Self-Normalizing Neural Networks](https://arxiv.org/pdf/1706.02515.pdf).
 """
-function selu(x)
+function selu(x::Real)
   λ = oftype(x/1, 1.0507009873554804934193349852946)
   α = oftype(x/1, 1.6732632423543772848170429916717)
   λ * ifelse(x > 0, x/1, α * (exp(x) - 1))
@@ -108,12 +108,26 @@ end
 
 See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205).
 """
-softsign(x) = x / (one(x) + abs(x))
+softsign(x::Real) = x / (one(x) + abs(x))
 
 
 """
     softplus(x) = log(exp(x) + 1)
 
 See [Deep Sparse Rectifier Neural Networks](http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf).
 """
-softplus(x) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
+softplus(x::Real) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x)))
+
+
+"""
+    logcosh(x)
+
+Return `log(cosh(x))` which is computed in a numerically stable way.
+"""
+logcosh(x::T) where T = x + softplus(-2x) - log(convert(T, 2))
+
+# Provide an informative error message if activation functions are called with an array
+for f in (:σ, :σ_stable, :logσ, :relu, :leakyrelu, :elu, :gelu, :swish, :selu, :softsign, :softplus, :logcosh)
+  @eval $(f)(x::AbstractArray, args...) =
+    error("Use broadcasting (`", $(string(f)), ".(x)`) to apply activation functions to arrays.")
+end
diff --git a/src/impl/conv_im2col.jl b/src/impl/conv_im2col.jl
@@ -50,10 +50,12 @@ which should eliminate any need for large allocations within this method.
         # We invoke `@timeit_debug` on the outside of `im2col!()` because inference
         # doesn't like us putting it on the inside.
         @timeit_debug to "im2col!" im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
-        col_ptr = pointer(col)
-        w_ptr = pointer(w)
-        y_ptr = pointer(y, (batch_idx - 1)*M*N + 1)
-        @timeit_debug to "gemm!" gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
+        GC.@preserve col, w, y, begin
+            col_ptr = pointer(col)
+            w_ptr = pointer(w)
+            y_ptr = pointer(y, (batch_idx - 1)*M*N + 1)
+            @timeit_debug to "gemm!" gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
+        end
     end
     return y
 end
@@ -96,10 +98,12 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
         # We invoke `@timeit_debug` on the outside of `im2col!()` because inference
         # doesn't like us putting it on the inside.
         @timeit_debug to "im2col!" im2col!(col, view(x, :, :, :, :, batch_idx), cdims)
-        col_ptr = pointer(col)
-        dy_ptr = pointer(dy,(batch_idx - 1)*K*N + 1)
-        dw_ptr = pointer(dw)
-        @timeit_debug to "gemm!" gemm!(Val(true), Val(false), M, N, K, alpha, col_ptr, dy_ptr, beta, dw_ptr)
+        GC.@preserve col, dw, dy, begin
+            col_ptr = pointer(col)
+            dy_ptr = pointer(dy,(batch_idx - 1)*K*N + 1)
+            dw_ptr = pointer(dw)
+            @timeit_debug to "gemm!" gemm!(Val(true), Val(false), M, N, K, alpha, col_ptr, dy_ptr, beta, dw_ptr)
+        end
 
         # Because we accumulate over batches in this loop, we must set `beta` equal
         # to `1.0` from this point on.
@@ -141,10 +145,12 @@ See the documentation for `conv_im2col!()` for explanation of other parameters.
     K = channels_out(cdims)
 
     @inbounds for batch_idx in 1:size(dx, 5)
-        dy_ptr = pointer(dy, (batch_idx - 1)*M*K + 1)
-        w_ptr = pointer(w)
-        col_ptr = pointer(col)
-        @timeit_debug to "gemm!" gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
+        GC.@preserve col, w, dy, begin
+            dy_ptr = pointer(dy, (batch_idx - 1)*M*K + 1)
+            w_ptr = pointer(w)
+            col_ptr = pointer(col)
+            @timeit_debug to "gemm!" gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
+        end
         @timeit_debug to "col2im!" col2im!(view(dx, :, :, :, :, batch_idx), col, cdims)
     end
     return dx
diff --git a/src/impl/depthwiseconv_im2col.jl b/src/impl/depthwiseconv_im2col.jl
@@ -35,10 +35,12 @@ depthwiseconv_im2col!
         # We do a separate convolution for each channel in x, as we must
         for c_in in 1:channels_in(cdims)
             # Walk each pointer forward as we process each input channel
-            col_ptr = pointer(col, (c_in-1)*M*K+1)
-            w_ptr = pointer(w, (c_in-1)*K*N+1)
-            y_ptr = pointer(y, ((batch_idx - 1)*channels_in(cdims) + c_in - 1)*M*N + 1)
-            gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
+            GC.@preserve col, w, y, begin
+                col_ptr = pointer(col, (c_in-1)*M*K+1)
+                w_ptr = pointer(w, (c_in-1)*K*N+1)
+                y_ptr = pointer(y, ((batch_idx - 1)*channels_in(cdims) + c_in - 1)*M*N + 1)
+                gemm!(Val(false), Val(false), M, N, K, alpha, col_ptr, w_ptr, beta, y_ptr)
+            end
         end
     end
     return y
@@ -71,11 +73,12 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
         # We do a separate convolution for each channel in x, as we must
         for c_in in 1:channels_in(cdims)
             # Walk each pointer forward as we process each input channel
-            col_ptr = pointer(col, (c_in - 1)*M*K + 1)
-            dy_ptr = pointer(dy, (batch_idx - 1)*N*K*channels_in(cdims) + (c_in - 1)*K*N + 1)
-            dw_ptr = pointer(dw, (c_in - 1)*M*N + 1)
-
-            gemm!(Val(true), Val(false), M, N, K, alpha, col_ptr, dy_ptr, beta, dw_ptr)
+            GC.@preserve col, dw, dy, begin
+                col_ptr = pointer(col, (c_in - 1)*M*K + 1)
+                dy_ptr = pointer(dy, (batch_idx - 1)*N*K*channels_in(cdims) + (c_in - 1)*K*N + 1)
+                dw_ptr = pointer(dw, (c_in - 1)*M*N + 1)
+                gemm!(Val(true), Val(false), M, N, K, alpha, col_ptr, dy_ptr, beta, dw_ptr)
+            end
         end
 
         # Because we accumulate over batches in this loop, we must set `beta` equal
@@ -107,13 +110,15 @@ See the documentation for `conv_im2col!()` for explanation of optional parameter
     @inbounds for batch_idx in 1:size(dx)[end]
         # We do a separate convolution for each channel in x, as we must
         for cidx in 1:channels_in(cdims)
-            # Walk each pointer forward as we process each input channel
-            dy_ptr = pointer(dy, (batch_idx - 1)*M*K*channels_in(cdims)+(cidx - 1)*K*M + 1)
-            w_ptr = pointer(w, (cidx - 1)*K*N + 1)
-            col_ptr = pointer(col, (cidx - 1)*M*N + 1)
-            gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
+            GC.@preserve col, w, dy, begin
+                # Walk each pointer forward as we process each input channel
+                dy_ptr = pointer(dy, (batch_idx - 1)*M*K*channels_in(cdims)+(cidx - 1)*K*M + 1)
+                w_ptr = pointer(w, (cidx - 1)*K*N + 1)
+                col_ptr = pointer(col, (cidx - 1)*M*N + 1)
+                gemm!(Val(false), Val(true), M, N, K, alpha, dy_ptr, w_ptr, T(0), col_ptr)
+            end
         end
         @timeit_debug to "col2im!" col2im!(view(dx, :, :, :, :, batch_idx), col, cdims)
     end
     return dx
-end
+end
diff --git a/test/activation.jl b/test/activation.jl
@@ -1,6 +1,6 @@
 using NNlib, Test
 
-ACTIVATION_FUNCTIONS = [σ, relu, leakyrelu, elu, gelu, swish, selu, softplus, softsign];
+ACTIVATION_FUNCTIONS = [σ, relu, leakyrelu, elu, gelu, swish, selu, softplus, softsign, logcosh];
 
 function test_value_float_precision_preserving(a)
     @testset "$(a): " begin
@@ -36,6 +36,7 @@ end
     @test softplus(-1e8) ≈ 0.0
     @test softsign(0.0) == 0.0
     @test selu(0.0) == 0.0
+    @test logcosh(0.0) == log(cosh(0.0))
 
     @test σ(1.0) == 1.0 / (1.0 + exp(-1.0))
     @test relu(1.0) == 1.0
@@ -46,6 +47,7 @@ end
     @test softplus(1.0) ≈ log(exp(1.0) + 1.0)
     @test softsign(1.0) == 0.5
     @test selu(1.0) == 1.0507009873554804934193349852946
+    @test logcosh(1.0) ≈ log(cosh(1.0))
 
     @test σ(-1.0) == 1.0 / (1.0 + exp(1.0))
     @test relu(-1.0) == 0.0
@@ -56,11 +58,19 @@ end
     @test softplus(-1.0) ≈ log(exp(-1.0) + 1.0)
     @test softsign(-1.0) == -0.5
     @test selu(-1.0) == 1.0507009873554804934193349852946 * 1.6732632423543772848170429916717 * (exp(-1.0) - 1.0)
+    @test log(cosh(-1.0)) ≈ log(cosh(-1.0))
 
     @testset "Float inference" begin
         test_value_float_precision_preserving.(ACTIVATION_FUNCTIONS)
     end
 
+    @testset "Array input" begin
+        x = rand(5)
+        for a in ACTIVATION_FUNCTIONS
+            @test_throws ErrorException a(x)
+        end
+    end
+
     @testset "Test Integer64 and Integer32 inputs will force Float64 outputs" begin
         test_value_int_input_forces_float64.(filter(x -> x != relu, ACTIVATION_FUNCTIONS))
 
@@ -125,4 +135,6 @@ end
             @eval @test logsigmoid.($T[-100_000, 100_000.]) ≈ $T[-100_000, 0.]
         end
     end
+
+    @test logcosh(1_000.0) + log(2) == 1_000.0
 end