diff --git a/src/activation.jl b/src/activation.jl index 0f95824c1..a075335a6 100644 --- a/src/activation.jl +++ b/src/activation.jl @@ -7,10 +7,10 @@ export σ, sigmoid, hardσ, hardsigmoid, hardtanh, relu, leakyrelu, relu6, rrelu # https://github.com/JuliaGPU/CuArrays.jl/issues/614 """ - σ(x) = 1 / (1 + exp(-x)) + σ(x) Classic [sigmoid](https://en.wikipedia.org/wiki/Sigmoid_function) activation -function. +function. Return `1 / (1 + exp(-x))`. """ σ(x::Real) = one(x) / (one(x) + exp(-x)) const sigmoid = σ @@ -23,15 +23,14 @@ const sigmoid = σ end """ - hardσ(x, a=0.2) = max(0, min(1.0, a * x + 0.5)) + hardσ(x, a=0.2) -Segment-wise linear approximation of sigmoid. +Segment-wise linear approximation of sigmoid. Return `max(0, min(1.0, a * x + 0.5))`. See [BinaryConnect: Training Deep Neural Networks withbinary weights during propagations](https://arxiv.org/pdf/1511.00363.pdf). """ -hardσ(x::Real, a=0.2) = oftype(x/1, max(zero(x/1), min(one(x/1), oftype(x/1,a) * x + oftype(x/1,0.5)))) +hardσ(x::Real, a=0.2) = oftype(x / 1, max(zero(x / 1), min(one(x / 1), oftype(x / 1, a) * x + oftype(x / 1, 0.5)))) const hardsigmoid = hardσ - """ logσ(x) @@ -48,50 +47,46 @@ Return `log(σ(x))` which is computed in a numerically stable way. logσ(x::Real) = -softplus(-x) const logsigmoid = logσ - """ - hardtanh(x) = max(-1, min(1, x)) + hardtanh(x) -Segment-wise linear approximation of tanh. Cheaper and more computational efficient version of tanh. +Segment-wise linear approximation of tanh. Return `max(-1, min(1, x))`. +Cheaper and more computational efficient version of tanh. See [Large Scale Machine Learning](http://ronan.collobert.org/pub/matos/2004_phdthesis_lip6.pdf). """ hardtanh(x::Real) = max(-one(x), min( one(x), x)) - """ - relu(x) = max(0, x) + relu(x) [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) -activation function. +activation function. Return `max(0, x)`. """ relu(x::Real) = max(zero(x), x) - """ - leakyrelu(x, a=0.01) = max(a*x, x) + leakyrelu(x, a=0.01) Leaky [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) -activation function. +activation function. Return `max(a*x, x)`. You can also specify the coefficient explicitly, e.g. `leakyrelu(x, 0.01)`. """ -leakyrelu(x::Real, a = oftype(x / 1, 0.01)) = max(a * x, x / one(x)) +leakyrelu(x::Real, a=0.01) = max(oftype(x / 1, a) * x, x / 1) """ - relu6(x) = min(max(0, x), 6) + relu6(x) [Rectified Linear Unit](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) -activation function capped at 6. +activation function capped at 6. Return `min(max(0, x), 6)`. See [Convolutional Deep Belief Networks on CIFAR-10](http://www.cs.utoronto.ca/%7Ekriz/conv-cifar10-aug2010.pdf) """ relu6(x::Real) = min(relu(x), oftype(x, 6)) """ - rrelu(x, l=1/8, u=1/3) = max(a*x, x) - - a = randomly sampled from uniform distribution U(l, u) + rrelu(x, l=1/8, u=1/3) Randomized Leaky [Rectified Linear Unit](https://arxiv.org/pdf/1505.00853.pdf) -activation function. +activation function. Return `max(a*x, x)` where `a` is randomly sampled from uniform distribution U(l, u). You can also specify the bound explicitly, e.g. `rrelu(x, 0.0, 1.0)`. """ function rrelu(x::Real, l::Real = 1 / 8.0, u::Real = 1 / 3.0) @@ -100,21 +95,19 @@ function rrelu(x::Real, l::Real = 1 / 8.0, u::Real = 1 / 3.0) end """ - elu(x, α=1) = - x > 0 ? x : α * (exp(x) - 1) + elu(x, α=1) -Exponential Linear Unit activation function. +Exponential Linear Unit activation function. Return `x > 0 ? x : α * (exp(x) - 1)`. See [Fast and Accurate Deep Network Learning by Exponential Linear Units](https://arxiv.org/abs/1511.07289). You can also specify the coefficient explicitly, e.g. `elu(x, 1)`. """ -elu(x::Real, α = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x) - one(x))) - +elu(x::Real, α=one(x)) = ifelse(x ≥ 0, x / 1, α * (exp(x) - one(x))) """ - gelu(x) = 0.5x * (1 + tanh(√(2/π) * (x + 0.044715x^3))) + gelu(x) [Gaussian Error Linear Unit](https://arxiv.org/pdf/1606.08415.pdf) -activation function. +activation function. Return `0.5x * (1 + tanh(√(2/π) * (x + 0.044715x^3)))`. """ function gelu(x::Real) p = oftype(x / 1, π) @@ -124,106 +117,100 @@ function gelu(x::Real) h * x * (one(x) + tanh(λ * (x + α * x^3))) end - """ - swish(x) = x * σ(x) + swish(x) -Self-gated activation function. +Self-gated activation function. Return `x * σ(x)`. See [Swish: a Self-Gated Activation Function](https://arxiv.org/pdf/1710.05941.pdf). """ swish(x::Real) = x * σ(x) - """ - lisht(x) = x * tanh(x) + lisht(x) -Non-Parametric Linearly Scaled Hyperbolic Tangent Activation Function. +Non-Parametric Linearly Scaled Hyperbolic Tangent Activation Function. Return `x * tanh(x)`. See [LiSHT](https://arxiv.org/abs/1901.05894) """ lisht(x::Real) = x * tanh(x) - """ - selu(x) = λ * (x ≥ 0 ? x : α * (exp(x) - 1)) - + selu(x) + λ ≈ 1.0507 α ≈ 1.6733 -Scaled exponential linear units. +Scaled exponential linear units. Return `λ * (x ≥ 0 ? x : α * (exp(x) - 1))`. See [Self-Normalizing Neural Networks](https://arxiv.org/pdf/1706.02515.pdf). """ function selu(x::Real) λ = oftype(x / 1, 1.0507009873554804934193349852946) α = oftype(x / 1, 1.6732632423543772848170429916717) - λ * ifelse(x > 0, x / one(x), α * (exp(x) - one(x))) + λ * ifelse(x > 0, x / 1, α * (exp(x) - one(x))) end """ - celu(x, α=1) = - (x ≥ 0 ? x : α * (exp(x/α) - 1)) + celu(x, α=1) -Continuously Differentiable Exponential Linear Units +Return `(x ≥ 0 ? x : α * (exp(x/α) - 1))`. See [Continuously Differentiable Exponential Linear Units](https://arxiv.org/pdf/1704.07483.pdf). """ -celu(x::Real, α::Real = one(x)) = ifelse(x ≥ 0, x / one(x), α * (exp(x/α) - one(x))) - +celu(x::Real, α::Real=one(x)) = ifelse(x ≥ 0, x / 1, α * (exp(x/α) - one(x))) """ - trelu(x, theta = 1.0) = x > theta ? x : 0 + trelu(x, θ=1.0) -Threshold Gated Rectified Linear. +Threshold Gated Rectified Linear. Return `x > θ ? x : 0`. See [ThresholdRelu](https://arxiv.org/pdf/1402.3337.pdf) """ -trelu(x::Real,theta = one(x)) = ifelse(x> theta, x, zero(x)) +trelu(x::Real, θ=one(x)) = ifelse(x> θ, x, zero(x)) const thresholdrelu = trelu - """ - softsign(x) = x / (1 + |x|) + softsign(x) +Return `x / (1 + |x|)`. See [Quadratic Polynomials Learn Better Image Features](http://www.iro.umontreal.ca/~lisa/publications2/index.php/attachments/single/205). """ softsign(x::Real) = x / (one(x) + abs(x)) - """ - softplus(x) = log(exp(x) + 1) + softplus(x) +Return `log(exp(x) + 1)`. See [Deep Sparse Rectifier Neural Networks](http://proceedings.mlr.press/v15/glorot11a/glorot11a.pdf). """ softplus(x::Real) = ifelse(x > 0, x + log1p(exp(-x)), log1p(exp(x))) - """ logcosh(x) -Return `log(cosh(x))` which is computed in a numerically stable way. +Return `log(cosh(x))` which is computed in a numerically stable way as `x + softplus(-2x) - log(2)`. """ logcosh(x::Real) = x + softplus(-2x) - log(oftype(x, 2)) - """ - mish(x) = x * tanh(softplus(x)) + mish(x) -Self Regularized Non-Monotonic Neural Activation Function. +Self Regularized Non-Monotonic Neural Activation Function. Return `x * tanh(softplus(x))`. See [Mish: A Self Regularized Non-Monotonic Neural Activation Function](https://arxiv.org/abs/1908.08681). """ mish(x::Real) = x * tanh(softplus(x)) """ - tanhshrink(x) = x - tanh(x) + tanhshrink(x) +Return `x - tanh(x)`. See [Tanhshrink Activation Function](https://www.gabormelli.com/RKB/Tanhshrink_Activation_Function). """ tanhshrink(x::Real) = x - tanh(x) """ - softshrink(x, λ=0.5) = - (x ≥ λ ? x - λ : (-λ ≥ x ? x + λ : 0)) + softshrink(x, λ=0.5) +Return `(x ≥ λ ? x - λ : (-λ ≥ x ? x + λ : 0))`. See [Softshrink Activation Function](https://www.gabormelli.com/RKB/Softshrink_Activation_Function). """ -softshrink(x::Real, λ = oftype(x/1, 0.5)) = min(max(zero(x), x - λ), x + λ) +softshrink(x::Real, λ = oftype(x / 1, 0.5)) = min(max(zero(x), x - λ), x + λ) # Provide an informative error message if activation functions are called with an array for f in (:σ, :σ_stable, :hardσ, :logσ, :hardtanh, :relu, :leakyrelu, :relu6, :rrelu, :elu, :gelu, :swish, :lisht, :selu, :celu, :trelu, :softsign, :softplus, :logcosh, :mish, :tanhshrink, :softshrink) diff --git a/src/conv.jl b/src/conv.jl index 3a5d83d56..880ed5e6c 100644 --- a/src/conv.jl +++ b/src/conv.jl @@ -29,7 +29,7 @@ export conv, conv!, ∇conv_data, ∇conv_data!, ∇conv_filter, ∇conv_filter! # First, we will define mappings from the generic API names to our accelerated backend -# implementations. For homogeneous-datatype 1, 2 and 3d convolutions, we default to using +# implementations. For homogeneous-datatype 1d, 2d and 3d convolutions, we default to using # im2col + GEMM. Do so in a loop, here: for (front_name, backend) in ( # This maps from public, front-facing name, to internal backend name @@ -86,7 +86,7 @@ end # We always support a fallback, non-accelerated path, where we use the direct, but # slow, implementations. These should not typically be used, hence the `@debug`, -# but let's ggo ahead and define them first: +# but let's go ahead and define them first: for front_name in (:conv, :∇conv_data, :∇conv_filter, :depthwiseconv, :∇depthwiseconv_data, :∇depthwiseconv_filter) @eval begin @@ -179,8 +179,6 @@ function conv(x, w::AbstractArray{T, N}; stride=1, pad=0, dilation=1, flipped=fa end - - """ depthwiseconv(x, w; stride=1, pad=0, dilation=1, flipped=false) diff --git a/src/impl/conv_direct.jl b/src/impl/conv_direct.jl index 2e2dada2f..617d69103 100644 --- a/src/impl/conv_direct.jl +++ b/src/impl/conv_direct.jl @@ -30,9 +30,9 @@ kernel, storing the result in a `Float32` output, there is at least a function c for that madness. The keyword arguments `alpha` and `beta` control accumulation behavior; this function -calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a nonzero -value, the user is able to accumulate values into a preallocated `y` buffer, or by -setting `alpha` to a nonunitary value, an arbitrary gain factor can be applied. +calculates `y = alpha * x * w + beta * y`, therefore by setting `beta` to a non-zero +value, the user is able to accumulate values into a pre-allocated `y` buffer, or by +setting `alpha` to a non-unitary value, an arbitrary gain factor can be applied. By defaulting `beta` to `false`, we make use of the Bradbury promotion trick to override `NaN`'s that may pre-exist within our output buffer, as `false*NaN == 0.0`, whereas diff --git a/src/impl/conv_im2col.jl b/src/impl/conv_im2col.jl index e06231325..eb8f36ad5 100644 --- a/src/impl/conv_im2col.jl +++ b/src/impl/conv_im2col.jl @@ -16,8 +16,8 @@ end Perform a convolution using im2col and GEMM, store the result in `y`. The kwargs `alpha` and `beta` control accumulation behavior; internally this operation is implemented as a matrix multiply that boils down to `y = alpha * x * w + beta * y`, thus -by setting `beta` to a nonzero value, multiple results can be accumulated into `y`, or -by setting `alpha` to a nonunitary value, various gain factors can be applied. +by setting `beta` to a non-zero value, multiple results can be accumulated into `y`, or +by setting `alpha` to a non-unitary value, various gain factors can be applied. Note for the particularly performance-minded, you can provide a pre-allocated `col`, which should eliminate any need for large allocations within this method. @@ -39,7 +39,7 @@ function conv_im2col!( # In english, we're grabbing each input patch and laying them out along # the M dimension in `col`, so that the GEMM call below multiplies each # kernel (which is kernel_h * kernel_w * channels_in elments long) is - # dotproducted with that input patch, effectively computing a convolution + # dot-producted with that input patch, effectively computing a convolution # in a somewhat memory-wasteful but easily-computed way (since we already # have an extremely highly-optimized GEMM call available in BLAS). M = prod(output_size(cdims)) @@ -162,9 +162,6 @@ function ∇conv_data_im2col!( end - - - """ im2col!(col, x, cdims) @@ -233,7 +230,7 @@ function im2col!(col::AbstractArray{T,2}, x::AbstractArray{T,4}, end end - + # For each "padded region", we run the fully general version @inbounds for (w_region, h_region, d_region) in padded_regions for c in 1:C_in,