Resolve conflicts

Saransh-cpp · Saransh-cpp · commit e61a9630f58b · 2022-03-28T11:13:41.000+05:30
diff --git a/docs/make.jl b/docs/make.jl
@@ -2,7 +2,7 @@ using Documenter, Flux, NNlib, Functors, MLUtils, BSON
 
 DocMeta.setdocmeta!(Flux, :DocTestSetup, :(using Flux); recursive = true)
 makedocs(modules = [Flux, NNlib, Functors, MLUtils, BSON],
-         doctest = VERSION == v"1.5",
+         doctest = false,
          sitename = "Flux",
          pages = ["Home" => "index.md",
                   "Building Models" =>
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
@@ -3,37 +3,58 @@
 Flux provides utility functions which can be used to initialize your layers
 or to regularly execute callback functions.
 
-## Layer Initialization
+## Layer Initialisation
 
-These are primarily useful if you are planning to write your own layers.
-Flux initializes convolutional layers and recurrent cells with `glorot_uniform`
-by default.
-To change the default on an applicable layer, pass the desired function with the
-`init` keyword. For example:
+Flux initialises convolutional layers and recurrent cells with `glorot_uniform` by default.
+Most layers accept a function as an `init` keyword, which replaces this default. For example:
 
 ```jldoctest; setup = :(using Flux)
-julia> conv = Conv((3, 3), 1 => 8, relu; init=Flux.glorot_normal)
-Conv((3, 3), 1 => 8, relu)  # 80 parameters
+julia> conv = Conv((3, 3), 3 => 2, relu; init=Flux.glorot_normal)
+Conv((3, 3), 3 => 2, relu)  # 56 parameters
+
+julia> conv.bias
+2-element Vector{Float32}:
+ 0.0
+ 0.0
+```
+
+Note that `init` creates the weight array, but not the bias vector.
+
+Many of the initialisation functions accept keywords such as `gain`, 
+and a random number generator. To make it easy to pass these to layers,
+there are methods which return a function:
+
+```jldoctest; setup = :(using Flux, Random)
+julia> Dense(4 => 5, tanh; init=Flux.glorot_uniform(gain=2))
+Dense(4 => 5, tanh)  # 25 parameters
+
+julia> Dense(4 => 5, tanh; init=Flux.randn32(MersenneTwister(1)))
+Dense(4 => 5, tanh)  # 25 parameters
 ```
 
 ```@docs
 Flux.glorot_uniform
 Flux.glorot_normal
 Flux.kaiming_uniform
 Flux.kaiming_normal
+Flux.truncated_normal
 Flux.orthogonal
 Flux.sparse_init
+Flux.identity_init
+Flux.ones32
+Flux.rand32
 ```
 
 ## Changing the type of model parameters
 
+The default `eltype` for models is `Float32` since models are often trained/run on GPUs.
+The `eltype` of model `m` can be changed to `Float64` by `f64(m)`:
+
 ```@docs
 Flux.f64
 Flux.f32
 ```
 
-The default `eltype` for models is `Float32` since models are often trained/run on GPUs. The `eltype` of model `m` can be changed to `Float64` by `f64(m)`, or to `Float32` by `f32(m)`.
-
 ## Model Building
 
 Flux provides some utility functions to help you generate models in an automated fashion.
diff --git a/perf/bench_utils.jl b/perf/bench_utils.jl
@@ -1,13 +1,14 @@
 using BenchmarkTools
 using Flux
 using CUDA
-using Zygote: pullback
+using Zygote: pullback, ignore
 
 
 fw(m, x) = m(x)
 bw(back) = back(1f0)
-fwbw(m, ps, x) = gradient(() -> sum(m(x)), ps)
-  
+fwbw(m, ps, x) = gradient(() -> sum(fw(m, x)), ps)
+pb(m, ps, x) = pullback(() -> sum(fw(m, x)), ps)
+
 function run_benchmark(model, x; cuda=true)
     
     if cuda 
@@ -16,7 +17,7 @@ function run_benchmark(model, x; cuda=true)
     end
 
     ps = Flux.params(model)
-    y, back = pullback(() -> sum(model(x)), ps)
+    y, back =  pb(model, ps, x)
 
 
     if cuda
diff --git a/perf/recurrent.jl b/perf/recurrent.jl
@@ -0,0 +1,62 @@
+
+
+struct RNNWrapper{T}
+  rnn::T
+end
+Flux.@functor RNNWrapper
+
+# Need to specialize for RNNWrapper.
+fw(r::RNNWrapper, X::Vector{<:AbstractArray}) = begin
+  Flux.reset!(r.rnn)
+  [r.rnn(x) for x in X]
+end
+
+fw(r::RNNWrapper, X) = begin
+  Flux.reset!(r.rnn)
+  r.rnn(X)
+end
+
+fwbw(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = gradient(ps) do
+  y = fw(r, X)
+  sum(sum(y))
+end
+
+pb(r::RNNWrapper, ps, X::Vector{<:AbstractArray}) = pullback(ps) do
+  y = fw(r, X)
+  sum(sum(y))
+end
+
+function rnn_benchmark_sweep(data_creator::Function, rnn_type)
+  for n in [2, 20, 200, 1000], ts in [1, 4, 16, 64]
+    x, x_n = data_creator(n, ts)
+    model = RNNWrapper(rnn_type(n, n))
+    
+    println("$rnn_type $x_n CPU n=$n, ts=$ts")
+    run_benchmark(model, x, cuda=false)
+    
+    println("$rnn_type $x_n CUDA n=$n, ts=$ts")
+    try
+      run_benchmark(model, x, cuda=true)
+    catch ex
+      @show typeof(ex)
+      if ex isa OutOfGPUMemoryError
+        @warn "Not enough GPU memory to run test"
+      else
+        rethrow(ex)
+      end
+    end
+  end  
+end
+
+for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
+  rnn_benchmark_sweep(rnn_type) do n, ts
+    [randn(Float32, n, n) for _ in 1:ts], "Vec"
+  end
+end
+
+for rnn_type in [Flux.RNN, Flux.GRU, Flux.LSTM]
+  rnn_benchmark_sweep(rnn_type) do n, ts
+    randn(Float32, n, n, ts), "Block"
+  end
+end
+
diff --git a/perf/runbenchmarks.jl b/perf/runbenchmarks.jl
@@ -11,3 +11,6 @@ include("conv.jl")
 
 @info "Benchmark VGG"
 include("vgg.jl")
+
+@info "Benchmark Recurrent"
+include("recurrent.jl")
diff --git a/src/functor.jl b/src/functor.jl
@@ -213,14 +213,16 @@ paramtype(T::Type{<:Real}, m) = fmap(x -> adapt(T, x), m)
 """
     f32(m)
 
-Convert the `eltype` of model's parameters to `Float32`.
+Converts the `eltype` of model's parameters to `Float32` (which is Flux's default).
+Recurses into structs marked with [`@functor`](@ref).
 """
 f32(m) = paramtype(Float32, m)
 
 """
     f64(m)
 
-Convert the `eltype` of model's parameters to `Float64`.
+Converts the `eltype` of model's parameters to `Float64`.
+Recurses into structs marked with [`@functor`](@ref).
 """
 f64(m) = paramtype(Float64, m)
 
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -164,22 +164,21 @@ struct LayerNorm{F,D,T,N}
   affine::Bool
 end
 
-function LayerNorm(sz, λ=identity; affine=true, ϵ=1f-5)
-  sz = sz isa Integer ? (sz,) : sz
-  diag = affine ? Diagonal(sz...) : nothing
-  return LayerNorm(λ, diag, ϵ, sz, affine)
+function LayerNorm(sz, λ=identity; affine::Bool=true, ϵ::Real=1f-5)
+  diag = affine ? Diagonal(sz...) : identity
+  return LayerNorm(λ, diag, ϵ, Tuple(sz), affine)
 end
 
 @functor LayerNorm
 
 function (a::LayerNorm)(x)
-  x = normalise(x, dims=1:length(a.size), ϵ=a.ϵ)
-  a.diag === nothing ? a.λ.(x) : a.λ.(a.diag(x))
+  x = a.diag(normalise(x, dims=1:length(a.size), ϵ=a.ϵ))
+  return a.λ === identity ? x : a.λ.(x)
 end
 
 function Base.show(io::IO, l::LayerNorm)
   print(io, "LayerNorm($(l.size)")
-  l.λ == identity || print(io, ", $(l.λ)")
+  l.λ === identity || print(io, ", ", l.λ)
   hasaffine(l) || print(io, ", affine=false")
   print(io, ")")
 end
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
@@ -33,9 +33,8 @@ Normalise `x` to mean 0 and standard deviation 1 across the dimension(s) given b
 Per default, `dims` is the last dimension. 
 `ϵ` is a small additive factor added to the denominator for numerical stability.
 """
-function normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5))
+@inline function normalise(x::AbstractArray; dims=ndims(x), ϵ=ofeltype(x, 1e-5))
   μ = mean(x, dims=dims)
-    #   σ = std(x, dims=dims, mean=μ, corrected=false) # use this when Zygote#478 gets merged
-  σ = std(x, dims=dims, corrected=false)
-  return (x .- μ) ./ (σ .+ ϵ)
+  σ = std(x, dims=dims, mean=μ, corrected=false)
+  return @. (x - μ) / (σ + ϵ)
 end
diff --git a/src/optimise/optimisers.jl b/src/optimise/optimisers.jl
@@ -51,7 +51,7 @@ Gradient descent optimizer with learning rate `η` and momentum `ρ`.
 - Learning rate (`η`): Amount by which gradients are discounted before updating
                        the weights.
 - Momentum (`ρ`): Controls the acceleration of gradient descent in the
-                  prominent direction, in effect dampening oscillations.
+                  prominent direction, in effect damping oscillations.
 
 # Examples
 ```julia
@@ -84,7 +84,7 @@ Gradient descent optimizer with learning rate `η` and Nesterov momentum `ρ`.
 - Learning rate (`η`): Amount by which gradients are discounted before updating
                        the weights.
 - Nesterov momentum (`ρ`): Controls the acceleration of gradient descent in the
-                           prominent direction, in effect dampening oscillations.
+                           prominent direction, in effect damping oscillations.
 
 # Examples
 ```julia
@@ -121,7 +121,7 @@ generally don't need tuning.
 - Learning rate (`η`): Amount by which gradients are discounted before updating
                        the weights.
 - Momentum (`ρ`): Controls the acceleration of gradient descent in the
-                  prominent direction, in effect dampening oscillations.
+                  prominent direction, in effect damping oscillations.
 
 # Examples
 ```julia
diff --git a/src/utils.jl b/src/utils.jl
diff --git a/test/utils.jl b/test/utils.jl