FluxML
diff --git a/‎NEWS.md
Lines changed: 2 additions & 0 deletions b/‎NEWS.md
Lines changed: 2 additions & 0 deletions
diff --git a/‎Project.toml
Lines changed: 2 additions & 2 deletions b/‎Project.toml
Lines changed: 2 additions & 2 deletions
diff --git a/‎docs/src/models/layers.md
Lines changed: 1 addition & 2 deletions b/‎docs/src/models/layers.md
Lines changed: 1 addition & 2 deletions
diff --git a/‎docs/src/saving.md
Lines changed: 22 additions & 24 deletions b/‎docs/src/saving.md
Lines changed: 22 additions & 24 deletions
diff --git a/‎src/Flux.jl
Lines changed: 2 additions & 0 deletions b/‎src/Flux.jl
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/deprecations.jl
Lines changed: 21 additions & 2 deletions b/‎src/deprecations.jl
Lines changed: 21 additions & 2 deletions
diff --git a/‎src/functor.jl
Lines changed: 0 additions & 8 deletions b/‎src/functor.jl
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/layers/basic.jl
Lines changed: 49 additions & 19 deletions b/‎src/layers/basic.jl
Lines changed: 49 additions & 19 deletions
@@ -11,6 +11,8 @@ been removed in favour of MLDatasets.jl.
 * Many utily functions and the `DataLoader` are [now provided by MLUtils.jl](https://github.com/FluxML/Flux.jl/pull/1874).
 * The DataLoader is now compatible with generic dataset types implementing `MLUtils.numobs` and `MLUtils.getobs`.
 * Added [truncated normal initialisation](https://github.com/FluxML/Flux.jl/pull/1877) of weights.
+* The `Flux.Diagonal` layer is now called `Scale`, and accepts an activation function.
+* `loadparams!` is replaced by [`loadmodel!`](https://github.com/FluxML/Flux.jl/pull/1875) which copies trainable + non-trainable parameters and performs more thorough structural checking
 
 ## v0.12.10
 * `Dropout`/`AlphaDropout` now supports [user-specified RNGs](https://github.com/FluxML/Flux.jl/pull/1838)
 
@@ -1,6 +1,6 @@
 name = "Flux"
 uuid = "587475ba-b771-5e3f-ad9e-33799f191a9c"
-version = "0.13.0-DEV"
+version = "0.13.0"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
@@ -29,7 +29,7 @@ Adapt = "3.0"
 ArrayInterface = "3.1, 4, 5"
 CUDA = "3"
 ChainRulesCore = "1.12"
-Functors = "0.2.1"
+Functors = "0.2.8"
 MLUtils = "0.2"
 MacroTools = "0.5"
 NNlib = "0.8.2"
 
@@ -25,7 +25,6 @@ CrossCor
 SamePad
 Flux.flatten
 Flux.convfilter
-Flux.depthwiseconvfilter
 ```
 
 ## Upsampling Layers
@@ -57,7 +56,7 @@ Maxout
 SkipConnection
 Parallel
 Flux.Bilinear
-Flux.Diagonal
+Flux.Scale
 Flux.Embedding
 ```
 
 
@@ -2,7 +2,7 @@
 
 You may wish to save models so that they can be loaded and run in a later
 session. The easiest way to do this is via
-[BSON.jl](https://github.com/MikeInnes/BSON.jl).
+[BSON.jl](https://github.com/JuliaIO/BSON.jl).
 
 Save a model:
 
@@ -34,7 +34,6 @@ Chain(
   Dense(5 => 2),                        # 12 parameters
   NNlib.softmax,
 )                   # Total: 4 arrays, 67 parameters, 524 bytes.
-
 ```
 
 Models are just normal Julia structs, so it's fine to use any Julia storage
@@ -44,15 +43,17 @@ versions of Flux).
 
 !!! note
 
-    If a saved model's weights are stored on the GPU, the model will not load
+    If a saved model's parameters are stored on the GPU, the model will not load
     later on if there is no GPU support available. It's best to [move your model
     to the CPU](gpu.md) with `cpu(model)` before saving it.
 
-## Saving Model Weights
+!!! warning
 
-In some cases it may be useful to save only the model parameters themselves, and
-rebuild the model architecture in your code. You can use `params(model)` to get
-model parameters.
+    Previous versions of Flux suggested saving only the model weights using
+    `@save "mymodel.bson" params(model)`.
+    This is no longer recommended and even strongly discouraged.
+    Saving models this way will only store the trainable parameters which
+    will result in incorrect behavior for layers like `BatchNorm`.
 
 ```jldoctest saving
 julia> model = Chain(Dense(10 => 5,relu),Dense(5 => 2),softmax)
@@ -64,29 +65,26 @@ Chain(
 
 julia> weights = Flux.params(model);
 
-julia> using BSON: @save
-
-julia> @save "mymodel.bson" weights
-```
-
-You can easily load parameters back into a model with `Flux.loadparams!`.
+Loading the model as shown above will return a new model with the stored parameters.
+But sometimes you already have a model, and you want to load stored parameters into it.
+This can be done as
 
 ```julia
-julia> model = Chain(Dense(10 => 5,relu),Dense(5 => 2),softmax)
-Chain(
-  Dense(10 => 5, relu),                 # 55 parameters
-  Dense(5 => 2),                        # 12 parameters
-  NNlib.softmax,
-)                   # Total: 4 arrays, 67 parameters, 524 bytes.
+using Flux: loadmodel!
+using BSON: @load
 
-julia> using BSON: @load
+# some predefined model
+model = Chain(Dense(10 => 5, relu), Dense(5 => 2), softmax)
 
-julia> @load "mymodel.bson" weights
-
-julia> Flux.loadparams!(model, weights)
+# load one model into another
+model = loadmodel!(model, @load("mymodel.bson"))
 ```
 
-The new `model` we created will now be identical to the one we saved parameters for.
+This ensures that the model loaded from `"mymodel.bson"` matches the structure of `model`. [`Flux.loadmodel!`](@ref) is also convenient for copying parameters between models in memory.
+
+```@docs
+Flux.loadmodel!
+```
 
 ## Checkpointing
 
 
@@ -46,6 +46,8 @@ include("layers/normalise.jl")
 include("layers/upsample.jl")
 include("layers/show.jl")
 
+include("loading.jl")
+
 include("outputsize.jl")
 
 include("data/Data.jl")
 
@@ -1,13 +1,13 @@
 # v0.12 deprecations
 
 function ones(dims...)
-  Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", :ones)
+  Base.depwarn("Flux.ones(size...) is deprecated, please use Flux.ones32(size...) or Base.ones(Float32, size...)", :ones, force=true)
   Base.ones(Float32, dims...)
 end
 ones(T::Type, dims...) = Base.ones(T, dims...)
 
 function zeros(dims...)
-  Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", :zeros)
+  Base.depwarn("Flux.zeros(size...) is deprecated, please use Flux.zeros32(size...) or Base.zeros(Float32, size...)", :zeros, force=true)
   Base.zeros(Float32, dims...)
 end
 zeros(T::Type, dims...) = Base.zeros(T, dims...)
@@ -39,6 +39,25 @@ function Optimise.update!(x::AbstractArray, x̄)
   x .-= x̄
 end
 
+function Diagonal(size::Integer...; kw...)
+  Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal)
+  Scale(size...; kw...)
+end
+function Diagonal(size::Tuple; kw...)
+  Base.depwarn("Flux.Diagonal is now Flux.Scale, and also allows an activation function.", :Diagonal)
+  Scale(size...; kw...)
+end
+
+# Deprecate this eventually once saving models w/o structure is no more
+function loadparams!(m, xs)
+  Base.depwarn("loadparams! will be deprecated eventually. Use loadmodel! instead.", :loadparams!)
+  for (p, x) in zip(params(m), xs)
+    size(p) == size(x) ||
+      error("Expected param size $(size(p)), got $(size(x))")
+    copyto!(p, x)
+  end
+end
+
 # Channel notation: Changed to match Conv, but very softly deprecated!
 # Perhaps change to @deprecate for v0.14, but there is no plan to remove these.
 Dense(in::Integer, out::Integer, σ = identity; kw...) =
 
@@ -85,14 +85,6 @@ function params(m...)
   return ps
 end
 
-function loadparams!(m, xs)
-  for (p, x) in zip(params(m), xs)
-    size(p) == size(x) ||
-      error("Expected param size $(size(p)), got $(size(x))")
-    copyto!(p, x)
-  end
-end
-
 struct FluxCUDAAdaptor end
 adapt_storage(to::FluxCUDAAdaptor, x) = CUDA.cu(x)
 adapt_storage(to::FluxCUDAAdaptor, x::Zygote.FillArrays.AbstractFill) = CUDA.cu(collect(x))
 
@@ -156,9 +156,8 @@ end
 @functor Dense
 
 function (a::Dense)(x::AbstractVecOrMat)
-  W, b = a.weight, a.bias
   σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
-  return σ.(W*x .+ b)
+  return σ.(a.weight * x .+ a.bias)
 end
 
 (a::Dense)(x::AbstractArray) = 
@@ -172,38 +171,69 @@ function Base.show(io::IO, l::Dense)
 end
 
 """
-    Diagonal(size::Integer...; bias=true, init=ones32)
-    Diagonal(scale::AbstractArray, [bias])
+    Scale(size::Integer..., σ=identity; bias=true, init=ones32)
+    Scale(scale::AbstractArray, [bias, σ])
 
-Create an element-wise linear layer, which performs
+Create an element-wise layer, whose forward pass is given by:
 
-    y = scale .* x .+ bias
+    y = σ.(scale .* x .+ bias)
 
-with no activation function.
- 
+This uses `.*` instead of matrix multiplication `*` of [`Dense`](@ref).
+    
 The learnable scale & bias are initialised `init(size...)` and `zeros32(size...)`,
 with `init=ones32` by default. You may specify the function `init`, 
 turn off trainable bias with `bias=false`, or provide the array(s) explicitly.
 
-Used by [`LayerNorm`](@ref).
+Used by [`LayerNorm`](@ref) with `affine=true`.
+
+# Examples
+```jldoctest
+julia> a = Flux.Scale(2)
+Scale(2)            # 4 parameters
+
+julia> Flux.params(a)
+Params([Float32[1.0, 1.0], Float32[0.0, 0.0]])
+
+julia> a([1 2 3])
+2×3 Matrix{Float32}:
+ 1.0  2.0  3.0
+ 1.0  2.0  3.0
+
+julia> b = Flux.Scale([1 2 3 4], false, abs2)
+Scale(1, 4, abs2; bias=false)  # 4 parameters
+
+julia> b([1, 10])
+2×4 Matrix{Int64}:
+   1    4    9    16
+ 100  400  900  1600
+
+julia> Flux.params(b)
+Params([[1 2 3 4]])
+```
 """
-struct Diagonal{A<:AbstractArray, B}
+struct Scale{F, A<:AbstractArray, B}
   scale::A
   bias::B
-  function Diagonal(W::M, bias = true) where M<:AbstractArray
-    b = create_bias(W, bias, size(W)...)
-    new{M, typeof(b)}(W, b)
+  σ::F
+  function Scale(scale::A, bias::B = true, σ::F = identity) where {A<:AbstractArray, B<:Union{Bool, AbstractArray}, F}
+    b = create_bias(scale, bias, size(scale)...)
+    new{F, A, typeof(b)}(scale, b, σ)
   end
 end
 
-Diagonal(sz::Integer...; bias = true, init = ones32) = Diagonal(init(sz...), bias)
+Scale(s1::Integer, s23::Integer...; bias = true, init = ones32, _act = identity) = Scale(init(s1, s23...), bias, _act)
+Scale(size_act...; bias = true, init = ones32) = Scale(size_act[1:end-1]...; bias, init, _act = size_act[end])
 
-@functor Diagonal
+@functor Scale
 
-(a::Diagonal)(x) = a.scale .* x .+ a.bias
+function (a::Scale)(x::AbstractArray)
+  σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
+  σ.(a.scale .* x .+ a.bias)
+end
 
-function Base.show(io::IO, l::Diagonal)
-  print(io, "Diagonal(", join(size(l.scale), ", "))
+function Base.show(io::IO, l::Scale)
+  print(io, "Scale(", join(size(l.scale), ", "))
+  l.σ == identity || print(io, ", ", l.σ)
   l.bias == false && print(io, "; bias=false")
   print(io, ")")
 end
@@ -212,7 +242,7 @@ end
     Maxout(layers...)
     Maxout(f, n_alts)
 
-This contains a number of internal layes, each of which receives the same input.
+This contains a number of internal layers, each of which receives the same input.
 Its output is the elementwise maximum of the the internal layers' outputs.
 
 Instead of defining layers individually, you can provide a zero-argument function