revert

DhairyaLGandhi · DhairyaLGandhi · commit ddc688f52f4e · 2021-04-01T05:17:24.000+05:30
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -1,23 +1,16 @@
 """
     Chain(layers...)
-
 Chain multiple layers / functions together, so that they are called in sequence
 on a given input.
-
 `Chain` also supports indexing and slicing, e.g. `m[2]` or `m[1:end-1]`.
 `m[1:3](x)` will calculate the output of the first three layers.
-
 # Examples
 ```jldoctest
 julia> m = Chain(x -> x^2, x -> x+1);
-
 julia> m(5) == 26
 true
-
 julia> m = Chain(Dense(10, 5), Dense(5, 2));
-
 julia> x = rand(10);
-
 julia> m(x) == m[2](m[1](x))
 true
 ```
@@ -52,7 +45,6 @@ end
 # only slightly changed to better handle interaction with Zygote @dsweber2
 """
     activations(c::Chain, input)
-
 Calculate the forward results of each layers in Chain `c` with `input` as model input.
 """
 function activations(c::Chain, input)
@@ -69,81 +61,75 @@ extraChain(::Tuple{}, x) = ()
 
 
 """
-    Dense(in, out, σ = identity; bias = true, init = glorot_uniform)
+    Dense(in, out, σ=identity; bias=true, init=glorot_uniform)
     Dense(W::AbstractMatrix, [bias, σ])
-
 Create a traditional `Dense` layer, whose forward pass is given by:
-
     y = σ.(W * x .+ bias)
-
 The input `x` should be a vector of length `in`, or batch of vectors represented
 as an `in × N` matrix, or any array with `size(x,1) == in`.
 The out `y` will be a vector  of length `out`, or a batch with
 `size(y) == (out, size(x)[2:end]...)`
-
-Keyword `bias = false` will switch off trainable bias for the layer.
+Keyword `bias=false` will switch off trainable bias for the layer.
 The initialisation of the weight matrix is `W = init(out, in)`, calling the function
 given to keyword `init`, with default [`glorot_uniform`](@doc Flux.glorot_uniform).
 The weight matrix and/or the bias vector (of length `out`) may also be provided explicitly.
-
 # Examples
 ```jldoctest
 julia> d = Dense(5, 2)
 Dense(5, 2)
-
 julia> d(rand(Float32, 5, 64)) |> size
 (2, 64)
-
 julia> d(rand(Float32, 5, 1, 1, 64)) |> size  # treated as three batch dimensions
 (2, 1, 1, 64)
-
 julia> d1 = Dense(ones(2, 5), false, tanh)  # using provided weight matrix
 Dense(5, 2, tanh; bias=false)
-
 julia> d1(ones(5))
-2-element Vector{Float64}:
+2-element Array{Float64,1}:
  0.9999092042625951
  0.9999092042625951
-
 julia> Flux.params(d1)  # no trainable bias
 Params([[1.0 1.0 … 1.0 1.0; 1.0 1.0 … 1.0 1.0]])
 ```
 """
-struct Dense{F,S<:AbstractArray,T}
-  weight::S
-  bias::T
+struct Dense{F, M<:AbstractMatrix, B}
+  weight::M
+  bias::B
   σ::F
+  function Dense(W::M, bias = true, σ::F = identity) where {M<:AbstractMatrix, F}
+    b = create_bias(W, bias, size(W,1))
+    new{F,M,typeof(b)}(W, b, σ)
+  end
 end
 
-Dense(W, b) = Dense(W, b, identity)
-
-Dense(W::AbstractArray, b::Bool = true, σ = identity) =
-  Dense(W, create_bias(W, b, size(W,1)), σ)
+function Dense(in::Integer, out::Integer, σ = identity;
+               initW = nothing, initb = nothing,
+               init = glorot_uniform, bias=true)
 
-function Dense(in::Integer, out::Integer, σ = identity; initW = nothing,
-               init = glorot_uniform, initb = nothing, bias::Bool = true)
-  if initW !== nothing
-    Base.depwarn("initW is deprecated, please use the `init` keyword instead", :Dense)
-    init = initW
+  W = if initW !== nothing
+    Base.depwarn("keyword initW is deprecated, please use init (which similarly accepts a funtion like randn)", :Dense)
+    initW(out, in)
+  else
+    init(out, in)
   end
 
-  if initb !== nothing
-    Base.depwarn("initb is deprecated, please use the array based constructors instead", :Dense)
-    initb = initb
+  b = if bias === true && initb !== nothing
+    Base.depwarn("keyword initb is deprecated, please simply supply the bias vector, bias=initb(out)", :Dense)
+    initb(out)
   else
-    initb = zeros
+    bias
   end
-  Dense(init(out, in), bias ? initb(out) : Zeros(), σ)
+
+  return Dense(W, b, σ)
 end
 
 @functor Dense
 
 function (a::Dense)(x::AbstractVecOrMat)
   W, b, σ = a.weight, a.bias, a.σ
-  σ.(W * x .+ b) 
+  return σ.(W*x .+ b)
 end
 
-(a::Dense)(x) =
+(a::Dense)(x::AbstractArray) = 
   reshape(a(reshape(x, size(x,1), :)), :, size(x)[2:end]...)
 
 function Base.show(io::IO, l::Dense)
@@ -156,14 +142,10 @@ end
 """
     Diagonal(α, β)
     Diagonal(size::Integer...)
-
 Create an element-wise linear layer, which performs
-
     y = α .* x .+ β
-
 The learnable arrays are initialised `α = ones(Float32, size)` and
 `β = zeros(Float32, size)`.
-
 Used by [`LayerNorm`](@ref).
 """
 struct Diagonal{T}
@@ -197,11 +179,9 @@ end
 
 """
     Maxout(over)
-
 The [Maxout](https://arxiv.org/abs/1302.4389) layer has a number of
 internal layers which all receive the same input. It returns the elementwise
 maximum of the internal layers' outputs.
-
 Maxout over linear dense layers satisfies the univeral approximation theorem.
 """
 struct Maxout{FS<:Tuple}
@@ -210,20 +190,15 @@ end
 
 """
     Maxout(f, n_alts)
-
 Construct a Maxout layer over `n_alts` instances of the layer given by `f`.
 The function takes no arguments and should return some callable layer.
 Conventionally, this is a linear dense layer.
-
 # Examples
-
 This constructs a `Maxout` layer over 4 internal dense linear layers, each
 identical in structure (784 inputs, 128 outputs):
 ```jldoctest
 julia> insize = 784;
-
 julia> outsize = 128;
-
 julia> Maxout(()->Dense(insize, outsize), 4);
 ```
 """
@@ -240,25 +215,19 @@ end
 
 """
     SkipConnection(layer, connection)
-
 Create a skip connection which consists of a layer or `Chain` of consecutive
 layers and a shortcut connection linking the block's input to the output
 through a user-supplied 2-argument callable. The first argument to the callable
 will be propagated through the given `layer` while the second is the unchanged,
 "skipped" input.
-
 The simplest "ResNet"-type connection is just `SkipConnection(layer, +)`.
 Here is a more complicated example:
 ```jldoctest
 julia> m = Conv((3,3), 4 => 7, pad=(1,1));
-
 julia> x = ones(Float32, 5, 5, 4, 10);
-
 julia> size(m(x)) == (5, 5, 7, 10)
 true
-
 julia> sm = SkipConnection(m, (mx, x) -> cat(mx, x, dims=3));
-
 julia> size(sm(x)) == (5, 5, 11, 10)
 true
 ```
@@ -281,45 +250,32 @@ end
 """
     Bilinear(in1, in2, out, σ=identity; bias=true, init=glorot_uniform)
     Bilinear(W::AbstractArray, [bias, σ])
-
 Creates a Bilinear layer, which operates on two inputs at the same time.
 Its output, given vectors `x` & `y`, is another vector `z` with,
 for all `i ∈ 1:out`:
-
     z[i] = σ(x' * W[i,:,:] * y + bias[i])
-
 If `x` and `y` are matrices, then each column of the output `z = B(x, y)` is of this form,
 with `B` a Bilinear layer.
-
 If `y` is not given, it is taken to be equal to `x`, i.e. `B(x) == B(x, x)`
 The two inputs may also be provided as a tuple, `B((x, y)) == B(x, y)`,
 which is accepted as the input to a `Chain`.
-
 The initialisation works as for [`Dense`](@ref) layer, with `W = init(out, in1, in2)`.
 By default the bias vector is `zeros(Float32, out)`, option `bias=false` will switch off
 trainable bias. Either of these may be provided explicitly.
-
 # Examples
-
 ```jldoctest
 julia> x, y = randn(Float32, 5, 32), randn(Float32, 5, 32);
-
 julia> B = Flux.Bilinear(5, 5, 7);
-
 julia> B(x) |> size  # interactions based on one input
 (7, 32)
-
 julia> B(x,y) == B((x,y))  # two inputs, may be given as a tuple
 true
-
 julia> sc = SkipConnection(
                 Chain(Dense(5, 20, tanh), Dense(20, 9, tanh)),
                 Flux.Bilinear(9, 5, 3, bias=false),
             );  # used as the recombinator, with skip as the second input
-
 julia> sc(x) |> size
 (3, 32)
-
 julia> Flux.Bilinear(rand(4,8,16), false, tanh)  # first dim of weight is the output
 Bilinear(8, 16, 4, tanh, bias=false)
 ```
@@ -373,26 +329,19 @@ end
 
 """
     Parallel(connection, layers...)
-
 Create a 'Parallel' layer that passes an input array to each path in
 `layers`, reducing the output with `connection`.
-
 Called with one input `x`, this is equivalent to `reduce(connection, [l(x) for l in layers])`.
 If called with multiple inputs, they are `zip`ped with the layers, thus `Parallel(+, f, g)(x, y) = f(x) + g(y)`.
-
 # Examples
-
 ```jldoctest
 julia> model = Chain(Dense(3, 5),
                      Parallel(vcat, Dense(5, 4), Chain(Dense(5, 7), Dense(7, 4))),
                      Dense(8, 17));
-
 julia> size(model(rand(3)))
 (17,)
-
 julia> model = Parallel(+, Dense(10, 2), Dense(5, 2))
 Parallel(+, Dense(10, 2), Dense(5, 2))
-
 julia> size(model(rand(10), rand(5)))
 (2,)
 ```
diff --git a/src/utils.jl b/src/utils.jl
@@ -388,10 +388,14 @@ to the constructor's keyword `bias=bias`.
 function create_bias(weights::AbstractArray, bias::Bool, dims::Integer...)
   bias ? fill!(similar(weights, dims...), 0) : Zeros()
 end
-
 function create_bias(weights::AbstractArray, bias::AbstractArray, dims::Integer...)
   size(bias) == dims || throw(DimensionMismatch("expected bias of size $(dims), got size $(size(bias))"))
-  bias
+  if eltype(bias) == eltype(weights)
+    return bias
+  else
+    @warn "converting bias to match element type of weights" typeof(weights) typeof(bias) maxlog=3 _id=hash(dims)
+    return broadcast(eltype(weights), bias)
+  end
 end
 
 """
diff --git a/test/layers/basic.jl b/test/layers/basic.jl
@@ -40,17 +40,16 @@ import Flux: activations
       @test Dense(rand(Float16, 100,10), true).bias isa Vector{Float16}  # creates matching type
       @test_skip Dense(rand(Float16, 100,10), rand(100)).bias isa Vector{Float16}  # converts to match
 
-      @test_skip Dense(3,4; init=Base.randn, bias=true).bias isa Vector{Float64}
+      @test Dense(3,4; init=Base.randn, bias=true).bias isa Vector{Float64}
       @test_skip Dense(3,4; init=Base.randn, bias=[1,2,3,4]).bias isa Vector{Float64}
 
-
       @test_throws MethodError Dense(10, 10.5)
       @test_throws MethodError Dense(10, 10.5, tanh)
-      # @test_throws DimensionMismatch Dense(3,4; bias=rand(5))
-      # @test_throws DimensionMismatch Dense(rand(4,3), rand(5))
-      # @test_throws MethodError Dense(rand(5))
-      # @test_throws MethodError Dense(rand(5), rand(5))
-      # @test_throws MethodError Dense(rand(5), rand(5), tanh)
+      @test_throws DimensionMismatch Dense(3,4; bias=rand(5))
+      @test_throws DimensionMismatch Dense(rand(4,3), rand(5))
+      @test_throws MethodError Dense(rand(5))
+      @test_throws MethodError Dense(rand(5), rand(5))
+      @test_throws MethodError Dense(rand(5), rand(5), tanh)
     end
     @testset "dimensions" begin
       @test  length(Dense(10, 5)(randn(10))) == 5
diff --git a/test/layers/conv.jl b/test/layers/conv.jl
@@ -194,7 +194,7 @@ end
   @test fun(rand(2,3,4,5), false).bias isa Flux.Zeros
   if fun == Conv
     @test fun(rand(2,3,4,5,6), rand(6)).bias isa Vector{Float64}
-    @test_skip fun(rand(2,3,4,5,6), 1:6).bias isa Vector{Float64}
+    @test fun(rand(2,3,4,5,6), 1:6).bias isa Vector{Float64}
   elseif fun == DepthwiseConv
     @test fun(rand(2,3,4,5,6), rand(30)).bias isa Vector{Float64}
   end
diff --git a/test/utils.jl b/test/utils.jl
@@ -342,7 +342,7 @@ end
   testdense(m, bt) = @testset "Check layer $i" for (i, (l1, l2)) in enumerate(zip(m, dm(bt)))
     @test l1.W == l2.W
     @test l1.b == l2.b
-    @test_skip typeof(l1.b) === typeof(l2.b)
+    @test typeof(l1.b) === typeof(l2.b)
   end
 
   @testset "loadparams!" begin