Add friendly size check (#2176)

mcabbott · web-flow · commit 2d357ad3ba43 · 2023-02-09T23:23:35.000-05:00
* add _size_check

* fix _channels_in(::CrossCor)

* outputsize

* fix GroupNorm not to re-use the same name for different things, dammit

* friendly error for ndims too

* is LayerNorm(1) allowed?

* rm outputsize(::Chain)

* doctest
diff --git a/src/layers/basic.jl b/src/layers/basic.jl
@@ -168,13 +168,16 @@ end
 @functor Dense
 
 function (a::Dense)(x::AbstractVecOrMat)
+  _size_check(a, x, 1 => size(a.weight, 2))
   σ = NNlib.fast_act(a.σ, x)  # replaces tanh => tanh_fast, etc
   xT = _match_eltype(a, x)  # fixes Float64 input, etc.
   return σ.(a.weight * xT .+ a.bias)
 end
 
-(a::Dense)(x::AbstractArray) = 
+function (a::Dense)(x::AbstractArray)
+  _size_check(a, x, 1 => size(a.weight, 2))
   reshape(a(reshape(x, size(x,1), :)), :, size(x)[2:end]...)
+end
 
 function Base.show(io::IO, l::Dense)
   print(io, "Dense(", size(l.weight, 2), " => ", size(l.weight, 1))
@@ -186,6 +189,14 @@ end
 Dense(W::LinearAlgebra.Diagonal, bias = true, σ = identity) =
   Scale(W.diag, bias, σ)
 
+function _size_check(layer, x::AbstractArray, (d, n)::Pair)
+  d > 0 || throw(DimensionMismatch(string("layer ", layer,
+    " expects ndims(input) > ", ndims(x)-d, ", but got ", summary(x))))
+  size(x, d) == n || throw(DimensionMismatch(string("layer ", layer,
+    " expects size(input, $d) == $n, but got ", summary(x))))
+end
+ChainRulesCore.@non_differentiable _size_check(::Any...)
+
 """
     Scale(size::Integer..., σ=identity; bias=true, init=ones32)
     Scale(scale::AbstractArray, [bias, σ])
diff --git a/src/layers/conv.jl b/src/layers/conv.jl
@@ -195,6 +195,7 @@ conv_dims(c::Conv, x::AbstractArray) =
 ChainRulesCore.@non_differentiable conv_dims(::Any, ::Any)
 
 function (c::Conv)(x::AbstractArray)
+  _size_check(c, x, ndims(x)-1 => _channels_in(c))
   σ = NNlib.fast_act(c.σ, x)
   cdims = conv_dims(c, x)
   xT = _match_eltype(c, x)
@@ -329,6 +330,7 @@ end
 ChainRulesCore.@non_differentiable conv_transpose_dims(::Any, ::Any)
 
 function (c::ConvTranspose)(x::AbstractArray)
+  _size_check(c, x, ndims(x)-1 => _channels_in(c))
   σ = NNlib.fast_act(c.σ, x)
   cdims = conv_transpose_dims(c, x)
   xT = _match_eltype(c, x)
@@ -418,6 +420,8 @@ struct CrossCor{N,M,F,A,V}
   dilation::NTuple{N,Int}
 end
 
+_channels_in(l::CrossCor) = size(l.weight, ndims(l.weight)-1)
+
 """
     CrossCor(weight::AbstractArray, [bias, activation; stride, pad, dilation])
 
@@ -468,6 +472,7 @@ crosscor_dims(c::CrossCor, x::AbstractArray) =
 ChainRulesCore.@non_differentiable crosscor_dims(::Any, ::Any)
 
 function (c::CrossCor)(x::AbstractArray)
+  _size_check(c, x, ndims(x)-1 => _channels_in(c))
   σ = NNlib.fast_act(c.σ, x)
   cdims = crosscor_dims(c, x)
   xT = _match_eltype(c, x)
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -188,7 +188,14 @@ LayerNorm(size_act...; kw...) = LayerNorm(Int.(size_act[1:end-1]), size_act[end]
 
 @functor LayerNorm
 
-(a::LayerNorm)(x) = a.diag(normalise(x, dims=1:length(a.size), ϵ=a.ϵ))
+function (a::LayerNorm)(x::AbstractArray)
+  ChainRulesCore.@ignore_derivatives if a.diag isa Scale
+    for d in 1:ndims(a.diag.scale)
+      _size_check(a, x, d => size(a.diag.scale, d))
+    end
+  end
+  a.diag(normalise(x, dims=1:length(a.size), ϵ=a.ϵ))
+end
 
 function Base.show(io::IO, l::LayerNorm)
   print(io, "LayerNorm(", join(l.size, ", "))
@@ -318,9 +325,8 @@ end
 @functor BatchNorm
 trainable(bn::BatchNorm) = hasaffine(bn) ? (β = bn.β, γ = bn.γ) : (;)
 
-function (BN::BatchNorm)(x)
-  @assert size(x, ndims(x)-1) == BN.chs
-  N = ndims(x)
+function (BN::BatchNorm)(x::AbstractArray{T,N}) where {T,N}
+  _size_check(BN, x, N-1 => BN.chs)
   reduce_dims = [1:N-2; N]
   affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
   return _norm_layer_forward(BN, x; reduce_dims, affine_shape)
@@ -408,10 +414,8 @@ end
 @functor InstanceNorm
 trainable(in::InstanceNorm) = hasaffine(in) ? (β = in.β, γ = in.γ) : (;)
 
-function (l::InstanceNorm)(x)
-  @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == l.chs
-  N = ndims(x)
+function (l::InstanceNorm)(x::AbstractArray{T,N}) where {T,N}
+  _size_check(l, x, N-1 => l.chs)
   reduce_dims = 1:N-2
   affine_shape = ntuple(i -> i == N-1 ? size(x, N-1) : 1, N)
   return _norm_layer_forward(l, x; reduce_dims, affine_shape)
@@ -511,17 +515,15 @@ end
             nothing, chs)
 end
 
-function (gn::GroupNorm)(x)
-  @assert ndims(x) > 2
-  @assert size(x, ndims(x)-1) == gn.chs
-  N = ndims(x)
+function (gn::GroupNorm)(x::AbstractArray)
+  _size_check(gn, x, ndims(x)-1 => gn.chs) 
   sz = size(x)
-  x = reshape(x, sz[1:N-2]..., sz[N-1]÷gn.G, gn.G, sz[N])
-  N = ndims(x)
+  x2 = reshape(x, sz[1:end-2]..., sz[end-1]÷gn.G, gn.G, sz[end])
+  N = ndims(x2)  # == ndims(x)+1
   reduce_dims = 1:N-2
-  affine_shape = ntuple(i -> i ∈ (N-1, N-2) ? size(x, i) : 1, N)
-  x = _norm_layer_forward(gn, x; reduce_dims, affine_shape)
-  return reshape(x, sz)
+  affine_shape = ntuple(i -> i ∈ (N-1, N-2) ? size(x2, i) : 1, N)
+  x3 = _norm_layer_forward(gn, x2; reduce_dims, affine_shape)
+  return reshape(x3, sz)
 end
 
 testmode!(m::GroupNorm, mode = true) =
diff --git a/src/layers/recurrent.jl b/src/layers/recurrent.jl
@@ -202,6 +202,7 @@ RNNCell((in, out)::Pair, σ=tanh; init=Flux.glorot_uniform, initb=zeros32, init_
 
 function (m::RNNCell{F,I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {F,I,H,V,T}
   Wi, Wh, b = m.Wi, m.Wh, m.b
+  _size_check(m, x, 1 => size(Wi,2))
   σ = NNlib.fast_act(m.σ, x)
   xT = _match_eltype(m, T, x)
   h = σ.(Wi*xT .+ Wh*h .+ b)
@@ -307,6 +308,7 @@ function LSTMCell((in, out)::Pair;
 end
 
 function (m::LSTMCell{I,H,V,<:NTuple{2,AbstractMatrix{T}}})((h, c), x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {I,H,V,T}
+  _size_check(m, x, 1 => size(m.Wi,2))
   b, o = m.b, size(h, 1)
   xT = _match_eltype(m, T, x)
   g = muladd(m.Wi, xT, muladd(m.Wh, h, b))
@@ -379,6 +381,7 @@ GRUCell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state = ze
   GRUCell(init(out * 3, in), init(out * 3, out), initb(out * 3), init_state(out,1))
 
 function (m::GRUCell{I,H,V,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {I,H,V,T}
+  _size_check(m, x, 1 => size(m.Wi,2))
   Wi, Wh, b, o = m.Wi, m.Wh, m.b, size(h, 1)
   xT = _match_eltype(m, T, x)
   gxs, ghs, bs = multigate(Wi*xT, o, Val(3)), multigate(Wh*h, o, Val(3)), multigate(b, o, Val(3))
@@ -448,6 +451,7 @@ GRUv3Cell((in, out)::Pair; init = glorot_uniform, initb = zeros32, init_state =
             init(out, out), init_state(out,1))
 
 function (m::GRUv3Cell{I,H,V,HH,<:AbstractMatrix{T}})(h, x::Union{AbstractVecOrMat{<:AbstractFloat},OneHotArray}) where {I,H,V,HH,T}
+  _size_check(m, x, 1 => size(m.Wi,2))
   Wi, Wh, b, Wh_h̃, o = m.Wi, m.Wh, m.b, m.Wh_h̃, size(h, 1)
   xT = _match_eltype(m, T, x)
   gxs, ghs, bs = multigate(Wi*xT, o, Val(3)), multigate(Wh*h, o, Val(2)), multigate(b, o, Val(3))
diff --git a/src/outputsize.jl b/src/outputsize.jl
@@ -62,7 +62,7 @@ which should work out of the box for custom layers.
 If `m` is a `Tuple` or `Vector`, its elements are applied in sequence, like `Chain(m...)`.
 
 # Examples
-```julia-repl
+```jldoctest
 julia> using Flux: outputsize
 
 julia> outputsize(Dense(10 => 4), (10,); padbatch=true)
@@ -80,9 +80,7 @@ julia> outputsize(m, (10, 10, 3, 64))
 (6, 6, 32, 64)
 
 julia> try outputsize(m, (10, 10, 7, 64)) catch e println(e) end
-┌ Error: layer Conv((3, 3), 3=>16), index 1 in Chain, gave an error with input of size (10, 10, 7, 64)
-└ @ Flux ~/.julia/dev/Flux/src/outputsize.jl:114
-DimensionMismatch("Input channels must match! (7 vs. 3)")
+DimensionMismatch("layer Conv((3, 3), 3 => 16) expects size(input, 3) == 3, but got 10×10×7×64 Array{Flux.NilNumber.Nil, 4}")
 
 julia> outputsize([Dense(10 => 4), Dense(4 => 2)], (10, 1)) # Vector of layers becomes a Chain
 (2, 1)
@@ -97,19 +95,6 @@ nil_input(pad::Bool, s::Tuple{Vararg{Integer}}) = pad ? fill(nil, (s...,1)) : fi
 nil_input(pad::Bool, multi::Tuple{Vararg{Integer}}...) = nil_input.(pad, multi)
 nil_input(pad::Bool, tup::Tuple{Vararg{Tuple}}) = nil_input(pad, tup...)
 
-function outputsize(m::Chain, inputsizes::Tuple{Vararg{Integer}}...; padbatch=false)
-  x = nil_input(padbatch, inputsizes...)
-  for (i,lay) in enumerate(m.layers)
-    try
-      x = lay(x)
-    catch err
-      str = x isa AbstractArray ? "with input of size $(size(x))" : ""
-      @error "layer $lay, index $i in Chain, gave an error $str"
-      rethrow(err)
-    end
-  end
-  return size(x)
-end
 
 """
     outputsize(m, x_size, y_size, ...; padbatch=false)
@@ -148,9 +133,8 @@ outputsize(m::AbstractVector, input::Tuple...; padbatch=false) = outputsize(Chai
 ## bypass statistics in normalization layers
 
 for layer in (:BatchNorm, :InstanceNorm, :GroupNorm)  # LayerNorm works fine
-  @eval function (l::$layer)(x::AbstractArray{Nil})
-    l.chs == size(x, ndims(x)-1) || throw(DimensionMismatch(
-      string($layer, " expected ", l.chs, " channels, but got size(x) == ", size(x))))
+  @eval function (l::$layer)(x::AbstractArray{Nil,N}) where N
+    _size_check(l, x, N-1 => l.chs)
     x
   end
 end
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -115,7 +115,7 @@ dropout_layers = [Dropout, AlphaDropout]
 gpu_gradtest("Dropout", dropout_layers, r, 0.5f0; test_cpu = false) # dropout is not deterministic
 
 layer_norm = [LayerNorm]
-gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 1, test_cpu = false) #TODO fix errors
+gpu_gradtest("LayerNorm 1", layer_norm, rand(Float32, 28,28,3,4), 28, test_cpu = false) #TODO fix errors
 gpu_gradtest("LayerNorm 2", layer_norm, rand(Float32, 5,4), 5)
 
 upsample = [x -> Upsample(scale=x)]
diff --git a/test/outputsize.jl b/test/outputsize.jl
@@ -144,7 +144,6 @@ end
   @test outputsize(m, (32, 32, 3); padbatch=true) == (32, 32, 3, 1)
   m2 = LayerNorm(3, 2)
   @test outputsize(m2, (3, 2)) == (3, 2) == size(m2(randn(3, 2)))
-  @test outputsize(m2, (3,)) == (3, 2) == size(m2(randn(3, 2)))
 
   m = BatchNorm(3)
   @test outputsize(m, (32, 32, 3, 16)) == (32, 32, 3, 16)