Let AlphaDropout join the RNG fun

darsnack · darsnack · commit 3c8cceb971b5 · 2022-01-27T12:05:23.000-06:00
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -93,7 +93,7 @@ function Base.show(io::IO, d::Dropout)
 end
 
 """
-    AlphaDropout(p)
+    AlphaDropout(p; rng = default_rng())
 
 A dropout layer. Used in
 [Self-Normalizing Neural Networks](https://arxiv.org/abs/1706.02515).
@@ -102,14 +102,16 @@ remain the same as before.
 
 Does nothing to the input once [`testmode!`](@ref) is true.
 """
-mutable struct AlphaDropout{F}
+mutable struct AlphaDropout{F,R<:AbstractRNG}
   p::F
   active::Union{Bool, Nothing}
-  function AlphaDropout(p, active = nothing)
+  rng::R
+  function AlphaDropout(p, active = nothing, rng = Random.default_rng())
     @assert 0 ≤ p ≤ 1
-    new{typeof(p)}(p, active)
+    new{typeof(p)}(p, active, rng)
   end
 end
+AlphaDropout(p; rng = Random.default_rng()) = AlphaDropout(p, nothing, rng)
 
 function (a::AlphaDropout)(x::AbstractArray{T}) where T
   _isactive(a) || return x
@@ -121,7 +123,7 @@ function (a::AlphaDropout)(x::AbstractArray{T}) where T
   A = T(inv(sqrt((1 - p) * (1 + p * α′^2))))
   B = T(-A * α′ * p)
 
-  noise = rand!(similar(x))
+  noise = rand!(a.rng, similar(x))
   return A .* ifelse.(noise .> p, x, α′) .+ B
 end
 
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -282,8 +282,10 @@ end
 end
 
 @testset "Dropout RNGs" begin
-  m = Dropout(0.1; rng = MersenneTwister(123))
-  @test_throws ErrorException gpu(m)
-  m = Dropout(0.1; rng = CUDA.default_rng())
-  @test gpu(m).rng === CUDA.default_rng()
+  @testset for layer in (Dropout, AlphaDropout)
+    m = layer(0.1; rng = MersenneTwister(123))
+    @test_throws ErrorException gpu(m)
+    m = layer(0.1; rng = CUDA.default_rng())
+    @test gpu(m).rng === CUDA.default_rng()
+  end
 end
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -67,31 +67,40 @@ evalwgrad(f, x...) = pullback(f, x...)[1]
 end
 
 @testset "AlphaDropout" begin
-  x = [1., 2., 3.]
-  @test x == AlphaDropout(0.1)(x)
-  @test x == evalwgrad(AlphaDropout(0), x)
-  @test zero(x) == evalwgrad(AlphaDropout(1), x)
-
-  x = randn(1000) # large enough to prevent flaky test
-  m = AlphaDropout(0.5)
-
-  y = evalwgrad(m, x)
-  # Should preserve unit mean and variance
-  @test mean(y) ≈ 0 atol=0.1
-  @test var(y) ≈ 1 atol=0.1
-
-  testmode!(m, true) # should override istraining
-  @test evalwgrad(m, x) == x
-
-  testmode!(m, false)
-  y = evalwgrad(m, x)
-  @test mean(y) ≈ 0 atol=0.1
-  @test var(y) ≈ 1 atol=0.1
-  
-  # Known good value ranges
-  # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
-  x = ones(100)
-  @test 40 < sum(evalwgrad(m, x)) < 130
+  @testset for rng_kwargs in ((), (; rng = MersenneTwister(123)))
+    x = [1., 2., 3.]
+    @test x == AlphaDropout(0.1; rng_kwargs...)(x)
+    @test x == evalwgrad(AlphaDropout(0; rng_kwargs...), x)
+    @test zero(x) == evalwgrad(AlphaDropout(1; rng_kwargs...), x)
+
+    x = randn(1000) # large enough to prevent flaky test
+    m = AlphaDropout(0.5; rng_kwargs...)
+
+    y = evalwgrad(m, x)
+    # Should preserve unit mean and variance
+    @test mean(y) ≈ 0 atol=0.1
+    @test var(y) ≈ 1 atol=0.1
+
+    testmode!(m, true) # should override istraining
+    @test evalwgrad(m, x) == x
+
+    testmode!(m, false)
+    y = evalwgrad(m, x)
+    @test mean(y) ≈ 0 atol=0.1
+    @test var(y) ≈ 1 atol=0.1
+
+    # Known good value ranges
+    # Values taken from https://github.com/pytorch/pytorch/blob/v1.10.0/test/cpp/api/modules.cpp#L1337-L1338
+    x = ones(100)
+    @test 40 < sum(evalwgrad(m, x)) < 130
+
+    # CPU RNGs map onto CPU ok
+    if isempty(rng_kwargs)
+      @test cpu(m).rng === Random.default_rng()
+    else
+      @test cpu(m).rng === only(values(rng_kwargs))
+    end
+  end
 end
 
 @testset "BatchNorm" begin