Add f16 (#2184)

mcabbott · web-flow · commit 68b165cf310b · 2023-02-11T05:11:20.000-05:00
* add f16 * rm duplicate ones32 * add ones16, rand16, etc * better docstrings * make _match_eltype noisy about 32 -> 16 * add a few tests * add f16 to docs * fixes * more tests * news * also remove some adapt piracy * fixes & tests * Revert "add ones16, rand16, etc" This reverts commit 7d2e8f1. * rm a test * fixup * fix promotion in BatchNorm
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,8 @@
 # Flux Release Notes
 
+## v0.13.13
+* Added `f16` which changes precision to `Float16`, recursively.
+
 ## v0.13.12
 * CUDA.jl 4.0 compatibility.
 
diff --git a/docs/src/utilities.md b/docs/src/utilities.md
@@ -61,4 +61,5 @@ The `eltype` of model `m` can be changed to `Float64` by `f64(m)`:
 ```@docs
 Flux.f64
 Flux.f32
+Flux.f16
 ```
diff --git a/src/Flux.jl b/src/Flux.jl
@@ -24,7 +24,7 @@ export Chain, Dense, Embedding, Maxout, SkipConnection, Parallel, PairwiseFusion
        AdaptiveMaxPool, AdaptiveMeanPool, GlobalMaxPool, GlobalMeanPool, MaxPool, MeanPool,
        Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
        Upsample, PixelShuffle,
-       fmap, cpu, gpu, f32, f64, rand32, randn32, zeros32, ones32,
+       fmap, cpu, gpu, f32, f64, f16, rand32, randn32, zeros32, ones32,
        testmode!, trainmode!
 
 include("optimise/Optimise.jl")
diff --git a/src/deprecations.jl b/src/deprecations.jl
@@ -84,6 +84,8 @@ Base.@deprecate_binding ADADelta AdaDelta
 # Remove sub-module Data, while making sure Flux.Data.DataLoader keeps working
 Base.@deprecate_binding Data Flux false "Sub-module Flux.Data has been removed. The only thing it contained may be accessed as Flux.DataLoader"
 
+@deprecate paramtype(T,m) _paramtype(T,m) false  # internal method, renamed to make this clear
+
 @deprecate rng_from_array() default_rng_value()
 
 function istraining()
diff --git a/src/functor.jl b/src/functor.jl
@@ -146,24 +146,26 @@ ChainRulesCore.rrule(::typeof(adapt), a::FluxCUDAAdaptor, x::AbstractArray) =
 """
     cpu(m)
 
-Moves `m` onto the CPU, the opposite of [`gpu`](@ref).
+Copies `m` onto the CPU, the opposite of [`gpu`](@ref).
 Recurses into structs marked [`@functor`](@ref).
 
+# Example
 ```julia-repl
-julia> m = Dense(1,2)
-Dense(1, 2)
+julia> m_gpu = Dense(CUDA.randn(2, 5))
+Dense(5 => 2)       # 12 parameters
 
-julia> m_gpu = gpu(m)
-Dense(1, 2)
+julia> m_gpu.bias  # matches the given weight matrix
+2-element CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}:
+ 0.0
+ 0.0
 
-julia> typeof(m_gpu.W)
-CuArray{Float32, 2}
+julia> m = m_gpu |> cpu
+Dense(5 => 2)       # 12 parameters
 
-julia> m_cpu = cpu(m_gpu)
-Dense(1, 2)
-
-julia> typeof(m_cpu.W)
-Matrix{Float32}
+julia> m.bias
+2-element Vector{Float32}:
+ 0.0
+ 0.0
 ```
 """
 cpu(x) = fmap(x -> adapt(FluxCPUAdaptor(), x), x, exclude = _isleaf)
@@ -178,24 +180,32 @@ _isleaf(x) = _isbitsarray(x) || Functors.isleaf(x)
 """
     gpu(x)
 
-Moves `m` to the current GPU device, if available. It is a no-op otherwise.
+Copies `m` to the current GPU device, if one is available.
+If no GPU is available, it does nothing (but prints a warning the first time).
+
+On arrays, this calls CUDA's `cu`, which also changes arrays
+with Float64 elements to Float32 while copying them to the device.
+To act on arrays within a struct, the struct type must be marked with [`@functor`](@ref).
+
+Use [`cpu`](@ref) to copy back to ordinary `Array`s.
+See also [`f32`](@ref) and [`f16`](@ref) to change element type only.
+
 See the [CUDA.jl docs](https://juliagpu.github.io/CUDA.jl/stable/usage/multigpu/) 
 to help identify the current device.
 
-This works for functions, and any struct marked with [`@functor`](@ref).
-
+# Example
 ```julia-repl
-julia> m = Dense(1,2)
-Dense(1, 2)
+julia> m = Dense(rand(2, 3))  # constructed with Float64 weight matrix
+Dense(3 => 2)       # 8 parameters
 
-julia> typeof(m.W)
-Matrix{Float32}
+julia> typeof(m.weight)
+Matrix{Float64} (alias for Array{Float64, 2})
 
-julia> m_gpu = gpu(m)
-Dense(1, 2)
+julia> m_gpu = gpu(m)  # can equivalently be written m_gpu = m |> gpu
+Dense(3 => 2)       # 8 parameters
 
-julia> typeof(m_gpu.W) # notice the type of the array changed to a CuArray
-CuArray{Float32, 2}
+julia> typeof(m_gpu.weight)
+CUDA.CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
 ```
 """
 function gpu(x)
@@ -216,25 +226,55 @@ ChainRulesCore.@non_differentiable check_use_cuda()
 
 # Precision
 
-adapt_storage(T::Type{<:Real}, xs::AbstractArray{<:Real}) = convert.(T, xs) # piracy
+struct FluxEltypeAdaptor{T} end
+
+Adapt.adapt_storage(::FluxEltypeAdaptor{T}, x::AbstractArray{<:Number}) where T = convert(AbstractArray{T}, x)
 
-paramtype(T::Type{<:Real}, m) = fmap(x -> adapt(T, x), m)
+_paramtype(::Type{T}, m) where T = fmap(adapt(FluxEltypeAdaptor{T}()), m)
+_paramtype(::Type{T}, x::AbstractArray{<:Real}) where {T} = convert(AbstractArray{T}, x)
 
 """
     f32(m)
 
 Converts the `eltype` of model's parameters to `Float32` (which is Flux's default).
 Recurses into structs marked with [`@functor`](@ref).
+See also [`f64`](@ref) and [`f16`](@ref).
 """
-f32(m) = paramtype(Float32, m)
+f32(m) = _paramtype(Float32, m)
 
 """
     f64(m)
 
 Converts the `eltype` of model's parameters to `Float64`.
 Recurses into structs marked with [`@functor`](@ref).
 """
-f64(m) = paramtype(Float64, m)
+f64(m) = _paramtype(Float64, m)
+
+"""
+    f16(m)
+
+Converts the `eltype` of model's parameters to `Float16`.
+Recurses into structs marked with [`@functor`](@ref).
+
+Support for `Float16` is limited on many CPUs. Julia may
+convert to `Float32` for each operation, which is slow.
+
+# Example
+```jldoctest
+julia> m = Chain(Dense(784, 2048, relu), Dense(2048, 10))  # all Float32
+Chain(
+  Dense(784 => 2048, relu),             # 1_607_680 parameters
+  Dense(2048 => 10),                    # 20_490 parameters
+)                   # Total: 4 arrays, 1_628_170 parameters, 6.211 MiB.
+
+julia> m |> f16  # takes half the memory
+Chain(
+  Dense(784 => 2048, relu),             # 1_607_680 parameters
+  Dense(2048 => 10),                    # 20_490 parameters
+)                   # Total: 4 arrays, 1_628_170 parameters, 3.106 MiB.
+```
+"""
+f16(m) = _paramtype(Float16, m)
 
 # Functors for certain Julia data structures
 @functor Cholesky
diff --git a/src/layers/normalise.jl b/src/layers/normalise.jl
@@ -194,7 +194,8 @@ function (a::LayerNorm)(x::AbstractArray)
       _size_check(a, x, d => size(a.diag.scale, d))
     end
   end
-  a.diag(normalise(x, dims=1:length(a.size), ϵ=a.ϵ))
+  eps = convert(float(eltype(x)), a.ϵ)  # avoids promotion for Float16 data, but should ε chage too?
+  a.diag(normalise(x, dims=1:length(a.size), ϵ=eps))
 end
 
 function Base.show(io::IO, l::LayerNorm)
@@ -223,7 +224,8 @@ function _norm_layer_forward(
     end
   end
 
-  o = _norm_layer_forward(x, μ, σ², l.ϵ)
+  eps = convert(float(T), l.ϵ)
+  o = _norm_layer_forward(x, μ, σ², eps)
   hasaffine(l) || return l.λ.(o)
 
   γ = reshape(l.γ, affine_shape)
diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
@@ -79,6 +79,13 @@ function _match_eltype(layer, ::Type{Float32}, x::AbstractArray{Float64})
   convert(AbstractArray{Float32}, x)
 end
 
+# Bug in Float16 use?
+function _match_eltype(layer, ::Type{Float16}, x::AbstractArray{Float32})
+  @warn "Layer with Float16 parameters got Float32 input.
+  The input will be converted, but may indicate a problem in earlier layers." layer summary(x) maxlog=1
+  convert(AbstractArray{Float16}, x)
+end
+
 # Allow OneHot to reach specialisation of * etc:
 _match_eltype(layer, ::Type, x::OneHotLike) = x
 
diff --git a/src/utils.jl b/src/utils.jl
@@ -468,9 +468,6 @@ identity_init(rng::AbstractRNG=default_rng_value(); init_kwargs...) = (args...;k
 
 ChainRulesCore.@non_differentiable identity_init(::Any...)
 
-ones32(dims::Integer...) = Base.ones(Float32, dims...)
-zeros32(dims::Integer...) = Base.zeros(Float32, dims...)
-
 """
     ones32(size...) = ones(Float32, size...)
 
diff --git a/test/cuda/cudnn.jl b/test/cuda/cudnn.jl
@@ -2,24 +2,28 @@ using Flux, CUDA, Test
 using Flux: pullback
 
 @testset "CUDNN BatchNorm" begin
-    @testset "4D Input" begin
-        x = rand(Float32, 2, 2, 3, 4)
-        m = BatchNorm(3)
+    @testset "4D Input, $T" for (T,f) in [(Float32, identity), (Float16, f16)]
+        x = randn(T, 2, 2, 3, 4)
+        m = f(BatchNorm(3))
         gx = gpu(x)
         gm = gpu(m)
 
         y, back = pullback((m, x) -> m(x), m, x)
         gy, gback = pullback((m, x) -> m(x), gm, gx)
 
-        @test cpu(gy) ≈ y
+        @test cpu(gy) ≈ y  rtol=1e-3
+        @test eltype(gy) == T
+        @test eltype(gm(gx)) == T
 
-        Δ = randn(Float32, size(y))
+        Δ = randn(T, size(y))
         dm, dx = back(Δ)
-        gdm, gdx = gback(gpu(Δ))
+        gdm, gdx = gback(f(gpu(Δ)))
 
         @test dm[].γ ≈ cpu(gdm[].γ)
         @test dm[].β ≈ cpu(gdm[].β)
         @test dx ≈ cpu(gdx)
+        @test eltype(gdm[].γ) == T
+        @test eltype(gdx) == T
     end
 
     @testset "2D Input" begin
diff --git a/test/cuda/layers.jl b/test/cuda/layers.jl
@@ -290,3 +290,51 @@ end
     @test gpu(m).rng isa CUDA.RNG
   end
 end
+
+@testset "Misc. Float16" begin
+  # These tests are very far from exhaustive!
+
+  x = randn(Float16, 3, 4)
+  gx = gpu(x)
+
+  # Dense
+  m1 = f16(Dense(3 => 4, tanh))
+  gm1 = gpu(m1)
+
+  y1, back1 = Zygote.pullback(|>, x, m1)
+  gy1, gback1 = Zygote.pullback(|>, gx, gm1)
+
+  @test y1 ≈ m1(x) ≈ cpu(gy1)
+  @test eltype(y1) == eltype(m1(x)) == eltype(gy1) == Float16
+
+  @test back1(one.(y1))[2].weight ≈ cpu(gback1(one.(gy1))[2].weight)
+  @test eltype(gback1(one.(gy1))[2].bias) == Float16
+
+  # A fake loss with Float32
+  f1(x) = sum((Float32.(x) .- 1).^2)
+  @test gradient(f1, x)[1] ≈ cpu(gradient(f1, gx)[1])
+  @test eltype(gradient(f1, gx)[1]) == Float16
+
+  # Normalisation
+  m2 = Chain(LayerNorm(3), Dropout(0.1)) |> f16
+  gm2 = m2 |> gpu
+  @test m2(x) ≈ cpu(gm2(gx))
+  @test eltype(m2(x)) == Float16
+  @test eltype(gm2(gx)) == Float16
+
+  # Conv
+  x3 = randn(Float16, 7, 2, 1)
+  m3 = Conv((3,), 2=>1, sigmoid, pad=1, stride=2) |> f16
+  @test m3(x3) ≈ f16(f32(m3)(f32(x3))) ≈ cpu(gpu(m3)(gpu(x3)))
+  @test eltype(m3(x3)) == Float16
+  dw = gradient((m,x) -> sum(abs2, m(x)), m3, x3)[1].weight
+  @test dw ≈ f16(gradient((m,x) -> sum(abs2, m(x)), f32(m3), f32(x3))[1].weight)
+  @test dw ≈ cpu(gradient((m,x) -> sum(abs2, m(x)), gpu(m3), gpu(x3))[1].weight)
+  @test eltype(dw) == Float16
+
+  # Pooling
+  for pool in [MaxPool((2,)), MeanPool((2,))]
+    pool(reshape(x,3,4,1)) ≈ cpu(pool(reshape(gx,3,4,1)))
+    @test eltype(pool(reshape(gx,3,4,1))) == Float16
+  end
+end
diff --git a/test/cuda/losses.jl b/test/cuda/losses.jl
@@ -26,13 +26,19 @@ y = [1  0  0  0  1
      0  0  1  0  0]
 @test focal_loss(x, y) ≈ focal_loss(gpu(x), gpu(y))
 
-@testset "GPU grad tests" begin
-  x = rand(Float32, 3,3)
-  y = rand(Float32, 3,3)
+@testset "GPU: $loss" for loss in ALL_LOSSES
+  x = rand(Float32, 3,4)
+  y = rand(Float32, 3,4)
+  @test loss(x, y) ≈ loss(gpu(x), gpu(y))
 
-  for loss in ALL_LOSSES
-    gpu_autodiff_test(loss, x, y)
-  end
+  gpu_autodiff_test(loss, x, y)
+
+  # Float16 tests
+  @test loss(f16(x), f16(y)) ≈ loss(gpu(f16(x)), gpu(f16(y)))
+  @test loss(f16(x), f16(y)) ≈ Float16(loss(x, y))  rtol=0.1  # no GPU in fact
+
+  g16 = gradient(loss, f16(x), f16(y))[1]
+  @test g16 ≈ cpu(gradient(loss, f16(gpu(x)), f16(gpu(y)))[1])
 end
 
 end #testset
diff --git a/test/layers/normalisation.jl b/test/layers/normalisation.jl
@@ -161,16 +161,24 @@ end
     @inferred m(x)
   end
 
-  let m = BatchNorm(2; track_stats=false), x = [1.0 3.0 5.0; 2.0 4.0 6.0]
-    @inferred m(x)
+  let m = BatchNorm(2; track_stats=false), x = Float32[1.0 3.0 5.0; 2.0 4.0 6.0]
+    y = @inferred m(x)
+    m16 = f16(m)
+    y16 = @inferred m16(f16(x))
+    @test eltype(y16) == Float16
+    @test y16 ≈ y  atol=1e-3
   end
 
   # with activation function
-  let m = BatchNorm(2, sigmoid), x = [1.0 3.0 5.0;
-                                      2.0 4.0 6.0]
+  let m = BatchNorm(2, sigmoid), x = Float32[1.0 3.0 5.0;
+                                             2.0 4.0 6.0]
     y = m(x)
     @test isapprox(y, sigmoid.((x .- m.μ) ./ sqrt.(m.σ² .+ m.ϵ)), atol = 1.0e-7)
     @inferred m(x)
+    m16 = f16(m)
+    y16 = @inferred m16(f16(x))
+    @test eltype(y16) == Float16
+    @test y16 ≈ y  atol=1e-3
   end
 
   let m = trainmode!(BatchNorm(2)), x = reshape(Float32.(1:6), 3, 2, 1)
diff --git a/test/utils.jl b/test/utils.jl
@@ -285,16 +285,26 @@ end
 end
 
 @testset "Precision" begin
-  m = Chain(Dense(10, 5, relu), Dense(5, 2))
+  m = Chain(Dense(10, 5, relu; bias=false), Dense(5, 2))
   x64 = rand(Float64, 10)
   x32 = rand(Float32, 10)
+
+  # Models
   @test eltype(m[1].weight) == Float32
   @test eltype(m(x32)) == Float32
   @test eltype(m(x64)) == Float32  # fixed by _match_eltype
   @test eltype(f64(m)(x32)) == Float64  # _match_eltype promotes, Julia would too
   @test eltype(f64(m)(x64)) == Float64
   @test eltype(f64(m)[1].weight) == Float64
   @test eltype(f32(f64(m))[1].weight) == Float32
+
+  # Arrays
+  @test f32(x64) isa Vector{Float32}
+  @test f16(x64') isa Adjoint{Float16}  # adapt goes inside the Adjoint
+  @test f32(x32) === x32  # doesn't copy when eltype is OK
+  @test f32(x32') === x32'
+  @test gradient(x -> sum(f16(x)), x32)[1] isa Vector{Float32}
+  @test gradient(x -> sum(f64(x)), x32')[1] isa Adjoint{Float32}
 end
 
 @testset "zero bias" begin