log1pexp (#37)

cossio · devmotion · web-flow · commit 4e50c5441e18 · 2022-03-12T01:38:40.000+01:00
* log1pexp(x) for x < -37 Based on https://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf * generic log1pexp * simpler * generic threshold * support Julia 1.0 expm1(::Float16) was not defined in Julia 1.0 So it's better (and more accurate) to compute the threshold in BigFloat, and convert to the appropriate float type in the end. Since this is generated it doesn't cost in terms of performance. * special thresholds * add exp branch * simplify thresholds * more comments * more tests * inline * generic x2 * comment * unnecessary broadcast * oftype -> float since x0 is of type float(x) anyway * comment * < instead of <= * x = float(_x) * hard-coded bounds for FLoat32, Float64 * typo * comment * typo * comment * compiler is smart enough we don't need generated thresholds! * special case log1pexp(x::BigFloat) dynamic thresholds for log1pexp(x::BigFloat) are slow, so use slower but accurate implementation in this case * rewrite comment * don't need float(x) * rewrite _log1pexp_thresholds (more readable?) * Float16 and typo * test at +/- 1 * comment tests * typo Co-authored-by: David Widmann <devmotion@users.noreply.github.com> * one-line Co-authored-by: David Widmann <devmotion@users.noreply.github.com> * better comments * test log1pexp with multiple precisions * bump version * hard-code same thresholds as given by generic fallback * Final fixes Co-authored-by: David Widmann <devmotion@users.noreply.github.com>
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "LogExpFunctions"
 uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 authors = ["StatsFun.jl contributors, Tamas K. Papp <tkpapp@gmail.com>"]
-version = "0.3.7"
+version = "0.3.8"
 
 [deps]
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
diff --git a/src/basicfuns.jl b/src/basicfuns.jl
@@ -152,9 +152,59 @@ Return `log(1+exp(x))` evaluated carefully for largish `x`.
 
 This is also called the ["softplus"](https://en.wikipedia.org/wiki/Rectifier_(neural_networks))
 transformation, being a smooth approximation to `max(0,x)`. Its inverse is [`logexpm1`](@ref).
+
+See:
+ * Martin Maechler (2012) [“Accurately Computing log(1 − exp(− |a|))”](http://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf)
 """
-log1pexp(x::Real) = x < 18.0 ? log1p(exp(x)) : x < 33.3 ? x + exp(-x) : oftype(exp(-x), x)
-log1pexp(x::Float32) = x < 9.0f0 ? log1p(exp(x)) : x < 16.0f0 ? x + exp(-x) : oftype(exp(-x), x)
+log1pexp(x::Real) = _log1pexp(float(x)) # ensures that BigInt/BigFloat, Int/Float64 etc. dispatch to the same algorithm
+
+# Approximations based on Maechler (2012)
+# Argument `x` is a floating point number due to the definition of `log1pexp` above
+function _log1pexp(x::Real)
+    x0, x1, x2 = _log1pexp_thresholds(x)
+    if x < x0
+        return exp(x)
+    elseif x < x1
+        return log1p(exp(x))
+    elseif x < x2
+        return x + exp(-x)
+    else
+        return x
+    end
+end
+
+#= The precision of BigFloat cannot be computed from the type only and computing
+thresholds is slow. Therefore prefer version without thresholds in this case. =#
+_log1pexp(x::BigFloat) = x > 0 ? x + log1p(exp(-x)) : log1p(exp(x))
+
+#=
+Returns thresholds x0, x1, x2 such that:
+
+    * log1pexp(x) ≈ exp(x) for x ≤ x0
+    * log1pexp(x) ≈ log1p(exp(x)) for x0 < x ≤ x1
+    * log1pexp(x) ≈ x + exp(-x) for x1 < x ≤ x2
+    * log1pexp(x) ≈ x for x > x2
+
+where the tolerances of the approximations are on the order of eps(typeof(x)).
+For types for which `precision(x)` depends only on the type of `x`, the compiler
+should optimize away all computations done here.
+=#
+@inline function _log1pexp_thresholds(x::Real)
+    prec = precision(x)
+    logtwo = oftype(x, IrrationalConstants.logtwo)
+    x0 = -prec * logtwo
+    x1 = (prec - 1) * logtwo / 2
+    x2 = -x0 - log(-x0) * (1 + 1 / x0) # approximate root of e^-x == x * ϵ/2 via asymptotics of Lambert's W function
+    return (x0, x1, x2)
+end
+
+#=
+For common types we hard-code the thresholds to make absolutely sure they are not recomputed
+each time. Also, _log1pexp_thresholds is not elided by the compiler in Julia 1.0 / 1.6.
+=#
+@inline _log1pexp_thresholds(::Float64) = (-36.7368005696771, 18.021826694558577, 33.23111882352963)
+@inline _log1pexp_thresholds(::Float32) = (-16.635532f0, 7.9711924f0, 13.993f0)
+@inline _log1pexp_thresholds(::Float16) = (Float16(-7.625), Float16(3.467), Float16(5.86))
 
 """
 $(SIGNATURES)
diff --git a/test/basicfuns.jl b/test/basicfuns.jl
@@ -110,15 +110,37 @@ end
 # log1pexp, log1mexp, log2mexp & logexpm1
 
 @testset "log1pexp" begin
-    @test log1pexp(2.0)    ≈ log(1.0 + exp(2.0))
-    @test log1pexp(-2.0)   ≈ log(1.0 + exp(-2.0))
-    @test log1pexp(10000)  ≈ 10000.0
-    @test log1pexp(-10000) ≈ 0.0
-
-    @test log1pexp(2f0)      ≈ log(1f0 + exp(2f0))
-    @test log1pexp(-2f0)     ≈ log(1f0 + exp(-2f0))
-    @test log1pexp(10000f0)  ≈ 10000f0
-    @test log1pexp(-10000f0) ≈ 0f0
+    for T in (Float16, Float32, Float64, BigFloat), x in 1:40
+        @test (@inferred log1pexp(+log(T(x)))) ≈ T(log1p(big(x)))
+        @test (@inferred log1pexp(-log(T(x)))) ≈ T(log1p(1/big(x)))
+    end
+
+    # special values
+    @test (@inferred log1pexp(0)) ≈ log(2)
+    @test (@inferred log1pexp(0f0)) ≈ log(2)
+    @test (@inferred log1pexp(big(0))) ≈ log(2)
+    @test (@inferred log1pexp(+1)) ≈ log1p(ℯ)
+    @test (@inferred log1pexp(-1)) ≈ log1p(ℯ) - 1
+
+    # large arguments
+    @test (@inferred log1pexp(1e4)) ≈ 1e4
+    @test (@inferred log1pexp(1f4)) ≈ 1f4
+    @test iszero(@inferred log1pexp(-1e4))
+    @test iszero(@inferred log1pexp(-1f4))
+
+    # compare to accurate but slower implementation
+    correct_log1pexp(x::Real) = x > 0 ? x + log1p(exp(-x)) : log1p(exp(x))
+    # large range needed to cover all branches, for all floats (from Float16 to BigFloat)
+    for T in (Int, Float16, Float32, Float64, BigInt, BigFloat), x in -300:300
+        @test (@inferred log1pexp(T(x))) ≈ float(T)(correct_log1pexp(big(x)))
+    end
+    # test BigFloat with multiple precisions
+    for prec in (10, 20, 50, 100), x in -300:300
+        setprecision(prec) do
+            y = big(float(x))
+            @test @inferred(log1pexp(y)) ≈ correct_log1pexp(y)
+        end
+    end
 end
 
 @testset "log1mexp" begin
diff --git a/test/chainrules.jl b/test/chainrules.jl
@@ -57,14 +57,11 @@
         test_rrule(logcosh, x)
     end
 
-    # test all branches of `log1pexp`
-    for x in (-20.9, 15.4, 41.5)
-        test_frule(log1pexp, x)
-        test_rrule(log1pexp, x)
-    end
-    for x in (8.3f0, 12.5f0, 21.2f0)
-        test_frule(log1pexp, x; rtol=1f-3, atol=1f-3)
-        test_rrule(log1pexp, x; rtol=1f-3, atol=1f-3)
+    @testset "log1pexp" begin
+        for absx in (0, 1, 2, 10, 15, 20, 40), x in (-absx, absx)
+            test_scalar(log1pexp, Float64(x))
+            test_scalar(log1pexp, Float32(x); rtol=1f-3, atol=1f-3)
+        end
     end
 
     for x in (-10.2, -3.3, -0.3)