@@ -384,3 +384,80 @@ else
384384 ifelse (abs (b. divisor) == 1 , a* b. divisor, (signbit (x) + (x >> b. shift)) % Int64)
385385 end
386386end
387+
388+ # From: https://forums.developer.nvidia.com/t/a-faster-and-more-accurate-implementation-of-expm1f/48085/2
389+ # Original license copied below:
390+ # Copyright (c) 2015-2023 Norbert Juffa
391+ # All rights reserved.
392+ #
393+ # Redistribution and use in source and binary forms, with or without
394+ # modification, are permitted provided that the following conditions
395+ # are met:
396+ #
397+ # 1. Redistributions of source code must retain the above copyright
398+ # notice, this list of conditions and the following disclaimer.
399+ #
400+ # 2. Redistributions in binary form must reproduce the above copyright
401+ # notice, this list of conditions and the following disclaimer in the
402+ # documentation and/or other materials provided with the distribution.
403+ #
404+ # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
405+ # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
406+ # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
407+ # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
408+ # HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
409+ # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
410+ # LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
411+ # DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
412+ # THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
413+ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
414+ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
415+
416+ @device_override function Base. expm1 (a:: Float32 )
417+ # exp(a) = 2**i * exp(f); i = rintf (a / log(2))
418+ j = fma (1.442695f0 , a, 12582912.0f0 )
419+ j = j - 12582912.0f0
420+ i = unsafe_trunc (Int32, j)
421+ f = fma (j, - 6.93145752f-1 , a) # log_2_hi
422+ f = fma (j, - 1.42860677f-6 , f) # log_2_lo
423+
424+ # approximate r = exp(f)-1 on interval [-log(2)/2, +log(2)/2]
425+ s = f * f
426+ if a == 0.0f0
427+ s = a # ensure -0 is passed through
428+ end
429+
430+ # err = 0.997458 ulp1 = 11094007
431+ r = fma (1.98423862f-4 , f, 1.39347673f-3 )
432+ t = fma (8.33342969f-3 , f, 4.16667424f-2 )
433+ r = fma (r, s, t)
434+ r = fma (r, f, 1.66666701f-1 )
435+ r = fma (r, f, 4.99999970f-1 )
436+ # if i == 0, expm1(a) = z
437+ # if i == 1, expm1(a) = 2*(r*(f*f)+f+0.5)
438+ # if (i < 0) || (i > 1) expm1(a) = 2*((r*(f*f)+f)*t-0.5+t)
439+ u = (j == 1 ) ? (f + 0.5f0 ) : f
440+ v = fma (r, s, u)
441+ s = 0.5f0
442+ t = ldexp (s, i)
443+ y = t - s
444+ x = (t - y) - s # double-float canonicalization of difference
445+ r = fma (v, t, x) + y
446+ r = r + r
447+
448+ if j == 0
449+ r = v
450+ end
451+
452+ if j == 1
453+ r = v + v
454+ end
455+
456+ # handle severe overflow and underflow
457+ if abs (a - 1.0f0 ) > 88.0f0
458+ r = 2 ^ a
459+ r = fma (r, r, - 1.0f0 )
460+ end
461+
462+ return r
463+ end
0 commit comments