Skip to content

Commit 82883a0

Browse files
authored
Fix rounding of Float32->Float16 near eps(Float16(0.0)) (#32444)
* Move F16 table to a better place * Fix rounding of Float32->Float16 near eps(Float16(0.0)) The originally described algorithm worked only for round-to-zero, and our adjustment to make it round-to-nearest was incorrect for values that result in Float16 subnormals. Fix this at the expense of an extra `|` and `&`.
2 parents 0eed27e + df9eb7d commit 82883a0

File tree

3 files changed

+66
-45
lines changed

3 files changed

+66
-45
lines changed

base/float.jl

Lines changed: 53 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -137,16 +137,61 @@ function Float32(x::Int128)
137137
reinterpret(Float32, s | d + y)
138138
end
139139

140+
# Float32 -> Float16 algorithm from:
141+
# "Fast Half Float Conversion" by Jeroen van der Zijp
142+
# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
143+
#
144+
# With adjustments for round-to-nearest, ties to even.
145+
#
146+
let _basetable = Vector{UInt16}(undef, 512),
147+
_shifttable = Vector{UInt8}(undef, 512)
148+
for i = 0:255
149+
e = i - 127
150+
if e < -25 # Very small numbers map to zero
151+
_basetable[i|0x000+1] = 0x0000
152+
_basetable[i|0x100+1] = 0x8000
153+
_shifttable[i|0x000+1] = 25
154+
_shifttable[i|0x100+1] = 25
155+
elseif e < -14 # Small numbers map to denorms
156+
_basetable[i|0x000+1] = 0x0000
157+
_basetable[i|0x100+1] = 0x8000
158+
_shifttable[i|0x000+1] = -e-1
159+
_shifttable[i|0x100+1] = -e-1
160+
elseif e <= 15 # Normal numbers just lose precision
161+
_basetable[i|0x000+1] = ((e+15)<<10)
162+
_basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
163+
_shifttable[i|0x000+1] = 13
164+
_shifttable[i|0x100+1] = 13
165+
elseif e < 128 # Large numbers map to Infinity
166+
_basetable[i|0x000+1] = 0x7C00
167+
_basetable[i|0x100+1] = 0xFC00
168+
_shifttable[i|0x000+1] = 24
169+
_shifttable[i|0x100+1] = 24
170+
else # Infinity and NaN's stay Infinity and NaN's
171+
_basetable[i|0x000+1] = 0x7C00
172+
_basetable[i|0x100+1] = 0xFC00
173+
_shifttable[i|0x000+1] = 13
174+
_shifttable[i|0x100+1] = 13
175+
end
176+
end
177+
global const shifttable = (_shifttable...,)
178+
global const basetable = (_basetable...,)
179+
end
180+
140181
function Float16(val::Float32)
141182
f = reinterpret(UInt32, val)
142183
if isnan(val)
143184
t = 0x8000 (0x8000 & ((f >> 0x10) % UInt16))
144185
return reinterpret(Float16, t ((f >> 0xd) % UInt16))
145186
end
146-
i = (f >> 23) & 0x1ff + 1
187+
i = ((f & ~significand_mask(Float32)) >> significand_bits(Float32)) + 1
147188
@inbounds sh = shifttable[i]
148-
f &= 0x007fffff
149-
@inbounds h = (basetable[i] + (f >> sh)) % UInt16
189+
f &= significand_mask(Float32)
190+
# If `val` is subnormal, the tables are set up to force the
191+
# result to 0, so the significand has an implicit `1` in the
192+
# cases we care about.
193+
f |= significand_mask(Float32) + 0x1
194+
@inbounds h = (basetable[i] + (f >> sh) & significand_mask(Float16)) % UInt16
150195
# round
151196
# NOTE: we maybe should ignore NaNs here, but the payload is
152197
# getting truncated anyway so "rounding" it might not matter
@@ -202,45 +247,6 @@ function Float32(val::Float16)
202247
return reinterpret(Float32, ret)
203248
end
204249

205-
# Float32 -> Float16 algorithm from:
206-
# "Fast Half Float Conversion" by Jeroen van der Zijp
207-
# ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
208-
209-
let _basetable = Vector{UInt16}(undef, 512),
210-
_shifttable = Vector{UInt8}(undef, 512)
211-
for i = 0:255
212-
e = i - 127
213-
if e < -24 # Very small numbers map to zero
214-
_basetable[i|0x000+1] = 0x0000
215-
_basetable[i|0x100+1] = 0x8000
216-
_shifttable[i|0x000+1] = 24
217-
_shifttable[i|0x100+1] = 24
218-
elseif e < -14 # Small numbers map to denorms
219-
_basetable[i|0x000+1] = (0x0400>>(-e-14))
220-
_basetable[i|0x100+1] = (0x0400>>(-e-14)) | 0x8000
221-
_shifttable[i|0x000+1] = -e-1
222-
_shifttable[i|0x100+1] = -e-1
223-
elseif e <= 15 # Normal numbers just lose precision
224-
_basetable[i|0x000+1] = ((e+15)<<10)
225-
_basetable[i|0x100+1] = ((e+15)<<10) | 0x8000
226-
_shifttable[i|0x000+1] = 13
227-
_shifttable[i|0x100+1] = 13
228-
elseif e < 128 # Large numbers map to Infinity
229-
_basetable[i|0x000+1] = 0x7C00
230-
_basetable[i|0x100+1] = 0xFC00
231-
_shifttable[i|0x000+1] = 24
232-
_shifttable[i|0x100+1] = 24
233-
else # Infinity and NaN's stay Infinity and NaN's
234-
_basetable[i|0x000+1] = 0x7C00
235-
_basetable[i|0x100+1] = 0xFC00
236-
_shifttable[i|0x000+1] = 13
237-
_shifttable[i|0x100+1] = 13
238-
end
239-
end
240-
global const shifttable = (_shifttable...,)
241-
global const basetable = (_basetable...,)
242-
end
243-
244250
#convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)
245251
Float32(x::Float64) = fptrunc(Float32, x)
246252
Float16(x::Float64) = Float16(Float32(x))
@@ -867,6 +873,11 @@ exponent_one(::Type{Float16}) = 0x3c00
867873
exponent_half(::Type{Float16}) = 0x3800
868874
significand_mask(::Type{Float16}) = 0x03ff
869875

876+
for T in (Float16, Float32, Float64)
877+
@eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
878+
@eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
879+
end
880+
870881
# integer size of float
871882
uinttype(::Type{Float64}) = UInt64
872883
uinttype(::Type{Float32}) = UInt32

base/math.jl

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ import .Base: log, exp, sin, cos, tan, sinh, cosh, tanh, asin,
2121
exp10, expm1, log1p
2222

2323
using .Base: sign_mask, exponent_mask, exponent_one,
24-
exponent_half, uinttype, significand_mask
24+
exponent_half, uinttype, significand_mask,
25+
significand_bits, exponent_bits
2526

2627
using Core.Intrinsics: sqrt_llvm
2728

@@ -38,8 +39,6 @@ end
3839
end
3940

4041
for T in (Float16, Float32, Float64)
41-
@eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
42-
@eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
4342
@eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
4443
# maximum float exponent
4544
@eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T))

test/float16.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,3 +166,14 @@ end
166166

167167
# issue #17148
168168
@test rem(Float16(1.2), Float16(one(1.2))) == 0.20019531f0
169+
170+
# issue #32441
171+
const f16eps2 = Float32(eps(Float16(0.0)))/2
172+
const minsubf16 = nextfloat(Float16(0.0))
173+
const minsubf16_32 = Float32(minsubf16)
174+
@test Float16(f16eps2) == Float16(0.0)
175+
@test Float16(nextfloat(f16eps2)) == minsubf16
176+
@test Float16(prevfloat(minsubf16_32)) == minsubf16
177+
# Ties to even, in this case up
178+
@test Float16(minsubf16_32 + f16eps2) == nextfloat(minsubf16)
179+
@test Float16(prevfloat(minsubf16_32 + f16eps2)) == minsubf16

0 commit comments

Comments
 (0)