@@ -137,16 +137,61 @@ function Float32(x::Int128)
137137 reinterpret (Float32, s | d + y)
138138end
139139
140+ # Float32 -> Float16 algorithm from:
141+ # "Fast Half Float Conversion" by Jeroen van der Zijp
142+ # ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
143+ #
144+ # With adjustments for round-to-nearest, ties to even.
145+ #
146+ let _basetable = Vector {UInt16} (undef, 512 ),
147+ _shifttable = Vector {UInt8} (undef, 512 )
148+ for i = 0 : 255
149+ e = i - 127
150+ if e < - 25 # Very small numbers map to zero
151+ _basetable[i| 0x000 + 1 ] = 0x0000
152+ _basetable[i| 0x100 + 1 ] = 0x8000
153+ _shifttable[i| 0x000 + 1 ] = 25
154+ _shifttable[i| 0x100 + 1 ] = 25
155+ elseif e < - 14 # Small numbers map to denorms
156+ _basetable[i| 0x000 + 1 ] = 0x0000
157+ _basetable[i| 0x100 + 1 ] = 0x8000
158+ _shifttable[i| 0x000 + 1 ] = - e- 1
159+ _shifttable[i| 0x100 + 1 ] = - e- 1
160+ elseif e <= 15 # Normal numbers just lose precision
161+ _basetable[i| 0x000 + 1 ] = ((e+ 15 )<< 10 )
162+ _basetable[i| 0x100 + 1 ] = ((e+ 15 )<< 10 ) | 0x8000
163+ _shifttable[i| 0x000 + 1 ] = 13
164+ _shifttable[i| 0x100 + 1 ] = 13
165+ elseif e < 128 # Large numbers map to Infinity
166+ _basetable[i| 0x000 + 1 ] = 0x7C00
167+ _basetable[i| 0x100 + 1 ] = 0xFC00
168+ _shifttable[i| 0x000 + 1 ] = 24
169+ _shifttable[i| 0x100 + 1 ] = 24
170+ else # Infinity and NaN's stay Infinity and NaN's
171+ _basetable[i| 0x000 + 1 ] = 0x7C00
172+ _basetable[i| 0x100 + 1 ] = 0xFC00
173+ _shifttable[i| 0x000 + 1 ] = 13
174+ _shifttable[i| 0x100 + 1 ] = 13
175+ end
176+ end
177+ global const shifttable = (_shifttable... ,)
178+ global const basetable = (_basetable... ,)
179+ end
180+
140181function Float16 (val:: Float32 )
141182 f = reinterpret (UInt32, val)
142183 if isnan (val)
143184 t = 0x8000 ⊻ (0x8000 & ((f >> 0x10 ) % UInt16))
144185 return reinterpret (Float16, t ⊻ ((f >> 0xd ) % UInt16))
145186 end
146- i = (f >> 23 ) & 0x1ff + 1
187+ i = ((f & ~ significand_mask (Float32)) >> significand_bits (Float32)) + 1
147188 @inbounds sh = shifttable[i]
148- f &= 0x007fffff
149- @inbounds h = (basetable[i] + (f >> sh)) % UInt16
189+ f &= significand_mask (Float32)
190+ # If `val` is subnormal, the tables are set up to force the
191+ # result to 0, so the significand has an implicit `1` in the
192+ # cases we care about.
193+ f |= significand_mask (Float32) + 0x1
194+ @inbounds h = (basetable[i] + (f >> sh) & significand_mask (Float16)) % UInt16
150195 # round
151196 # NOTE: we maybe should ignore NaNs here, but the payload is
152197 # getting truncated anyway so "rounding" it might not matter
@@ -202,45 +247,6 @@ function Float32(val::Float16)
202247 return reinterpret (Float32, ret)
203248end
204249
205- # Float32 -> Float16 algorithm from:
206- # "Fast Half Float Conversion" by Jeroen van der Zijp
207- # ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
208-
209- let _basetable = Vector {UInt16} (undef, 512 ),
210- _shifttable = Vector {UInt8} (undef, 512 )
211- for i = 0 : 255
212- e = i - 127
213- if e < - 24 # Very small numbers map to zero
214- _basetable[i| 0x000 + 1 ] = 0x0000
215- _basetable[i| 0x100 + 1 ] = 0x8000
216- _shifttable[i| 0x000 + 1 ] = 24
217- _shifttable[i| 0x100 + 1 ] = 24
218- elseif e < - 14 # Small numbers map to denorms
219- _basetable[i| 0x000 + 1 ] = (0x0400 >> (- e- 14 ))
220- _basetable[i| 0x100 + 1 ] = (0x0400 >> (- e- 14 )) | 0x8000
221- _shifttable[i| 0x000 + 1 ] = - e- 1
222- _shifttable[i| 0x100 + 1 ] = - e- 1
223- elseif e <= 15 # Normal numbers just lose precision
224- _basetable[i| 0x000 + 1 ] = ((e+ 15 )<< 10 )
225- _basetable[i| 0x100 + 1 ] = ((e+ 15 )<< 10 ) | 0x8000
226- _shifttable[i| 0x000 + 1 ] = 13
227- _shifttable[i| 0x100 + 1 ] = 13
228- elseif e < 128 # Large numbers map to Infinity
229- _basetable[i| 0x000 + 1 ] = 0x7C00
230- _basetable[i| 0x100 + 1 ] = 0xFC00
231- _shifttable[i| 0x000 + 1 ] = 24
232- _shifttable[i| 0x100 + 1 ] = 24
233- else # Infinity and NaN's stay Infinity and NaN's
234- _basetable[i| 0x000 + 1 ] = 0x7C00
235- _basetable[i| 0x100 + 1 ] = 0xFC00
236- _shifttable[i| 0x000 + 1 ] = 13
237- _shifttable[i| 0x100 + 1 ] = 13
238- end
239- end
240- global const shifttable = (_shifttable... ,)
241- global const basetable = (_basetable... ,)
242- end
243-
244250# convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)
245251Float32 (x:: Float64 ) = fptrunc (Float32, x)
246252Float16 (x:: Float64 ) = Float16 (Float32 (x))
@@ -867,6 +873,11 @@ exponent_one(::Type{Float16}) = 0x3c00
867873exponent_half (:: Type{Float16} ) = 0x3800
868874significand_mask (:: Type{Float16} ) = 0x03ff
869875
876+ for T in (Float16, Float32, Float64)
877+ @eval significand_bits (:: Type{$T} ) = $ (trailing_ones (significand_mask (T)))
878+ @eval exponent_bits (:: Type{$T} ) = $ (sizeof (T)* 8 - significand_bits (T) - 1 )
879+ end
880+
870881# integer size of float
871882uinttype (:: Type{Float64} ) = UInt64
872883uinttype (:: Type{Float32} ) = UInt32
0 commit comments