@@ -137,6 +137,45 @@ function Float32(x::Int128)
137137 reinterpret (Float32, s | d + y)
138138end
139139
140+ # Float32 -> Float16 algorithm from:
141+ # "Fast Half Float Conversion" by Jeroen van der Zijp
142+ # ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
143+
144+ let _basetable = Vector {UInt16} (undef, 512 ),
145+ _shifttable = Vector {UInt8} (undef, 512 )
146+ for i = 0 : 255
147+ e = i - 127
148+ if e < - 24 # Very small numbers map to zero
149+ _basetable[i| 0x000 + 1 ] = 0x0000
150+ _basetable[i| 0x100 + 1 ] = 0x8000
151+ _shifttable[i| 0x000 + 1 ] = 24
152+ _shifttable[i| 0x100 + 1 ] = 24
153+ elseif e < - 14 # Small numbers map to denorms
154+ _basetable[i| 0x000 + 1 ] = (0x0400 >> (- e- 14 ))
155+ _basetable[i| 0x100 + 1 ] = (0x0400 >> (- e- 14 )) | 0x8000
156+ _shifttable[i| 0x000 + 1 ] = - e- 1
157+ _shifttable[i| 0x100 + 1 ] = - e- 1
158+ elseif e <= 15 # Normal numbers just lose precision
159+ _basetable[i| 0x000 + 1 ] = ((e+ 15 )<< 10 )
160+ _basetable[i| 0x100 + 1 ] = ((e+ 15 )<< 10 ) | 0x8000
161+ _shifttable[i| 0x000 + 1 ] = 13
162+ _shifttable[i| 0x100 + 1 ] = 13
163+ elseif e < 128 # Large numbers map to Infinity
164+ _basetable[i| 0x000 + 1 ] = 0x7C00
165+ _basetable[i| 0x100 + 1 ] = 0xFC00
166+ _shifttable[i| 0x000 + 1 ] = 24
167+ _shifttable[i| 0x100 + 1 ] = 24
168+ else # Infinity and NaN's stay Infinity and NaN's
169+ _basetable[i| 0x000 + 1 ] = 0x7C00
170+ _basetable[i| 0x100 + 1 ] = 0xFC00
171+ _shifttable[i| 0x000 + 1 ] = 13
172+ _shifttable[i| 0x100 + 1 ] = 13
173+ end
174+ end
175+ global const shifttable = (_shifttable... ,)
176+ global const basetable = (_basetable... ,)
177+ end
178+
140179function Float16 (val:: Float32 )
141180 f = reinterpret (UInt32, val)
142181 if isnan (val)
@@ -202,45 +241,6 @@ function Float32(val::Float16)
202241 return reinterpret (Float32, ret)
203242end
204243
205- # Float32 -> Float16 algorithm from:
206- # "Fast Half Float Conversion" by Jeroen van der Zijp
207- # ftp://ftp.fox-toolkit.org/pub/fasthalffloatconversion.pdf
208-
209- let _basetable = Vector {UInt16} (undef, 512 ),
210- _shifttable = Vector {UInt8} (undef, 512 )
211- for i = 0 : 255
212- e = i - 127
213- if e < - 24 # Very small numbers map to zero
214- _basetable[i| 0x000 + 1 ] = 0x0000
215- _basetable[i| 0x100 + 1 ] = 0x8000
216- _shifttable[i| 0x000 + 1 ] = 24
217- _shifttable[i| 0x100 + 1 ] = 24
218- elseif e < - 14 # Small numbers map to denorms
219- _basetable[i| 0x000 + 1 ] = (0x0400 >> (- e- 14 ))
220- _basetable[i| 0x100 + 1 ] = (0x0400 >> (- e- 14 )) | 0x8000
221- _shifttable[i| 0x000 + 1 ] = - e- 1
222- _shifttable[i| 0x100 + 1 ] = - e- 1
223- elseif e <= 15 # Normal numbers just lose precision
224- _basetable[i| 0x000 + 1 ] = ((e+ 15 )<< 10 )
225- _basetable[i| 0x100 + 1 ] = ((e+ 15 )<< 10 ) | 0x8000
226- _shifttable[i| 0x000 + 1 ] = 13
227- _shifttable[i| 0x100 + 1 ] = 13
228- elseif e < 128 # Large numbers map to Infinity
229- _basetable[i| 0x000 + 1 ] = 0x7C00
230- _basetable[i| 0x100 + 1 ] = 0xFC00
231- _shifttable[i| 0x000 + 1 ] = 24
232- _shifttable[i| 0x100 + 1 ] = 24
233- else # Infinity and NaN's stay Infinity and NaN's
234- _basetable[i| 0x000 + 1 ] = 0x7C00
235- _basetable[i| 0x100 + 1 ] = 0xFC00
236- _shifttable[i| 0x000 + 1 ] = 13
237- _shifttable[i| 0x100 + 1 ] = 13
238- end
239- end
240- global const shifttable = (_shifttable... ,)
241- global const basetable = (_basetable... ,)
242- end
243-
244244# convert(::Type{Float16}, x::Float32) = fptrunc(Float16, x)
245245Float32 (x:: Float64 ) = fptrunc (Float32, x)
246246Float16 (x:: Float64 ) = Float16 (Float32 (x))
0 commit comments