40
40
function rem_pio2_sum (xs:: Vararg{Float64} )
41
41
n = 0
42
42
hi, lo = 0.0 , 0.0
43
- small_start = length (xs)+ 1
44
- for i in eachindex (xs)
45
- x = xs[i]
43
+ for x in xs
46
44
if abs (x) <= pi / 4
47
- small_start = i
48
- break
45
+ s = x + hi
46
+ lo += (x - (s - hi))
47
+ else
48
+ n_i, y = rem_pio2_kernel (x)
49
+ n += n_i
50
+ s = y. hi + hi
51
+ lo += (y. hi - (s - hi)) + y. lo
49
52
end
50
- n_i, y = rem_pio2_kernel (x)
51
- n += n_i
52
- s = y. hi + hi
53
- lo += (y. hi - (s - hi)) + y. lo
54
- hi = s
55
- end
56
- for i in small_start: length (xs)
57
- x = xs[i]
58
- s = x + hi
59
- lo += (x - (s - hi))
60
53
hi = s
61
54
end
62
55
while hi > pi / 4
@@ -72,7 +65,28 @@ function rem_pio2_sum(xs::Vararg{Float64})
72
65
return n, DoubleFloat64 (hi, lo)
73
66
end
74
67
75
- function rem_pio2_sum (xs:: Vararg{Union{Float32,Float64}} )
76
- n, y = rem_pio2_kernel (sum (Float64, xs))
68
+ function rem_pio2_sum (xs:: Vararg{Float32} )
69
+ y = 0.0
70
+ n = 0
71
+ # The minimum cosine or sine of any Float32 that gets reduced is 1.6e-9
72
+ # so reducing at 2^22 prevents catastrophic loss of precision.
73
+ # There probably is a case where this loses some digits but it is a decent
74
+ # tradeoff between accuracy and speed.
75
+ @fastmath for x in xs
76
+ if x > 0x1 p22
77
+ n_i, y_i = rem_pio2_kernel (Float32 (x))
78
+ n += n_i
79
+ y += y_i. hi
80
+ else
81
+ y += x
82
+ end
83
+ end
84
+ n_i, y = rem_pio2_kernel (y)
85
+ return n + n_i, DoubleFloat32 (y. hi)
86
+ end
87
+
88
+ function rem_pio2_sum (xs:: Vararg{Float16} )
89
+ y = sum (Float64, xs) # Float16 can be losslessly accumulated in Float64
90
+ n, y = rem_pio2_kernel (y)
77
91
return n, DoubleFloat32 (y. hi)
78
92
end
0 commit comments