|
30 | 30 | #include <clc/shared/clc_max.h> |
31 | 31 | #include <math/clc_remainder.h> |
32 | 32 |
|
33 | | -_CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) |
34 | | -{ |
35 | | - int ux = as_int(x); |
36 | | - int ax = ux & EXSIGNBIT_SP32; |
37 | | - float xa = as_float(ax); |
38 | | - int sx = ux ^ ax; |
39 | | - int ex = ax >> EXPSHIFTBITS_SP32; |
40 | | - |
41 | | - int uy = as_int(y); |
42 | | - int ay = uy & EXSIGNBIT_SP32; |
43 | | - float ya = as_float(ay); |
44 | | - int ey = ay >> EXPSHIFTBITS_SP32; |
45 | | - |
46 | | - float xr = as_float(0x3f800000 | (ax & 0x007fffff)); |
47 | | - float yr = as_float(0x3f800000 | (ay & 0x007fffff)); |
48 | | - int c; |
49 | | - int k = ex - ey; |
50 | | - |
51 | | - while (k > 0) { |
52 | | - c = xr >= yr; |
53 | | - xr -= c ? yr : 0.0f; |
54 | | - xr += xr; |
55 | | - --k; |
56 | | - } |
57 | | - |
| 33 | +_CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) { |
| 34 | + int ux = as_int(x); |
| 35 | + int ax = ux & EXSIGNBIT_SP32; |
| 36 | + float xa = as_float(ax); |
| 37 | + int sx = ux ^ ax; |
| 38 | + int ex = ax >> EXPSHIFTBITS_SP32; |
| 39 | + |
| 40 | + int uy = as_int(y); |
| 41 | + int ay = uy & EXSIGNBIT_SP32; |
| 42 | + float ya = as_float(ay); |
| 43 | + int ey = ay >> EXPSHIFTBITS_SP32; |
| 44 | + |
| 45 | + float xr = as_float(0x3f800000 | (ax & 0x007fffff)); |
| 46 | + float yr = as_float(0x3f800000 | (ay & 0x007fffff)); |
| 47 | + int c; |
| 48 | + int k = ex - ey; |
| 49 | + |
| 50 | + while (k > 0) { |
58 | 51 | c = xr >= yr; |
59 | 52 | xr -= c ? yr : 0.0f; |
| 53 | + xr += xr; |
| 54 | + --k; |
| 55 | + } |
60 | 56 |
|
61 | | - int lt = ex < ey; |
62 | | - |
63 | | - xr = lt ? xa : xr; |
64 | | - yr = lt ? ya : yr; |
| 57 | + c = xr >= yr; |
| 58 | + xr -= c ? yr : 0.0f; |
65 | 59 |
|
| 60 | + int lt = ex < ey; |
66 | 61 |
|
67 | | - float s = as_float(ey << EXPSHIFTBITS_SP32); |
68 | | - xr *= lt ? 1.0f : s; |
| 62 | + xr = lt ? xa : xr; |
| 63 | + yr = lt ? ya : yr; |
69 | 64 |
|
70 | | - c = ax == ay; |
71 | | - xr = c ? 0.0f : xr; |
| 65 | + float s = as_float(ey << EXPSHIFTBITS_SP32); |
| 66 | + xr *= lt ? 1.0f : s; |
72 | 67 |
|
73 | | - xr = as_float(sx ^ as_int(xr)); |
| 68 | + c = ax == ay; |
| 69 | + xr = c ? 0.0f : xr; |
74 | 70 |
|
75 | | - c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | ay == 0; |
76 | | - xr = c ? as_float(QNANBITPATT_SP32) : xr; |
| 71 | + xr = as_float(sx ^ as_int(xr)); |
77 | 72 |
|
78 | | - return xr; |
| 73 | + c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 | |
| 74 | + ay == 0; |
| 75 | + xr = c ? as_float(QNANBITPATT_SP32) : xr; |
79 | 76 |
|
| 77 | + return xr; |
80 | 78 | } |
81 | 79 | _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_fmod, float, float); |
82 | 80 |
|
83 | 81 | #ifdef cl_khr_fp64 |
84 | | -_CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) |
85 | | -{ |
86 | | - ulong ux = as_ulong(x); |
87 | | - ulong ax = ux & ~SIGNBIT_DP64; |
88 | | - ulong xsgn = ux ^ ax; |
89 | | - double dx = as_double(ax); |
90 | | - int xexp = convert_int(ax >> EXPSHIFTBITS_DP64); |
91 | | - int xexp1 = 11 - (int) __clc_clz(ax & MANTBITS_DP64); |
92 | | - xexp1 = xexp < 1 ? xexp1 : xexp; |
93 | | - |
94 | | - ulong uy = as_ulong(y); |
95 | | - ulong ay = uy & ~SIGNBIT_DP64; |
96 | | - double dy = as_double(ay); |
97 | | - int yexp = convert_int(ay >> EXPSHIFTBITS_DP64); |
98 | | - int yexp1 = 11 - (int) __clc_clz(ay & MANTBITS_DP64); |
99 | | - yexp1 = yexp < 1 ? yexp1 : yexp; |
100 | | - |
101 | | - // First assume |x| > |y| |
102 | | - |
103 | | - // Set ntimes to the number of times we need to do a |
104 | | - // partial remainder. If the exponent of x is an exact multiple |
105 | | - // of 53 larger than the exponent of y, and the mantissa of x is |
106 | | - // less than the mantissa of y, ntimes will be one too large |
107 | | - // but it doesn't matter - it just means that we'll go round |
108 | | - // the loop below one extra time. |
109 | | - int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); |
110 | | - double w = ldexp(dy, ntimes * 53); |
111 | | - w = ntimes == 0 ? dy : w; |
112 | | - double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; |
113 | | - |
114 | | - // Each time round the loop we compute a partial remainder. |
115 | | - // This is done by subtracting a large multiple of w |
116 | | - // from x each time, where w is a scaled up version of y. |
117 | | - // The subtraction must be performed exactly in quad |
118 | | - // precision, though the result at each stage can |
119 | | - // fit exactly in a double precision number. |
120 | | - int i; |
121 | | - double t, v, p, pp; |
122 | | - |
123 | | - for (i = 0; i < ntimes; i++) { |
124 | | - // Compute integral multiplier |
125 | | - t = __clc_trunc(dx / w); |
126 | | - |
127 | | - // Compute w * t in quad precision |
128 | | - p = w * t; |
129 | | - pp = fma(w, t, -p); |
130 | | - |
131 | | - // Subtract w * t from dx |
132 | | - v = dx - p; |
133 | | - dx = v + (((dx - v) - p) - pp); |
134 | | - |
135 | | - // If t was one too large, dx will be negative. Add back one w. |
136 | | - dx += dx < 0.0 ? w : 0.0; |
137 | | - |
138 | | - // Scale w down by 2^(-53) for the next iteration |
139 | | - w *= scale; |
140 | | - } |
141 | | - |
142 | | - // One more time |
143 | | - // Variable todd says whether the integer t is odd or not |
144 | | - t = __clc_floor(dx / w); |
145 | | - long lt = (long)t; |
146 | | - int todd = lt & 1; |
147 | | - |
| 82 | +_CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) { |
| 83 | + ulong ux = as_ulong(x); |
| 84 | + ulong ax = ux & ~SIGNBIT_DP64; |
| 85 | + ulong xsgn = ux ^ ax; |
| 86 | + double dx = as_double(ax); |
| 87 | + int xexp = convert_int(ax >> EXPSHIFTBITS_DP64); |
| 88 | + int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64); |
| 89 | + xexp1 = xexp < 1 ? xexp1 : xexp; |
| 90 | + |
| 91 | + ulong uy = as_ulong(y); |
| 92 | + ulong ay = uy & ~SIGNBIT_DP64; |
| 93 | + double dy = as_double(ay); |
| 94 | + int yexp = convert_int(ay >> EXPSHIFTBITS_DP64); |
| 95 | + int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64); |
| 96 | + yexp1 = yexp < 1 ? yexp1 : yexp; |
| 97 | + |
| 98 | + // First assume |x| > |y| |
| 99 | + |
| 100 | + // Set ntimes to the number of times we need to do a |
| 101 | + // partial remainder. If the exponent of x is an exact multiple |
| 102 | + // of 53 larger than the exponent of y, and the mantissa of x is |
| 103 | + // less than the mantissa of y, ntimes will be one too large |
| 104 | + // but it doesn't matter - it just means that we'll go round |
| 105 | + // the loop below one extra time. |
| 106 | + int ntimes = __clc_max(0, (xexp1 - yexp1) / 53); |
| 107 | + double w = ldexp(dy, ntimes * 53); |
| 108 | + w = ntimes == 0 ? dy : w; |
| 109 | + double scale = ntimes == 0 ? 1.0 : 0x1.0p-53; |
| 110 | + |
| 111 | + // Each time round the loop we compute a partial remainder. |
| 112 | + // This is done by subtracting a large multiple of w |
| 113 | + // from x each time, where w is a scaled up version of y. |
| 114 | + // The subtraction must be performed exactly in quad |
| 115 | + // precision, though the result at each stage can |
| 116 | + // fit exactly in a double precision number. |
| 117 | + int i; |
| 118 | + double t, v, p, pp; |
| 119 | + |
| 120 | + for (i = 0; i < ntimes; i++) { |
| 121 | + // Compute integral multiplier |
| 122 | + t = __clc_trunc(dx / w); |
| 123 | + |
| 124 | + // Compute w * t in quad precision |
148 | 125 | p = w * t; |
149 | 126 | pp = fma(w, t, -p); |
| 127 | + |
| 128 | + // Subtract w * t from dx |
150 | 129 | v = dx - p; |
151 | 130 | dx = v + (((dx - v) - p) - pp); |
152 | | - i = dx < 0.0; |
153 | | - todd ^= i; |
154 | | - dx += i ? w : 0.0; |
155 | 131 |
|
156 | | - // At this point, dx lies in the range [0,dy) |
157 | | - double ret = as_double(xsgn ^ as_ulong(dx)); |
158 | | - dx = as_double(ax); |
| 132 | + // If t was one too large, dx will be negative. Add back one w. |
| 133 | + dx += dx < 0.0 ? w : 0.0; |
| 134 | + |
| 135 | + // Scale w down by 2^(-53) for the next iteration |
| 136 | + w *= scale; |
| 137 | + } |
| 138 | + |
| 139 | + // One more time |
| 140 | + // Variable todd says whether the integer t is odd or not |
| 141 | + t = __clc_floor(dx / w); |
| 142 | + long lt = (long)t; |
| 143 | + int todd = lt & 1; |
| 144 | + |
| 145 | + p = w * t; |
| 146 | + pp = fma(w, t, -p); |
| 147 | + v = dx - p; |
| 148 | + dx = v + (((dx - v) - p) - pp); |
| 149 | + i = dx < 0.0; |
| 150 | + todd ^= i; |
| 151 | + dx += i ? w : 0.0; |
| 152 | + |
| 153 | + // At this point, dx lies in the range [0,dy) |
| 154 | + double ret = as_double(xsgn ^ as_ulong(dx)); |
| 155 | + dx = as_double(ax); |
159 | 156 |
|
160 | | - // Now handle |x| == |y| |
161 | | - int c = dx == dy; |
162 | | - t = as_double(xsgn); |
163 | | - ret = c ? t : ret; |
| 157 | + // Now handle |x| == |y| |
| 158 | + int c = dx == dy; |
| 159 | + t = as_double(xsgn); |
| 160 | + ret = c ? t : ret; |
164 | 161 |
|
165 | | - // Next, handle |x| < |y| |
166 | | - c = dx < dy; |
167 | | - ret = c ? x : ret; |
| 162 | + // Next, handle |x| < |y| |
| 163 | + c = dx < dy; |
| 164 | + ret = c ? x : ret; |
168 | 165 |
|
169 | | - // We don't need anything special for |x| == 0 |
| 166 | + // We don't need anything special for |x| == 0 |
170 | 167 |
|
171 | | - // |y| is 0 |
172 | | - c = dy == 0.0; |
173 | | - ret = c ? as_double(QNANBITPATT_DP64) : ret; |
| 168 | + // |y| is 0 |
| 169 | + c = dy == 0.0; |
| 170 | + ret = c ? as_double(QNANBITPATT_DP64) : ret; |
174 | 171 |
|
175 | | - // y is +-Inf, NaN |
176 | | - c = yexp > BIASEDEMAX_DP64; |
177 | | - t = y == y ? x : y; |
178 | | - ret = c ? t : ret; |
| 172 | + // y is +-Inf, NaN |
| 173 | + c = yexp > BIASEDEMAX_DP64; |
| 174 | + t = y == y ? x : y; |
| 175 | + ret = c ? t : ret; |
179 | 176 |
|
180 | | - // x is +=Inf, NaN |
181 | | - c = xexp > BIASEDEMAX_DP64; |
182 | | - ret = c ? as_double(QNANBITPATT_DP64) : ret; |
| 177 | + // x is +=Inf, NaN |
| 178 | + c = xexp > BIASEDEMAX_DP64; |
| 179 | + ret = c ? as_double(QNANBITPATT_DP64) : ret; |
183 | 180 |
|
184 | | - return ret; |
| 181 | + return ret; |
185 | 182 | } |
186 | | -_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_fmod, double, double); |
| 183 | +_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_fmod, double, |
| 184 | + double); |
187 | 185 | #endif |
0 commit comments