@@ -199,14 +199,13 @@ LLVM_LIBC_FUNCTION(float16, powf16, (float16 x, float16 y)) {
199199 int m = -FPBits::EXP_BIAS;
200200
201201 // When x is subnormal, normalize it by multiplying by 2^FRACTION_LEN.
202- if ((x_u_log & FPBits::EXP_MASK) == 0U ) {
203- constexpr float NORMALIZE_EXP =
204- static_cast < float >( 1U << FPBits::FRACTION_LEN);
205- x_bits = FPBits (x_bits. get_val () * fputil::cast<float16>( NORMALIZE_EXP));
202+ if ((x_u_log & FPBits::EXP_MASK) == 0U ) { // Subnormal x
203+ constexpr double NORMALIZE_EXP = 1.0 * ( 1U << FPBits::FRACTION_LEN);
204+ x_bits = FPBits (fputil::cast<float16>(
205+ fputil::cast<double >(x_bits. get_val ()) * NORMALIZE_EXP));
206206 x_u_log = x_bits.uintval ();
207207 m -= FPBits::FRACTION_LEN;
208208 }
209-
210209 // Extract the mantissa and index into small lookup tables.
211210 uint16_t mant = x_bits.get_mantissa ();
212211 // Use the highest 5 fractional bits of the mantissa as the index f.
@@ -217,39 +216,37 @@ LLVM_LIBC_FUNCTION(float16, powf16, (float16 x, float16 y)) {
217216 // Add the hidden bit to the mantissa.
218217 // 1 <= m_x < 2
219218 x_bits.set_biased_exponent (FPBits::EXP_BIAS);
220- float mant_f = x_bits.get_val ();
219+ double mant_d = x_bits.get_val ();
221220
222221 // Range reduction for log2(m_x):
223222 // v = r * m_x - 1, where r is a power of 2 from a lookup table.
224223 // The computation is exact for half-precision, and -2^-5 <= v < 2^-4.
225224 // Then m_x = (1 + v) / r, and log2(m_x) = log2(1 + v) - log2(r).
226225
227- float v = fputil::multiply_add (mant_f, ONE_OVER_F_F[f], - 1 . 0f );
228-
226+ double v =
227+ fputil::multiply_add (mant_d, fputil::cast< double >(ONE_OVER_F_F[f]), - 1.0 );
229228 // For half-precision accuracy, we use a degree-2 polynomial approximation:
230229 // P(v) ~ log2(1 + v) / v
231230 // Generated by Sollya with:
232231 // > P = fpminimax(log2(1+x)/x, 2, [|D...|], [-2^-5, 2^-4]);
233232 // The coefficients are rounded from the Sollya output.
234- float log2p1_d_over_f =
235- v * fputil::polyeval (v, 0x1 .715476p+0f , -0x1 .71771ap-1f , 0x1 .ecb38ep -2f );
233+
234+ double log2p1_d_over_f =
235+ v * fputil::polyeval (v, 0x1 .715476p+0 , -0x1 .71771ap-1 , 0x1 .ecb38ep -2 );
236236
237237 // log2(1.mant) = log2(f) + log2(1 + v)
238- float log2_1_mant = LOG2F_F[f] + log2p1_d_over_f;
238+ double log2_1_mant = LOG2F_F[f] + log2p1_d_over_f;
239239
240240 // Complete log2(x) = e_x + log2(m_x)
241- float log2_x = static_cast <float >(m) + log2_1_mant;
241+ double log2_x = static_cast <double >(m) + log2_1_mant;
242242
243243 // z = y * log2(x)
244244 // Now compute 2^z = 2^(n + r), with n integer and r in [-0.5, 0.5].
245245 double z = fputil::cast<double >(y) * log2_x;
246246
247247 // Check for overflow/underflow for half-precision.
248248 // Half-precision range is approximately 2^-24 to 2^15.
249- if (z > 15.0 ) {
250- fputil::raise_except_if_required (FE_OVERFLOW);
251- return FPBits::inf ().get_val ();
252- }
249+ //
253250 if (z < -24.0 ) {
254251 fputil::raise_except_if_required (FE_UNDERFLOW);
255252 return fputil::cast<float16>(0 .0f );
@@ -282,7 +279,11 @@ LLVM_LIBC_FUNCTION(float16, powf16, (float16 x, float16 y)) {
282279 uint64_t exp_bits = static_cast <uint64_t >(n_int + 1023 ) << 52 ;
283280 double pow2_n = cpp::bit_cast<double >(exp_bits);
284281
285- float16 result = fputil::cast<float16>(pow2_n * exp2_r);
282+
283+ double result_d = (pow2_n * exp2_r);
284+ float16 result = fputil::cast<float16>(result_d);
285+ if (result_d==65504.0 )
286+ return (65504 .f16 );
286287
287288 if (result_sign) {
288289 FPBits result_bits (result);
0 commit comments