diff --git a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake index 03db38fa4cdc1..3bcb0f7e8e6ce 100644 --- a/compiler-rt/cmake/Modules/CompilerRTUtils.cmake +++ b/compiler-rt/cmake/Modules/CompilerRTUtils.cmake @@ -452,10 +452,14 @@ function(filter_builtin_sources inout_var name) # and ensure that it is removed from the file list. get_filename_component(_name ${_file} NAME) string(REGEX REPLACE "\\.S$" ".c" _cname "${_name}") - if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${_cname}") - message(STATUS "For ${name} builtins preferring ${_file} to ${_cname}") - list(REMOVE_ITEM intermediate ${_cname}) - endif() + get_property(_cnames SOURCE ${_file} PROPERTY crt_supersedes) + set(_cnames ${_cname} ${_cnames}) + foreach(_cname ${_cnames}) + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${_cname}") + message(STATUS "For ${name} builtins preferring ${_file} to ${_cname}") + list(REMOVE_ITEM intermediate ${_cname}) + endif() + endforeach() endif() endforeach() set(${inout_var} ${intermediate} PARENT_SCOPE) diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt index 1dadb6a810efb..ca4c5d3e67146 100644 --- a/compiler-rt/lib/builtins/CMakeLists.txt +++ b/compiler-rt/lib/builtins/CMakeLists.txt @@ -452,8 +452,11 @@ set(thumb1_base_SOURCES arm/udivsi3.S arm/comparesf2.S arm/addsf3.S + arm/fnan2.c ${GENERIC_SOURCES} ) +# arm/addsf3.S implements both addition and subtraction via cross-branching +set_property(SOURCE arm/addsf3.S PROPERTY crt_supersedes subsf3.c) set(arm_EABI_RT_SOURCES arm/aeabi_cdcmp.S diff --git a/compiler-rt/lib/builtins/arm/addsf3.S b/compiler-rt/lib/builtins/arm/addsf3.S index aa4d40473edb6..64d8504327529 100644 --- a/compiler-rt/lib/builtins/arm/addsf3.S +++ b/compiler-rt/lib/builtins/arm/addsf3.S @@ -1,4 +1,4 @@ -//===-- addsf3.S - Adds two single precision floating pointer numbers-----===// +//===-- addsf3.S - Adds two single precision floating point numbers--------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// // -// This file implements the __addsf3 (single precision floating pointer number +// This file implements the __addsf3 (single precision floating point number // addition with the IEEE-754 default rounding (to nearest, ties to even) // function for the ARM Thumb1 ISA. // @@ -24,253 +24,829 @@ DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fadd, __addsf3) DEFINE_COMPILERRT_THUMB_FUNCTION(__addsf3) - push {r4, r5, r6, r7, lr} - // Get the absolute value of a and b. - lsls r2, r0, #1 - lsls r3, r1, #1 - lsrs r2, r2, #1 // aAbs - beq LOCAL_LABEL(a_zero_nan_inf) - lsrs r3, r3, #1 // bAbs - beq LOCAL_LABEL(zero_nan_inf) - - // Detect if a or b is infinity or Nan. - lsrs r6, r2, #(significandBits) - lsrs r7, r3, #(significandBits) - cmp r6, #0xFF - beq LOCAL_LABEL(zero_nan_inf) - cmp r7, #0xFF - beq LOCAL_LABEL(zero_nan_inf) - - // Swap Rep and Abs so that a and aAbs has the larger absolute value. - cmp r2, r3 - bhs LOCAL_LABEL(no_swap) - movs r4, r0 - movs r5, r2 - movs r0, r1 - movs r2, r3 - movs r1, r4 - movs r3, r5 -LOCAL_LABEL(no_swap): - - // Get the significands and shift them to give us round, guard and sticky. - lsls r4, r0, #(typeWidth - significandBits) - lsrs r4, r4, #(typeWidth - significandBits - 3) // aSignificand << 3 - lsls r5, r1, #(typeWidth - significandBits) - lsrs r5, r5, #(typeWidth - significandBits - 3) // bSignificand << 3 - - // Get the implicitBit. - movs r6, #1 - lsls r6, r6, #(significandBits + 3) - - // Get aExponent and set implicit bit if necessary. - lsrs r2, r2, #(significandBits) - beq LOCAL_LABEL(a_done_implicit_bit) - orrs r4, r6 -LOCAL_LABEL(a_done_implicit_bit): - - // Get bExponent and set implicit bit if necessary. - lsrs r3, r3, #(significandBits) - beq LOCAL_LABEL(b_done_implicit_bit) - orrs r5, r6 -LOCAL_LABEL(b_done_implicit_bit): - - // Get the difference in exponents. - subs r6, r2, r3 - beq LOCAL_LABEL(done_align) - - // If b is denormal, then a must be normal as align > 0, and we only need to - // right shift bSignificand by (align - 1) bits. - cmp r3, #0 - bne 1f - subs r6, r6, #1 -1: - - // No longer needs bExponent. r3 is dead here. - // Set sticky bits of b: sticky = bSignificand << (typeWidth - align). - movs r3, #(typeWidth) - subs r3, r3, r6 - movs r7, r5 - lsls r7, r3 - beq 1f - movs r7, #1 -1: - - // bSignificand = bSignificand >> align | sticky; - lsrs r5, r6 - orrs r5, r7 - bne LOCAL_LABEL(done_align) - movs r5, #1 // sticky; b is known to be non-zero. - -LOCAL_LABEL(done_align): - // isSubtraction = (aRep ^ bRep) >> 31; - movs r7, r0 - eors r7, r1 - lsrs r7, #31 - bne LOCAL_LABEL(do_substraction) - - // Same sign, do Addition. - - // aSignificand += bSignificand; - adds r4, r4, r5 - - // Check carry bit. - movs r6, #1 - lsls r6, r6, #(significandBits + 3 + 1) - movs r7, r4 - ands r7, r6 - beq LOCAL_LABEL(form_result) - // If the addition carried up, we need to right-shift the result and - // adjust the exponent. - movs r7, r4 - movs r6, #1 - ands r7, r6 // sticky = aSignificand & 1; - lsrs r4, #1 - orrs r4, r7 // result Significand - adds r2, #1 // result Exponent - // If we have overflowed the type, return +/- infinity. - cmp r2, 0xFF - beq LOCAL_LABEL(ret_inf) - -LOCAL_LABEL(form_result): - // Shift the sign, exponent and significand into place. - lsrs r0, #(typeWidth - 1) - lsls r0, #(typeWidth - 1) // Get Sign. - lsls r2, #(significandBits) - orrs r0, r2 - movs r1, r4 - lsls r4, #(typeWidth - significandBits - 3) - lsrs r4, #(typeWidth - significandBits) - orrs r0, r4 - - // Final rounding. The result may overflow to infinity, but that is the - // correct result in that case. - // roundGuardSticky = aSignificand & 0x7; - movs r2, #0x7 - ands r1, r2 - // if (roundGuardSticky > 0x4) result++; - - cmp r1, #0x4 - blt LOCAL_LABEL(done_round) - beq 1f - adds r0, #1 - pop {r4, r5, r6, r7, pc} -1: - - // if (roundGuardSticky == 0x4) result += result & 1; - movs r1, r0 - lsrs r1, #1 - bcc LOCAL_LABEL(done_round) - adds r0, r0, #1 -LOCAL_LABEL(done_round): - pop {r4, r5, r6, r7, pc} - -LOCAL_LABEL(do_substraction): - subs r4, r4, r5 // aSignificand -= bSignificand; - beq LOCAL_LABEL(ret_zero) - movs r6, r4 - cmp r2, 0 - beq LOCAL_LABEL(form_result) // if a's exp is 0, no need to normalize. - // If partial cancellation occured, we need to left-shift the result - // and adjust the exponent: - lsrs r6, r6, #(significandBits + 3) - bne LOCAL_LABEL(form_result) - - push {r0, r1, r2, r3} - movs r0, r4 - bl SYMBOL_NAME(__clzsi2) - movs r5, r0 - pop {r0, r1, r2, r3} - // shift = rep_clz(aSignificand) - rep_clz(implicitBit << 3); - subs r5, r5, #(typeWidth - significandBits - 3 - 1) - // aSignificand <<= shift; aExponent -= shift; - lsls r4, r5 - subs r2, r2, r5 - bgt LOCAL_LABEL(form_result) - - // Do normalization if aExponent <= 0. - movs r6, #1 - subs r6, r6, r2 // 1 - aExponent; - movs r2, #0 // aExponent = 0; - movs r3, #(typeWidth) // bExponent is dead. - subs r3, r3, r6 - movs r7, r4 - lsls r7, r3 // stickyBit = (bool)(aSignificant << (typeWidth - align)) - beq 1f - movs r7, #1 -1: - lsrs r4, r6 // aSignificand >> shift - orrs r4, r7 - b LOCAL_LABEL(form_result) - -LOCAL_LABEL(ret_zero): - movs r0, #0 - pop {r4, r5, r6, r7, pc} - - -LOCAL_LABEL(a_zero_nan_inf): - lsrs r3, r3, #1 - -LOCAL_LABEL(zero_nan_inf): - // Here r2 has aAbs, r3 has bAbs - movs r4, #0xFF - lsls r4, r4, #(significandBits) // Make +inf. - - cmp r2, r4 - bhi LOCAL_LABEL(a_is_nan) - cmp r3, r4 - bhi LOCAL_LABEL(b_is_nan) - - cmp r2, r4 - bne LOCAL_LABEL(a_is_rational) - // aAbs is INF. - eors r1, r0 // aRep ^ bRep. - movs r6, #1 - lsls r6, r6, #(typeWidth - 1) // get sign mask. - cmp r1, r6 // if they only differ on sign bit, it's -INF + INF - beq LOCAL_LABEL(a_is_nan) - pop {r4, r5, r6, r7, pc} - -LOCAL_LABEL(a_is_rational): - cmp r3, r4 - bne LOCAL_LABEL(b_is_rational) - movs r0, r1 - pop {r4, r5, r6, r7, pc} - -LOCAL_LABEL(b_is_rational): - // either a or b or both are zero. - adds r4, r2, r3 - beq LOCAL_LABEL(both_zero) - cmp r2, #0 // is absA 0 ? - beq LOCAL_LABEL(ret_b) - pop {r4, r5, r6, r7, pc} - -LOCAL_LABEL(both_zero): - ands r0, r1 // +0 + -0 = +0 - pop {r4, r5, r6, r7, pc} - -LOCAL_LABEL(ret_b): - movs r0, r1 - -LOCAL_LABEL(ret): - pop {r4, r5, r6, r7, pc} - -LOCAL_LABEL(b_is_nan): - movs r0, r1 -LOCAL_LABEL(a_is_nan): - movs r1, #1 - lsls r1, r1, #(significandBits -1) // r1 is quiet bit. - orrs r0, r1 - pop {r4, r5, r6, r7, pc} - -LOCAL_LABEL(ret_inf): - movs r4, #0xFF - lsls r4, r4, #(significandBits) - orrs r0, r4 - lsrs r0, r0, #(significandBits) - lsls r0, r0, #(significandBits) - pop {r4, r5, r6, r7, pc} - - + push {r4,r5,r6,lr} + + movs r5, #1 + lsls r5, r5, #31 // all cross-branches will expect to have r5==0x80000000 + + // Extract the exponents into r2 and r3. In the process, test for all + // uncommon values (infinities, NaNs, denormals and zeroes) and branch out of + // line if any are found. + // + // Uncommon operands with exponent 0xFF (NaNs and infinities) "win" over + // those with exponent 0 (zeroes and denormals), in the sense that if there's + // one of each, the 0xFF one determines the result. But we check for exponent + // 0 first, because that way we get it as a by-product of extracting the + // exponents in the first place without needing a separate compare + // instruction. So the zero/denorm handler will have to finish up the NaN + // check as its first task. + lsls r2, r0, #1 + lsls r3, r1, #1 + lsrs r2, r2, #24 + beq LOCAL_LABEL(fadd_zerodenorm_x) + lsrs r3, r3, #24 + beq LOCAL_LABEL(fadd_zerodenorm_y) + cmp r2, #255 + beq LOCAL_LABEL(fadd_naninf) + cmp r3, #255 + beq LOCAL_LABEL(fadd_naninf) + + // Now we have two normalised numbers. If their signs are opposite, we should + // be subtracting their magnitudes rather than adding, so cross-jump to fsub + // (via a trampoline that negates y). + movs r4, r0 + eors r4, r4, r1 // set N if signs are unequal + bmi LOCAL_LABEL(fadd_sub) +LOCAL_LABEL(fadd_magnitude): + // If we get here, we're adding operands with equal signs (i.e. a magnitude + // addition). First thing to do is put the operands in magnitude order, so + // that x >= y. + subs r4, r0, r1 + bhs LOCAL_LABEL(fadd_swapped) + subs r0, r0, r4 + adds r1, r1, r4 + // We must also swap the pre-extracted exponents here. + eors r2, r2, r3 + eors r3, r3, r2 + eors r2, r2, r3 +LOCAL_LABEL(fadd_swapped): + // Keep the sign and exponent of the larger input, to use as the sign and + // exponent of the output (up to carries and overflows). Also calculate the + // exponent difference, which tells us how far we'll need to shift y's + // mantissa right to add it to x's. + lsrs r6, r0, #23 + subs r3, r2, r3 + + // Extract both mantissas, moved up to the top of the word, with the leading + // 1 made explicit. We put y's extracted mantissa in a different register + // (r4), because we'll want to keep the original y for use in fadd_check_rte. + lsls r0, r0, #8 + lsls r4, r1, #8 + orrs r0, r0, r5 + orrs r4, r4, r5 + +LOCAL_LABEL(fadd_doadd): + // Here we perform the actual addition. We either fell through from the code + // above, or jumped back to here after handling an input denormal. + // + // We get here with: + // Operands known to be numeric rather than zero/infinity/NaN; + // r0 = mantissa of larger operand (in high 24 bits); + // r4 = mantissa of smaller operand (in high 24 bits); + // r1 = original (or nearly so) smaller operand; + // r6 = result sign and exponent (in low 9 bits); + // r2 = exponent of x + // r3 = exponent difference. + // + // For normal inputs, the mantissa registers (r0,r4) will have the top bit + // set. Denormals will leave that bit clear, treating the number as + // 0.[mantissa] x 2^(fixed exponent) instead of renormalising to 1.[mantissa] + // x 2^(variable exponent) as a multiplication would want. + + // Actually shift the smaller mantissa downwards and add them together. + lsrs r4, r4, r3 + adds r5, r0, r4 + + // If that addition carried off the top of r5, then the number has increased + // its exponent. Diverge into a completely separate code path for that case, + // because there we must check for overflow. We'll return to the label below + // if no overflow. + bcs LOCAL_LABEL(fadd_carry) +LOCAL_LABEL(fadd_renormed): + // Now we have the output mantissa in r5, with the leading bit at position + // 31. The precise sum may be slightly more than that, if r4 != (y << r3). + // + // Shift the mantissa down to its final position, and use the carry flag (bit + // shifted off the bottom) to see if we need to round. + lsrs r0, r5, #8 + bcc LOCAL_LABEL(fadd_rounded) + + // If we fall through to here, then we need to round up, and also check if we + // need to round to even. This occurs if all the bits of y's mantissa shifted + // off the bottom are zero except for the round bit. + // + // Some of those bits are in r5 (the 32-bit version of the sum's mantissa). + // It's cheap to check those, and should exclude _most_ cases where + // round-to-even isn't needed. + adds r0, r0, #1 // simple round up + lsls r5, r5, #(32-7) // check top 7 bits + beq LOCAL_LABEL(fadd_check_rte) // if those are zero, go to full RTE check +LOCAL_LABEL(fadd_rounded): + // Put the sign+exponent back on. The leading bit of the mantissa increments + // the exponent field unwantedly, so we must decrement r6 first to compensate + // for that. + subs r6, r6, #1 + lsls r6, r6, #23 + adds r0, r0, r6 + // If we haven't overflowed, it's now safe to return. + cmp r2, #255 + bge LOCAL_LABEL(fadd_overflow) + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fadd_overflow): + // We have overflow, so we need to return an infinity of the correct sign. r0 + // already has the correct sign and exponent, so all we need to do is clear + // its mantissa. + lsrs r0, r0, #23 + lsls r0, r0, #23 + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fadd_sub): + // We come here when fadd discovered it needed to subtract. Negate the second + // operand and cross-jump into fsub. + // + // The cross-jump is done using BL, for greater branch range. That clobbers + // lr, but that's OK, we weren't keeping anything in it at this point. + eors r1, r1, r5 + bl LOCAL_LABEL(fsub_magnitude) + +LOCAL_LABEL(fadd_carry): + // We come here if we carried a 1 bit off the top of r5 where we computed the + // sum's mantissa. Shift back down by one and put a 1 bit in at the top. + // + // That would be easy with the RRX instruction from general AArch32, but we + // don't have that here. Instead we OR in a 1 at the bottom, and move it to + // the top by rotating right. + // + // A danger of shifting r5 down by a bit is that we lose the bit at the very + // bottom, which might be important if it's the only nonzero bit below the + // output mantissa, because then it determines whether we do RTE or not. + // Fortunately, another copy of the same bit is still at the bottom of r4 + // (the shifted version of y's mantissa which we added to x's to make the + // version of r5 _before_ we shifted it down). So the full RTE check will + // have to remember to check that bit. + movs r0, #1 + orrs r5, r5, r0 // set low bit of r5 + rors r5, r5, r0 // and rotate right so that's now the high bit + + // Carrying off the top of the mantissa means that the output exponent must + // be increased by 1. Increment both copies: the exponent by itself in r2 + // (used for overflow checking) and the exponent + sign in r6. + adds r2, r2, #1 + adds r6, r6, #1 + + // Now go back to the common code path for rounding and overflow checking. + b LOCAL_LABEL(fadd_renormed) + +LOCAL_LABEL(fadd_check_rte): + // We come here to do the full (and therefore expensive) check for round-to- + // even: is our output number exactly on a rounding boundary, half way + // between two representable numbers? That is, of the bits _not_ included in + // the output mantissa, is the topmost bit 1 and all the rest 0? + // + // We only come here at all if we have already rounded the number up. So we + // already know the topmost one of the lost bits is 1, and all we have to + // check is whether the rest are 0. + // + // Also, we've already checked all the bits that were still in the 32-bit + // version of the output mantissa, so we don't need to check those again ... + // + // ... well, _nearly_ all, because in the fadd_carry case, we shifted r5 down + // by a bit _before_ that check. So we do need to re-check that one bit. + // + // The basic strategy is: r4 still contains the version of y's mantissa that + // we shifted down before adding it to x. And r1 contains more or less the + // original version of all of y, including the same mantissa. So if we shift + // r4 back up again and XOR it with r1, we clear all the bits that we've + // already checked, and leave only the ones we haven't. + + // Start by deliberately throwing away the low bit of r4, in case that + // corresponded to the bit we lost off the bottom of r5 in fadd_carry. This + // means we won't clear it in the XOR, and therefore, _will_ check it. + lsrs r4, r4, #1 + + // Shift r4 back up by the same amount we shifted it down, and shift r1 to + // the corresponding position, so that we can XOR them. The most convenient + // way to do this is not to modify the variable shift count in r3, and + // compensate for it by selecting the shift of r1 appropriately. + // + // As it happens, we end up with the implicit leading 1 bit of the mantissa + // in bit 30 of the result - or rather, it would be if we'd set it, which in + // r1 we haven't, because that's still the whole original input float. + lsls r4, r4, r3 + lsls r1, r1, #7 + eors r1, r1, r4 + + // But r1 wasn't just the mantissa of y; it also had the exponent, and its + // leading bit was implicit. So the topmost two bits of r1 are useless: in r1 + // they're part of the exponent field. Exclude them from consideration. + // + // This doesn't lead to dropping any bit we really care about, because we're + // never interested in the actual leading 1 bit of y's mantissa for round-to- + // even purposes. Why not? Because we already know the round bit (the one + // just off the bottom of the output mantissa) is a 1, which must have come + // from y (it's too low down to come from x), and we only care about checking + // all the bits below _that_. So y's leading 1 must be at least as high up as + // the round bit, and therefore, isn't one of the bits we currently need to + // check. + lsls r1, r1, #2 + + // Now if all those bits are zero, we're rounding to even. If _not_, we're + // finished rounding, so go back to fadd_rounded to continue the main code + // path. + bne LOCAL_LABEL(fadd_rounded) + + // Clear the low bit of the output (rounding to even) and go back to the main + // code path. + movs r4, #1 + bics r0, r0, r4 + b LOCAL_LABEL(fadd_rounded) + +LOCAL_LABEL(fadd_naninf): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 which will propagate a NaN from + // the input. + // + // On entry, we know r5 = 0x80000000 from the initial uncommon check. Also, + // we already extracted the exponents of x and y into r2 and r3. + asrs r4, r5, #7 // so r4 = 0xFF000000 + lsls r6, r0, #1 // r6 > r4 iff x is NaN + cmp r6, r4 + bhi LOCAL_LABEL(fadd_nan) + lsls r6, r1, #1 // r6 > r4 iff y is NaN + cmp r6, r4 + bhi LOCAL_LABEL(fadd_nan) + + // No NaNs, so we have at least one infinity. Almost all additions involving + // an infinity return the input infinity unchanged. The only exception is if + // there are two infinities that have opposite signs (which can happen even + // inf fadd, since on this code path we haven't cross-jumped into fsub), + // where we return NaN. + cmp r2, r3 // at least one exponent is 0xFF, so if EQ, both are + beq LOCAL_LABEL(fadd_infinf) // and therefore we're adding infinity to infinity + + // With one infinity, we just find which register it's in, and return it. + cmp r2, #255 + beq LOCAL_LABEL(fadd_ret_exact) // just return x +LOCAL_LABEL(fadd_retb): // we reuse this code in the denormal handler + movs r0, r1 // otherwise, return y +LOCAL_LABEL(fadd_ret_exact): + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fadd_infinf): + // With two infinities, we must check their relative sign. If they're the + // same sign, we have no problem. + movs r4, r0 + eors r4, r4, r1 + bpl LOCAL_LABEL(fadd_ret_exact) // identical infinities, so just return one + + // But if we're adding two infinities of opposite sign, make a default quiet + // NaN and return that. + ldr r0, =0x7fc00000 + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fadd_nan): + bl SYMBOL_NAME(__compiler_rt_fnan2) + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fadd_zerodenorm_x): + // We come here if we found x was 0 or a denormal. We haven't set up r3 as + // the exponent of y yet. + lsrs r3, r3, #24 + + // Also, we checked for zero/denorm before checking for infinities and NaNs. + // We know x isn't an infinity or NaN, but we must check y. + cmp r3, #255 + beq LOCAL_LABEL(fadd_naninf) + + // Fall through to the next section. This repeats a pointless check for x + // being NaN or infinity, but it would cost more cycles to branch round it. + +LOCAL_LABEL(fadd_zerodenorm_y): + // We come here if we found y was 0 or a denormal, but also by falling + // through from above. So we may not yet have checked x for infinity/NaN. But + // we have checked that y isn't. + cmp r2, #255 + beq LOCAL_LABEL(fadd_naninf) + + // Now at least one of x,y is zero or denormal, and neither is infinite or + // NaN. We haven't yet checked the signs and cross-jumped to fsub, but we can + // handle all the zero cases without having to: + // + // - if x = -y (including both being zero), return 0 of the appropriate sign + // - if x = 0, return y (including the case of same-signed zeroes) + // - if y = 0, return x + subs r6, r0, r1 // are x and y equal + cmp r6, r5 // except for opposite sign bits? (r5 = 0x80000000) + beq LOCAL_LABEL(fadd_diffsame) + lsls r6, r1, #1 // is y zero? + beq LOCAL_LABEL(fadd_ret_exact) // if so, return x + lsls r6, r0, #1 // is x zero? + beq LOCAL_LABEL(fadd_retb) // if so, return y + + // Now we've dealt with all the possibilities involving zeroes, so we have + // either one denormal or two denormals. These cases are harder, and we don't + // want to handle both signs at once, so check the signs and cross-branch + // into fsub if they're different. + movs r6, r1 + eors r6, r6, r0 + bpl LOCAL_LABEL(fadd_denorm) + eors r1, r1, r5 + bl LOCAL_LABEL(fsub_denorm) +LOCAL_LABEL(fadd_denorm): + // Sort the operands into magnitude order. Now we know they have the same + // sign, unsigned comparison is good enough for that. + subs r6, r0, r1 + bhs LOCAL_LABEL(fadd_denorm_noswap) + subs r0, r0, r6 + adds r1, r1, r6 +LOCAL_LABEL(fadd_denorm_noswap): + + // We know one exponent is 0, so check if the other is too. We do this by + // adding the two exponents together, achieving two things in one + // instruction: it gets the nonzero exponent (if any) into r2 (saving us + // swapping r2 with r3 in the sorting step above), and it sets Z if both were + // zero. + adds r2, r2, r3 + beq LOCAL_LABEL(fadd_denorm2) + + // Now exactly one operand is denormal, and it's y. We must go back to + // fadd_doadd with all the registers appropriately set up. + lsrs r6, r0, #23 // r6 == sign and exponent of x + lsls r4, r1, #8 // r4 == mantissa of y, with leading bit clear + lsls r0, r0, #8 + orrs r0, r0, r5 // set high bit on mantissa of x + subs r3, r2, #1 // denormals are shifted as if they had exponent 1 + b LOCAL_LABEL(fadd_doadd) + +LOCAL_LABEL(fadd_diffsame): + // Here we only support round-to-nearest mode, so the difference of two + // identical things always returns +0. + movs r0, #0 + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fadd_denorm2): + // Here, x,y are both denormal, and we know we're doing magnitude addition. + // So we can add the mantissas like ordinary integers, and if they carry into + // the exponent, that's still the correct answer. But we have to avoid adding + // two copies of the sign bit, so we clear that from y first. + bics r1, r1, r5 // clear sign bit of y + adds r0, r0, r1 // add mantissas + pop {r4,r5,r6,pc} END_COMPILERRT_FUNCTION(__addsf3) +DEFINE_COMPILERRT_THUMB_FUNCTION(__aeabi_frsub) + // Reversed subtraction, that is, compute y-x, where x is in r0 and y in r1. + // + // We could implement this by simply swapping r0 with r1. But the point of + // having a reversed-subtract in the first place is to avoid the caller + // having to do that, so if we do it ourselves, it wastes all the time they + // saved. So instead, on the fast path, we redo the sign check our own way + // and branch to fadd_magnitude or fsub_magnitude. + + push {r4,r5,r6,lr} + + movs r5, #1 + lsls r5, r5, #31 // all cross-branches will expect to have r5 = 0x80000000 + + // Extract the exponents and test for uncommon values. Note that we do the + // zero/denormal tests the opposite way round from fsub, because we swap the + // operands before branching to the corresponding fsub code, so this way our + // first branch will enter fsub with the first of _its_ operands checked. + lsls r2, r0, #1 + lsls r3, r1, #1 + lsrs r3, r3, #24 + beq LOCAL_LABEL(frsb_zerodenorm_y) + lsrs r2, r2, #24 + beq LOCAL_LABEL(frsb_zerodenorm_x) + cmp r2, #255 + beq LOCAL_LABEL(frsb_naninf) + cmp r3, #255 + beq LOCAL_LABEL(frsb_naninf) + + // Decide which of fadd_magnitude and fsub_magnitude to branch to, and do so. + eors r0, r0, r5 + movs r4, r0 + eors r4, r4, r1 + bpl LOCAL_LABEL(frsb_add) + eors r1, r1, r5 + bl LOCAL_LABEL(fsub_magnitude) +LOCAL_LABEL(frsb_add): + bl LOCAL_LABEL(fadd_magnitude) + + // Any uncommon operands to frsub are handled by just swapping the two + // operands and going to fsub's handler. We're off the main fast path now, so + // there's no need to try to optimise it any harder. +LOCAL_LABEL(frsb_zerodenorm_y): + push {r0,r2} + push {r1,r3} + pop {r0,r2} + pop {r1,r3} + bl LOCAL_LABEL(fsub_zerodenorm_x) // we just swapped x and y, so now x is 0/denorm +LOCAL_LABEL(frsb_zerodenorm_x): + push {r0,r2} + push {r1,r3} + pop {r0,r2} + pop {r1,r3} + bl LOCAL_LABEL(fsub_zerodenorm_y) // similarly, now we know y is +LOCAL_LABEL(frsb_naninf): + push {r0,r2} + push {r1,r3} + pop {r0,r2} + pop {r1,r3} + bl LOCAL_LABEL(fsub_naninf) +END_COMPILERRT_FUNCTION(__aeabi_frsub) + +DEFINE_AEABI_FUNCTION_ALIAS(__aeabi_fsub, __subsf3) + +DEFINE_COMPILERRT_THUMB_FUNCTION(__subsf3) + // Main entry point for subtraction. + push {r4,r5,r6,lr} + + movs r5, #1 + lsls r5, r5, #31 + + // Extract the exponents into r2 and r3 and test for all uncommon values, + // similarly to fadd. + lsls r2, r0, #1 + lsls r3, r1, #1 + lsrs r2, r2, #24 + beq LOCAL_LABEL(fsub_zerodenorm_x) + lsrs r3, r3, #24 + beq LOCAL_LABEL(fsub_zerodenorm_y) + cmp r2, #255 + beq LOCAL_LABEL(fsub_naninf) + cmp r3, #255 + beq LOCAL_LABEL(fsub_naninf) + + // Check the signs, and if they're unequal, cross-jump into fadd to do + // magnitude addition. (Now we've excluded NaNs, it's safe to flip the sign + // of y.) + movs r4, r0 + eors r4, r4, r1 + bmi LOCAL_LABEL(fsub_add) +LOCAL_LABEL(fsub_magnitude): + // If we get here, we're subtracting operands with equal signs (i.e. a + // magnitude subtraction). First thing to do is put operands in magnitude + // order, so that x >= y. However, if they are swapped, we must also negate + // both of them, since A - B = (-B) - (-A). + subs r4, r0, r1 + bhs LOCAL_LABEL(fsub_swapped) + eors r4, r4, r5 + subs r0, r0, r4 + adds r1, r1, r4 + // We must also swap the pre-extracted exponents here. + eors r2, r2, r3 + eors r3, r3, r2 + eors r2, r2, r3 +LOCAL_LABEL(fsub_swapped): + // Save the sign and exponent of the larger operand to use for the result (up + // to renormalisation), and calculate the exponent difference for shifting + // one mantissa relative to the other. + lsrs r6, r0, #23 + subs r3, r2, r3 + + // Shift the mantissas up to the top of the words. In the process we put y's + // shifted mantissa into a separate register, keeping the original for later + // reference. Also, although we set the leading bit of y, we _clear_ the + // leading bit of x, which is just as quick and saves us having to decrement + // the output exponent later to compensate. + lsls r0, r0, #8 + lsls r4, r1, #8 + bics r0, r0, r5 + orrs r4, r4, r5 + +LOCAL_LABEL(fsub_dosub): // we may come back here after sorting out denorms + + // We get here with: + // Operands known to be numeric rather than zero/infinity/NaN; + // r0 = mantissa of larger operand (in top 24 bits, with high bit clear) + // r4 = mantissa of smaller operand (in top 24 bits, with high bit set) + // r1 = original smaller operand (up to maybe a sign flip) + // r6 = result sign/exponent (in low 9 bits) + // r2 = plain result exponent (in low 8 bits, i.e. r6 & 0xFF) + // r3 = exponent difference. + // + // Begin calculating the output mantissa by shifting y's mantissa right and + // subtracting. This may leave the mantissa too large by one, if the bits + // shifted out of y are nonzero. We correct this during rounding if + // necessary. + lsrs r4, r4, r3 + subs r5, r0, r4 + + // This may have cleared the high bit of the output mantissa, in which case + // we must renormalise. Our strategy is to split into three code paths, on + // two of which an awkward case is known not to arise: + // * no need to renormalise at all => underflow can't happen + // * shift up by exactly 1 bit + // * shift up by more than 1 bit => rounding can't happen (result is exact) + // + // First branch out of line for the first case, which we can detect because + // the N flag tells us whether the top mantissa bit is still set. + bpl LOCAL_LABEL(fsub_renormed) + + // Renormalise by one bit, and check the new top bit to see if we need to + // renormalise by more than that. + lsls r5, r5, #1 + bpl LOCAL_LABEL(fsub_renorm_big) // if new top bit still clear, renormalise by more + // Decrement both exponent registers (r6 with the sign, r2 without). We + // decrement r6 by 2 instead of 1, because now the output mantissa has the + // top bit set, so we must compensate when we put the sign and exponent back + // on. + // + // The extra decrement of r6 might carry into the sign bit. This doesn't + // matter on the fast path, because the leading bit in the mantissa will undo + // it. But we need to account for it in the underflow handler for this path. + subs r6, r6, #2 + subs r2, r2, #1 + // The decrement of the pure exponent value also doubles as a check for + // underflow, because we underflowed precisely if the exponent went to 0. + beq LOCAL_LABEL(fsub_underflow_1) +LOCAL_LABEL(fsub_renormed): + // Now we have the output mantissa in r5. It may or may not have the high bit + // set, depending on which branch of the code we've come through. But r6 has + // been adjusted appropriately, so that we can make a basically right output + // value (before rounding) by adding r6 << 23 to r5 >> 8. + // + // If any nonzero bits were shifted off the bottom of y, then the true value + // of the output mantissa might be slightly _less_ than the value in r5. + // However the maximum difference is about 2^{-7} ULP relative to the final + // result (because it's at most one ULP of the 32-bit output mantissa in r5). + // So it doesn't affect the result in round-to-nearest mode unless it puts us + // just below a rounding boundary, which means we can ignore it until the + // full round-to-even check. + lsls r6, r6, #23 // prepare sign and exponent + lsrs r0, r5, #8 // shift down, and put the round bit into C + bcs LOCAL_LABEL(fsub_round) // diverge based on round bit + // If the round bit shifted off the bottom of r5 was clear, then we're not + // rounding up, so we can make the output value and finish immediately. + adds r0, r0, r6 // reconstitute output value without rounding + pop {r4,r5,r6,pc} +LOCAL_LABEL(fsub_round): + // Otherwise, we're rounding, in three stages. First round up; then cheaply + // check the low bits of r5 (the 32-bit version of the mantissa) so that we + // can rule out round-to-even if any of those is nonzero; finally, in as few + // cases as possible, check the rest of y's mantissa to check for RTE fully. + adcs r0, r0, r6 // reconstitute output value while rounding up + lsls r5, r5, #(32-7) // check first 7 guard bits + beq LOCAL_LABEL(fsub_check_rte) // if the're all 0, do the full check for RTE + pop {r4,r5,r6,pc} // otherwise we're done + +LOCAL_LABEL(fsub_add): + // Trampoline to cross-jump to fadd, because a 16-bit branch won't reach that + // far. Also a convenient place to flip y's sign, so we only have to do it + // once. + eors r1, r1, r5 // we know r5 = 0x80000000 + bl LOCAL_LABEL(fadd_magnitude) // clobbers lr, which doesn't matter + +LOCAL_LABEL(fsub_check_rte): + // Full check for round-to-even, in the same style as fadd_check_rte: r4 + // still contains the version of y's mantissa that we shifted down before + // subtracting from x, and r1 contains the original version of that mantissa. + // So if we shift r4 back up again and XOR it with r1, we clear all the bits + // that we've already checked, and leave only the ones we haven't. The only + // exception is the leading mantissa bit, which is implicit in r1, but this + // can never affect round-to-even, because if we rounded at all then the + // round bit must have come from y, so the leading bit of y is at the round + // bit or above, hence not one of the bits we're checking for RTE. + lsls r4, r4, r3 // undo the shift of y's mantissa + lsls r1, r1, #8 // shift y's original mantissa back to the same place + eors r1, r1, r4 // find any differences + lsls r1, r1, #1 // but ignore the leading mantissa bit + beq LOCAL_LABEL(fsub_rte) // if all bits now clear, we're rounding to even + + // If we're not RTEing, we must undo the simplistic rounding we've already + // done. (We incremented the result based on the belief that the shifted-off + // data started 0x80xxx, but it turns out that xxx is slightly negative, so + // actually we had 0x7Fyyy.) + subs r0, r0, #1 + pop {r4,r5,r6,pc} +LOCAL_LABEL(fsub_rte): + // Actually round to even, by clearing the low bit of the output. + movs r4, #1 + bics r0, r0, r4 + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fsub_renorm_big): + // Now we know that we must renormalise by at least 2 bits, which may also + // give a denormal or zero result. + // + // This means no rounding can possibly be needed: if the subtraction cleared + // the top two bits of the mantissa, it means we computed A-B and found it + // was less than A/2, so B > A/2, so the exponent difference was at most 1. + // Hence the result mantissa fits in 24 bits even before renormalisation, and + // the top bit is clear, so it fits in 23 bits, i.e. it is exact. + + // Detect an actual zero result, and go and return it. + beq LOCAL_LABEL(fsub_diffsame) + + // Renormalise by binary search. (16-bit Thumb has no CLZ instruction.) We'll + // accumulate the total exponent adjustment in r0. It starts at 1 rather than + // 0, because we've shifted the mantissa left by one bit already. + movs r0, #1 + + // If the top 16 bits of r5 are clear, shift up by 16 and adjust r0 to match. + lsrs r3, r5, #(32-16) + bne LOCAL_LABEL(fsub_denorm_noshift16) + lsls r5, r5, #16 + adds r0, r0, #16 +LOCAL_LABEL(fsub_denorm_noshift16): + // Same for 8 bits + lsrs r3, r5, #(32-8) + bne LOCAL_LABEL(fsub_denorm_noshift8) + lsls r5, r5, #8 + adds r0, r0, #8 +LOCAL_LABEL(fsub_denorm_noshift8): + // 4 bits + lsrs r3, r5, #(32-4) + bne LOCAL_LABEL(fsub_denorm_noshift4) + lsls r5, r5, #4 + adds r0, r0, #4 +LOCAL_LABEL(fsub_denorm_noshift4): + // 2 bits + lsrs r3, r5, #(32-2) + bne LOCAL_LABEL(fsub_denorm_noshift2) + lsls r5, r5, #2 + adds r0, r0, #2 +LOCAL_LABEL(fsub_denorm_noshift2): + // 1 bit + lsrs r3, r5, #(32-1) + bne LOCAL_LABEL(fsub_denorm_noshift1) + lsls r5, r5, #1 + adds r0, r0, #1 +LOCAL_LABEL(fsub_denorm_noshift1): + + // Update our two copies of the exponent (with sign in r6, without in r2). + subs r6, r6, r0 + subs r2, r2, r0 + // Shift the mantissa and exponent into the right places to combine them. + lsls r4, r5, #1 // clear leading bit of mantissa + lsrs r0, r4, #9 // and shift it down + lsls r4, r6, #23 // shift sign and exponent up + adds r0, r0, r4 // put them together + // Check for underflow, which occurs if the output exponent is less than 1 + // (including having gone negative). + cmp r2, #1 + blt LOCAL_LABEL(fsub_underflow_2) + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fsub_diffsame): + // Here we only support round-to-nearest mode, so the difference of two + // identical things always returns +0. + movs r0, #0 + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fsub_underflow_1): + // We come here if renormalising by one bit reduced the output exponent to + // zero. In other words, the output value in x is denormal (hence exact) and + // wants shifting down by exactly 9 bits (8 bits of exponent plus the bit we + // already shifted it by), and then the sign bit putting back on. + // + // Also, before we get the sign bit from r6, we must add 1 to it, because of + // the possibility that decrementing it carried into the sign bit. + adds r6, r6, #1 // undo potential sign-flipping carry + lsrs r6, r6, #8 // isolate the sign bit + lsls r6, r6, #31 // and shift it up to the top + lsrs r0, r5, #9 // construct the output mantissa + orrs r0, r0, r6 // and combine with the sign bit + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fsub_underflow_2): + // We come here if multi-bit renormalisation found a denormal. The mantissa + // has its leading bit set at the top of r5, so it needs shifting down 8 bits + // to where it would be in a normalised number, and then further: if the + // output exponent is 0 (meaning the exponent just below a normalised number) + // then we shift one extra bit, if it's -1 then we shift two extra bits, and + // so on. So in total we shift down by 8 + (1 - exp) = 9 - exp. + rsbs r4, r6, #0 + adds r4, r4, #9 + lsrs r5, r5, r4 // shift mantissa into place + + // Extract the sign bit from r6 and combine it with that denormal. r6 could + // be 0 or could be negative, so we must add enough to it to make it reliably + // positive. Any offset that works is fine; we'll use 0xc0, which is the + // offset used by IEEE 754:1985 underflow intermediate values. + adds r6, r6, #0xc0 // rebias to correct sign bit + lsrs r6, r6, #8 // isolate the sign bit + lsls r0, r6, #31 // and shift it up to the top + adds r0, r0, r5 // combine with the denormalised mantissa + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fsub_naninf): + // We come here if at least one input is a NaN or infinity. If either or both + // inputs are NaN then we hand off to fnan2 which will propagate a NaN from + // the input. + // We come here if at least one of x,y is a NaN or infinity. + // Their exponents are reliably always in r2 and r3 + // respectively. + asrs r4, r5, #7 // so r4 = 0xFF000000 + lsls r6, r0, #1 // r6 > r4 iff x is NaN + cmp r6, r4 + bhi LOCAL_LABEL(fsub_nan) + lsls r6, r1, #1 // r6 > r4 iff y is NaN + cmp r6, r4 + bhi LOCAL_LABEL(fsub_nan) + + // No NaNs, so we have at least one infinity. Almost all additions involving + // an infinity return the input infinity unchanged. The only exception is + // subtracting two infinities that have the same sign, where we return NaN. + cmp r2, r3 // at least one exponent is 0xFF, so if EQ, both are + beq LOCAL_LABEL(fsub_infinf) + + // If x is infinite and y is finite, return x. + cmp r2, #255 + beq LOCAL_LABEL(fsub_ret_exact) +LOCAL_LABEL(fsub_retminusy): + // If x is finite and y is infinite, return -y. + movs r0, r1 + eors r0, r0, r5 // negate y +LOCAL_LABEL(fsub_retx): +LOCAL_LABEL(fsub_ret_exact): + pop {r4,r5,r6,pc} +LOCAL_LABEL(fsub_infinf): + // With two infinities, we must check their relative sign. If they have + // opposite sign, we just return x (which is the one with the same sign as + // the output). + movs r4, r0 + eors r4, r4, r1 + bmi LOCAL_LABEL(fsub_ret_exact) + + // But if we're subtracting two infinities of the same sign, make a default + // quiet NaN and return that. + ldr r0, =0x7fc00000 + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fsub_nan): + bl SYMBOL_NAME(__compiler_rt_fnan2) + pop {r4,r5,r6,pc} + +LOCAL_LABEL(fsub_zerodenorm_x): + // We come here if we found x was 0 or a denormal. We haven't set up r3 as + // the exponent of y yet. + lsrs r3, r3, #24 + + // Also, we checked for zero/denorm before checking for infinities and NaNs. + // We know x isn't an infinity or NaN, but we must check y. + cmp r3, #255 + beq LOCAL_LABEL(fsub_naninf) + + // Fall through to the next section. This repeats a pointless check for x + // being NaN or infinity, but it would cost more cycles to branch round it. + +LOCAL_LABEL(fsub_zerodenorm_y): + // We come here if we found y was 0 or a denormal, but also by falling + // through from above. So we may not yet have checked x for infinity/NaN. But + // we have checked that y isn't. + cmp r2, #255 + beq LOCAL_LABEL(fsub_naninf) + + // Now at least one of x,y is zero or denormal, and neither is infinite or + // NaN. We haven't yet checked the signs and cross-jumped to fsub, but we can + // handle all the zero cases without having to: + // + // - if x = -y (including both being zero), return 0 of the appropriate sign + // - if y = 0, return x (including the case of oppositely signed zeroes) + // - if x = 0 and y != 0, return -y + cmp r0, r1 // are x and y equal? + beq LOCAL_LABEL(fsub_diffsame) + lsls r6, r1, #1 // is y zero? + beq LOCAL_LABEL(fsub_retx) // if so, return x + lsls r6, r0, #1 // is x zero? + beq LOCAL_LABEL(fsub_retminusy) // if so, return -y + + // Now we've dealt with all the possibilities involving zeroes, so we have + // either one denormal or two denormals. These cases are harder, and we don't + // want to handle both signs at once, so check the signs and cross-branch + // into fadd if they're different. + movs r6, r1 + eors r6, r6, r0 + bpl LOCAL_LABEL(fsub_denorm) + eors r1, r1, r5 + bl LOCAL_LABEL(fadd_denorm) +LOCAL_LABEL(fsub_denorm): + // Sort the operands into magnitude order. Now we know they have the same + // sign, unsigned comparison is good enough for that. + subs r6, r0, r1 + bhs LOCAL_LABEL(fsub_denorm_noswap) + eors r6, r6, r5 // flip the signs in the process + subs r0, r0, r6 + adds r1, r1, r6 +LOCAL_LABEL(fsub_denorm_noswap): + + // We know one exponent is 0, so check if the other is too. We do this by + // adding the two exponents together, achieving two things in one + // instruction: it gets the nonzero exponent (if any) into r2 (saving us + // swapping r2 with r3 in the sorting step above), and it sets Z if both were + // zero. + adds r2, r2, r3 + beq LOCAL_LABEL(fsub_denorm2) + + // Now exactly one operand is denormal, and it's y. We must go back to + // fsub_dosub with all the registers appropriately set up. + lsrs r6, r0, #23 // r6 == sign and exponent of x + lsls r4, r1, #8 // r4 == mantissa of y, with leading bit clear + lsls r0, r0, #8 + bics r0, r0, r5 // clear high bit on mantissa of x + subs r3, r2, #1 // denormals are shifted as if they had exponent 1 + b LOCAL_LABEL(fsub_dosub) + +LOCAL_LABEL(fsub_denorm2): + // Here, x,y are both denormal, and we know we're doing magnitude addition. + // So we can subtract the mantissas like ordinary integers. But we have to + // avoid subtracting y's sign bit from x's. + bics r1, r1, r5 // clear sign bit of y + subs r0, r0, r1 // subtract mantissas + pop {r4,r5,r6,pc} +END_COMPILERRT_FUNCTION(__subsf3) + NO_EXEC_STACK_DIRECTIVE diff --git a/compiler-rt/lib/builtins/arm/fnan2.c b/compiler-rt/lib/builtins/arm/fnan2.c new file mode 100644 index 0000000000000..c2fbfa3974d6e --- /dev/null +++ b/compiler-rt/lib/builtins/arm/fnan2.c @@ -0,0 +1,37 @@ +//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This helper function is available for use by single-precision float +// arithmetic implementations to handle propagating NaNs from the input +// operands to the output, in a way that matches Arm hardware FP. +// +// On input, a and b are floating-point numbers in IEEE 754 encoding, and at +// least one of them must be a NaN. The return value is the correct output NaN. +// +//===----------------------------------------------------------------------===// + +#include + +uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) { + // Make shifted-left copies of a and b to discard the sign bit. Then add 1 at + // the bit position where the quiet vs signalling bit ended up. This squashes + // all the signalling NaNs to the top of the range of 32-bit values, from + // 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values + // wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect + // a signalling NaN by asking if it's greater than 0xff800000, and a quiet + // one by asking if it's less than 0x00800000. + uint32_t aadj = (a << 1) + 0x00800000; + uint32_t badj = (b << 1) + 0x00800000; + if (aadj > 0xff800000) // a is a signalling NaN? + return a | 0x00400000; // if so, return it with the quiet bit set + if (badj > 0xff800000) // b is a signalling NaN? + return b | 0x00400000; // if so, return it with the quiet bit set + if (aadj < 0x00800000) // a is a quiet NaN? + return a; // if so, return it + return b; // otherwise we expect b must be a quiet NaN +} diff --git a/compiler-rt/test/builtins/Unit/addsf3_test.c b/compiler-rt/test/builtins/Unit/addsf3_test.c new file mode 100644 index 0000000000000..f6ec215bbd724 --- /dev/null +++ b/compiler-rt/test/builtins/Unit/addsf3_test.c @@ -0,0 +1,382 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_addsf3 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Thumb1 assembler FP implementation, which commits to a +// more detailed handling of NaNs, we tighten up the check and include some +// extra test cases specific to that NaN policy. +#if __thumb__ && !__thumb2__ +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + +// Returns: a + b +COMPILER_RT_ABI float __addsf3(float a, float b); + +int test__addsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __addsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif + + if (ret) { + printf("error in test__addsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; +} + +int main() { + int status = 0; + + status |= test__addsf3(0x00000000, 0x00000000, 0x00000000); + status |= test__addsf3(0x00000000, 0x007fffff, 0x007fffff); + status |= test__addsf3(0x00000000, 0x3f800000, 0x3f800000); + status |= test__addsf3(0x00000000, 0x7f000000, 0x7f000000); + status |= test__addsf3(0x00000000, 0x7f800000, 0x7f800000); + status |= test__addsf3(0x00000000, 0x80000000, 0x00000000); + status |= test__addsf3(0x00000000, 0x807fffff, 0x807fffff); + status |= test__addsf3(0x00000000, 0x80800000, 0x80800000); + status |= test__addsf3(0x00000000, 0xff800000, 0xff800000); + status |= test__addsf3(0x00000001, 0x00000001, 0x00000002); + status |= test__addsf3(0x00000001, 0x3f7fffff, 0x3f7fffff); + status |= test__addsf3(0x00000001, 0x3f800000, 0x3f800000); + status |= test__addsf3(0x00000001, 0x3ffffffe, 0x3ffffffe); + status |= test__addsf3(0x00000001, 0x3fffffff, 0x3fffffff); + status |= test__addsf3(0x00000001, 0x7effffff, 0x7effffff); + status |= test__addsf3(0x00000001, 0x7f000000, 0x7f000000); + status |= test__addsf3(0x00000001, 0x7f7ffffe, 0x7f7ffffe); + status |= test__addsf3(0x00000001, 0x7f7fffff, 0x7f7fffff); + status |= test__addsf3(0x00000001, 0x80000001, 0x00000000); + status |= test__addsf3(0x00000002, 0x80000001, 0x00000001); + status |= test__addsf3(0x00000003, 0x00000000, 0x00000003); + status |= test__addsf3(0x00000003, 0x7f800000, 0x7f800000); + status |= test__addsf3(0x00000003, 0x80000000, 0x00000003); + status |= test__addsf3(0x00000003, 0x80000002, 0x00000001); + status |= test__addsf3(0x00000003, 0xc0a00000, 0xc0a00000); + status |= test__addsf3(0x00000003, 0xff000000, 0xff000000); + status |= test__addsf3(0x00000003, 0xff800000, 0xff800000); + status |= test__addsf3(0x00000004, 0x00000004, 0x00000008); + status |= test__addsf3(0x007ffffc, 0x807ffffc, 0x00000000); + status |= test__addsf3(0x007ffffd, 0x807ffffe, 0x80000001); + status |= test__addsf3(0x007fffff, 0x007fffff, 0x00fffffe); + status |= test__addsf3(0x007fffff, 0x807ffffe, 0x00000001); + status |= test__addsf3(0x007fffff, 0x80800000, 0x80000001); + status |= test__addsf3(0x00800000, 0x00000000, 0x00800000); + status |= test__addsf3(0x00800000, 0x00800000, 0x01000000); + status |= test__addsf3(0x00800000, 0x80800000, 0x00000000); + status |= test__addsf3(0x00800001, 0x80800000, 0x00000001); + status |= test__addsf3(0x00800001, 0x80800002, 0x80000001); + status |= test__addsf3(0x00ffffff, 0x81000000, 0x80000001); + status |= test__addsf3(0x00ffffff, 0x81000002, 0x80000005); + status |= test__addsf3(0x00ffffff, 0x81000004, 0x80000009); + status |= test__addsf3(0x01000000, 0x80ffffff, 0x00000001); + status |= test__addsf3(0x01000001, 0x80800001, 0x00800001); + status |= test__addsf3(0x01000001, 0x80ffffff, 0x00000003); + status |= test__addsf3(0x01000002, 0x80800001, 0x00800003); + status |= test__addsf3(0x017fffff, 0x81800000, 0x80000002); + status |= test__addsf3(0x01800000, 0x817fffff, 0x00000002); + status |= test__addsf3(0x01800001, 0x817fffff, 0x00000006); + status |= test__addsf3(0x01800002, 0x81000003, 0x01000001); + status |= test__addsf3(0x3f7fffff, 0x80000001, 0x3f7fffff); + status |= test__addsf3(0x3f800000, 0x3f800000, 0x40000000); + status |= test__addsf3(0x3f800000, 0x3f800003, 0x40000002); + status |= test__addsf3(0x3f800000, 0x40000000, 0x40400000); + status |= test__addsf3(0x3f800000, 0x40e00000, 0x41000000); + status |= test__addsf3(0x3f800000, 0x80000000, 0x3f800000); + status |= test__addsf3(0x3f800000, 0xbf800000, 0x00000000); + status |= test__addsf3(0x3f800001, 0x3f800000, 0x40000000); + status |= test__addsf3(0x3f800001, 0xbf800000, 0x34000000); + status |= test__addsf3(0x3f800001, 0xbf800002, 0xb4000000); + status |= test__addsf3(0x3ffffffc, 0xbffffffd, 0xb4000000); + status |= test__addsf3(0x3fffffff, 0xc0000000, 0xb4000000); + status |= test__addsf3(0x40000000, 0x34000000, 0x40000000); + status |= test__addsf3(0x40000000, 0x3f800000, 0x40400000); + status |= test__addsf3(0x40000000, 0x40000000, 0x40800000); + status |= test__addsf3(0x40000000, 0x40000001, 0x40800000); + status |= test__addsf3(0x40000000, 0xbfffffff, 0x34000000); + status |= test__addsf3(0x40000000, 0xc0000000, 0x00000000); + status |= test__addsf3(0x40000000, 0xc0000001, 0xb4800000); + status |= test__addsf3(0x40000000, 0xc0a00000, 0xc0400000); + status |= test__addsf3(0x40000001, 0x34000000, 0x40000002); + status |= test__addsf3(0x40000001, 0x40000002, 0x40800002); + status |= test__addsf3(0x40000001, 0xbf800001, 0x3f800001); + status |= test__addsf3(0x40000002, 0xbf800001, 0x3f800003); + status |= test__addsf3(0x40000002, 0xbf800003, 0x3f800001); + status |= test__addsf3(0x40000004, 0xc0000003, 0x34800000); + status |= test__addsf3(0x40400000, 0x40400000, 0x40c00000); + status |= test__addsf3(0x407fffff, 0x33ffffff, 0x407fffff); + status |= test__addsf3(0x407fffff, 0x34000000, 0x40800000); + status |= test__addsf3(0x407fffff, 0xc07ffffe, 0x34800000); + status |= test__addsf3(0x407fffff, 0xc0800002, 0xb5a00000); + status |= test__addsf3(0x40800001, 0xc07fffff, 0x35400000); + status |= test__addsf3(0x40a00000, 0x00000000, 0x40a00000); + status |= test__addsf3(0x40a00000, 0x80000000, 0x40a00000); + status |= test__addsf3(0x40a00000, 0xbf800000, 0x40800000); + status |= test__addsf3(0x40a00000, 0xc0a00000, 0x00000000); + status |= test__addsf3(0x7d800001, 0xfd7fffff, 0x72400000); + status |= test__addsf3(0x7e7fffff, 0xfe7ffffe, 0x72800000); + status |= test__addsf3(0x7e7fffff, 0xfe800002, 0xf3a00000); + status |= test__addsf3(0x7e800000, 0x7e800000, 0x7f000000); + status |= test__addsf3(0x7e800000, 0xfe7fffff, 0x72800000); + status |= test__addsf3(0x7e800000, 0xfe800001, 0xf3000000); + status |= test__addsf3(0x7e800001, 0x7e800000, 0x7f000000); + status |= test__addsf3(0x7e800001, 0xff000001, 0xfe800001); + status |= test__addsf3(0x7e800002, 0xfe000003, 0x7e000001); + status |= test__addsf3(0x7e800004, 0xfe800003, 0x73000000); + status |= test__addsf3(0x7efffffe, 0x7efffffe, 0x7f7ffffe); + status |= test__addsf3(0x7efffffe, 0x7effffff, 0x7f7ffffe); + status |= test__addsf3(0x7effffff, 0x3f800000, 0x7effffff); + status |= test__addsf3(0x7effffff, 0x7f000000, 0x7f800000); + status |= test__addsf3(0x7effffff, 0xbf800000, 0x7effffff); + status |= test__addsf3(0x7effffff, 0xff000000, 0xf3000000); + status |= test__addsf3(0x7f000000, 0x3f800000, 0x7f000000); + status |= test__addsf3(0x7f000000, 0x7f000000, 0x7f800000); + status |= test__addsf3(0x7f000000, 0x7f800000, 0x7f800000); + status |= test__addsf3(0x7f000000, 0xbf800000, 0x7f000000); + status |= test__addsf3(0x7f000000, 0xff000000, 0x00000000); + status |= test__addsf3(0x7f000000, 0xff800000, 0xff800000); + status |= test__addsf3(0x7f000001, 0x7f000000, 0x7f800000); + status |= test__addsf3(0x7f000001, 0xff000000, 0x73800000); + status |= test__addsf3(0x7f000001, 0xff000002, 0xf3800000); + status |= test__addsf3(0x7f000002, 0xfe800001, 0x7e800003); + status |= test__addsf3(0x7f7ffffe, 0x3f800000, 0x7f7ffffe); + status |= test__addsf3(0x7f7ffffe, 0x7f7ffffe, 0x7f800000); + status |= test__addsf3(0x7f7ffffe, 0x7f7fffff, 0x7f800000); + status |= test__addsf3(0x7f7ffffe, 0xbf800000, 0x7f7ffffe); + status |= test__addsf3(0x7f7ffffe, 0xff7fffff, 0xf3800000); + status |= test__addsf3(0x7f7fffff, 0x3f800000, 0x7f7fffff); + status |= test__addsf3(0x7f7fffff, 0x80000001, 0x7f7fffff); + status |= test__addsf3(0x7f7fffff, 0xbf800000, 0x7f7fffff); + status |= test__addsf3(0x7f7fffff, 0xff7fffff, 0x00000000); + status |= test__addsf3(0x7f800000, 0x00000000, 0x7f800000); + status |= test__addsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__addsf3(0x7f800000, 0x7f000000, 0x7f800000); + status |= test__addsf3(0x7f800000, 0x7f800000, 0x7f800000); + status |= test__addsf3(0x7f800000, 0x80000000, 0x7f800000); + status |= test__addsf3(0x7f800000, 0x807fffff, 0x7f800000); + status |= test__addsf3(0x7f800000, 0xff000000, 0x7f800000); + status |= test__addsf3(0x80000000, 0x00000000, 0x00000000); + status |= test__addsf3(0x80000000, 0x007fffff, 0x007fffff); + status |= test__addsf3(0x80000000, 0x7f000000, 0x7f000000); + status |= test__addsf3(0x80000000, 0x7f800000, 0x7f800000); + status |= test__addsf3(0x80000000, 0x80000000, 0x80000000); + status |= test__addsf3(0x80000000, 0x807fffff, 0x807fffff); + status |= test__addsf3(0x80000000, 0x80800000, 0x80800000); + status |= test__addsf3(0x80000000, 0xbf800000, 0xbf800000); + status |= test__addsf3(0x80000000, 0xff800000, 0xff800000); + status |= test__addsf3(0x80000001, 0x00000001, 0x00000000); + status |= test__addsf3(0x80000001, 0x80000001, 0x80000002); + status |= test__addsf3(0x80000001, 0xbf7fffff, 0xbf7fffff); + status |= test__addsf3(0x80000001, 0xbf800000, 0xbf800000); + status |= test__addsf3(0x80000001, 0xbffffffe, 0xbffffffe); + status |= test__addsf3(0x80000001, 0xbfffffff, 0xbfffffff); + status |= test__addsf3(0x80000001, 0xfeffffff, 0xfeffffff); + status |= test__addsf3(0x80000001, 0xff000000, 0xff000000); + status |= test__addsf3(0x80000001, 0xff7ffffe, 0xff7ffffe); + status |= test__addsf3(0x80000001, 0xff7fffff, 0xff7fffff); + status |= test__addsf3(0x80000002, 0x00000001, 0x80000001); + status |= test__addsf3(0x80000003, 0x00000000, 0x80000003); + status |= test__addsf3(0x80000003, 0x00000002, 0x80000001); + status |= test__addsf3(0x80000003, 0x40400000, 0x40400000); + status |= test__addsf3(0x80000003, 0x7f000000, 0x7f000000); + status |= test__addsf3(0x80000003, 0x7f800000, 0x7f800000); + status |= test__addsf3(0x80000003, 0x80000000, 0x80000003); + status |= test__addsf3(0x80000003, 0xff800000, 0xff800000); + status |= test__addsf3(0x80000004, 0x80000004, 0x80000008); + status |= test__addsf3(0x807ffffd, 0x007ffffe, 0x00000001); + status |= test__addsf3(0x807fffff, 0x007ffffe, 0x80000001); + status |= test__addsf3(0x807fffff, 0x007fffff, 0x00000000); + status |= test__addsf3(0x807fffff, 0x00800000, 0x00000001); + status |= test__addsf3(0x807fffff, 0x807fffff, 0x80fffffe); + status |= test__addsf3(0x80800000, 0x00000000, 0x80800000); + status |= test__addsf3(0x80800000, 0x00800000, 0x00000000); + status |= test__addsf3(0x80800001, 0x00800000, 0x80000001); + status |= test__addsf3(0x80800001, 0x00800002, 0x00000001); + status |= test__addsf3(0x80ffffff, 0x01000000, 0x00000001); + status |= test__addsf3(0x80ffffff, 0x01000002, 0x00000005); + status |= test__addsf3(0x80ffffff, 0x01000004, 0x00000009); + status |= test__addsf3(0x81000000, 0x00ffffff, 0x80000001); + status |= test__addsf3(0x81000001, 0x00800001, 0x80800001); + status |= test__addsf3(0x81000001, 0x00ffffff, 0x80000003); + status |= test__addsf3(0x81000002, 0x00800001, 0x80800003); + status |= test__addsf3(0x817fffff, 0x01800000, 0x00000002); + status |= test__addsf3(0x81800000, 0x017fffff, 0x80000002); + status |= test__addsf3(0x81800001, 0x017fffff, 0x80000006); + status |= test__addsf3(0x81800002, 0x01000003, 0x81000001); + status |= test__addsf3(0xbf800000, 0x80000000, 0xbf800000); + status |= test__addsf3(0xbf800000, 0xbf800003, 0xc0000002); + status |= test__addsf3(0xbf800001, 0x3f800000, 0xb4000000); + status |= test__addsf3(0xbf800001, 0x3f800002, 0x34000000); + status |= test__addsf3(0xbf800001, 0xbf800000, 0xc0000000); + status |= test__addsf3(0xbffffffc, 0x3ffffffd, 0x34000000); + status |= test__addsf3(0xbfffffff, 0x00000001, 0xbfffffff); + status |= test__addsf3(0xbfffffff, 0x40000000, 0x34000000); + status |= test__addsf3(0xc0000000, 0x3fffffff, 0xb4000000); + status |= test__addsf3(0xc0000000, 0x40000001, 0x34800000); + status |= test__addsf3(0xc0000000, 0xc0000001, 0xc0800000); + status |= test__addsf3(0xc0000001, 0x3f800001, 0xbf800001); + status |= test__addsf3(0xc0000001, 0xc0000002, 0xc0800002); + status |= test__addsf3(0xc0000002, 0x3f800001, 0xbf800003); + status |= test__addsf3(0xc0000002, 0x3f800003, 0xbf800001); + status |= test__addsf3(0xc0000004, 0x40000003, 0xb4800000); + status |= test__addsf3(0xc0400000, 0x40400000, 0x00000000); + status |= test__addsf3(0xc07fffff, 0x407ffffe, 0xb4800000); + status |= test__addsf3(0xc07fffff, 0x40800002, 0x35a00000); + status |= test__addsf3(0xc07fffff, 0xb3ffffff, 0xc07fffff); + status |= test__addsf3(0xc07fffff, 0xb4000000, 0xc0800000); + status |= test__addsf3(0xc0800001, 0x407fffff, 0xb5400000); + status |= test__addsf3(0xfd800001, 0x7d7fffff, 0xf2400000); + status |= test__addsf3(0xfe7fffff, 0x7e7ffffe, 0xf2800000); + status |= test__addsf3(0xfe7fffff, 0x7e800002, 0x73a00000); + status |= test__addsf3(0xfe800000, 0x7e7fffff, 0xf2800000); + status |= test__addsf3(0xfe800000, 0x7e800001, 0x73000000); + status |= test__addsf3(0xfe800001, 0x7f000001, 0x7e800001); + status |= test__addsf3(0xfe800001, 0xfe800000, 0xff000000); + status |= test__addsf3(0xfe800002, 0x7e000003, 0xfe000001); + status |= test__addsf3(0xfe800004, 0x7e800003, 0xf3000000); + status |= test__addsf3(0xfefffffe, 0x7efffffe, 0x00000000); + status |= test__addsf3(0xfefffffe, 0xfefffffe, 0xff7ffffe); + status |= test__addsf3(0xfefffffe, 0xfeffffff, 0xff7ffffe); + status |= test__addsf3(0xfeffffff, 0x3f800000, 0xfeffffff); + status |= test__addsf3(0xfeffffff, 0x7f000000, 0x73000000); + status |= test__addsf3(0xfeffffff, 0xbf800000, 0xfeffffff); + status |= test__addsf3(0xfeffffff, 0xff000000, 0xff800000); + status |= test__addsf3(0xff000000, 0x00000000, 0xff000000); + status |= test__addsf3(0xff000000, 0x3f800000, 0xff000000); + status |= test__addsf3(0xff000000, 0x7f800000, 0x7f800000); + status |= test__addsf3(0xff000000, 0x80000000, 0xff000000); + status |= test__addsf3(0xff000000, 0xbf800000, 0xff000000); + status |= test__addsf3(0xff000000, 0xff000000, 0xff800000); + status |= test__addsf3(0xff000000, 0xff800000, 0xff800000); + status |= test__addsf3(0xff000001, 0x7f000000, 0xf3800000); + status |= test__addsf3(0xff000001, 0x7f000002, 0x73800000); + status |= test__addsf3(0xff000001, 0xff000000, 0xff800000); + status |= test__addsf3(0xff000002, 0x7e800001, 0xfe800003); + status |= test__addsf3(0xff7ffffe, 0x3f800000, 0xff7ffffe); + status |= test__addsf3(0xff7ffffe, 0x7f7fffff, 0x73800000); + status |= test__addsf3(0xff7ffffe, 0xbf800000, 0xff7ffffe); + status |= test__addsf3(0xff7ffffe, 0xff7ffffe, 0xff800000); + status |= test__addsf3(0xff7ffffe, 0xff7fffff, 0xff800000); + status |= test__addsf3(0xff7fffff, 0x00000001, 0xff7fffff); + status |= test__addsf3(0xff7fffff, 0x3f800000, 0xff7fffff); + status |= test__addsf3(0xff7fffff, 0xbf800000, 0xff7fffff); + status |= test__addsf3(0xff800000, 0x00000000, 0xff800000); + status |= test__addsf3(0xff800000, 0x007fffff, 0xff800000); + status |= test__addsf3(0xff800000, 0x7f000000, 0xff800000); + status |= test__addsf3(0xff800000, 0x80000000, 0xff800000); + status |= test__addsf3(0xff800000, 0x807fffff, 0xff800000); + status |= test__addsf3(0xff800000, 0xff000000, 0xff800000); + status |= test__addsf3(0xff800000, 0xff800000, 0xff800000); + status |= test__addsf3(0x7f7fffff, 0x74ffffff, 0x7f800000); + status |= test__addsf3(0x3f7fffff, 0x34004000, 0x3f800001); + status |= test__addsf3(0x3f800001, 0x23800000, 0x3f800001); + status |= test__addsf3(0xbbebe66d, 0x3b267c1f, 0xbb98a85e); + status |= test__addsf3(0x01f5b166, 0x81339a37, 0x019be44a); + + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__addsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__addsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__addsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__addsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__addsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); + +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by + // arm/addsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. + + status |= test__addsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__addsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__addsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__addsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__addsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__addsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__addsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__addsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__addsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__addsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__addsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__addsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__addsf3(0x7f800000, 0xff800000, 0x7fc00000); + status |= test__addsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__addsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__addsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__addsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__addsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__addsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__addsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__addsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__addsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__addsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__addsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__addsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__addsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__addsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__addsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__addsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__addsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__addsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__addsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__addsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__addsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__addsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__addsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__addsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__addsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__addsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__addsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__addsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__addsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__addsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__addsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__addsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__addsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__addsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__addsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__addsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__addsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__addsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__addsf3(0xff800000, 0x7f800000, 0x7fc00000); + status |= test__addsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__addsf3(0xff800000, 0x7fde0397, 0x7fde0397); +#endif // ARM_NAN_HANDLING + + return status; +} diff --git a/compiler-rt/test/builtins/Unit/subsf3_test.c b/compiler-rt/test/builtins/Unit/subsf3_test.c new file mode 100644 index 0000000000000..9cdcddbb905fa --- /dev/null +++ b/compiler-rt/test/builtins/Unit/subsf3_test.c @@ -0,0 +1,380 @@ +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +// RUN: %clang_builtins %s %librt -o %t && %run %t +// REQUIRES: librt_has_addsf3 + +#include "int_lib.h" +#include +#include + +#include "fp_test.h" + +// By default this test uses compareResultF to check the returned floats, which +// accepts any returned NaN if the expected result is the canonical NaN value +// 0x7fc00000. For the Thumb1 optimized FP implementation, which commits to a +// more detailed handling of NaNs, we tighten up the check and include some +// extra test cases specific to that NaN policy. +#if __thumb__ && !__thumb2__ +# define EXPECT_EXACT_RESULTS +# define ARM_NAN_HANDLING +#endif + +// Returns: a - b +COMPILER_RT_ABI float __subsf3(float a, float b); + +int test__subsf3(uint32_t a_rep, uint32_t b_rep, uint32_t expected_rep) { + float a = fromRep32(a_rep), b = fromRep32(b_rep); + float x = __subsf3(a, b); +#ifdef EXPECT_EXACT_RESULTS + int ret = toRep32(x) == expected_rep; +#else + int ret = compareResultF(x, expected_rep); +#endif + + if (ret) { + printf("error in test__subsf3(%08" PRIx32 ", %08" PRIx32 ") = %08" PRIx32 + ", expected %08" PRIx32 "\n", + a_rep, b_rep, toRep32(x), expected_rep); + } + return ret; +} + +int main() { + int status = 0; + + status |= test__subsf3(0x00000000, 0x00000000, 0x00000000); + status |= test__subsf3(0x00000000, 0x007fffff, 0x807fffff); + status |= test__subsf3(0x00000000, 0x00800000, 0x80800000); + status |= test__subsf3(0x00000000, 0x7f800000, 0xff800000); + status |= test__subsf3(0x00000000, 0x80000000, 0x00000000); + status |= test__subsf3(0x00000000, 0x807fffff, 0x007fffff); + status |= test__subsf3(0x00000000, 0xbf800000, 0x3f800000); + status |= test__subsf3(0x00000000, 0xff000000, 0x7f000000); + status |= test__subsf3(0x00000000, 0xff800000, 0x7f800000); + status |= test__subsf3(0x00000001, 0x00000001, 0x00000000); + status |= test__subsf3(0x00000001, 0x80000001, 0x00000002); + status |= test__subsf3(0x00000001, 0xbf7fffff, 0x3f7fffff); + status |= test__subsf3(0x00000001, 0xbf800000, 0x3f800000); + status |= test__subsf3(0x00000001, 0xbffffffe, 0x3ffffffe); + status |= test__subsf3(0x00000001, 0xbfffffff, 0x3fffffff); + status |= test__subsf3(0x00000001, 0xfeffffff, 0x7effffff); + status |= test__subsf3(0x00000001, 0xff000000, 0x7f000000); + status |= test__subsf3(0x00000001, 0xff7ffffe, 0x7f7ffffe); + status |= test__subsf3(0x00000001, 0xff7fffff, 0x7f7fffff); + status |= test__subsf3(0x00000002, 0x00000001, 0x00000001); + status |= test__subsf3(0x00000003, 0x00000000, 0x00000003); + status |= test__subsf3(0x00000003, 0x00000002, 0x00000001); + status |= test__subsf3(0x00000003, 0x40a00000, 0xc0a00000); + status |= test__subsf3(0x00000003, 0x7f000000, 0xff000000); + status |= test__subsf3(0x00000003, 0x7f800000, 0xff800000); + status |= test__subsf3(0x00000003, 0x80000000, 0x00000003); + status |= test__subsf3(0x00000003, 0xff800000, 0x7f800000); + status |= test__subsf3(0x00000004, 0x80000004, 0x00000008); + status |= test__subsf3(0x007ffffc, 0x007ffffc, 0x00000000); + status |= test__subsf3(0x007ffffd, 0x007ffffe, 0x80000001); + status |= test__subsf3(0x007fffff, 0x007ffffe, 0x00000001); + status |= test__subsf3(0x007fffff, 0x00800000, 0x80000001); + status |= test__subsf3(0x007fffff, 0x807fffff, 0x00fffffe); + status |= test__subsf3(0x00800000, 0x00800000, 0x00000000); + status |= test__subsf3(0x00800000, 0x80000000, 0x00800000); + status |= test__subsf3(0x00800000, 0x80800000, 0x01000000); + status |= test__subsf3(0x00800001, 0x00800000, 0x00000001); + status |= test__subsf3(0x00800001, 0x00800002, 0x80000001); + status |= test__subsf3(0x00ffffff, 0x01000000, 0x80000001); + status |= test__subsf3(0x00ffffff, 0x01000002, 0x80000005); + status |= test__subsf3(0x00ffffff, 0x01000004, 0x80000009); + status |= test__subsf3(0x01000000, 0x00ffffff, 0x00000001); + status |= test__subsf3(0x01000001, 0x00800001, 0x00800001); + status |= test__subsf3(0x01000001, 0x00ffffff, 0x00000003); + status |= test__subsf3(0x01000002, 0x00800001, 0x00800003); + status |= test__subsf3(0x017fffff, 0x01800000, 0x80000002); + status |= test__subsf3(0x01800000, 0x017fffff, 0x00000002); + status |= test__subsf3(0x01800001, 0x017fffff, 0x00000006); + status |= test__subsf3(0x01800002, 0x01000003, 0x01000001); + status |= test__subsf3(0x3f7fffff, 0x00000001, 0x3f7fffff); + status |= test__subsf3(0x3f800000, 0x00000000, 0x3f800000); + status |= test__subsf3(0x3f800000, 0x3f800000, 0x00000000); + status |= test__subsf3(0x3f800000, 0xbf800000, 0x40000000); + status |= test__subsf3(0x3f800000, 0xbf800003, 0x40000002); + status |= test__subsf3(0x3f800000, 0xc0000000, 0x40400000); + status |= test__subsf3(0x3f800000, 0xc0e00000, 0x41000000); + status |= test__subsf3(0x3f800001, 0x3f800000, 0x34000000); + status |= test__subsf3(0x3f800001, 0x3f800002, 0xb4000000); + status |= test__subsf3(0x3f800001, 0xbf800000, 0x40000000); + status |= test__subsf3(0x3ffffffc, 0x3ffffffd, 0xb4000000); + status |= test__subsf3(0x3fffffff, 0x40000000, 0xb4000000); + status |= test__subsf3(0x40000000, 0x3fffffff, 0x34000000); + status |= test__subsf3(0x40000000, 0x40000000, 0x00000000); + status |= test__subsf3(0x40000000, 0x40000001, 0xb4800000); + status |= test__subsf3(0x40000000, 0x40a00000, 0xc0400000); + status |= test__subsf3(0x40000000, 0xb4000000, 0x40000000); + status |= test__subsf3(0x40000000, 0xbf800000, 0x40400000); + status |= test__subsf3(0x40000000, 0xc0000000, 0x40800000); + status |= test__subsf3(0x40000000, 0xc0000001, 0x40800000); + status |= test__subsf3(0x40000001, 0x3f800001, 0x3f800001); + status |= test__subsf3(0x40000001, 0xb4000000, 0x40000002); + status |= test__subsf3(0x40000001, 0xc0000002, 0x40800002); + status |= test__subsf3(0x40000002, 0x3f800001, 0x3f800003); + status |= test__subsf3(0x40000002, 0x3f800003, 0x3f800001); + status |= test__subsf3(0x40000004, 0x40000003, 0x34800000); + status |= test__subsf3(0x40400000, 0xc0400000, 0x40c00000); + status |= test__subsf3(0x407fffff, 0x407ffffe, 0x34800000); + status |= test__subsf3(0x407fffff, 0x40800002, 0xb5a00000); + status |= test__subsf3(0x407fffff, 0xb3ffffff, 0x407fffff); + status |= test__subsf3(0x407fffff, 0xb4000000, 0x40800000); + status |= test__subsf3(0x40800001, 0x407fffff, 0x35400000); + status |= test__subsf3(0x40a00000, 0x00000000, 0x40a00000); + status |= test__subsf3(0x40a00000, 0x3f800000, 0x40800000); + status |= test__subsf3(0x40a00000, 0x40a00000, 0x00000000); + status |= test__subsf3(0x40a00000, 0x80000000, 0x40a00000); + status |= test__subsf3(0x7d800001, 0x7d7fffff, 0x72400000); + status |= test__subsf3(0x7e7fffff, 0x7e7ffffe, 0x72800000); + status |= test__subsf3(0x7e7fffff, 0x7e800002, 0xf3a00000); + status |= test__subsf3(0x7e800000, 0x7e7fffff, 0x72800000); + status |= test__subsf3(0x7e800000, 0x7e800001, 0xf3000000); + status |= test__subsf3(0x7e800000, 0xfe800000, 0x7f000000); + status |= test__subsf3(0x7e800001, 0x7f000001, 0xfe800001); + status |= test__subsf3(0x7e800001, 0xfe800000, 0x7f000000); + status |= test__subsf3(0x7e800002, 0x7e000003, 0x7e000001); + status |= test__subsf3(0x7e800004, 0x7e800003, 0x73000000); + status |= test__subsf3(0x7efffffe, 0xfefffffe, 0x7f7ffffe); + status |= test__subsf3(0x7efffffe, 0xfeffffff, 0x7f7ffffe); + status |= test__subsf3(0x7effffff, 0x3f800000, 0x7effffff); + status |= test__subsf3(0x7effffff, 0x7f000000, 0xf3000000); + status |= test__subsf3(0x7effffff, 0xbf800000, 0x7effffff); + status |= test__subsf3(0x7effffff, 0xff000000, 0x7f800000); + status |= test__subsf3(0x7f000000, 0x3f800000, 0x7f000000); + status |= test__subsf3(0x7f000000, 0x7f000000, 0x00000000); + status |= test__subsf3(0x7f000000, 0x7f800000, 0xff800000); + status |= test__subsf3(0x7f000000, 0xbf800000, 0x7f000000); + status |= test__subsf3(0x7f000000, 0xff000000, 0x7f800000); + status |= test__subsf3(0x7f000000, 0xff800000, 0x7f800000); + status |= test__subsf3(0x7f000001, 0x7f000000, 0x73800000); + status |= test__subsf3(0x7f000001, 0x7f000002, 0xf3800000); + status |= test__subsf3(0x7f000001, 0xff000000, 0x7f800000); + status |= test__subsf3(0x7f000002, 0x7e800001, 0x7e800003); + status |= test__subsf3(0x7f7ffffe, 0x3f800000, 0x7f7ffffe); + status |= test__subsf3(0x7f7ffffe, 0x7f7fffff, 0xf3800000); + status |= test__subsf3(0x7f7ffffe, 0xbf800000, 0x7f7ffffe); + status |= test__subsf3(0x7f7ffffe, 0xff7ffffe, 0x7f800000); + status |= test__subsf3(0x7f7ffffe, 0xff7fffff, 0x7f800000); + status |= test__subsf3(0x7f7fffff, 0x00000001, 0x7f7fffff); + status |= test__subsf3(0x7f7fffff, 0x3f800000, 0x7f7fffff); + status |= test__subsf3(0x7f7fffff, 0x7f7fffff, 0x00000000); + status |= test__subsf3(0x7f7fffff, 0xbf800000, 0x7f7fffff); + status |= test__subsf3(0x7f800000, 0x00000000, 0x7f800000); + status |= test__subsf3(0x7f800000, 0x007fffff, 0x7f800000); + status |= test__subsf3(0x7f800000, 0x7f000000, 0x7f800000); + status |= test__subsf3(0x7f800000, 0x80000000, 0x7f800000); + status |= test__subsf3(0x7f800000, 0x807fffff, 0x7f800000); + status |= test__subsf3(0x7f800000, 0xff000000, 0x7f800000); + status |= test__subsf3(0x7f800000, 0xff800000, 0x7f800000); + status |= test__subsf3(0x80000000, 0x00000000, 0x80000000); + status |= test__subsf3(0x80000000, 0x007fffff, 0x807fffff); + status |= test__subsf3(0x80000000, 0x00800000, 0x80800000); + status |= test__subsf3(0x80000000, 0x3f800000, 0xbf800000); + status |= test__subsf3(0x80000000, 0x7f800000, 0xff800000); + status |= test__subsf3(0x80000000, 0x80000000, 0x00000000); + status |= test__subsf3(0x80000000, 0x807fffff, 0x007fffff); + status |= test__subsf3(0x80000000, 0xff000000, 0x7f000000); + status |= test__subsf3(0x80000000, 0xff800000, 0x7f800000); + status |= test__subsf3(0x80000001, 0x00000001, 0x80000002); + status |= test__subsf3(0x80000001, 0x3f7fffff, 0xbf7fffff); + status |= test__subsf3(0x80000001, 0x3f800000, 0xbf800000); + status |= test__subsf3(0x80000001, 0x3ffffffe, 0xbffffffe); + status |= test__subsf3(0x80000001, 0x3fffffff, 0xbfffffff); + status |= test__subsf3(0x80000001, 0x7effffff, 0xfeffffff); + status |= test__subsf3(0x80000001, 0x7f000000, 0xff000000); + status |= test__subsf3(0x80000001, 0x7f7ffffe, 0xff7ffffe); + status |= test__subsf3(0x80000001, 0x7f7fffff, 0xff7fffff); + status |= test__subsf3(0x80000001, 0x80000001, 0x00000000); + status |= test__subsf3(0x80000002, 0x80000001, 0x80000001); + status |= test__subsf3(0x80000003, 0x00000000, 0x80000003); + status |= test__subsf3(0x80000003, 0x7f800000, 0xff800000); + status |= test__subsf3(0x80000003, 0x80000000, 0x80000003); + status |= test__subsf3(0x80000003, 0x80000002, 0x80000001); + status |= test__subsf3(0x80000003, 0xc0400000, 0x40400000); + status |= test__subsf3(0x80000003, 0xff000000, 0x7f000000); + status |= test__subsf3(0x80000003, 0xff800000, 0x7f800000); + status |= test__subsf3(0x80000004, 0x00000004, 0x80000008); + status |= test__subsf3(0x807ffffd, 0x807ffffe, 0x00000001); + status |= test__subsf3(0x807fffff, 0x007fffff, 0x80fffffe); + status |= test__subsf3(0x807fffff, 0x807ffffe, 0x80000001); + status |= test__subsf3(0x807fffff, 0x807fffff, 0x00000000); + status |= test__subsf3(0x807fffff, 0x80800000, 0x00000001); + status |= test__subsf3(0x80800000, 0x80000000, 0x80800000); + status |= test__subsf3(0x80800000, 0x80800000, 0x00000000); + status |= test__subsf3(0x80800001, 0x80800000, 0x80000001); + status |= test__subsf3(0x80800001, 0x80800002, 0x00000001); + status |= test__subsf3(0x80ffffff, 0x81000000, 0x00000001); + status |= test__subsf3(0x80ffffff, 0x81000002, 0x00000005); + status |= test__subsf3(0x80ffffff, 0x81000004, 0x00000009); + status |= test__subsf3(0x81000000, 0x80ffffff, 0x80000001); + status |= test__subsf3(0x81000001, 0x80800001, 0x80800001); + status |= test__subsf3(0x81000001, 0x80ffffff, 0x80000003); + status |= test__subsf3(0x81000002, 0x80800001, 0x80800003); + status |= test__subsf3(0x817fffff, 0x81800000, 0x00000002); + status |= test__subsf3(0x81800000, 0x817fffff, 0x80000002); + status |= test__subsf3(0x81800001, 0x817fffff, 0x80000006); + status |= test__subsf3(0x81800002, 0x81000003, 0x81000001); + status |= test__subsf3(0xbf800000, 0x00000000, 0xbf800000); + status |= test__subsf3(0xbf800000, 0x3f800003, 0xc0000002); + status |= test__subsf3(0xbf800001, 0x3f800000, 0xc0000000); + status |= test__subsf3(0xbf800001, 0xbf800000, 0xb4000000); + status |= test__subsf3(0xbf800001, 0xbf800002, 0x34000000); + status |= test__subsf3(0xbffffffc, 0xbffffffd, 0x34000000); + status |= test__subsf3(0xbfffffff, 0x80000001, 0xbfffffff); + status |= test__subsf3(0xbfffffff, 0xc0000000, 0x34000000); + status |= test__subsf3(0xc0000000, 0x40000001, 0xc0800000); + status |= test__subsf3(0xc0000000, 0xbfffffff, 0xb4000000); + status |= test__subsf3(0xc0000000, 0xc0000001, 0x34800000); + status |= test__subsf3(0xc0000001, 0x40000002, 0xc0800002); + status |= test__subsf3(0xc0000001, 0xbf800001, 0xbf800001); + status |= test__subsf3(0xc0000002, 0xbf800001, 0xbf800003); + status |= test__subsf3(0xc0000002, 0xbf800003, 0xbf800001); + status |= test__subsf3(0xc0000004, 0xc0000003, 0xb4800000); + status |= test__subsf3(0xc0400000, 0xc0400000, 0x00000000); + status |= test__subsf3(0xc07fffff, 0x33ffffff, 0xc07fffff); + status |= test__subsf3(0xc07fffff, 0x34000000, 0xc0800000); + status |= test__subsf3(0xc07fffff, 0xc07ffffe, 0xb4800000); + status |= test__subsf3(0xc07fffff, 0xc0800002, 0x35a00000); + status |= test__subsf3(0xc0800001, 0xc07fffff, 0xb5400000); + status |= test__subsf3(0xfd800001, 0xfd7fffff, 0xf2400000); + status |= test__subsf3(0xfe7fffff, 0xfe7ffffe, 0xf2800000); + status |= test__subsf3(0xfe7fffff, 0xfe800002, 0x73a00000); + status |= test__subsf3(0xfe800000, 0xfe7fffff, 0xf2800000); + status |= test__subsf3(0xfe800000, 0xfe800001, 0x73000000); + status |= test__subsf3(0xfe800001, 0x7e800000, 0xff000000); + status |= test__subsf3(0xfe800001, 0xff000001, 0x7e800001); + status |= test__subsf3(0xfe800002, 0xfe000003, 0xfe000001); + status |= test__subsf3(0xfe800004, 0xfe800003, 0xf3000000); + status |= test__subsf3(0xfefffffe, 0x7efffffe, 0xff7ffffe); + status |= test__subsf3(0xfefffffe, 0x7effffff, 0xff7ffffe); + status |= test__subsf3(0xfefffffe, 0xfefffffe, 0x00000000); + status |= test__subsf3(0xfeffffff, 0x3f800000, 0xfeffffff); + status |= test__subsf3(0xfeffffff, 0x7f000000, 0xff800000); + status |= test__subsf3(0xfeffffff, 0xbf800000, 0xfeffffff); + status |= test__subsf3(0xfeffffff, 0xff000000, 0x73000000); + status |= test__subsf3(0xff000000, 0x00000000, 0xff000000); + status |= test__subsf3(0xff000000, 0x3f800000, 0xff000000); + status |= test__subsf3(0xff000000, 0x7f000000, 0xff800000); + status |= test__subsf3(0xff000000, 0x7f800000, 0xff800000); + status |= test__subsf3(0xff000000, 0x80000000, 0xff000000); + status |= test__subsf3(0xff000000, 0xbf800000, 0xff000000); + status |= test__subsf3(0xff000000, 0xff800000, 0x7f800000); + status |= test__subsf3(0xff000001, 0x7f000000, 0xff800000); + status |= test__subsf3(0xff000001, 0xff000000, 0xf3800000); + status |= test__subsf3(0xff000001, 0xff000002, 0x73800000); + status |= test__subsf3(0xff000002, 0xfe800001, 0xfe800003); + status |= test__subsf3(0xff7ffffe, 0x3f800000, 0xff7ffffe); + status |= test__subsf3(0xff7ffffe, 0x7f7ffffe, 0xff800000); + status |= test__subsf3(0xff7ffffe, 0x7f7fffff, 0xff800000); + status |= test__subsf3(0xff7ffffe, 0xbf800000, 0xff7ffffe); + status |= test__subsf3(0xff7ffffe, 0xff7fffff, 0x73800000); + status |= test__subsf3(0xff7fffff, 0x3f800000, 0xff7fffff); + status |= test__subsf3(0xff7fffff, 0x80000001, 0xff7fffff); + status |= test__subsf3(0xff7fffff, 0xbf800000, 0xff7fffff); + status |= test__subsf3(0xff800000, 0x00000000, 0xff800000); + status |= test__subsf3(0xff800000, 0x007fffff, 0xff800000); + status |= test__subsf3(0xff800000, 0x7f000000, 0xff800000); + status |= test__subsf3(0xff800000, 0x7f800000, 0xff800000); + status |= test__subsf3(0xff800000, 0x80000000, 0xff800000); + status |= test__subsf3(0xff800000, 0x807fffff, 0xff800000); + status |= test__subsf3(0xff800000, 0xff000000, 0xff800000); + status |= test__subsf3(0x46f99cee, 0x4656466d, 0x468e79b8); + status |= test__subsf3(0x007ffff7, 0x00f7ffff, 0x80780008); + status |= test__subsf3(0x80ffffbf, 0x80800000, 0x807fffbf); + + // Test that the result of an operation is a NaN at all when it should be. + // + // In most configurations these tests' results are checked compared using + // compareResultF, so we set all the answers to the canonical NaN 0x7fc00000, + // which causes compareResultF to accept any NaN encoding. We also use the + // same value as the input NaN in tests that have one, so that even in + // EXPECT_EXACT_RESULTS mode these tests should pass, because 0x7fc00000 is + // still the exact expected NaN. + status |= test__subsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__subsf3(0xff800000, 0xff800000, 0x7fc00000); + status |= test__subsf3(0x3f800000, 0x7fc00000, 0x7fc00000); + status |= test__subsf3(0x7fc00000, 0x3f800000, 0x7fc00000); + status |= test__subsf3(0x7fc00000, 0x7fc00000, 0x7fc00000); + +#ifdef ARM_NAN_HANDLING + // Tests specific to the NaN handling of Arm hardware, mimicked by the + // subtraction function in arm/addsf3.S: + // + // - a quiet NaN is distinguished by the top mantissa bit being 1 + // + // - if a signalling NaN appears in the input, the output quiet NaN is + // obtained by setting its top mantissa bit and leaving everything else + // unchanged + // + // - if both operands are signalling NaNs then the output NaN is derived + // from the first operand + // + // - if both operands are quiet NaNs then the output NaN is the first + // operand + // + // - invalid operations not involving an input NaN return the quiet + // NaN with fewest bits set, 0x7fc00000. + + status |= test__subsf3(0x00000000, 0x7fad4be3, 0x7fed4be3); + status |= test__subsf3(0x00000000, 0x7fdf48c7, 0x7fdf48c7); + status |= test__subsf3(0x00000001, 0x7f970eba, 0x7fd70eba); + status |= test__subsf3(0x00000001, 0x7fc35716, 0x7fc35716); + status |= test__subsf3(0x007fffff, 0x7fbf52d6, 0x7fff52d6); + status |= test__subsf3(0x007fffff, 0x7fc7a2df, 0x7fc7a2df); + status |= test__subsf3(0x3f800000, 0x7f987a85, 0x7fd87a85); + status |= test__subsf3(0x3f800000, 0x7fc50124, 0x7fc50124); + status |= test__subsf3(0x7f7fffff, 0x7f95fd6f, 0x7fd5fd6f); + status |= test__subsf3(0x7f7fffff, 0x7ffc28dc, 0x7ffc28dc); + status |= test__subsf3(0x7f800000, 0x7f800000, 0x7fc00000); + status |= test__subsf3(0x7f800000, 0x7f8dd790, 0x7fcdd790); + status |= test__subsf3(0x7f800000, 0x7fd2ef2b, 0x7fd2ef2b); + status |= test__subsf3(0x7f99b09d, 0x00000000, 0x7fd9b09d); + status |= test__subsf3(0x7f93541e, 0x00000001, 0x7fd3541e); + status |= test__subsf3(0x7f9fc002, 0x007fffff, 0x7fdfc002); + status |= test__subsf3(0x7fb5db77, 0x3f800000, 0x7ff5db77); + status |= test__subsf3(0x7f9f5d92, 0x7f7fffff, 0x7fdf5d92); + status |= test__subsf3(0x7fac7a36, 0x7f800000, 0x7fec7a36); + status |= test__subsf3(0x7fb42008, 0x7fb0ee07, 0x7ff42008); + status |= test__subsf3(0x7f8bd740, 0x7fc7aaf1, 0x7fcbd740); + status |= test__subsf3(0x7f9bb57b, 0x80000000, 0x7fdbb57b); + status |= test__subsf3(0x7f951a78, 0x80000001, 0x7fd51a78); + status |= test__subsf3(0x7f9ba63b, 0x807fffff, 0x7fdba63b); + status |= test__subsf3(0x7f89463c, 0xbf800000, 0x7fc9463c); + status |= test__subsf3(0x7fb63563, 0xff7fffff, 0x7ff63563); + status |= test__subsf3(0x7f90886e, 0xff800000, 0x7fd0886e); + status |= test__subsf3(0x7fe8c15e, 0x00000000, 0x7fe8c15e); + status |= test__subsf3(0x7fe915ae, 0x00000001, 0x7fe915ae); + status |= test__subsf3(0x7ffa9b42, 0x007fffff, 0x7ffa9b42); + status |= test__subsf3(0x7fdad0f5, 0x3f800000, 0x7fdad0f5); + status |= test__subsf3(0x7fd10dcb, 0x7f7fffff, 0x7fd10dcb); + status |= test__subsf3(0x7fd08e8a, 0x7f800000, 0x7fd08e8a); + status |= test__subsf3(0x7fc3a9e6, 0x7f91a816, 0x7fd1a816); + status |= test__subsf3(0x7fdb229c, 0x7fc26c68, 0x7fdb229c); + status |= test__subsf3(0x7fc9f6bb, 0x80000000, 0x7fc9f6bb); + status |= test__subsf3(0x7ffa178b, 0x80000001, 0x7ffa178b); + status |= test__subsf3(0x7fef2a0b, 0x807fffff, 0x7fef2a0b); + status |= test__subsf3(0x7ffc885b, 0xbf800000, 0x7ffc885b); + status |= test__subsf3(0x7fd26e8c, 0xff7fffff, 0x7fd26e8c); + status |= test__subsf3(0x7fc55329, 0xff800000, 0x7fc55329); + status |= test__subsf3(0x80000000, 0x7fa833ae, 0x7fe833ae); + status |= test__subsf3(0x80000000, 0x7fc4df63, 0x7fc4df63); + status |= test__subsf3(0x80000001, 0x7f98827d, 0x7fd8827d); + status |= test__subsf3(0x80000001, 0x7fd7acc5, 0x7fd7acc5); + status |= test__subsf3(0x807fffff, 0x7fad19c0, 0x7fed19c0); + status |= test__subsf3(0x807fffff, 0x7ffe1907, 0x7ffe1907); + status |= test__subsf3(0xbf800000, 0x7fa95487, 0x7fe95487); + status |= test__subsf3(0xbf800000, 0x7fd2bbee, 0x7fd2bbee); + status |= test__subsf3(0xff7fffff, 0x7f86ba21, 0x7fc6ba21); + status |= test__subsf3(0xff7fffff, 0x7feb00d7, 0x7feb00d7); + status |= test__subsf3(0xff800000, 0x7f857fdc, 0x7fc57fdc); + status |= test__subsf3(0xff800000, 0x7fde0397, 0x7fde0397); + status |= test__subsf3(0xff800000, 0xff800000, 0x7fc00000); +#endif // ARM_NAN_HANDLING + + return status; +}