Skip to content

Commit 6e3b7ab

Browse files
committed
[compiler-rt][ARM] Optimized mulsf3 and divsf3
This commit adds optimized assembly versions of single-precision float multiplication and division. Both functions are implemented in a style that can be assembled as either of Arm and Thumb2; for multiplication, a separate implementation is provided for Thumb1. Also, extensive new tests are added for multiplication and division. These implementations can be removed from the build by defining the cmake variable COMPILER_RT_ARM_OPTIMIZED_FP=OFF. Outlying parts of the functionality which are not on the fast path, such as NaN handling and underflow, are handled in helper functions written in C. These can be shared between the Arm/Thumb2 and Thumb1 implementations, and also reused by other optimized assembly functions we hope to add in future.
1 parent a6bf271 commit 6e3b7ab

File tree

9 files changed

+2326
-96
lines changed

9 files changed

+2326
-96
lines changed

compiler-rt/lib/builtins/CMakeLists.txt

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -422,6 +422,22 @@ set(arm_or_thumb2_base_SOURCES
422422
${GENERIC_SOURCES}
423423
)
424424

425+
option(COMPILER_RT_ARM_OPTIMIZED_FP
426+
"On 32-bit Arm, use optimized assembly implementations of FP arithmetic" ON)
427+
428+
if(COMPILER_RT_ARM_OPTIMIZED_FP)
429+
set(arm_or_thumb2_base_SOURCES
430+
arm/mulsf3.S
431+
arm/divsf3.S
432+
arm/fnan2.c
433+
arm/fnorm2.c
434+
arm/funder.c
435+
${arm_or_thumb2_base_SOURCES}
436+
)
437+
endif()
438+
set_source_files_properties(arm/mulsf3.S arm/divsf3.S
439+
PROPERTIES COMPILE_OPTIONS "-Wa,-mimplicit-it=always")
440+
425441
set(arm_sync_SOURCES
426442
arm/sync_fetch_and_add_4.S
427443
arm/sync_fetch_and_add_8.S
@@ -455,6 +471,13 @@ set(thumb1_base_SOURCES
455471
${GENERIC_SOURCES}
456472
)
457473

474+
if(COMPILER_RT_ARM_OPTIMIZED_FP)
475+
set(thumb1_base_SOURCES
476+
arm/thumb1/mulsf3.S
477+
${thumb1_base_SOURCES}
478+
)
479+
endif()
480+
458481
set(arm_EABI_RT_SOURCES
459482
arm/aeabi_cdcmp.S
460483
arm/aeabi_cdcmpeq_check_nan.c

compiler-rt/lib/builtins/arm/divsf3.S

Lines changed: 608 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This helper function is available for use by single-precision float
10+
// arithmetic implementations to handle propagating NaNs from the input
11+
// operands to the output, in a way that matches Arm hardware FP.
12+
//
13+
// On input, a and b are floating-point numbers in IEEE 754 encoding, and at
14+
// least one of them must be a NaN. The return value is the correct output NaN.
15+
//
16+
//===----------------------------------------------------------------------===//
17+
18+
#include <stdint.h>
19+
20+
uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) {
21+
// Make shifted-left copies of a and b to discard the sign bit. Then add 1 at
22+
// the bit position where the quiet vs signalling bit ended up. This squashes
23+
// all the signalling NaNs to the top of the range of 32-bit values, from
24+
// 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values
25+
// wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect
26+
// a signalling NaN by asking if it's greater than 0xff800000, and a quiet
27+
// one by asking if it's less than 0x00800000.
28+
uint32_t aadj = (a << 1) + 0x00800000;
29+
uint32_t badj = (b << 1) + 0x00800000;
30+
if (aadj > 0xff800000) // a is a signalling NaN?
31+
return a | 0x00400000; // if so, return it with the quiet bit set
32+
if (badj > 0xff800000) // b is a signalling NaN?
33+
return b | 0x00400000; // if so, return it with the quiet bit set
34+
if (aadj < 0x00800000) // a is a quiet NaN?
35+
return a; // if so, return it
36+
else // expect (badj < 0x00800000)
37+
return b; // in that case b must be a quiet NaN
38+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This helper function is available for use by single-precision float
10+
// arithmetic implementations, to handle denormal inputs on entry by
11+
// renormalizing the mantissa and modifying the exponent to match.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
#include <stdint.h>
16+
17+
// Structure containing the function's inputs and outputs.
18+
//
19+
// On entry: a, b are two input floating-point numbers, still in IEEE 754
20+
// encoding. expa and expb are the 8-bit exponents of those numbers, extracted
21+
// and shifted down to the low 8 bits of the word, with no other change.
22+
// Neither value should be zero, or have the maximum exponent (indicating an
23+
// infinity or NaN).
24+
//
25+
// On exit: each of a and b contains the mantissa of the input value, with the
26+
// leading 1 bit made explicit, and shifted up to the top of the word. If expa
27+
// was zero (indicating that a was denormal) then it is now represented as a
28+
// normalized number with an out-of-range exponent (zero or negative). The same
29+
// applies to expb and b.
30+
struct fnorm2 {
31+
uint32_t a, b, expa, expb;
32+
};
33+
34+
void __compiler_rt_fnorm2(struct fnorm2 *values) {
35+
// Shift the mantissas of a and b to the right place to follow a leading 1 in
36+
// the top bit, if there is one.
37+
values->a <<= 8;
38+
values->b <<= 8;
39+
40+
// Test if a is denormal.
41+
if (values->expa == 0) {
42+
// If so, decide how much further up to shift its mantissa, and adjust its
43+
// exponent to match. This brings the leading 1 of the denormal mantissa to
44+
// the top of values->a.
45+
uint32_t shift = __builtin_clz(values->a);
46+
values->a <<= shift;
47+
values->expa = 1 - shift;
48+
} else {
49+
// Otherwise, leave the mantissa of a in its current position, and OR in
50+
// the explicit leading 1.
51+
values->a |= 0x80000000;
52+
}
53+
54+
// Do the same operation on b.
55+
if (values->expb == 0) {
56+
uint32_t shift = __builtin_clz(values->b);
57+
values->b <<= shift;
58+
values->expb = 1 - shift;
59+
} else {
60+
values->b |= 0x80000000;
61+
}
62+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
//===-- funder.c - Handle single-precision floating-point underflow -------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This helper function is available for use by single-precision float
10+
// arithmetic implementations to handle underflowed output values, if they were
11+
// computed in the form of a normalized mantissa and an out-of-range exponent.
12+
//
13+
// On input: x should be a complete IEEE 754 floating-point value representing
14+
// the desired output scaled up by 2^192 (the same value that would have been
15+
// passed to an underflow trap handler in IEEE 754:1985).
16+
//
17+
// This isn't enough information to re-round to the correct output denormal
18+
// without also knowing whether x itself has already been rounded, and which
19+
// way. 'errsign' gives this information, by indicating the sign of the value
20+
// (true result - x). That is, if errsign > 0 it means the true value was
21+
// larger (x was rounded down); if errsign < 0 then x was rounded up; if
22+
// errsign == 0 then x represents the _exact_ desired output value.
23+
//
24+
//===----------------------------------------------------------------------===//
25+
26+
#include <stdint.h>
27+
28+
#define SIGNBIT 0x80000000
29+
#define MANTSIZE 23
30+
#define BIAS 0xc0
31+
32+
uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) {
33+
uint32_t sign = x & SIGNBIT;
34+
uint32_t exponent = (x << 1) >> 24;
35+
36+
// Rule out exponents so small (or large!) that no denormalisation
37+
// is needed.
38+
if (exponent > BIAS) {
39+
// Exponent 0xc1 or above means a normalised number got here by
40+
// mistake, so we just remove the 0xc0 exponent bias and go
41+
// straight home.
42+
return x - (BIAS << MANTSIZE);
43+
}
44+
uint32_t bits_lost = BIAS + 1 - exponent;
45+
if (bits_lost > MANTSIZE + 1) {
46+
// The implicit leading 1 of the intermediate value's mantissa is
47+
// below the lowest mantissa bit of a denormal by at least 2 bits.
48+
// Round down to 0 unconditionally.
49+
return sign;
50+
}
51+
52+
// Make the full mantissa (with leading bit) at the top of the word.
53+
uint32_t mantissa = 0x80000000 | (x << 8);
54+
// Adjust by 1 depending on the sign of the error.
55+
mantissa -= errsign >> 31;
56+
mantissa += (-errsign) >> 31;
57+
58+
// Shift down to the output position, keeping the bits shifted off.
59+
uint32_t outmant, shifted_off;
60+
if (bits_lost == MANTSIZE + 1) {
61+
// Special case for the exponent where we have to shift the whole
62+
// of 'mantissa' off the bottom of the word.
63+
outmant = 0;
64+
shifted_off = mantissa;
65+
} else {
66+
outmant = mantissa >> (8 + bits_lost);
67+
shifted_off = mantissa << (32 - (8 + bits_lost));
68+
}
69+
70+
// Re-round.
71+
if (shifted_off >> 31) {
72+
outmant++;
73+
if (!(shifted_off << 1))
74+
outmant &= ~1; // halfway case: round to even
75+
}
76+
77+
return sign | outmant;
78+
}

0 commit comments

Comments
 (0)