Skip to content

Commit f7e6521

Browse files
authored
[compiler-rt][ARM] Optimized mulsf3 and divsf3 (#161546)
This commit adds optimized assembly versions of single-precision float multiplication and division. Both functions are implemented in a style that can be assembled as either of Arm and Thumb2; for multiplication, a separate implementation is provided for Thumb1. Also, extensive new tests are added for multiplication and division. These implementations can be removed from the build by defining the cmake variable COMPILER_RT_ARM_OPTIMIZED_FP=OFF. Outlying parts of the functionality which are not on the fast path, such as NaN handling and underflow, are handled in helper functions written in C. These can be shared between the Arm/Thumb2 and Thumb1 implementations, and also reused by other optimized assembly functions we hope to add in future.
1 parent a04c6b5 commit f7e6521

File tree

11 files changed

+2461
-95
lines changed

11 files changed

+2461
-95
lines changed
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Helper function to find out whether the assembler supports a particular
2+
# command-line flag. You'd like to use the standard check_compiler_flag(), but
3+
# that only supports a fixed list of languages, and ASM isn't one of them. So
4+
# we do it ourselves, by trying to assemble an empty source file.
5+
6+
function(check_assembler_flag outvar flag)
7+
if(NOT DEFINED "${outvar}")
8+
if(NOT CMAKE_REQUIRED_QUIET)
9+
message(CHECK_START "Checking for assembler flag ${flag}")
10+
endif()
11+
12+
# Stop try_compile from attempting to link the result of the assembly, so
13+
# that we don't depend on having a working linker, and also don't have to
14+
# figure out what special symbol like _start needs to be defined in the
15+
# test input.
16+
#
17+
# This change is made within the dynamic scope of this function, so
18+
# CMAKE_TRY_COMPILE_TARGET_TYPE will be restored to its previous value on
19+
# return.
20+
set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
21+
22+
# Try to assemble an empty file with a .S name, using the provided flag.
23+
try_compile(success
24+
SOURCE_FROM_CONTENT "CheckAssemblerFlag.s" ""
25+
COMPILE_DEFINITIONS ${flag}
26+
NO_CACHE)
27+
28+
if(NOT CMAKE_REQUIRED_QUIET)
29+
if(success)
30+
message(CHECK_PASS "Accepted")
31+
set(${outvar} 1 CACHE INTERNAL "Test assembler flag ${flag}")
32+
else()
33+
message(CHECK_FAIL "Not accepted")
34+
set(${outvar} "" CACHE INTERNAL "Test assembler flag ${flag}")
35+
endif()
36+
endif()
37+
endif()
38+
endfunction()

compiler-rt/lib/builtins/CMakeLists.txt

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ endif()
6060
include(builtin-config-ix)
6161
include(CMakeDependentOption)
6262
include(CMakePushCheckState)
63+
include(CheckAssemblerFlag)
6364

6465
option(COMPILER_RT_BUILTINS_HIDE_SYMBOLS
6566
"Do not export any symbols from the static library." ON)
@@ -423,6 +424,40 @@ set(arm_or_thumb2_base_SOURCES
423424
${GENERIC_SOURCES}
424425
)
425426

427+
option(COMPILER_RT_ARM_OPTIMIZED_FP
428+
"On 32-bit Arm, use optimized assembly implementations of FP arithmetic. Likely to increase code size, but be faster." ON)
429+
430+
if(COMPILER_RT_ARM_OPTIMIZED_FP AND BUILTIN_SUPPORTED_ARCH MATCHES "arm")
431+
check_assembler_flag(COMPILER_RT_HAS_MIMPLICIT_IT -mimplicit-it=always)
432+
if(COMPILER_RT_HAS_MIMPLICIT_IT)
433+
set(implicit_it_flag -mimplicit-it=always)
434+
else()
435+
check_assembler_flag(
436+
COMPILER_RT_HAS_WA_MIMPLICIT_IT -Wa,-mimplicit-it=always)
437+
if(COMPILER_RT_HAS_WA_MIMPLICIT_IT)
438+
set(implicit_it_flag -Wa,-mimplicit-it=always)
439+
else()
440+
message(WARNING "Don't know how to set the -mimplicit-it=always flag in this assembler; not including Arm optimized implementations")
441+
set(implicit_it_flag "")
442+
endif()
443+
endif()
444+
445+
if(implicit_it_flag)
446+
set(assembly_files
447+
arm/mulsf3.S
448+
arm/divsf3.S)
449+
set_source_files_properties(${assembly_files}
450+
PROPERTIES COMPILE_OPTIONS ${implicit_it_flag})
451+
set(arm_or_thumb2_base_SOURCES
452+
${assembly_files}
453+
arm/fnan2.c
454+
arm/fnorm2.c
455+
arm/funder.c
456+
${arm_or_thumb2_base_SOURCES}
457+
)
458+
endif()
459+
endif()
460+
426461
set(arm_sync_SOURCES
427462
arm/sync_fetch_and_add_4.S
428463
arm/sync_fetch_and_add_8.S
@@ -456,6 +491,16 @@ set(thumb1_base_SOURCES
456491
${GENERIC_SOURCES}
457492
)
458493

494+
if(COMPILER_RT_ARM_OPTIMIZED_FP)
495+
set(thumb1_base_SOURCES
496+
arm/thumb1/mulsf3.S
497+
arm/fnan2.c
498+
arm/fnorm2.c
499+
arm/funder.c
500+
${thumb1_base_SOURCES}
501+
)
502+
endif()
503+
459504
set(arm_EABI_RT_SOURCES
460505
arm/aeabi_cdcmp.S
461506
arm/aeabi_cdcmpeq_check_nan.c

compiler-rt/lib/builtins/arm/divsf3.S

Lines changed: 608 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
//===-- fnan2.c - Handle single-precision NaN inputs to binary operation --===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This helper function is available for use by single-precision float
10+
// arithmetic implementations to handle propagating NaNs from the input
11+
// operands to the output, in a way that matches Arm hardware FP.
12+
//
13+
// On input, a and b are floating-point numbers in IEEE 754 encoding, and at
14+
// least one of them must be a NaN. The return value is the correct output NaN.
15+
//
16+
// A signalling NaN in the input (with bit 22 clear) takes priority over any
17+
// quiet NaN, and is adjusted on return by setting bit 22 to make it quiet. If
18+
// both inputs are the same type of NaN then the first input takes priority:
19+
// the input a is used instead of b.
20+
//
21+
//===----------------------------------------------------------------------===//
22+
23+
#include <stdint.h>
24+
25+
uint32_t __compiler_rt_fnan2(uint32_t a, uint32_t b) {
26+
// Make shifted-left copies of a and b to discard the sign bit. Then add 1 at
27+
// the bit position where the quiet vs signalling bit ended up. This squashes
28+
// all the signalling NaNs to the top of the range of 32-bit values, from
29+
// 0xff800001 to 0xffffffff inclusive; meanwhile, all the quiet NaN values
30+
// wrap round to the bottom, from 0 to 0x007fffff inclusive. So we can detect
31+
// a signalling NaN by asking if it's greater than 0xff800000, and a quiet
32+
// one by asking if it's less than 0x00800000.
33+
uint32_t aadj = (a << 1) + 0x00800000;
34+
uint32_t badj = (b << 1) + 0x00800000;
35+
if (aadj > 0xff800000) // a is a signalling NaN?
36+
return a | 0x00400000; // if so, return it with the quiet bit set
37+
if (badj > 0xff800000) // b is a signalling NaN?
38+
return b | 0x00400000; // if so, return it with the quiet bit set
39+
if (aadj < 0x00800000) // a is a quiet NaN?
40+
return a; // if so, return it
41+
return b; // otherwise we expect b must be a quiet NaN
42+
}
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
//===-- fnorm2.c - Handle single-precision denormal inputs to binary op ---===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This helper function is available for use by single-precision float
10+
// arithmetic implementations, to handle denormal inputs on entry by
11+
// renormalizing the mantissa and modifying the exponent to match.
12+
//
13+
//===----------------------------------------------------------------------===//
14+
15+
#include <stdint.h>
16+
17+
// Structure containing the function's inputs and outputs.
18+
//
19+
// On entry: a, b are two input floating-point numbers, still in IEEE 754
20+
// encoding. expa and expb are the 8-bit exponents of those numbers, extracted
21+
// and shifted down to the low 8 bits of the word, with no other change.
22+
// Neither value should be zero, or have the maximum exponent (indicating an
23+
// infinity or NaN).
24+
//
25+
// On exit: each of a and b contains the mantissa of the input value, with the
26+
// leading 1 bit made explicit, and shifted up to the top of the word. If expa
27+
// was zero (indicating that a was denormal) then it is now represented as a
28+
// normalized number with an out-of-range exponent (zero or negative). The same
29+
// applies to expb and b.
30+
struct fnorm2 {
31+
uint32_t a, b, expa, expb;
32+
};
33+
34+
void __compiler_rt_fnorm2(struct fnorm2 *values) {
35+
// Shift the mantissas of a and b to the right place to follow a leading 1 in
36+
// the top bit, if there is one.
37+
values->a <<= 8;
38+
values->b <<= 8;
39+
40+
// Test if a is denormal.
41+
if (values->expa == 0) {
42+
// If so, decide how much further up to shift its mantissa, and adjust its
43+
// exponent to match. This brings the leading 1 of the denormal mantissa to
44+
// the top of values->a.
45+
uint32_t shift = __builtin_clz(values->a);
46+
values->a <<= shift;
47+
values->expa = 1 - shift;
48+
} else {
49+
// Otherwise, leave the mantissa of a in its current position, and OR in
50+
// the explicit leading 1.
51+
values->a |= 0x80000000;
52+
}
53+
54+
// Do the same operation on b.
55+
if (values->expb == 0) {
56+
uint32_t shift = __builtin_clz(values->b);
57+
values->b <<= shift;
58+
values->expb = 1 - shift;
59+
} else {
60+
values->b |= 0x80000000;
61+
}
62+
}
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
//===-- funder.c - Handle single-precision floating-point underflow -------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
//
9+
// This helper function is available for use by single-precision float
10+
// arithmetic implementations to handle underflowed output values, if they were
11+
// computed in the form of a normalized mantissa and an out-of-range exponent.
12+
//
13+
// On input: x should be a complete IEEE 754 floating-point value representing
14+
// the desired output scaled up by 2^192 (the same value that would have been
15+
// passed to an underflow trap handler in IEEE 754:1985).
16+
//
17+
// This isn't enough information to re-round to the correct output denormal
18+
// without also knowing whether x itself has already been rounded, and which
19+
// way. 'errsign' gives this information, by indicating the sign of the value
20+
// (true result - x). That is, if errsign > 0 it means the true value was
21+
// larger (x was rounded down); if errsign < 0 then x was rounded up; if
22+
// errsign == 0 then x represents the _exact_ desired output value.
23+
//
24+
//===----------------------------------------------------------------------===//
25+
26+
#include <stdint.h>
27+
28+
#define SIGNBIT 0x80000000
29+
#define MANTSIZE 23
30+
#define BIAS 0xc0
31+
32+
uint32_t __compiler_rt_funder(uint32_t x, uint32_t errsign) {
33+
uint32_t sign = x & SIGNBIT;
34+
uint32_t exponent = (x << 1) >> 24;
35+
36+
// Rule out exponents so small (or large!) that no denormalisation
37+
// is needed.
38+
if (exponent > BIAS) {
39+
// Exponent 0xc1 or above means a normalised number got here by
40+
// mistake, so we just remove the 0xc0 exponent bias and go
41+
// straight home.
42+
return x - (BIAS << MANTSIZE);
43+
}
44+
uint32_t bits_lost = BIAS + 1 - exponent;
45+
if (bits_lost > MANTSIZE + 1) {
46+
// The implicit leading 1 of the intermediate value's mantissa is
47+
// below the lowest mantissa bit of a denormal by at least 2 bits.
48+
// Round down to 0 unconditionally.
49+
return sign;
50+
}
51+
52+
// Make the full mantissa (with leading bit) at the top of the word.
53+
uint32_t mantissa = 0x80000000 | (x << 8);
54+
// Adjust by 1 depending on the sign of the error.
55+
mantissa -= errsign >> 31;
56+
mantissa += (-errsign) >> 31;
57+
58+
// Shift down to the output position, keeping the bits shifted off.
59+
uint32_t outmant, shifted_off;
60+
if (bits_lost == MANTSIZE + 1) {
61+
// Special case for the exponent where we have to shift the whole
62+
// of 'mantissa' off the bottom of the word.
63+
outmant = 0;
64+
shifted_off = mantissa;
65+
} else {
66+
outmant = mantissa >> (8 + bits_lost);
67+
shifted_off = mantissa << (32 - (8 + bits_lost));
68+
}
69+
70+
// Re-round.
71+
if (shifted_off >> 31) {
72+
outmant++;
73+
if (!(shifted_off << 1))
74+
outmant &= ~1; // halfway case: round to even
75+
}
76+
77+
return sign | outmant;
78+
}

0 commit comments

Comments
 (0)