Skip to content

Commit 93f0395

Browse files
committed
Optimize the default FP formatting
1 parent 35dcc58 commit 93f0395

File tree

2 files changed

+138
-82
lines changed

2 files changed

+138
-82
lines changed

include/fmt/format-inl.h

Lines changed: 2 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -173,16 +173,6 @@ inline auto operator==(basic_fp<F> x, basic_fp<F> y) -> bool {
173173
return x.f == y.f && x.e == y.e;
174174
}
175175

176-
// Compilers should be able to optimize this into the ror instruction.
177-
FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
178-
r &= 31;
179-
return (n >> r) | (n << (32 - r));
180-
}
181-
FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
182-
r &= 63;
183-
return (n >> r) | (n << (64 - r));
184-
}
185-
186176
// Implementation of Dragonbox algorithm: https://github.com/jk-jeon/dragonbox.
187177
namespace dragonbox {
188178
// Computes upper 64 bits of multiplication of a 32-bit unsigned integer and a
@@ -1149,65 +1139,6 @@ auto is_left_endpoint_integer_shorter_interval(int exponent) noexcept -> bool {
11491139
exponent <= case_shorter_interval_left_endpoint_upper_threshold;
11501140
}
11511141

1152-
// Remove trailing zeros from n and return the number of zeros removed (float)
1153-
FMT_INLINE int remove_trailing_zeros(uint32_t& n, int s = 0) noexcept {
1154-
FMT_ASSERT(n != 0, "");
1155-
// Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
1156-
constexpr uint32_t mod_inv_5 = 0xcccccccd;
1157-
constexpr uint32_t mod_inv_25 = 0xc28f5c29; // = mod_inv_5 * mod_inv_5
1158-
1159-
while (true) {
1160-
auto q = rotr(n * mod_inv_25, 2);
1161-
if (q > max_value<uint32_t>() / 100) break;
1162-
n = q;
1163-
s += 2;
1164-
}
1165-
auto q = rotr(n * mod_inv_5, 1);
1166-
if (q <= max_value<uint32_t>() / 10) {
1167-
n = q;
1168-
s |= 1;
1169-
}
1170-
return s;
1171-
}
1172-
1173-
// Removes trailing zeros and returns the number of zeros removed (double)
1174-
FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
1175-
FMT_ASSERT(n != 0, "");
1176-
1177-
// This magic number is ceil(2^90 / 10^8).
1178-
constexpr uint64_t magic_number = 12379400392853802749ull;
1179-
auto nm = umul128(n, magic_number);
1180-
1181-
// Is n is divisible by 10^8?
1182-
if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) {
1183-
// If yes, work with the quotient...
1184-
auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
1185-
// ... and use the 32 bit variant of the function
1186-
int s = remove_trailing_zeros(n32, 8);
1187-
n = n32;
1188-
return s;
1189-
}
1190-
1191-
// If n is not divisible by 10^8, work with n itself.
1192-
constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
1193-
constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29; // mod_inv_5 * mod_inv_5
1194-
1195-
int s = 0;
1196-
while (true) {
1197-
auto q = rotr(n * mod_inv_25, 2);
1198-
if (q > max_value<uint64_t>() / 100) break;
1199-
n = q;
1200-
s += 2;
1201-
}
1202-
auto q = rotr(n * mod_inv_5, 1);
1203-
if (q <= max_value<uint64_t>() / 10) {
1204-
n = q;
1205-
s |= 1;
1206-
}
1207-
1208-
return s;
1209-
}
1210-
12111142
// The main algorithm for shorter interval case
12121143
template <typename T>
12131144
FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
@@ -1234,7 +1165,7 @@ FMT_INLINE decimal_fp<T> shorter_interval_case(int exponent) noexcept {
12341165
// If succeed, remove trailing zeros if necessary and return
12351166
if (ret_value.significand * 10 >= xi) {
12361167
ret_value.exponent = minus_k + 1;
1237-
ret_value.exponent += remove_trailing_zeros(ret_value.significand);
1168+
// Trailing zeros are removed later.
12381169
return ret_value;
12391170
}
12401171

@@ -1340,8 +1271,7 @@ template <typename T> auto to_decimal(T x) noexcept -> decimal_fp<T> {
13401271
}
13411272
ret_value.exponent = minus_k + float_info<T>::kappa + 1;
13421273

1343-
// We may need to remove trailing zeros.
1344-
ret_value.exponent += remove_trailing_zeros(ret_value.significand);
1274+
// Trailing zeros are remove later.
13451275
return ret_value;
13461276

13471277
// Step 3: Find the significand with the smaller divisor.

include/fmt/format.h

Lines changed: 136 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -519,6 +519,11 @@ template <typename T, typename OutputIt>
519519
constexpr auto to_pointer(OutputIt, size_t) -> T* {
520520
return nullptr;
521521
}
522+
template <typename T> FMT_CONSTEXPR auto to_pointer(T*& ptr, size_t n) -> T* {
523+
T* begin = ptr;
524+
ptr += n;
525+
return begin;
526+
}
522527
template <typename T>
523528
FMT_CONSTEXPR20 auto to_pointer(basic_appender<T> it, size_t n) -> T* {
524529
buffer<T>& buf = get_container(it);
@@ -1169,8 +1174,9 @@ FMT_CONSTEXPR20 FMT_INLINE void write2digits(Char* out, size_t value) {
11691174
*out = static_cast<Char>('0' + value % 10);
11701175
}
11711176

1172-
// Formats a decimal unsigned integer value writing to out pointing to a buffer
1173-
// of specified size. The caller must ensure that the buffer is large enough.
1177+
// Formats a decimal unsigned integer value and writes to out pointing to a
1178+
// buffer of specified size. The caller must ensure that the buffer is large
1179+
// enough.
11741180
template <typename Char, typename UInt>
11751181
FMT_CONSTEXPR20 auto do_format_decimal(Char* out, UInt value, int size)
11761182
-> Char* {
@@ -1455,6 +1461,75 @@ template <typename T> struct decimal_fp {
14551461
template <typename T> FMT_API auto to_decimal(T x) noexcept -> decimal_fp<T>;
14561462
} // namespace dragonbox
14571463

1464+
// Compilers should be able to optimize this into the ror instruction.
1465+
FMT_CONSTEXPR inline auto rotr(uint32_t n, uint32_t r) noexcept -> uint32_t {
1466+
r &= 31;
1467+
return (n >> r) | (n << (32 - r));
1468+
}
1469+
FMT_CONSTEXPR inline auto rotr(uint64_t n, uint32_t r) noexcept -> uint64_t {
1470+
r &= 63;
1471+
return (n >> r) | (n << (64 - r));
1472+
}
1473+
1474+
// Remove trailing zeros from n and return the number of zeros removed (float)
1475+
FMT_INLINE int remove_trailing_zeros(uint32_t& n, int s = 0) noexcept {
1476+
FMT_ASSERT(n != 0, "");
1477+
// Modular inverse of 5 (mod 2^32): (mod_inv_5 * 5) mod 2^32 = 1.
1478+
constexpr uint32_t mod_inv_5 = 0xcccccccd;
1479+
constexpr uint32_t mod_inv_25 = 0xc28f5c29; // = mod_inv_5 * mod_inv_5
1480+
1481+
while (true) {
1482+
auto q = rotr(n * mod_inv_25, 2);
1483+
if (q > max_value<uint32_t>() / 100) break;
1484+
n = q;
1485+
s += 2;
1486+
}
1487+
auto q = rotr(n * mod_inv_5, 1);
1488+
if (q <= max_value<uint32_t>() / 10) {
1489+
n = q;
1490+
s |= 1;
1491+
}
1492+
return s;
1493+
}
1494+
1495+
// Removes trailing zeros and returns the number of zeros removed (double)
1496+
FMT_INLINE int remove_trailing_zeros(uint64_t& n) noexcept {
1497+
FMT_ASSERT(n != 0, "");
1498+
1499+
// This magic number is ceil(2^90 / 10^8).
1500+
constexpr uint64_t magic_number = 12379400392853802749ull;
1501+
auto nm = umul128(n, magic_number);
1502+
1503+
// Is n is divisible by 10^8?
1504+
if ((nm.high() & ((1ull << (90 - 64)) - 1)) == 0 && nm.low() < magic_number) {
1505+
// If yes, work with the quotient...
1506+
auto n32 = static_cast<uint32_t>(nm.high() >> (90 - 64));
1507+
// ... and use the 32 bit variant of the function
1508+
int s = remove_trailing_zeros(n32, 8);
1509+
n = n32;
1510+
return s;
1511+
}
1512+
1513+
// If n is not divisible by 10^8, work with n itself.
1514+
constexpr uint64_t mod_inv_5 = 0xcccccccccccccccd;
1515+
constexpr uint64_t mod_inv_25 = 0x8f5c28f5c28f5c29; // mod_inv_5 * mod_inv_5
1516+
1517+
int s = 0;
1518+
while (true) {
1519+
auto q = rotr(n * mod_inv_25, 2);
1520+
if (q > max_value<uint64_t>() / 100) break;
1521+
n = q;
1522+
s += 2;
1523+
}
1524+
auto q = rotr(n * mod_inv_5, 1);
1525+
if (q <= max_value<uint64_t>() / 10) {
1526+
n = q;
1527+
s |= 1;
1528+
}
1529+
1530+
return s;
1531+
}
1532+
14581533
// Returns true iff Float has the implicit bit which is not stored.
14591534
template <typename Float> constexpr auto has_implicit_bit() -> bool {
14601535
// An 80-bit FP number has a 64-bit significand an no implicit bit.
@@ -1486,7 +1561,7 @@ template <typename Float> constexpr auto exponent_bias() -> int {
14861561
FMT_CONSTEXPR inline auto compute_exp_size(int exp) -> int {
14871562
auto prefix_size = 2; // sign + 'e'
14881563
auto abs_exp = exp >= 0 ? exp : -exp;
1489-
if (exp < 100) return prefix_size + 2;
1564+
if (abs_exp < 100) return prefix_size + 2;
14901565
return prefix_size + (abs_exp >= 1000 ? 4 : 3);
14911566
}
14921567

@@ -3413,6 +3488,8 @@ FMT_CONSTEXPR20 auto write(OutputIt out, T value, format_specs specs,
34133488
} else if (is_fast_float<T>::value && !is_constant_evaluated()) {
34143489
// Use Dragonbox for the shortest format.
34153490
auto dec = dragonbox::to_decimal(static_cast<fast_float_t<T>>(value));
3491+
if (dec.significand != 0)
3492+
dec.exponent += remove_trailing_zeros(dec.significand);
34163493
return write_float<Char>(out, dec, specs, s, exp_upper, loc);
34173494
}
34183495
}
@@ -3455,9 +3532,29 @@ FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
34553532
return write_nonfinite<Char>(out, std::isnan(value), {}, s);
34563533

34573534
auto dec = dragonbox::to_decimal(static_cast<fast_float_t<T>>(value));
3458-
int significand_size = count_digits(dec.significand);
3459-
int exp = dec.exponent + significand_size - 1;
3460-
if (use_fixed(exp, detail::exp_upper<T>())) {
3535+
auto significand = dec.significand;
3536+
auto exponent = dec.exponent;
3537+
3538+
uint32_t block1, block2 = 0;
3539+
int num_block2_digits = 0;
3540+
constexpr unsigned ten_pow_8 = 100000000u;
3541+
if (significand >= ten_pow_8) {
3542+
block1 = static_cast<unsigned>(significand / ten_pow_8);
3543+
block2 = static_cast<unsigned>(significand) - block1 * ten_pow_8;
3544+
if (block2 != 0) num_block2_digits = 8 - remove_trailing_zeros(block2);
3545+
exponent += 8;
3546+
} else {
3547+
block1 = static_cast<unsigned>(significand);
3548+
}
3549+
if (block2 == 0 && block1 != 0) exponent += remove_trailing_zeros(block1);
3550+
3551+
int num_block1_digits = count_digits(block1);
3552+
exponent += num_block1_digits - 1;
3553+
int significand_size = num_block1_digits + num_block2_digits;
3554+
3555+
if (use_fixed(exponent, detail::exp_upper<T>())) {
3556+
if (dec.significand != 0)
3557+
dec.exponent += remove_trailing_zeros(dec.significand);
34613558
return write_fixed<Char, fallback_digit_grouping<Char>>(
34623559
out, dec, significand_size, Char('.'), {}, s);
34633560
}
@@ -3466,14 +3563,43 @@ FMT_CONSTEXPR20 auto write(OutputIt out, T value) -> OutputIt {
34663563
auto has_decimal_point = significand_size != 1;
34673564
size_t size =
34683565
to_unsigned((s != sign::none ? 1 : 0) + significand_size +
3469-
(has_decimal_point ? 1 : 0) + compute_exp_size(exp));
3566+
(has_decimal_point ? 1 : 0) + compute_exp_size(exponent));
3567+
3568+
if (auto ptr = to_pointer<Char>(out, size)) {
3569+
if (s != sign::none) *ptr++ = Char('-');
3570+
if (has_decimal_point) {
3571+
auto begin = ptr;
3572+
ptr = format_decimal<Char>(ptr, block1, num_block1_digits + 1);
3573+
*begin = begin[1];
3574+
begin[1] = '.';
3575+
if (num_block2_digits != 0) {
3576+
int n = num_block2_digits;
3577+
while (n > 2) {
3578+
n -= 2;
3579+
write2digits(ptr + n, block2 % 100);
3580+
block2 /= 100;
3581+
}
3582+
if (n > 1) {
3583+
n -= 2;
3584+
write2digits(ptr + n, block2);
3585+
} else {
3586+
ptr[--n] = static_cast<Char>('0' + block2);
3587+
}
3588+
ptr += num_block2_digits;
3589+
}
3590+
} else {
3591+
*ptr++ = static_cast<Char>('0' + block1);
3592+
}
3593+
*ptr++ = Char('e');
3594+
ptr = write_exponent<Char>(exponent, ptr);
3595+
return out;
3596+
}
34703597
auto it = reserve(out, size);
34713598
if (s != sign::none) *it++ = Char('-');
3472-
// Insert a decimal point after the first digit and add an exponent.
3473-
it = write_significand(it, dec.significand, significand_size, 1,
3599+
it = write_significand(it, significand, significand_size, 1,
34743600
has_decimal_point ? Char('.') : Char());
34753601
*it++ = Char('e');
3476-
it = write_exponent<Char>(exp, it);
3602+
it = write_exponent<Char>(exponent, it);
34773603
return base_iterator(out, it);
34783604
}
34793605

0 commit comments

Comments
 (0)