@@ -137,7 +137,7 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
137137 int index = 0 ;
138138 if (sign)
139139 result[index++] = ' -' ;
140-
140+ // We use fast arithmetic to compute the number of digits.
141141 const uint32_t olength = is_double ? fast_digit_count64 (mantissa)
142142 : fast_digit_count32 (mantissa);
143143 // Print the decimal digits.
@@ -146,22 +146,25 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
146146 // result[index + olength - i] = (char) ('0' + c);
147147 // }
148148 // result[index] = '0' + mantissa % 10;
149-
149+ // ////////////////
150+ // Performance:
151+ // On 64-bit systems, 32-bit arithmetic is no faster than 64-bit,
152+ // and sometimes slower.
153+ // ///////////////
150154 uint32_t i = 0 ;
151- // We take care of the least significant eight digits first.
155+ // We take care of the least significant eight digits first.
152156 if (mantissa >= 100'000'000 ) {
153- // Expensive 64-bit division.
154157 const uint64_t q = mantissa / 100'000'000 ;
155- uint32_t temp = mantissa % 100'000'000 ;
158+ uint64_t temp = mantissa % 100'000'000 ;
156159 mantissa = q;
157160
158- const uint32_t c = temp % 10000 ;
161+ const uint64_t c = temp % 10000 ;
159162 temp /= 10000 ;
160- const uint32_t d = temp % 10000 ;
161- const uint32_t c0 = (c % 100 ) << 1 ;
162- const uint32_t c1 = (c / 100 ) << 1 ;
163- const uint32_t d0 = (d % 100 ) << 1 ;
164- const uint32_t d1 = (d / 100 ) << 1 ;
163+ const uint64_t d = temp % 10000 ;
164+ const uint64_t c0 = (c % 100 ) << 1 ;
165+ const uint64_t c1 = (c / 100 ) << 1 ;
166+ const uint64_t d0 = (d % 100 ) << 1 ;
167+ const uint64_t d1 = (d / 100 ) << 1 ;
165168 memcpy (result + index + olength - 1 , hundreds_digit_table + c0, 2 );
166169 memcpy (result + index + olength - 3 , hundreds_digit_table + c1, 2 );
167170 memcpy (result + index + olength - 5 , hundreds_digit_table + d0, 2 );
@@ -171,25 +174,24 @@ int to_chars(T mantissa, int32_t exponent, bool sign, char* const result) {
171174
172175
173176 uint64_t output = mantissa;
177+ // Next, we proceed in block of 4 digits.
174178 while (output >= 10000 ) {
175- #ifdef __clang__ // https://bugs.llvm.org/show_bug.cgi?id=38217
176- const uint32_t c = output - 10000 * (output / 10000 );
177- #else
178- const uint32_t c = output % 10000 ;
179- #endif
179+ const uint64_t c = output % 10000 ;
180180 output /= 10000 ;
181- const uint32_t c0 = (c % 100 ) << 1 ;
182- const uint32_t c1 = (c / 100 ) << 1 ;
181+ const uint64_t c0 = (c % 100 ) << 1 ;
182+ const uint64_t c1 = (c / 100 ) << 1 ;
183183 memcpy (result + index + olength - i - 1 , hundreds_digit_table + c0, 2 );
184184 memcpy (result + index + olength - i - 3 , hundreds_digit_table + c1, 2 );
185185 i += 4 ;
186186 }
187+ // We can take care of two digits out of the 2 or 3 remaining.
187188 if (output >= 100 ) {
188- const uint32_t c = (output % 100 ) << 1 ;
189+ const uint64_t c = (output % 100 ) << 1 ;
189190 output /= 100 ;
190191 memcpy (result + index + olength - i - 1 , hundreds_digit_table + c, 2 );
191192 i += 2 ;
192193 }
194+ // Last digit.
193195 if (output >= 10 ) {
194196 const uint64_t c = output << 1 ;
195197 // We can't use memcpy here: the decimal dot goes between these two digits.
0 commit comments