Skip to content

Commit a59333c

Browse files
radiospielhsbt
authored andcommitted
[ruby/json] Faster float formatting
This commit provides an alternative implementation for a float → decimal conversion. It integrates a C implementation of Fabian Loitsch's Grisu-algorithm [[pdf]](http://florian.loitsch.com/publications/dtoa-pldi2010.pdf), extracted from https://github.com/night-shift/fpconv. The relevant files are added in this PR, they are, as is all of https://github.com/night-shift/fpconv, available under a MIT License. As a result, I see a speedup of 900% on Apple Silicon M1 for a float set of benchmarks. floats don't have a single correct string representation: a float like `1000.0` can be represented as "1000", "1e3", "1000.0" (and more). The Grisu algorithm converts floating point numbers to an optimal decimal string representation without loss of precision. As a result, a float that is exactly an integer (like `Float(10)`) will be converted by that algorithm into `"10"`. While technically correct – the JSON format treats floats and integers identically –, this differs from the current behaviour of the `"json"` gem. To address this, the integration checks for that case, and explicitely adds a ".0" suffix in those cases. This is sufficient to meet all existing tests; there is, however, a chance that the current implementation and this implementation occasionally encode floats differently. ``` == Encoding floats (4179311 bytes) ruby 3.4.1 (2024-12-25 revision ruby/json@48d4efcb85) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- json (local) 4.000 i/100ms Calculating ------------------------------------- json (local) 46.046 (± 2.2%) i/s (21.72 ms/i) - 232.000 in 5.039611s Normalize to 2090234 byte == Encoding floats (4179242 bytes) ruby 3.4.1 (2024-12-25 revision ruby/json@48d4efcb85) +YJIT +PRISM [arm64-darwin24] Warming up -------------------------------------- json (2.10.2) 1.000 i/100ms Calculating ------------------------------------- json (2.10.2) 4.614 (± 0.0%) i/s (216.74 ms/i) - 24.000 in 5.201871s ``` These benchmarks are run via a script ([link](https://gist.github.com/radiospiel/04019402726a28b31616df3d0c17bd1c)) which is based on the gem's `benchmark/encoder.rb` file. There are probably better ways to run benchmarks :) My version allows to combine multiple test cases into a single one. The `dumps` benchmark, which covers the JSON files in `benchmark/data/*.json` – with the exception of `canada.json` – , reported a minor speedup within statistical uncertainty. ruby/json@7d77415108
1 parent ef7c7f9 commit a59333c

File tree

6 files changed

+516
-3
lines changed

6 files changed

+516
-3
lines changed
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
The contents of this directory is extracted from https://github.com/night-shift/fpconv
2+
3+
It is licensed under the provisions of the Boost Software License - Version 1.0 - August 17th, 2003. See the ./license file for details.

ext/json/ext/vendor/fpconv/license

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
Boost Software License - Version 1.0 - August 17th, 2003
2+
3+
Permission is hereby granted, free of charge, to any person or organization
4+
obtaining a copy of the software and accompanying documentation covered by
5+
this license (the "Software") to use, reproduce, display, distribute,
6+
execute, and transmit the Software, and to prepare derivative works of the
7+
Software, and to permit third-parties to whom the Software is furnished to
8+
do so, all subject to the following:
9+
10+
The copyright notices in the Software and this entire statement, including
11+
the above license grant, this restriction and the following disclaimer,
12+
must be included in all copies of the Software, in whole or in part, and
13+
all derivative works of the Software, unless such copies or derivative
14+
works are solely in the form of machine-executable object code generated by
15+
a source language processor.
16+
17+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19+
FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
20+
SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
21+
FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
22+
ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
23+
DEALINGS IN THE SOFTWARE.
Lines changed: 339 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,339 @@
1+
#include <stdbool.h>
2+
#include <string.h>
3+
4+
#include "fpconv.h"
5+
#include "powers.h"
6+
7+
#define fracmask 0x000FFFFFFFFFFFFFU
8+
#define expmask 0x7FF0000000000000U
9+
#define hiddenbit 0x0010000000000000U
10+
#define signmask 0x8000000000000000U
11+
#define expbias (1023 + 52)
12+
13+
#define absv(n) ((n) < 0 ? -(n) : (n))
14+
#define minv(a, b) ((a) < (b) ? (a) : (b))
15+
16+
static uint64_t tens[] = {
17+
10000000000000000000U, 1000000000000000000U, 100000000000000000U,
18+
10000000000000000U, 1000000000000000U, 100000000000000U,
19+
10000000000000U, 1000000000000U, 100000000000U,
20+
10000000000U, 1000000000U, 100000000U,
21+
10000000U, 1000000U, 100000U,
22+
10000U, 1000U, 100U,
23+
10U, 1U
24+
};
25+
26+
static inline uint64_t get_dbits(double d)
27+
{
28+
union {
29+
double dbl;
30+
uint64_t i;
31+
} dbl_bits = { d };
32+
33+
return dbl_bits.i;
34+
}
35+
36+
static Fp build_fp(double d)
37+
{
38+
uint64_t bits = get_dbits(d);
39+
40+
Fp fp;
41+
fp.frac = bits & fracmask;
42+
fp.exp = (bits & expmask) >> 52;
43+
44+
if(fp.exp) {
45+
fp.frac += hiddenbit;
46+
fp.exp -= expbias;
47+
48+
} else {
49+
fp.exp = -expbias + 1;
50+
}
51+
52+
return fp;
53+
}
54+
55+
static void normalize(Fp* fp)
56+
{
57+
while ((fp->frac & hiddenbit) == 0) {
58+
fp->frac <<= 1;
59+
fp->exp--;
60+
}
61+
62+
int shift = 64 - 52 - 1;
63+
fp->frac <<= shift;
64+
fp->exp -= shift;
65+
}
66+
67+
static void get_normalized_boundaries(Fp* fp, Fp* lower, Fp* upper)
68+
{
69+
upper->frac = (fp->frac << 1) + 1;
70+
upper->exp = fp->exp - 1;
71+
72+
while ((upper->frac & (hiddenbit << 1)) == 0) {
73+
upper->frac <<= 1;
74+
upper->exp--;
75+
}
76+
77+
int u_shift = 64 - 52 - 2;
78+
79+
upper->frac <<= u_shift;
80+
upper->exp = upper->exp - u_shift;
81+
82+
83+
int l_shift = fp->frac == hiddenbit ? 2 : 1;
84+
85+
lower->frac = (fp->frac << l_shift) - 1;
86+
lower->exp = fp->exp - l_shift;
87+
88+
89+
lower->frac <<= lower->exp - upper->exp;
90+
lower->exp = upper->exp;
91+
}
92+
93+
static Fp multiply(Fp* a, Fp* b)
94+
{
95+
const uint64_t lomask = 0x00000000FFFFFFFF;
96+
97+
uint64_t ah_bl = (a->frac >> 32) * (b->frac & lomask);
98+
uint64_t al_bh = (a->frac & lomask) * (b->frac >> 32);
99+
uint64_t al_bl = (a->frac & lomask) * (b->frac & lomask);
100+
uint64_t ah_bh = (a->frac >> 32) * (b->frac >> 32);
101+
102+
uint64_t tmp = (ah_bl & lomask) + (al_bh & lomask) + (al_bl >> 32);
103+
/* round up */
104+
tmp += 1U << 31;
105+
106+
Fp fp = {
107+
ah_bh + (ah_bl >> 32) + (al_bh >> 32) + (tmp >> 32),
108+
a->exp + b->exp + 64
109+
};
110+
111+
return fp;
112+
}
113+
114+
static void round_digit(char* digits, int ndigits, uint64_t delta, uint64_t rem, uint64_t kappa, uint64_t frac)
115+
{
116+
while (rem < frac && delta - rem >= kappa &&
117+
(rem + kappa < frac || frac - rem > rem + kappa - frac)) {
118+
119+
digits[ndigits - 1]--;
120+
rem += kappa;
121+
}
122+
}
123+
124+
static int generate_digits(Fp* fp, Fp* upper, Fp* lower, char* digits, int* K)
125+
{
126+
uint64_t wfrac = upper->frac - fp->frac;
127+
uint64_t delta = upper->frac - lower->frac;
128+
129+
Fp one;
130+
one.frac = 1ULL << -upper->exp;
131+
one.exp = upper->exp;
132+
133+
uint64_t part1 = upper->frac >> -one.exp;
134+
uint64_t part2 = upper->frac & (one.frac - 1);
135+
136+
int idx = 0, kappa = 10;
137+
uint64_t* divp;
138+
/* 1000000000 */
139+
for(divp = tens + 10; kappa > 0; divp++) {
140+
141+
uint64_t div = *divp;
142+
unsigned digit = (unsigned) (part1 / div);
143+
144+
if (digit || idx) {
145+
digits[idx++] = digit + '0';
146+
}
147+
148+
part1 -= digit * div;
149+
kappa--;
150+
151+
uint64_t tmp = (part1 <<-one.exp) + part2;
152+
if (tmp <= delta) {
153+
*K += kappa;
154+
round_digit(digits, idx, delta, tmp, div << -one.exp, wfrac);
155+
156+
return idx;
157+
}
158+
}
159+
160+
/* 10 */
161+
uint64_t* unit = tens + 18;
162+
163+
while(true) {
164+
part2 *= 10;
165+
delta *= 10;
166+
kappa--;
167+
168+
unsigned digit = (unsigned) (part2 >> -one.exp);
169+
if (digit || idx) {
170+
digits[idx++] = digit + '0';
171+
}
172+
173+
part2 &= one.frac - 1;
174+
if (part2 < delta) {
175+
*K += kappa;
176+
round_digit(digits, idx, delta, part2, one.frac, wfrac * *unit);
177+
178+
return idx;
179+
}
180+
181+
unit--;
182+
}
183+
}
184+
185+
static int grisu2(double d, char* digits, int* K)
186+
{
187+
Fp w = build_fp(d);
188+
189+
Fp lower, upper;
190+
get_normalized_boundaries(&w, &lower, &upper);
191+
192+
normalize(&w);
193+
194+
int k;
195+
Fp cp = find_cachedpow10(upper.exp, &k);
196+
197+
w = multiply(&w, &cp);
198+
upper = multiply(&upper, &cp);
199+
lower = multiply(&lower, &cp);
200+
201+
lower.frac++;
202+
upper.frac--;
203+
204+
*K = -k;
205+
206+
return generate_digits(&w, &upper, &lower, digits, K);
207+
}
208+
209+
static int emit_digits(char* digits, int ndigits, char* dest, int K, bool neg)
210+
{
211+
int exp = absv(K + ndigits - 1);
212+
213+
int max_trailing_zeros = 7;
214+
215+
if(neg) {
216+
max_trailing_zeros -= 1;
217+
}
218+
219+
/* write plain integer */
220+
if(K >= 0 && (exp < (ndigits + max_trailing_zeros))) {
221+
222+
memcpy(dest, digits, ndigits);
223+
memset(dest + ndigits, '0', K);
224+
225+
return ndigits + K;
226+
}
227+
228+
/* write decimal w/o scientific notation */
229+
if(K < 0 && (K > -7 || exp < 4)) {
230+
int offset = ndigits - absv(K);
231+
/* fp < 1.0 -> write leading zero */
232+
if(offset <= 0) {
233+
offset = -offset;
234+
dest[0] = '0';
235+
dest[1] = '.';
236+
memset(dest + 2, '0', offset);
237+
memcpy(dest + offset + 2, digits, ndigits);
238+
239+
return ndigits + 2 + offset;
240+
241+
/* fp > 1.0 */
242+
} else {
243+
memcpy(dest, digits, offset);
244+
dest[offset] = '.';
245+
memcpy(dest + offset + 1, digits + offset, ndigits - offset);
246+
247+
return ndigits + 1;
248+
}
249+
}
250+
251+
/* write decimal w/ scientific notation */
252+
ndigits = minv(ndigits, 18 - neg);
253+
254+
int idx = 0;
255+
dest[idx++] = digits[0];
256+
257+
if(ndigits > 1) {
258+
dest[idx++] = '.';
259+
memcpy(dest + idx, digits + 1, ndigits - 1);
260+
idx += ndigits - 1;
261+
}
262+
263+
dest[idx++] = 'e';
264+
265+
char sign = K + ndigits - 1 < 0 ? '-' : '+';
266+
dest[idx++] = sign;
267+
268+
int cent = 0;
269+
270+
if(exp > 99) {
271+
cent = exp / 100;
272+
dest[idx++] = cent + '0';
273+
exp -= cent * 100;
274+
}
275+
if(exp > 9) {
276+
int dec = exp / 10;
277+
dest[idx++] = dec + '0';
278+
exp -= dec * 10;
279+
280+
} else if(cent) {
281+
dest[idx++] = '0';
282+
}
283+
284+
dest[idx++] = exp % 10 + '0';
285+
286+
return idx;
287+
}
288+
289+
static int filter_special(double fp, char* dest)
290+
{
291+
if(fp == 0.0) {
292+
dest[0] = '0';
293+
return 1;
294+
}
295+
296+
uint64_t bits = get_dbits(fp);
297+
298+
bool nan = (bits & expmask) == expmask;
299+
300+
if(!nan) {
301+
return 0;
302+
}
303+
304+
if(bits & fracmask) {
305+
dest[0] = 'n'; dest[1] = 'a'; dest[2] = 'n';
306+
307+
} else {
308+
dest[0] = 'i'; dest[1] = 'n'; dest[2] = 'f';
309+
}
310+
311+
return 3;
312+
}
313+
314+
int fpconv_dtoa(double d, char dest[24])
315+
{
316+
char digits[18];
317+
318+
int str_len = 0;
319+
bool neg = false;
320+
321+
if(get_dbits(d) & signmask) {
322+
dest[0] = '-';
323+
str_len++;
324+
neg = true;
325+
}
326+
327+
int spec = filter_special(d, dest + str_len);
328+
329+
if(spec) {
330+
return str_len + spec;
331+
}
332+
333+
int K = 0;
334+
int ndigits = grisu2(d, digits, &K);
335+
336+
str_len += emit_digits(digits, ndigits, dest + str_len, K, neg);
337+
338+
return str_len;
339+
}

0 commit comments

Comments
 (0)