Skip to content

Commit 7fe3045

Browse files
authored
Merge pull request #1 from wado-lang/gfx/wasm_size_optimize
Introduce features = ["small"] to reduce binary size for Wasm
2 parents f91bdcd + 147766e commit 7fe3045

File tree

9 files changed

+384
-73
lines changed

9 files changed

+384
-73
lines changed

.github/workflows/ci.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ jobs:
1717
- uses: actions/checkout@v6
1818
- uses: dtolnay/rust-toolchain@stable
1919
- run: cargo test --workspace
20+
- run: cargo test --workspace --features small
2021

2122
integrity:
2223
name: Integrity
@@ -28,6 +29,9 @@ jobs:
2829
components: clippy, rustfmt
2930
targets: wasm32-unknown-unknown
3031
- run: cargo check --workspace
32+
- run: cargo check --workspace --features small
3133
- run: cargo check --target wasm32-unknown-unknown
34+
- run: cargo check --target wasm32-unknown-unknown --features small
3235
- run: cargo clippy --workspace -- -D warnings
36+
- run: cargo clippy --workspace --features small -- -D warnings
3337
- run: cargo fmt --all --check

AGENTS.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@ The crate is `no_std` and `wasm32-unknown-unknown` compatible.
99
## Development
1010

1111
```sh
12-
cargo test
12+
cargo test && cargo test --features small
1313
cargo fmt
14-
cargo clippy
14+
cargo clippy && cargo clippy --features small
1515
cargo bench -p bench
16+
cargo bench -p bench --features small
1617
```

Cargo.toml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ repository = "https://github.com/wado-lang/fpfmt"
99
autotests = false
1010
exclude = ["tools/", "tests/"]
1111

12+
[features]
13+
small = []
14+
1215
[workspace]
1316
members = [".", "tools/pow10gen", "tools/bench", "tools/wasm-size"]
1417

@@ -27,5 +30,10 @@ workspace = true
2730
name = "comprehensive"
2831
path = "tests/comprehensive.rs"
2932

33+
[profile.release]
34+
lto = true
35+
opt-level = "z"
36+
strip = true
37+
3038
[dev-dependencies]
3139
chacha8rand = "0.1"

README.md

Lines changed: 37 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,28 +21,58 @@ Regenerate the power-of-10 table:
2121
cargo run -p pow10gen
2222
```
2323

24+
## Features
25+
26+
### `small` — compact tables for Wasm
27+
28+
The core algorithm needs `10^p` as a 128-bit normalized mantissa for each
29+
exponent `p` in −348..=347. By default, all 696 entries are stored in a flat
30+
lookup table (696 × 16 = 11 KB).
31+
32+
The `small` feature decomposes the lookup using `10^p = 10^(27q) × 10^r`
33+
where `p = 27q + r` and `0 ≤ r < 27`. This replaces the single table with:
34+
35+
- `POW10_COARSE`: 26 entries of `(u64, u64)` for `10^(27q)` — 416 bytes
36+
- `POW10_FINE`: 27 entries of `u64` for `10^r` — 216 bytes
37+
38+
Fine entries need only one `u64` (not two) because `10^r` for `r ≤ 26` is
39+
exact at 128 bits and the low 64 bits are always zero.
40+
41+
At runtime, `prescale` multiplies the two factors back together with u128
42+
arithmetic instead of doing a direct table lookup. This reduces Wasm binary
43+
size from 14 KB to **4 KB** with a modest formatting slowdown (~1.6x),
44+
while parsing is unaffected. Still **2.4x faster** than ryu for formatting.
45+
46+
```toml
47+
fpfmt = { version = "0.2", features = ["small"] }
48+
```
49+
2450
## Benchmarks
2551

2652
Formatting and parsing 8 representative f64 values (`1.0`, `0.1`, `3.14`, `PI`, `E`, `1e23`, `5e-324`, `1.7976931348623157e308`).
2753

2854
Measured on Apple M3 Pro, macOS 15.7.3 (aarch64):
2955

30-
| Task | fpfmt | ryu | stdlib |
31-
|------|------:|----:|-------:|
32-
| **format** (f64 → string) | 63 ns | 164 ns | 535 ns |
33-
| **parse** (string → f64) | 738 ns || 702 ns |
56+
| Task | fpfmt | fpfmt `small` | ryu | stdlib |
57+
| ------------------------- | -----: | ------------: | -----: | -----: |
58+
| **format** (f64 → string) | 63 ns | 99 ns | 239 ns | 529 ns |
59+
| **parse** (string → f64) | 690 ns | 700 ns | | 675 ns |
3460

3561
```sh
3662
cargo bench -p bench
63+
cargo bench -p bench --features small
3764
```
3865

3966
## Wasm size
4067

41-
32,970 bytes for `short` + `parse` as a cdylib (`wasm32-unknown-unknown`, `-Oz`).
68+
| Configuration | Size |
69+
| ------------- | --------------: |
70+
| default | 14,379 bytes |
71+
| `small` | **4,175 bytes** |
4272

4373
```sh
44-
RUSTFLAGS="-C opt-level=s" cargo build --target wasm32-unknown-unknown --release -p wasm-size
45-
RUSTFLAGS="-C opt-level=z" cargo build --target wasm32-unknown-unknown --release -p wasm-size
74+
cargo build --target wasm32-unknown-unknown --release -p wasm-size
75+
cargo build --target wasm32-unknown-unknown --release -p wasm-size --features small
4676
wc -c target/wasm32-unknown-unknown/release/wasm_size.wasm
4777
```
4878

src/lib.rs

Lines changed: 64 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@
99

1010
mod pow10tab;
1111

12+
#[cfg(feature = "small")]
13+
use pow10tab::{K, POW10_COARSE, POW10_FINE, Q_MIN};
14+
#[cfg(not(feature = "small"))]
1215
use pow10tab::{POW10_MIN, POW10_TAB};
1316

1417
/// `PmHiLo` represents `hi<<64 - lo`.
@@ -29,14 +32,6 @@ struct Scaler {
2932
#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
3033
struct Unrounded(u64);
3134

32-
impl core::fmt::Display for Unrounded {
33-
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
34-
let u = self.0;
35-
let plus = if u & 1 != 0 { "+" } else { "" };
36-
write!(f, "\u{27e8}{}.{}{plus}\u{27e9}", u >> 2, 5 * ((u >> 1) & 1))
37-
}
38-
}
39-
4035
#[cfg(test)]
4136
#[allow(clippy::float_cmp)]
4237
fn unround(x: f64) -> Unrounded {
@@ -124,7 +119,6 @@ const UINT64_POW10: [u64; 20] = [
124119
/// `unpack64` returns (m, e) such that `f = m * 2**e`.
125120
/// The caller is expected to have handled 0, NaN, and +/-Inf already.
126121
/// To unpack an `f32`, use `unpack64(f as f64)`.
127-
#[inline]
128122
#[allow(clippy::many_single_char_names)]
129123
fn unpack64(f: f64) -> (u64, i32) {
130124
const SHIFT: u32 = 64 - 53; // 11
@@ -160,6 +154,7 @@ fn unmin(x: u64) -> Unrounded {
160154

161155
/// `prescale` returns the scaling constants for (e, p).
162156
/// `lp` must be `log2_pow10(p)`.
157+
#[cfg(not(feature = "small"))]
163158
#[inline]
164159
fn prescale(e: i32, p: i32, lp: i32) -> Scaler {
165160
Scaler {
@@ -168,10 +163,66 @@ fn prescale(e: i32, p: i32, lp: i32) -> Scaler {
168163
}
169164
}
170165

166+
#[cfg(feature = "small")]
167+
#[allow(clippy::many_single_char_names)]
168+
fn mul_pow10(p: i32) -> PmHiLo {
169+
let q = p.div_euclid(K);
170+
let r = p.rem_euclid(K) as usize;
171+
172+
let c = POW10_COARSE[(q - Q_MIN) as usize];
173+
let f = POW10_FINE[r];
174+
175+
// Convert coarse from PmHiLo to raw u128 (hi<<64 - lo).
176+
let c_raw = (u128::from(c.hi) << 64).wrapping_sub(u128::from(c.lo));
177+
let c_hi = (c_raw >> 64) as u64;
178+
let c_lo = c_raw as u64;
179+
180+
// Product = c_raw * f * 2^64 (256-bit).
181+
// Split into a1*2^64 + a0 = c_raw * f, then shift left 64.
182+
let a1 = u128::from(c_hi) * u128::from(f);
183+
let a0 = u128::from(c_lo) * u128::from(f);
184+
185+
// Top 128 = a1 + (a0 >> 64); remainder = (a0 as u64).
186+
let mut top = a1 + (a0 >> 64);
187+
let has_remainder = a0 as u64 != 0;
188+
189+
// Round up if not exact (matching generator convention).
190+
if has_remainder {
191+
top += 1;
192+
}
193+
194+
// Normalize: ensure bit 127 is set.
195+
let norm = 1 - (top >> 127) as u32;
196+
top <<= norm;
197+
198+
let hi = (top >> 64) as u64;
199+
let lo = top as u64;
200+
201+
// Convert to PmHiLo (hi<<64 - lo).
202+
if lo != 0 {
203+
PmHiLo {
204+
hi: hi + 1,
205+
lo: lo.wrapping_neg(),
206+
}
207+
} else {
208+
PmHiLo { hi, lo: 0 }
209+
}
210+
}
211+
212+
/// `prescale` returns the scaling constants for (e, p).
213+
/// `lp` must be `log2_pow10(p)`.
214+
#[cfg(feature = "small")]
215+
#[inline]
216+
fn prescale(e: i32, p: i32, lp: i32) -> Scaler {
217+
Scaler {
218+
pm: mul_pow10(p),
219+
s: -(e + lp + 3),
220+
}
221+
}
222+
171223
/// `uscale` returns `unround(x * 2**e * 10**p)`.
172224
/// The caller should pass `c = prescale(e, p, log2_pow10(p))`
173225
/// and should have left-justified x so its high bit is set.
174-
#[inline]
175226
fn uscale(x: u64, c: Scaler) -> Unrounded {
176227
let r = u128::from(x) * u128::from(c.pm.hi);
177228
let mut hi = (r >> 64) as u64;
@@ -193,10 +244,9 @@ fn uscale(x: u64, c: Scaler) -> Unrounded {
193244
///
194245
/// Panics if `n > 18`.
195246
#[must_use]
196-
#[inline]
197247
#[allow(clippy::many_single_char_names)]
198248
pub fn fixed_width(f: f64, n: i32) -> (u64, i32) {
199-
assert!(n <= 18, "too many digits");
249+
debug_assert!(n <= 18, "too many digits");
200250
let (m, e) = unpack64(f);
201251
let p = n - 1 - log10_pow2(e + 63);
202252
let u = uscale(m, prescale(e, p, log2_pow10(p)));
@@ -216,10 +266,9 @@ pub fn fixed_width(f: f64, n: i32) -> (u64, i32) {
216266
///
217267
/// Panics if `d > 10_000_000_000_000_000_000` (more than 19 digits).
218268
#[must_use]
219-
#[inline]
220269
#[allow(clippy::many_single_char_names)]
221270
pub fn parse(d: u64, p: i32) -> f64 {
222-
assert!(d <= 10_000_000_000_000_000_000, "too many digits");
271+
debug_assert!(d <= 10_000_000_000_000_000_000, "too many digits");
223272
let b = 64 - d.leading_zeros() as i32; // bits.Len64(d)
224273
let lp = log2_pow10(p);
225274
let mut e = (1074i32).min(53 - b - lp);
@@ -240,7 +289,6 @@ pub fn parse(d: u64, p: i32) -> f64 {
240289
/// Parses a decimal string and returns the nearest f64.
241290
/// Returns `None` if the input is malformed.
242291
#[must_use]
243-
#[inline]
244292
pub fn parse_text(s: &[u8]) -> Option<f64> {
245293
fn is_digit(c: u8) -> bool {
246294
c.wrapping_sub(b'0') <= 9
@@ -308,7 +356,6 @@ pub fn parse_text(s: &[u8]) -> Option<f64> {
308356
/// using as few digits as possible that will still round trip
309357
/// back to the original f64.
310358
#[must_use]
311-
#[inline]
312359
#[allow(clippy::many_single_char_names)]
313360
pub fn short(f: f64) -> (u64, i32) {
314361
const MIN_EXP: i32 = -1085;
@@ -357,7 +404,6 @@ fn skewed(e: i32) -> i32 {
357404
/// Removes trailing zeros from `x * 10**p`.
358405
/// If x ends in k zeros, returns `(x/10**k, p+k)`.
359406
/// Assumes that x ends in at most 16 zeros.
360-
#[inline]
361407
#[allow(clippy::unreadable_literal)]
362408
fn trim_zeros(x: u64, p: i32) -> (u64, i32) {
363409
const INV5P8: u64 = 0xc767074b22e90e21; // inverse of 5**8
@@ -418,7 +464,6 @@ const I2A: &[u8] = b"\
418464
/// Formats the decimal representation of u into a.
419465
/// The caller is responsible for ensuring that a is big enough to hold u.
420466
/// If a is too big, leading zeros will be filled in as needed.
421-
#[inline]
422467
fn format_base10(a: &mut [u8], mut u: u64) {
423468
let mut nd = a.len();
424469
while nd >= 8 {
@@ -469,7 +514,6 @@ fn format_base10(a: &mut [u8], mut u: u64) {
469514
/// The caller must pass nd set to the number of digits in d.
470515
/// Returns the number of bytes written to s.
471516
#[must_use]
472-
#[inline]
473517
pub fn fmt_float(s: &mut [u8], d: u64, p: i32, nd: i32) -> usize {
474518
let nd = nd as usize;
475519
// Put digits into s, leaving room for decimal point.
@@ -519,6 +563,7 @@ mod tests {
519563

520564
/// `TestPow10`: verify power-of-10 table entries.
521565
/// Port of Go's `TestPow10`.
566+
#[cfg(not(feature = "small"))]
522567
#[test]
523568
fn test_pow10() {
524569
let cases: [(i32, PmHiLo, i32); 4] = [

0 commit comments

Comments
 (0)