2828//
2929// VERSION HISTORY
3030//
31- // 0.01 (2025-10-30 ) Initial release
31+ // 1.00 (2025-10-31 ) Initial release
3232//
3333//
3434// DOCUMENTATION
@@ -281,15 +281,22 @@ struct fast_uniform_real_distribution {
281281 impl::do_rng_checks (rng);
282282
283283 // (1 << d) - 1 is more readable, but might overflow
284- ITLIB_RAND_DIST_CONSTEXPR uint64_t max_int = ~uint64_t (0 ) >> (64 - std::numeric_limits<F>::digits);
284+ constexpr uint64_t max_int = ~uint64_t (0 ) >> (64 - std::numeric_limits<F>::digits);
285285
286286 using r_t = typename R::result_type;
287287 ITLIB_RAND_DIST_CONSTEXPR uint64_t rng_range = R::max () - R::min ();
288288 if ITLIB_RAND_DIST_CONSTEXPR (rng_range >= max_int) {
289289 // rng_range is enough to saturate our float precision
290290 // we slice off the needed bits (and hope that rng is uniform over all bits)
291291 const auto random_value = r_t (rng () - R::min ()) & r_t (max_int);
292- return std::ldexp (F (random_value), -std::numeric_limits<F>::digits);
292+
293+ // ideally we would use this here:
294+ // return std::ldexp(F(random_value), -std::numeric_limits<F>::digits);
295+ // ... but in reality no compiler would would optimize it well enough
296+ // every single one does a `call ldexp[f]`!
297+ // at best they would do value * exp2[f](-digits) which is literally the same as below
298+ // well, not literally, as exp2 only gets an overload with C++23, but still...
299+ return F (random_value) / F (max_int + 1 );
293300 }
294301 else {
295302 // "stretching" to rng_range
@@ -300,12 +307,19 @@ struct fast_uniform_real_distribution {
300307
301308 template <typename R>
302309 constexpr static F draw (F min, F max, R& rng) {
303- return std::fma (max - min, draw_01 (rng), min);
310+ // see comment below about fma
311+ // return std::fma(max - min, draw_01(rng), min);
312+ return min + (max - min) * draw_01 (rng);
304313 }
305314
306315 template <typename R>
307316 constexpr F operator ()(R& rng) const {
308- return std::fma (m_scale, draw_01 (rng), m_min);
317+ // in the interest of precision we should use fma here:
318+ // return std::fma(m_scale, draw_01(rng), m_min);
319+ // unfortunately it's slower than simple multiply+add on most hardware
320+ // std::uniform_real_distribution uses multiply+add as well
321+ // we want to be fast, so we do it as well
322+ return m_min + m_scale * draw_01 (rng);
309323 }
310324
311325private:
0 commit comments