more slide work

lemire · lemire · commit b1ddc5e21b16 · 2025-09-18T07:26:50.000-06:00
diff --git a/cppcon2025/cppcon_2025_slides.md b/cppcon2025/cppcon_2025_slides.md
@@ -620,6 +620,7 @@ if (!needs_escape)
 
 # Optimization #3: Fast Integer serialization
 
+(`std::to_chars`)
 
 ```cpp
 while(number >= 10) {
@@ -637,7 +638,7 @@ Writing from the end
 
 ```cpp
 while(number >= 100) {
-    memcpy(write_pointer - 1, &internal::decimal_table[(pv % 100)*2], 2);
+    memcpy(write_pointer - 1, &decimal_table[(pv % 100)*2], 2);
     write_pointer -= 2;
     pv /= 100;
 }
@@ -655,6 +656,11 @@ if(number >= 10) {
 - Useful to compute quickly the number of digits
 
 ```cpp
+template <typename number_type>
+int int_log2(number_type x) { 
+  return 63 - leading_zeroes(uint64_t(x) | 1); 
+}
+
 int fast_digit_count_64(uint64_t x) {
   static uint64_t table[] = {9,
                              99,
@@ -670,17 +676,41 @@ int fast_digit_count_64(uint64_t x) {
 }
 ```
 
-
 ---
 
-# Does fast integer processing matter?
-
-Replace fast digit count by naive approach
+# Could use SIMD if we wanted to
 
+**Don't try to understand:**:
 ```cpp
-std::to_string(value).length();  // Allocates string just to count!
+__m128i to_string_avx512ifma(uint64_t n) {
+  uint64_t n_15_08  = n / 100000000;
+  uint64_t n_07_00  = n % 100000000;
+  __m512i bcstq_h   = _mm512_set1_epi64(n_15_08);
+  __m512i bcstq_l   = _mm512_set1_epi64(n_07_00);
+  __m512i zmmzero   = _mm512_castsi128_si512(_mm_cvtsi64_si128(0x1A1A400));
+  __m512i zmmTen    = _mm512_set1_epi64(10);
+  __m512i asciiZero = _mm512_set1_epi64('0');
+  __m512i ifma_const	= _mm512_setr_epi64(0x00000000002af31dc, ...);
+  __m512i permb_const	= _mm512_castsi128_si512(_mm_set_epi8(0x78, ...));
+  __m512i lowbits_h	= _mm512_madd52lo_epu64(zmmzero, bcstq_h, ifma_const);
+  __m512i lowbits_l	= _mm512_madd52lo_epu64(zmmzero, bcstq_l, ifma_const);
+  __m512i highbits_h	= _mm512_madd52hi_epu64(asciiZero, zmmTen, lowbits_h);
+  __m512i highbits_l	= _mm512_madd52hi_epu64(asciiZero, zmmTen, lowbits_l);
+  __m512i perm          = _mm512_permutex2var_epi8(highbits_h, permb_const, highbits_l);
+  __m128i digits_15_0	= _mm512_castsi512_si128(perm);
+  return digits_15_0;
+}
 ```
 
+---
+
+# Does fast integer processing matter?
+
+* Replace fast digit count by naive approach based on `std::to_string`
+    ```cpp
+    std::to_string(value).length();
+    ```
+* Only 34% worse in one dataset.
 
 ---