@@ -2724,7 +2724,9 @@ struct arena_t {
27242724 // / The total bytes "freed" so far
27252725 std::size_t total_reclaimed = 0 ;
27262726 // / The total number of unique allocations before a reset
2727- std::size_t unique_allocations = 0 ;
2727+ std::size_t unique_allocs = 0 ;
2728+ // The maximum number of bytes allocated at once
2729+ std::size_t max_alloc_size = 0 ;
27282730};
27292731
27302732/* *
@@ -2735,7 +2737,8 @@ inline std::byte *allocate_from_arena(arena_t &arena, std::size_t size) noexcept
27352737 if (arena.total_allocated + size > arena_t ::capacity_k) return nullptr ; // Not enough space
27362738 std::byte *ptr = arena.buffer + arena.total_allocated ;
27372739 arena.total_allocated += size;
2738- arena.unique_allocations ++;
2740+ arena.unique_allocs ++;
2741+ arena.max_alloc_size = std::max (arena.max_alloc_size , size);
27392742 return ptr;
27402743}
27412744
@@ -2751,7 +2754,7 @@ inline void deallocate_from_arena(arena_t &arena, std::byte *ptr, std::size_t si
27512754 arena.total_reclaimed += size;
27522755 // Reset completely if fully reclaimed
27532756 if (arena.total_allocated == arena.total_reclaimed )
2754- arena.total_allocated = 0 , arena.total_reclaimed = 0 , arena.unique_allocations = 0 ;
2757+ arena.total_allocated = 0 , arena.total_reclaimed = 0 , arena.unique_allocs = 0 , arena. max_alloc_size = 0 ;
27552758}
27562759
27572760/* *
@@ -2877,7 +2880,7 @@ yyjson_alc yyjson_wrap_arena_prepend(arena_t &arena) noexcept {
28772880}
28782881
28792882/* *
2880- * There is also an even cooler way to allocate memory! @b Pointer-tag ! 🏷️
2883+ * There is also an even cooler way to allocate memory! @b Pointer-tagging ! 🏷️
28812884 * 64-bit address space is a lie! Most systems only use 48 bits for addresses,
28822885 * some even less. So, we can use the remaining bits to store metadata about
28832886 * the allocated block, like its size, or the arena it came from.
@@ -2889,23 +2892,64 @@ yyjson_alc yyjson_wrap_arena_prepend(arena_t &arena) noexcept {
28892892 * Address sizes: 46 bits physical, 48 bits virtual
28902893 * Byte Order: Little Endian
28912894 *
2892- * 48-bit virtual addressing allows mapping up to @b 256-TiB of virtual space.
2895+ * 48-bit virtual addressing allows mapping up to @b 256-TiB of virtual space,
2896+ * leaving 16 bits for metadata. On Armv8-A there is a Top Byte Ignore @b (TBI)
2897+ * mode, that frees 8 bits for such metadata, but it may not be enough for our
2898+ * current use-case.
2899+ *
2900+ * There is a catch! On every OS and CPU vendor, the mechanic is different.
2901+ * On Intel-based Linux systems, for example, the feature is called "Linear Address
2902+ * Masking" or @b LAM for short. It can be configured in 2 modes
2903+ *
2904+ * - LAM_U57: 57-bit linear addresses, 7 bits for metadata
2905+ * - LAM_U48: 48-bit linear addresses, 16 bits for metadata
2906+ *
2907+ * The Linux kernel itself has to be compiled with LAM support, and the feature must
2908+ * also be enabled for the current running process.
2909+ *
2910+ * @see "Support for Intel's Linear Address Masking" on Linux Weekly News:
2911+ * https://lwn.net/Articles/902094/
28932912 */
28942913
2895- constexpr std::uintptr_t pointer_tag_mask_k = 0xFFFF000000000000ull ;
2896-
2897- inline void *pointer_tag (void *ptr, std::uint16_t size) noexcept {
2898- std::uintptr_t addr = reinterpret_cast <std::uintptr_t >(ptr);
2899- std::uintptr_t tagged = (addr & ~pointer_tag_mask_k) | (static_cast <std::uintptr_t >(size) << 48 );
2900- if (addr & (1ull << 47 )) tagged |= pointer_tag_mask_k;
2901- return reinterpret_cast <void *>(tagged);
2914+ #if defined(__x86_64__) && defined(__linux__)
2915+ #include < asm/prctl.h> // `ARCH_ENABLE_TAGGED_ADDR`
2916+ #include < sys/syscall.h> // `SYS_arch_prctl`
2917+ static bool enable_pointer_tagging (unsigned long bits = 1 ) noexcept {
2918+ // The argument is required number of tag bits.
2919+ // It is rounded up to the nearest LAM mode that can provide it.
2920+ // For now only LAM_U57 is supported, with 6 tag bits.
2921+ return syscall (SYS_arch_prctl, ARCH_ENABLE_TAGGED_ADDR, bits) == 0 ;
29022922}
2923+ #else
2924+ static bool enable_pointer_tagging (unsigned long = 0 ) noexcept { return false ; }
2925+ #endif
29032926
2927+ template <int start_bit_ = 48 , int end_bit_ = 62 >
2928+ inline void *pointer_tag (void *ptr, std::uint16_t tag) noexcept {
2929+ static_assert (start_bit_ <= end_bit_);
2930+ // Number of bits available for the tag:
2931+ constexpr int bits_count = end_bit_ - start_bit_ + 1 ;
2932+ static_assert (bits_count <= 16 , " We only store up to 16 bits in that range (std::uint16_t)." );
2933+ // Convert pointer to a 64-bit integer:
2934+ std::uint64_t val = reinterpret_cast <std::uint64_t >(ptr);
2935+ // Create a mask that clears the bits in [start_bit_ .. end_bit_].
2936+ std::uint64_t const clear_mask = ~(((1ULL << bits_count) - 1ULL ) << start_bit_);
2937+ val &= clear_mask;
2938+ // Insert our tag into those bits:
2939+ std::uint64_t const tag_val = (static_cast <std::uint64_t >(tag) & ((1ULL << bits_count) - 1ULL )) << start_bit_;
2940+ val |= tag_val;
2941+ return reinterpret_cast <void *>(val);
2942+ }
2943+
2944+ template <int start_bit_ = 48 , int end_bit_ = 62 >
29042945inline std::pair<void *, std::uint16_t > pointer_untag (void *ptr) noexcept {
2905- std::uintptr_t tagged = reinterpret_cast <std::uintptr_t >(ptr);
2906- std::uint16_t size = static_cast <std::uint16_t >(tagged >> 48 );
2907- std::uintptr_t addr = tagged & ~pointer_tag_mask_k;
2908- return {reinterpret_cast <void *>(addr), size};
2946+ static_assert (start_bit_ <= end_bit_);
2947+ constexpr int bits_count = end_bit_ - start_bit_ + 1 ;
2948+ std::uint64_t val = reinterpret_cast <std::uint64_t >(ptr);
2949+ std::uint64_t extracted_tag = (val >> start_bit_) & ((1ULL << bits_count) - 1ULL );
2950+ std::uint64_t const clear_mask = ~(((1ULL << bits_count) - 1ULL ) << start_bit_);
2951+ val &= clear_mask;
2952+ return {reinterpret_cast <void *>(val), static_cast <std::uint16_t >(extracted_tag)};
29092953}
29102954
29112955yyjson_alc yyjson_wrap_arena_tag (arena_t &arena) noexcept {
@@ -2925,9 +2969,11 @@ yyjson_alc yyjson_wrap_arena_tag(arena_t &arena) noexcept {
29252969
29262970 alc.realloc = +[](void *ctx, void *ptr, size_t old_size_native, size_t size_native) noexcept -> void * {
29272971 alc_size_t size = static_cast <alc_size_t >(size_native);
2928- auto [real_ptr, _] = pointer_untag (ptr);
2929- std::byte *new_ptr = reallocate_from_arena (*static_cast <arena_t *>(ctx), static_cast <std::byte *>(real_ptr),
2930- old_size_native, size_native);
2972+ auto [real_ptr, old_size_from_ptr] = pointer_untag (ptr);
2973+ assert (old_size_native == old_size_from_ptr);
2974+ std::byte *new_ptr = reallocate_from_arena ( //
2975+ *static_cast <arena_t *>(ctx), static_cast <std::byte *>(real_ptr), //
2976+ old_size_from_ptr, size_native);
29312977 if (!new_ptr) return nullptr ;
29322978 return pointer_tag (new_ptr, size);
29332979 };
@@ -2939,7 +2985,7 @@ yyjson_alc yyjson_wrap_arena_tag(arena_t &arena) noexcept {
29392985 return alc;
29402986}
29412987
2942- yyjson_alc yyjson_wrapp_malloc (arena_t &) noexcept {
2988+ yyjson_alc yyjson_wrap_malloc (arena_t &) noexcept {
29432989 yyjson_alc alc;
29442990 alc.ctx = NULL ;
29452991 alc.malloc = +[](void *, size_t size) noexcept -> void * { return malloc (size); };
@@ -2950,7 +2996,10 @@ yyjson_alc yyjson_wrapp_malloc(arena_t &) noexcept {
29502996
29512997typedef yyjson_alc (*yyjson_alc_wrapper)(arena_t &);
29522998
2953- static void json_yyjson (bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjson_wrapp_malloc) {
2999+ static void json_yyjson (bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjson_wrap_malloc) {
3000+
3001+ if (alc_wrapper == &yyjson_wrap_arena_tag)
3002+ if (!enable_pointer_tagging ()) state.SkipWithError (" Pointer tagging not supported" );
29543003
29553004 // Wrap our custom arena into a `yyjson_alc` structure, alternatively we could use:
29563005 //
@@ -2962,8 +3011,9 @@ static void json_yyjson(bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjso
29623011
29633012 // Repeat the checks many times
29643013 std::size_t bytes_processed = 0 ;
2965- std::size_t peak_memory_usage = 0 ;
2966- std::size_t peak_memory_calls = 0 ;
3014+ std::size_t peak_usage = 0 ;
3015+ std::size_t count_calls = 0 ;
3016+ std::size_t max_alloc = 0 ;
29673017 std::size_t iteration = 0 ;
29683018 for (auto _ : state) {
29693019
@@ -2978,18 +3028,22 @@ static void json_yyjson(bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjso
29783028 (char *)packet_json.data (), packet_json.size (), //
29793029 YYJSON_READ_NOFLAG, &alc, &error);
29803030 if (!error.code ) bm::DoNotOptimize (contains_xss_in_yyjson (yyjson_doc_get_root (doc)));
2981- peak_memory_usage = std::max (peak_memory_usage, arena.total_allocated );
2982- peak_memory_calls = std::max (peak_memory_calls, arena.unique_allocations );
3031+ peak_usage = std::max (peak_usage, arena.total_allocated );
3032+ count_calls = std::max (count_calls, arena.unique_allocs );
3033+ max_alloc = std::max (max_alloc, arena.max_alloc_size );
29833034 yyjson_doc_free (doc);
29843035 }
29853036 state.SetBytesProcessed (bytes_processed);
2986- state.counters [" peak_memory_usage" ] = bm::Counter (peak_memory_usage, bm::Counter::kAvgThreads );
2987- state.counters [" mean_allocation_size" ] =
2988- bm::Counter (peak_memory_usage * 1.0 / peak_memory_calls, bm::Counter::kAvgThreads );
3037+
3038+ if (peak_usage) {
3039+ state.counters [" peak_usage" ] = bm::Counter (peak_usage, bm::Counter::kAvgThreads );
3040+ state.counters [" mean_alloc" ] = bm::Counter (peak_usage * 1.0 / count_calls, bm::Counter::kAvgThreads );
3041+ state.counters [" max_alloc" ] = bm::Counter (max_alloc, bm::Counter::kAvgThreads );
3042+ }
29893043}
29903044
2991- BENCHMARK_CAPTURE (json_yyjson, malloc, yyjson_wrapp_malloc )->MinTime(10 )->Name(" json_yyjson<malloc>" );
2992- BENCHMARK_CAPTURE (json_yyjson, malloc, yyjson_wrapp_malloc )
3045+ BENCHMARK_CAPTURE (json_yyjson, malloc, yyjson_wrap_malloc )->MinTime(10 )->Name(" json_yyjson<malloc>" );
3046+ BENCHMARK_CAPTURE (json_yyjson, malloc, yyjson_wrap_malloc )
29933047 ->MinTime(10 )
29943048 ->Name(" json_yyjson<malloc>" )
29953049 ->Threads(physical_cores());
@@ -3127,8 +3181,9 @@ enum class exception_handling_t { throw_k, noexcept_k };
31273181template <typename json_type_, exception_handling_t exception_handling_>
31283182static void json_nlohmann (bm::State &state) {
31293183 std::size_t bytes_processed = 0 ;
3130- std::size_t peak_memory_usage = 0 ;
3131- std::size_t peak_memory_calls = 0 ;
3184+ std::size_t peak_usage = 0 ;
3185+ std::size_t count_calls = 0 ;
3186+ std::size_t max_alloc = 0 ;
31323187 std::size_t iteration = 0 ;
31333188 for (auto _ : state) {
31343189
@@ -3156,14 +3211,18 @@ static void json_nlohmann(bm::State &state) {
31563211 if (!json.is_discarded ()) bm::DoNotOptimize (contains_xss_nlohmann (json));
31573212 }
31583213 if constexpr (!std::is_same_v<json_type_, default_json>) {
3159- peak_memory_usage = std::max (peak_memory_usage, thread_local_arena.total_allocated );
3160- peak_memory_calls = std::max (peak_memory_calls, thread_local_arena.unique_allocations );
3214+ peak_usage = std::max (peak_usage, thread_local_arena.total_allocated );
3215+ count_calls = std::max (count_calls, thread_local_arena.unique_allocs );
3216+ max_alloc = std::max (max_alloc, thread_local_arena.max_alloc_size );
31613217 }
31623218 }
31633219 state.SetBytesProcessed (bytes_processed);
3164- state.counters [" peak_memory_usage" ] = bm::Counter (peak_memory_usage, bm::Counter::kAvgThreads );
3165- state.counters [" mean_allocation_size" ] =
3166- bm::Counter (peak_memory_usage * 1.0 / peak_memory_calls, bm::Counter::kAvgThreads );
3220+
3221+ if (peak_usage) {
3222+ state.counters [" peak_usage" ] = bm::Counter (peak_usage, bm::Counter::kAvgThreads );
3223+ state.counters [" mean_alloc" ] = bm::Counter (peak_usage * 1.0 / count_calls, bm::Counter::kAvgThreads );
3224+ state.counters [" max_alloc" ] = bm::Counter (max_alloc, bm::Counter::kAvgThreads );
3225+ }
31673226}
31683227
31693228BENCHMARK (json_nlohmann<default_json, exception_handling_t ::throw_k>)
0 commit comments