Skip to content

Commit 72dfc31

Browse files
committed
Add: Pointer tagging draft
1 parent 83ba346 commit 72dfc31

File tree

2 files changed

+115
-18
lines changed

2 files changed

+115
-18
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ Some of the highlights include:
2626
- __How to handle JSON avoiding memory allocations?__ Is it easier with C or C++ libraries?
2727
- __How to properly use associative containers__ with custom keys and transparent comparators?
2828
- __How to beat a hand-written parser__ with `consteval` RegEx engines?
29+
- __Is the pointer size really 64 bits__ and how to exploit [pointer-tagging](https://en.wikipedia.org/wiki/Tagged_pointer)?
2930

3031
To read, jump to the [`less_slow.cpp` source file](https://github.com/ashvardanian/less_slow.cpp/blob/main/less_slow.cpp) and read the code snippets and comments.
3132
Follow the instructions below to run the code in your environment and compare it to the comments as you read through the source.

less_slow.cpp

Lines changed: 114 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2840,16 +2840,7 @@ bool contains_xss_in_yyjson(yyjson_val *node) noexcept {
28402840
*
28412841
* @see YYJSON allocators: https://ibireme.github.io/yyjson/doc/doxygen/html/structyyjson__alc.html
28422842
*/
2843-
template <bool use_arena>
2844-
static void json_yyjson(bm::State &state) {
2845-
2846-
// Wrap our custom arena into a `yyjson_alc` structure, alternatively we could use:
2847-
//
2848-
// char yyjson_buffer[4096];
2849-
// yyjson_alc_pool_init(&alc, yyjson_buffer, sizeof(yyjson_buffer));
2850-
//
2851-
using arena_t = limited_arena_t;
2852-
arena_t arena;
2843+
yyjson_alc yyjson_wrap_arena_prepending(limited_arena_t &arena) noexcept {
28532844
yyjson_alc alc;
28542845
alc.ctx = &arena;
28552846

@@ -2868,8 +2859,8 @@ static void json_yyjson(bm::State &state) {
28682859
alc_size_t old_size = static_cast<alc_size_t>(old_size_native);
28692860
alc_size_t size = static_cast<alc_size_t>(size_native);
28702861
std::byte *start = static_cast<std::byte *>(ptr) - sizeof(alc_size_t);
2871-
std::byte *new_start = reallocate_from_arena( //
2872-
*static_cast<arena_t *>(ctx), start, //
2862+
std::byte *new_start = reallocate_from_arena( //
2863+
*static_cast<limited_arena_t *>(ctx), start, //
28732864
old_size + sizeof(alc_size_t), size + sizeof(alc_size_t));
28742865
if (!new_start) return nullptr;
28752866
// Don't forget to increment the size if the pointer was reallocated
@@ -2880,9 +2871,95 @@ static void json_yyjson(bm::State &state) {
28802871
std::byte *start = static_cast<std::byte *>(ptr) - sizeof(alc_size_t);
28812872
alc_size_t size;
28822873
std::memcpy(&size, start, sizeof(alc_size_t));
2883-
deallocate_from_arena(*static_cast<arena_t *>(ctx), start, size + sizeof(alc_size_t));
2874+
deallocate_from_arena(*static_cast<limited_arena_t *>(ctx), start, size + sizeof(alc_size_t));
2875+
};
2876+
return alc;
2877+
}
2878+
2879+
/**
2880+
* There is also an even cooler way to allocate memory! @b Pointer-tagging! 🏷️
2881+
* 64-bit address space is a lie! Most systems only use 48 bits for addresses,
2882+
* some even less. So, we can use the remaining bits to store metadata about
2883+
* the allocated block, like its size, or the arena it came from.
2884+
*
2885+
* On x86, for example, calling @b `lscpu` will show:
2886+
*
2887+
* Architecture: x86_64
2888+
* CPU op-mode(s): 32-bit, 64-bit
2889+
* Address sizes: 46 bits physical, 48 bits virtual
2890+
* Byte Order: Little Endian
2891+
*
2892+
* 48-bit virtual addressing allows mapping up to @b 256-TiB of virtual space.
2893+
*/
2894+
2895+
constexpr std::uintptr_t pointer_tag_mask_k = 0xFFFF000000000000ull;
2896+
2897+
inline void *pointer_tag(void *ptr, std::uint16_t size) noexcept {
2898+
std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(ptr);
2899+
std::uintptr_t tagged = (addr & ~pointer_tag_mask_k) | (static_cast<std::uintptr_t>(size) << 48);
2900+
if (addr & (1ull << 47)) tagged |= pointer_tag_mask_k;
2901+
return reinterpret_cast<void *>(tagged);
2902+
}
2903+
2904+
inline std::pair<void *, std::uint16_t> pointer_untag(void *ptr) noexcept {
2905+
std::uintptr_t tagged = reinterpret_cast<std::uintptr_t>(ptr);
2906+
std::uint16_t size = static_cast<std::uint16_t>(tagged >> 48);
2907+
std::uintptr_t addr = tagged & ~pointer_tag_mask_k;
2908+
return {reinterpret_cast<void *>(addr), size};
2909+
}
2910+
2911+
yyjson_alc yyjson_wrap_arena_tagging(limited_arena_t &arena) noexcept {
2912+
yyjson_alc alc;
2913+
alc.ctx = &arena;
2914+
2915+
//? There is a neat trick that allows us to use a lambda as a
2916+
//? C-style function pointer by using the unary `+` operator.
2917+
//? Assuming our buffer is only 4 KB, a 16-bit unsigned integer is enough...
2918+
using alc_size_t = std::uint16_t;
2919+
alc.malloc = +[](void *ctx, size_t size_native) noexcept -> void * {
2920+
alc_size_t size = static_cast<alc_size_t>(size_native);
2921+
std::byte *result = allocate_from_arena(*static_cast<limited_arena_t *>(ctx), size);
2922+
if (!result) return nullptr;
2923+
return pointer_tag(result, size);
28842924
};
28852925

2926+
alc.realloc = +[](void *ctx, void *ptr, size_t old_size_native, size_t size_native) noexcept -> void * {
2927+
alc_size_t size = static_cast<alc_size_t>(size_native);
2928+
auto [real_ptr, _] = pointer_untag(ptr);
2929+
std::byte *new_ptr = reallocate_from_arena(*static_cast<limited_arena_t *>(ctx),
2930+
static_cast<std::byte *>(real_ptr), old_size_native, size_native);
2931+
if (!new_ptr) return nullptr;
2932+
return pointer_tag(new_ptr, size);
2933+
};
2934+
2935+
alc.free = +[](void *ctx, void *ptr) noexcept -> void {
2936+
auto [real_ptr, size] = pointer_untag(ptr);
2937+
deallocate_from_arena(*static_cast<limited_arena_t *>(ctx), static_cast<std::byte *>(real_ptr), size);
2938+
};
2939+
return alc;
2940+
}
2941+
2942+
yyjson_alc yyjson_wrapp_malloc(limited_arena_t &) noexcept {
2943+
yyjson_alc alc;
2944+
alc.ctx = NULL;
2945+
alc.malloc = +[](void *, size_t size) noexcept -> void * { return malloc(size); };
2946+
alc.realloc = +[](void *, void *ptr, size_t, size_t size) noexcept -> void * { return realloc(ptr, size); };
2947+
alc.free = +[](void *, void *ptr) noexcept -> void { free(ptr); };
2948+
return alc;
2949+
}
2950+
2951+
typedef yyjson_alc (*yyjson_alc_wrapper)(limited_arena_t &);
2952+
2953+
static void json_yyjson(bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjson_wrapp_malloc) {
2954+
2955+
// Wrap our custom arena into a `yyjson_alc` structure, alternatively we could use:
2956+
//
2957+
// char yyjson_buffer[4096];
2958+
// yyjson_alc_pool_init(&alc, yyjson_buffer, sizeof(yyjson_buffer));
2959+
//
2960+
using arena_t = limited_arena_t;
2961+
arena_t arena;
2962+
28862963
// Repeat the checks many times
28872964
std::size_t bytes_processed = 0;
28882965
std::size_t peak_memory_usage = 0;
@@ -2896,9 +2973,10 @@ static void json_yyjson(bm::State &state) {
28962973
yyjson_read_err error;
28972974
std::memset(&error, 0, sizeof(error));
28982975

2976+
yyjson_alc alc = alc_wrapper(arena);
28992977
yyjson_doc *doc = yyjson_read_opts( //
29002978
(char *)packet_json.data(), packet_json.size(), //
2901-
YYJSON_READ_NOFLAG, use_arena ? &alc : NULL, &error);
2979+
YYJSON_READ_NOFLAG, &alc, &error);
29022980
if (!error.code) bm::DoNotOptimize(contains_xss_in_yyjson(yyjson_doc_get_root(doc)));
29032981
peak_memory_usage = std::max(peak_memory_usage, arena.total_allocated);
29042982
peak_memory_calls = std::max(peak_memory_calls, arena.unique_allocations);
@@ -2910,10 +2988,28 @@ static void json_yyjson(bm::State &state) {
29102988
bm::Counter(peak_memory_usage * 1.0 / peak_memory_calls, bm::Counter::kAvgThreads);
29112989
}
29122990

2913-
BENCHMARK(json_yyjson<false>)->MinTime(10)->Name("json_yyjson<malloc>");
2914-
BENCHMARK(json_yyjson<true>)->MinTime(10)->Name("json_yyjson<limited_arena>");
2915-
BENCHMARK(json_yyjson<false>)->MinTime(10)->Name("json_yyjson<malloc>")->Threads(physical_cores());
2916-
BENCHMARK(json_yyjson<true>)->MinTime(10)->Name("json_yyjson<limited_arena>")->Threads(physical_cores());
2991+
BENCHMARK_CAPTURE(json_yyjson, malloc, yyjson_wrapp_malloc) //
2992+
->MinTime(10)
2993+
->Name("json_yyjson<malloc>");
2994+
BENCHMARK_CAPTURE(json_yyjson, prepending, yyjson_wrap_arena_prepending)
2995+
->MinTime(10)
2996+
->Name("json_yyjson<limited_arena, prepending>");
2997+
BENCHMARK_CAPTURE(json_yyjson, tagging, yyjson_wrap_arena_tagging)
2998+
->MinTime(10)
2999+
->Name("json_yyjson<tagging_arena, tagging>");
3000+
3001+
BENCHMARK_CAPTURE(json_yyjson, malloc, yyjson_wrapp_malloc)
3002+
->MinTime(10)
3003+
->Name("json_yyjson<malloc>")
3004+
->Threads(physical_cores());
3005+
BENCHMARK_CAPTURE(json_yyjson, prepending, yyjson_wrap_arena_prepending)
3006+
->MinTime(10)
3007+
->Name("json_yyjson<limited_arena, prepending>")
3008+
->Threads(physical_cores());
3009+
BENCHMARK_CAPTURE(json_yyjson, tagging, yyjson_wrap_arena_tagging)
3010+
->MinTime(10)
3011+
->Name("json_yyjson<tagging_arena, tagging>")
3012+
->Threads(physical_cores());
29173013

29183014
/**
29193015
* The `nlohmann::json` library is designed to be simple and easy to use, but it's

0 commit comments

Comments
 (0)