Skip to content

Commit ead0cd2

Browse files
committed
Add: Intel's Linear Address Masking
1 parent 5ac64d1 commit ead0cd2

File tree

1 file changed

+96
-37
lines changed

1 file changed

+96
-37
lines changed

less_slow.cpp

Lines changed: 96 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -2724,7 +2724,9 @@ struct arena_t {
27242724
/// The total bytes "freed" so far
27252725
std::size_t total_reclaimed = 0;
27262726
/// The total number of unique allocations before a reset
2727-
std::size_t unique_allocations = 0;
2727+
std::size_t unique_allocs = 0;
2728+
// The maximum number of bytes allocated at once
2729+
std::size_t max_alloc_size = 0;
27282730
};
27292731

27302732
/**
@@ -2735,7 +2737,8 @@ inline std::byte *allocate_from_arena(arena_t &arena, std::size_t size) noexcept
27352737
if (arena.total_allocated + size > arena_t::capacity_k) return nullptr; // Not enough space
27362738
std::byte *ptr = arena.buffer + arena.total_allocated;
27372739
arena.total_allocated += size;
2738-
arena.unique_allocations++;
2740+
arena.unique_allocs++;
2741+
arena.max_alloc_size = std::max(arena.max_alloc_size, size);
27392742
return ptr;
27402743
}
27412744

@@ -2751,7 +2754,7 @@ inline void deallocate_from_arena(arena_t &arena, std::byte *ptr, std::size_t si
27512754
arena.total_reclaimed += size;
27522755
// Reset completely if fully reclaimed
27532756
if (arena.total_allocated == arena.total_reclaimed)
2754-
arena.total_allocated = 0, arena.total_reclaimed = 0, arena.unique_allocations = 0;
2757+
arena.total_allocated = 0, arena.total_reclaimed = 0, arena.unique_allocs = 0, arena.max_alloc_size = 0;
27552758
}
27562759

27572760
/**
@@ -2877,7 +2880,7 @@ yyjson_alc yyjson_wrap_arena_prepend(arena_t &arena) noexcept {
28772880
}
28782881

28792882
/**
2880-
* There is also an even cooler way to allocate memory! @b Pointer-tag! 🏷️
2883+
* There is also an even cooler way to allocate memory! @b Pointer-tagging! 🏷️
28812884
* 64-bit address space is a lie! Most systems only use 48 bits for addresses,
28822885
* some even less. So, we can use the remaining bits to store metadata about
28832886
* the allocated block, like its size, or the arena it came from.
@@ -2889,23 +2892,64 @@ yyjson_alc yyjson_wrap_arena_prepend(arena_t &arena) noexcept {
28892892
* Address sizes: 46 bits physical, 48 bits virtual
28902893
* Byte Order: Little Endian
28912894
*
2892-
* 48-bit virtual addressing allows mapping up to @b 256-TiB of virtual space.
2895+
* 48-bit virtual addressing allows mapping up to @b 256-TiB of virtual space,
2896+
* leaving 16 bits for metadata. On Armv8-A there is a Top Byte Ignore @b (TBI)
2897+
* mode, that frees 8 bits for such metadata, but it may not be enough for our
2898+
* current use-case.
2899+
*
2900+
* There is a catch! On every OS and CPU vendor, the mechanic is different.
2901+
* On Intel-based Linux systems, for example, the feature is called "Linear Address
2902+
* Masking" or @b LAM for short. It can be configured in 2 modes
2903+
*
2904+
* - LAM_U57: 57-bit linear addresses, 7 bits for metadata
2905+
* - LAM_U48: 48-bit linear addresses, 16 bits for metadata
2906+
*
2907+
* The Linux kernel itself has to be compiled with LAM support, and the feature must
2908+
* also be enabled for the current running process.
2909+
*
2910+
* @see "Support for Intel's Linear Address Masking" on Linux Weekly News:
2911+
* https://lwn.net/Articles/902094/
28932912
*/
28942913

2895-
constexpr std::uintptr_t pointer_tag_mask_k = 0xFFFF000000000000ull;
2896-
2897-
inline void *pointer_tag(void *ptr, std::uint16_t size) noexcept {
2898-
std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(ptr);
2899-
std::uintptr_t tagged = (addr & ~pointer_tag_mask_k) | (static_cast<std::uintptr_t>(size) << 48);
2900-
if (addr & (1ull << 47)) tagged |= pointer_tag_mask_k;
2901-
return reinterpret_cast<void *>(tagged);
2914+
#if defined(__x86_64__) && defined(__linux__)
2915+
#include <asm/prctl.h> // `ARCH_ENABLE_TAGGED_ADDR`
2916+
#include <sys/syscall.h> // `SYS_arch_prctl`
2917+
static bool enable_pointer_tagging(unsigned long bits = 1) noexcept {
2918+
// The argument is required number of tag bits.
2919+
// It is rounded up to the nearest LAM mode that can provide it.
2920+
// For now only LAM_U57 is supported, with 6 tag bits.
2921+
return syscall(SYS_arch_prctl, ARCH_ENABLE_TAGGED_ADDR, bits) == 0;
29022922
}
2923+
#else
2924+
static bool enable_pointer_tagging(unsigned long = 0) noexcept { return false; }
2925+
#endif
29032926

2927+
template <int start_bit_ = 48, int end_bit_ = 62>
2928+
inline void *pointer_tag(void *ptr, std::uint16_t tag) noexcept {
2929+
static_assert(start_bit_ <= end_bit_);
2930+
// Number of bits available for the tag:
2931+
constexpr int bits_count = end_bit_ - start_bit_ + 1;
2932+
static_assert(bits_count <= 16, "We only store up to 16 bits in that range (std::uint16_t).");
2933+
// Convert pointer to a 64-bit integer:
2934+
std::uint64_t val = reinterpret_cast<std::uint64_t>(ptr);
2935+
// Create a mask that clears the bits in [start_bit_ .. end_bit_].
2936+
std::uint64_t const clear_mask = ~(((1ULL << bits_count) - 1ULL) << start_bit_);
2937+
val &= clear_mask;
2938+
// Insert our tag into those bits:
2939+
std::uint64_t const tag_val = (static_cast<std::uint64_t>(tag) & ((1ULL << bits_count) - 1ULL)) << start_bit_;
2940+
val |= tag_val;
2941+
return reinterpret_cast<void *>(val);
2942+
}
2943+
2944+
template <int start_bit_ = 48, int end_bit_ = 62>
29042945
inline std::pair<void *, std::uint16_t> pointer_untag(void *ptr) noexcept {
2905-
std::uintptr_t tagged = reinterpret_cast<std::uintptr_t>(ptr);
2906-
std::uint16_t size = static_cast<std::uint16_t>(tagged >> 48);
2907-
std::uintptr_t addr = tagged & ~pointer_tag_mask_k;
2908-
return {reinterpret_cast<void *>(addr), size};
2946+
static_assert(start_bit_ <= end_bit_);
2947+
constexpr int bits_count = end_bit_ - start_bit_ + 1;
2948+
std::uint64_t val = reinterpret_cast<std::uint64_t>(ptr);
2949+
std::uint64_t extracted_tag = (val >> start_bit_) & ((1ULL << bits_count) - 1ULL);
2950+
std::uint64_t const clear_mask = ~(((1ULL << bits_count) - 1ULL) << start_bit_);
2951+
val &= clear_mask;
2952+
return {reinterpret_cast<void *>(val), static_cast<std::uint16_t>(extracted_tag)};
29092953
}
29102954

29112955
yyjson_alc yyjson_wrap_arena_tag(arena_t &arena) noexcept {
@@ -2925,9 +2969,11 @@ yyjson_alc yyjson_wrap_arena_tag(arena_t &arena) noexcept {
29252969

29262970
alc.realloc = +[](void *ctx, void *ptr, size_t old_size_native, size_t size_native) noexcept -> void * {
29272971
alc_size_t size = static_cast<alc_size_t>(size_native);
2928-
auto [real_ptr, _] = pointer_untag(ptr);
2929-
std::byte *new_ptr = reallocate_from_arena(*static_cast<arena_t *>(ctx), static_cast<std::byte *>(real_ptr),
2930-
old_size_native, size_native);
2972+
auto [real_ptr, old_size_from_ptr] = pointer_untag(ptr);
2973+
assert(old_size_native == old_size_from_ptr);
2974+
std::byte *new_ptr = reallocate_from_arena( //
2975+
*static_cast<arena_t *>(ctx), static_cast<std::byte *>(real_ptr), //
2976+
old_size_from_ptr, size_native);
29312977
if (!new_ptr) return nullptr;
29322978
return pointer_tag(new_ptr, size);
29332979
};
@@ -2939,7 +2985,7 @@ yyjson_alc yyjson_wrap_arena_tag(arena_t &arena) noexcept {
29392985
return alc;
29402986
}
29412987

2942-
yyjson_alc yyjson_wrapp_malloc(arena_t &) noexcept {
2988+
yyjson_alc yyjson_wrap_malloc(arena_t &) noexcept {
29432989
yyjson_alc alc;
29442990
alc.ctx = NULL;
29452991
alc.malloc = +[](void *, size_t size) noexcept -> void * { return malloc(size); };
@@ -2950,7 +2996,10 @@ yyjson_alc yyjson_wrapp_malloc(arena_t &) noexcept {
29502996

29512997
typedef yyjson_alc (*yyjson_alc_wrapper)(arena_t &);
29522998

2953-
static void json_yyjson(bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjson_wrapp_malloc) {
2999+
static void json_yyjson(bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjson_wrap_malloc) {
3000+
3001+
if (alc_wrapper == &yyjson_wrap_arena_tag)
3002+
if (!enable_pointer_tagging()) state.SkipWithError("Pointer tagging not supported");
29543003

29553004
// Wrap our custom arena into a `yyjson_alc` structure, alternatively we could use:
29563005
//
@@ -2962,8 +3011,9 @@ static void json_yyjson(bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjso
29623011

29633012
// Repeat the checks many times
29643013
std::size_t bytes_processed = 0;
2965-
std::size_t peak_memory_usage = 0;
2966-
std::size_t peak_memory_calls = 0;
3014+
std::size_t peak_usage = 0;
3015+
std::size_t count_calls = 0;
3016+
std::size_t max_alloc = 0;
29673017
std::size_t iteration = 0;
29683018
for (auto _ : state) {
29693019

@@ -2978,18 +3028,22 @@ static void json_yyjson(bm::State &state, yyjson_alc_wrapper alc_wrapper = yyjso
29783028
(char *)packet_json.data(), packet_json.size(), //
29793029
YYJSON_READ_NOFLAG, &alc, &error);
29803030
if (!error.code) bm::DoNotOptimize(contains_xss_in_yyjson(yyjson_doc_get_root(doc)));
2981-
peak_memory_usage = std::max(peak_memory_usage, arena.total_allocated);
2982-
peak_memory_calls = std::max(peak_memory_calls, arena.unique_allocations);
3031+
peak_usage = std::max(peak_usage, arena.total_allocated);
3032+
count_calls = std::max(count_calls, arena.unique_allocs);
3033+
max_alloc = std::max(max_alloc, arena.max_alloc_size);
29833034
yyjson_doc_free(doc);
29843035
}
29853036
state.SetBytesProcessed(bytes_processed);
2986-
state.counters["peak_memory_usage"] = bm::Counter(peak_memory_usage, bm::Counter::kAvgThreads);
2987-
state.counters["mean_allocation_size"] =
2988-
bm::Counter(peak_memory_usage * 1.0 / peak_memory_calls, bm::Counter::kAvgThreads);
3037+
3038+
if (peak_usage) {
3039+
state.counters["peak_usage"] = bm::Counter(peak_usage, bm::Counter::kAvgThreads);
3040+
state.counters["mean_alloc"] = bm::Counter(peak_usage * 1.0 / count_calls, bm::Counter::kAvgThreads);
3041+
state.counters["max_alloc"] = bm::Counter(max_alloc, bm::Counter::kAvgThreads);
3042+
}
29893043
}
29903044

2991-
BENCHMARK_CAPTURE(json_yyjson, malloc, yyjson_wrapp_malloc)->MinTime(10)->Name("json_yyjson<malloc>");
2992-
BENCHMARK_CAPTURE(json_yyjson, malloc, yyjson_wrapp_malloc)
3045+
BENCHMARK_CAPTURE(json_yyjson, malloc, yyjson_wrap_malloc)->MinTime(10)->Name("json_yyjson<malloc>");
3046+
BENCHMARK_CAPTURE(json_yyjson, malloc, yyjson_wrap_malloc)
29933047
->MinTime(10)
29943048
->Name("json_yyjson<malloc>")
29953049
->Threads(physical_cores());
@@ -3127,8 +3181,9 @@ enum class exception_handling_t { throw_k, noexcept_k };
31273181
template <typename json_type_, exception_handling_t exception_handling_>
31283182
static void json_nlohmann(bm::State &state) {
31293183
std::size_t bytes_processed = 0;
3130-
std::size_t peak_memory_usage = 0;
3131-
std::size_t peak_memory_calls = 0;
3184+
std::size_t peak_usage = 0;
3185+
std::size_t count_calls = 0;
3186+
std::size_t max_alloc = 0;
31323187
std::size_t iteration = 0;
31333188
for (auto _ : state) {
31343189

@@ -3156,14 +3211,18 @@ static void json_nlohmann(bm::State &state) {
31563211
if (!json.is_discarded()) bm::DoNotOptimize(contains_xss_nlohmann(json));
31573212
}
31583213
if constexpr (!std::is_same_v<json_type_, default_json>) {
3159-
peak_memory_usage = std::max(peak_memory_usage, thread_local_arena.total_allocated);
3160-
peak_memory_calls = std::max(peak_memory_calls, thread_local_arena.unique_allocations);
3214+
peak_usage = std::max(peak_usage, thread_local_arena.total_allocated);
3215+
count_calls = std::max(count_calls, thread_local_arena.unique_allocs);
3216+
max_alloc = std::max(max_alloc, thread_local_arena.max_alloc_size);
31613217
}
31623218
}
31633219
state.SetBytesProcessed(bytes_processed);
3164-
state.counters["peak_memory_usage"] = bm::Counter(peak_memory_usage, bm::Counter::kAvgThreads);
3165-
state.counters["mean_allocation_size"] =
3166-
bm::Counter(peak_memory_usage * 1.0 / peak_memory_calls, bm::Counter::kAvgThreads);
3220+
3221+
if (peak_usage) {
3222+
state.counters["peak_usage"] = bm::Counter(peak_usage, bm::Counter::kAvgThreads);
3223+
state.counters["mean_alloc"] = bm::Counter(peak_usage * 1.0 / count_calls, bm::Counter::kAvgThreads);
3224+
state.counters["max_alloc"] = bm::Counter(max_alloc, bm::Counter::kAvgThreads);
3225+
}
31673226
}
31683227

31693228
BENCHMARK(json_nlohmann<default_json, exception_handling_t::throw_k>)

0 commit comments

Comments
 (0)