Skip to content

Commit 105b50d

Browse files
melvergregkh
authored andcommitted
kfence: limit currently covered allocations when pool nearly full
[ Upstream commit 08f6b10 ] One of KFENCE's main design principles is that with increasing uptime, allocation coverage increases sufficiently to detect previously undetected bugs. We have observed that frequent long-lived allocations of the same source (e.g. pagecache) tend to permanently fill up the KFENCE pool with increasing system uptime, thus breaking the above requirement. The workaround thus far had been increasing the sample interval and/or increasing the KFENCE pool size, but is no reliable solution. To ensure diverse coverage of allocations, limit currently covered allocations of the same source once pool utilization reaches 75% (configurable via `kfence.skip_covered_thresh`) or above. The effect is retaining reasonable allocation coverage when the pool is close to full. A side-effect is that this also limits frequent long-lived allocations of the same source filling up the pool permanently. Uniqueness of an allocation for coverage purposes is based on its (partial) allocation stack trace (the source). A Counting Bloom filter is used to check if an allocation is covered; if the allocation is currently covered, the allocation is skipped by KFENCE. Testing was done using: (a) a synthetic workload that performs frequent long-lived allocations (default config values; sample_interval=1; num_objects=63), and (b) normal desktop workloads on an otherwise idle machine where the problem was first reported after a few days of uptime (default config values). In both test cases the sampled allocation rate no longer drops to zero at any point. In the case of (b) we observe (after 2 days uptime) 15% unique allocations in the pool, 77% pool utilization, with 20% "skipped allocations (covered)". [[email protected]: simplify and just use hash_32(), use more random stack_hash_seed] Link: https://lkml.kernel.org/r/[email protected] [[email protected]: fix 32 bit] Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Marco Elver <[email protected]> Reviewed-by: Dmitry Vyukov <[email protected]> Acked-by: Alexander Potapenko <[email protected]> Cc: Aleksandr Nogikh <[email protected]> Cc: Jann Horn <[email protected]> Cc: Taras Madan <[email protected]> Signed-off-by: Andrew Morton <[email protected]> Signed-off-by: Linus Torvalds <[email protected]> Signed-off-by: Sasha Levin <[email protected]>
1 parent 44b44b6 commit 105b50d

File tree

2 files changed

+109
-2
lines changed

2 files changed

+109
-2
lines changed

mm/kfence/core.c

Lines changed: 107 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,15 @@
1010
#include <linux/atomic.h>
1111
#include <linux/bug.h>
1212
#include <linux/debugfs.h>
13+
#include <linux/hash.h>
1314
#include <linux/irq_work.h>
15+
#include <linux/jhash.h>
1416
#include <linux/kcsan-checks.h>
1517
#include <linux/kfence.h>
1618
#include <linux/kmemleak.h>
1719
#include <linux/list.h>
1820
#include <linux/lockdep.h>
21+
#include <linux/log2.h>
1922
#include <linux/memblock.h>
2023
#include <linux/moduleparam.h>
2124
#include <linux/random.h>
@@ -82,6 +85,10 @@ static const struct kernel_param_ops sample_interval_param_ops = {
8285
};
8386
module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_interval, 0600);
8487

88+
/* Pool usage% threshold when currently covered allocations are skipped. */
89+
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
90+
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
91+
8592
/* The pool of pages used for guard pages and objects. */
8693
char *__kfence_pool __ro_after_init;
8794
EXPORT_SYMBOL(__kfence_pool); /* Export for test modules. */
@@ -106,6 +113,32 @@ DEFINE_STATIC_KEY_FALSE(kfence_allocation_key);
106113
/* Gates the allocation, ensuring only one succeeds in a given period. */
107114
atomic_t kfence_allocation_gate = ATOMIC_INIT(1);
108115

116+
/*
117+
* A Counting Bloom filter of allocation coverage: limits currently covered
118+
* allocations of the same source filling up the pool.
119+
*
120+
* Assuming a range of 15%-85% unique allocations in the pool at any point in
121+
* time, the below parameters provide a probablity of 0.02-0.33 for false
122+
* positive hits respectively:
123+
*
124+
* P(alloc_traces) = (1 - e^(-HNUM * (alloc_traces / SIZE)) ^ HNUM
125+
*/
126+
#define ALLOC_COVERED_HNUM 2
127+
#define ALLOC_COVERED_ORDER (const_ilog2(CONFIG_KFENCE_NUM_OBJECTS) + 2)
128+
#define ALLOC_COVERED_SIZE (1 << ALLOC_COVERED_ORDER)
129+
#define ALLOC_COVERED_HNEXT(h) hash_32(h, ALLOC_COVERED_ORDER)
130+
#define ALLOC_COVERED_MASK (ALLOC_COVERED_SIZE - 1)
131+
static atomic_t alloc_covered[ALLOC_COVERED_SIZE];
132+
133+
/* Stack depth used to determine uniqueness of an allocation. */
134+
#define UNIQUE_ALLOC_STACK_DEPTH ((size_t)8)
135+
136+
/*
137+
* Randomness for stack hashes, making the same collisions across reboots and
138+
* different machines less likely.
139+
*/
140+
static u32 stack_hash_seed __ro_after_init;
141+
109142
/* Statistics counters for debugfs. */
110143
enum kfence_counter_id {
111144
KFENCE_COUNTER_ALLOCATED,
@@ -115,6 +148,7 @@ enum kfence_counter_id {
115148
KFENCE_COUNTER_BUGS,
116149
KFENCE_COUNTER_SKIP_INCOMPAT,
117150
KFENCE_COUNTER_SKIP_CAPACITY,
151+
KFENCE_COUNTER_SKIP_COVERED,
118152
KFENCE_COUNTER_COUNT,
119153
};
120154
static atomic_long_t counters[KFENCE_COUNTER_COUNT];
@@ -126,11 +160,57 @@ static const char *const counter_names[] = {
126160
[KFENCE_COUNTER_BUGS] = "total bugs",
127161
[KFENCE_COUNTER_SKIP_INCOMPAT] = "skipped allocations (incompatible)",
128162
[KFENCE_COUNTER_SKIP_CAPACITY] = "skipped allocations (capacity)",
163+
[KFENCE_COUNTER_SKIP_COVERED] = "skipped allocations (covered)",
129164
};
130165
static_assert(ARRAY_SIZE(counter_names) == KFENCE_COUNTER_COUNT);
131166

132167
/* === Internals ============================================================ */
133168

169+
static inline bool should_skip_covered(void)
170+
{
171+
unsigned long thresh = (CONFIG_KFENCE_NUM_OBJECTS * kfence_skip_covered_thresh) / 100;
172+
173+
return atomic_long_read(&counters[KFENCE_COUNTER_ALLOCATED]) > thresh;
174+
}
175+
176+
static u32 get_alloc_stack_hash(unsigned long *stack_entries, size_t num_entries)
177+
{
178+
num_entries = min(num_entries, UNIQUE_ALLOC_STACK_DEPTH);
179+
num_entries = filter_irq_stacks(stack_entries, num_entries);
180+
return jhash(stack_entries, num_entries * sizeof(stack_entries[0]), stack_hash_seed);
181+
}
182+
183+
/*
184+
* Adds (or subtracts) count @val for allocation stack trace hash
185+
* @alloc_stack_hash from Counting Bloom filter.
186+
*/
187+
static void alloc_covered_add(u32 alloc_stack_hash, int val)
188+
{
189+
int i;
190+
191+
for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
192+
atomic_add(val, &alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]);
193+
alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
194+
}
195+
}
196+
197+
/*
198+
* Returns true if the allocation stack trace hash @alloc_stack_hash is
199+
* currently contained (non-zero count) in Counting Bloom filter.
200+
*/
201+
static bool alloc_covered_contains(u32 alloc_stack_hash)
202+
{
203+
int i;
204+
205+
for (i = 0; i < ALLOC_COVERED_HNUM; i++) {
206+
if (!atomic_read(&alloc_covered[alloc_stack_hash & ALLOC_COVERED_MASK]))
207+
return false;
208+
alloc_stack_hash = ALLOC_COVERED_HNEXT(alloc_stack_hash);
209+
}
210+
211+
return true;
212+
}
213+
134214
static bool kfence_protect(unsigned long addr)
135215
{
136216
return !KFENCE_WARN_ON(!kfence_protect_page(ALIGN_DOWN(addr, PAGE_SIZE), true));
@@ -270,7 +350,8 @@ static __always_inline void for_each_canary(const struct kfence_metadata *meta,
270350
}
271351

272352
static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t gfp,
273-
unsigned long *stack_entries, size_t num_stack_entries)
353+
unsigned long *stack_entries, size_t num_stack_entries,
354+
u32 alloc_stack_hash)
274355
{
275356
struct kfence_metadata *meta = NULL;
276357
unsigned long flags;
@@ -333,6 +414,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
333414
/* Pairs with READ_ONCE() in kfence_shutdown_cache(). */
334415
WRITE_ONCE(meta->cache, cache);
335416
meta->size = size;
417+
meta->alloc_stack_hash = alloc_stack_hash;
418+
336419
for_each_canary(meta, set_canary_byte);
337420

338421
/* Set required struct page fields. */
@@ -345,6 +428,8 @@ static void *kfence_guarded_alloc(struct kmem_cache *cache, size_t size, gfp_t g
345428

346429
raw_spin_unlock_irqrestore(&meta->lock, flags);
347430

431+
alloc_covered_add(alloc_stack_hash, 1);
432+
348433
/* Memory initialization. */
349434

350435
/*
@@ -413,6 +498,8 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
413498

414499
raw_spin_unlock_irqrestore(&meta->lock, flags);
415500

501+
alloc_covered_add(meta->alloc_stack_hash, -1);
502+
416503
/* Protect to detect use-after-frees. */
417504
kfence_protect((unsigned long)addr);
418505

@@ -679,6 +766,7 @@ void __init kfence_init(void)
679766
if (!kfence_sample_interval)
680767
return;
681768

769+
stack_hash_seed = (u32)random_get_entropy();
682770
if (!kfence_init_pool()) {
683771
pr_err("%s failed\n", __func__);
684772
return;
@@ -756,6 +844,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
756844
{
757845
unsigned long stack_entries[KFENCE_STACK_DEPTH];
758846
size_t num_stack_entries;
847+
u32 alloc_stack_hash;
759848

760849
/*
761850
* Perform size check before switching kfence_allocation_gate, so that
@@ -798,7 +887,23 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
798887

799888
num_stack_entries = stack_trace_save(stack_entries, KFENCE_STACK_DEPTH, 0);
800889

801-
return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries);
890+
/*
891+
* Do expensive check for coverage of allocation in slow-path after
892+
* allocation_gate has already become non-zero, even though it might
893+
* mean not making any allocation within a given sample interval.
894+
*
895+
* This ensures reasonable allocation coverage when the pool is almost
896+
* full, including avoiding long-lived allocations of the same source
897+
* filling up the pool (e.g. pagecache allocations).
898+
*/
899+
alloc_stack_hash = get_alloc_stack_hash(stack_entries, num_stack_entries);
900+
if (should_skip_covered() && alloc_covered_contains(alloc_stack_hash)) {
901+
atomic_long_inc(&counters[KFENCE_COUNTER_SKIP_COVERED]);
902+
return NULL;
903+
}
904+
905+
return kfence_guarded_alloc(s, size, flags, stack_entries, num_stack_entries,
906+
alloc_stack_hash);
802907
}
803908

804909
size_t kfence_ksize(const void *addr)

mm/kfence/kfence.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,8 @@ struct kfence_metadata {
8787
/* Allocation and free stack information. */
8888
struct kfence_track alloc_track;
8989
struct kfence_track free_track;
90+
/* For updating alloc_covered on frees. */
91+
u32 alloc_stack_hash;
9092
};
9193

9294
extern struct kfence_metadata kfence_metadata[CONFIG_KFENCE_NUM_OBJECTS];

0 commit comments

Comments
 (0)