Skip to content

Commit a715e94

Browse files
committed
Merge branch 'slab/for-6.12/rcu_barriers' into slab/for-next
Merge most of SLUB feature work for 6.12: - Barrier for pending kfree_rcu() in kmem_cache_destroy() and associated refactoring of the destroy path (Vlastimil Babka) - CONFIG_SLUB_RCU_DEBUG to allow KASAN catching UAF bugs in SLAB_TYPESAFE_BY_RCU caches (Jann Horn) - kmem_cache_charge() for delayed kmemcg charging (Shakeel Butt)
2 parents e02147c + 9028cde commit a715e94

File tree

13 files changed

+528
-128
lines changed

13 files changed

+528
-128
lines changed

include/linux/kasan.h

Lines changed: 58 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -175,13 +175,59 @@ static __always_inline void * __must_check kasan_init_slab_obj(
175175
return (void *)object;
176176
}
177177

178-
bool __kasan_slab_free(struct kmem_cache *s, void *object,
179-
unsigned long ip, bool init);
178+
bool __kasan_slab_pre_free(struct kmem_cache *s, void *object,
179+
unsigned long ip);
180+
/**
181+
* kasan_slab_pre_free - Check whether freeing a slab object is safe.
182+
* @object: Object to be freed.
183+
*
184+
* This function checks whether freeing the given object is safe. It may
185+
* check for double-free and invalid-free bugs and report them.
186+
*
187+
* This function is intended only for use by the slab allocator.
188+
*
189+
* @Return true if freeing the object is unsafe; false otherwise.
190+
*/
191+
static __always_inline bool kasan_slab_pre_free(struct kmem_cache *s,
192+
void *object)
193+
{
194+
if (kasan_enabled())
195+
return __kasan_slab_pre_free(s, object, _RET_IP_);
196+
return false;
197+
}
198+
199+
bool __kasan_slab_free(struct kmem_cache *s, void *object, bool init,
200+
bool still_accessible);
201+
/**
202+
* kasan_slab_free - Poison, initialize, and quarantine a slab object.
203+
* @object: Object to be freed.
204+
* @init: Whether to initialize the object.
205+
* @still_accessible: Whether the object contents are still accessible.
206+
*
207+
* This function informs that a slab object has been freed and is not
208+
* supposed to be accessed anymore, except when @still_accessible is set
209+
* (indicating that the object is in a SLAB_TYPESAFE_BY_RCU cache and an RCU
210+
* grace period might not have passed yet).
211+
*
212+
* For KASAN modes that have integrated memory initialization
213+
* (kasan_has_integrated_init() == true), this function also initializes
214+
* the object's memory. For other modes, the @init argument is ignored.
215+
*
216+
* This function might also take ownership of the object to quarantine it.
217+
* When this happens, KASAN will defer freeing the object to a later
218+
* stage and handle it internally until then. The return value indicates
219+
* whether KASAN took ownership of the object.
220+
*
221+
* This function is intended only for use by the slab allocator.
222+
*
223+
* @Return true if KASAN took ownership of the object; false otherwise.
224+
*/
180225
static __always_inline bool kasan_slab_free(struct kmem_cache *s,
181-
void *object, bool init)
226+
void *object, bool init,
227+
bool still_accessible)
182228
{
183229
if (kasan_enabled())
184-
return __kasan_slab_free(s, object, _RET_IP_, init);
230+
return __kasan_slab_free(s, object, init, still_accessible);
185231
return false;
186232
}
187233

@@ -371,7 +417,14 @@ static inline void *kasan_init_slab_obj(struct kmem_cache *cache,
371417
{
372418
return (void *)object;
373419
}
374-
static inline bool kasan_slab_free(struct kmem_cache *s, void *object, bool init)
420+
421+
static inline bool kasan_slab_pre_free(struct kmem_cache *s, void *object)
422+
{
423+
return false;
424+
}
425+
426+
static inline bool kasan_slab_free(struct kmem_cache *s, void *object,
427+
bool init, bool still_accessible)
375428
{
376429
return false;
377430
}

include/linux/rcutiny.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,6 +111,11 @@ static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr)
111111
kvfree(ptr);
112112
}
113113

114+
static inline void kvfree_rcu_barrier(void)
115+
{
116+
rcu_barrier();
117+
}
118+
114119
#ifdef CONFIG_KASAN_GENERIC
115120
void kvfree_call_rcu(struct rcu_head *head, void *ptr);
116121
#else

include/linux/rcutree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ static inline void rcu_virt_note_context_switch(void)
3535

3636
void synchronize_rcu_expedited(void);
3737
void kvfree_call_rcu(struct rcu_head *head, void *ptr);
38+
void kvfree_rcu_barrier(void);
3839

3940
void rcu_barrier(void);
4041
void rcu_momentary_dyntick_idle(void);

include/linux/slab.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,6 +547,35 @@ void *kmem_cache_alloc_lru_noprof(struct kmem_cache *s, struct list_lru *lru,
547547
gfp_t gfpflags) __assume_slab_alignment __malloc;
548548
#define kmem_cache_alloc_lru(...) alloc_hooks(kmem_cache_alloc_lru_noprof(__VA_ARGS__))
549549

550+
/**
551+
* kmem_cache_charge - memcg charge an already allocated slab memory
552+
* @objp: address of the slab object to memcg charge
553+
* @gfpflags: describe the allocation context
554+
*
555+
* kmem_cache_charge allows charging a slab object to the current memcg,
556+
* primarily in cases where charging at allocation time might not be possible
557+
* because the target memcg is not known (i.e. softirq context)
558+
*
559+
* The objp should be pointer returned by the slab allocator functions like
560+
* kmalloc (with __GFP_ACCOUNT in flags) or kmem_cache_alloc. The memcg charge
561+
* behavior can be controlled through gfpflags parameter, which affects how the
562+
* necessary internal metadata can be allocated. Including __GFP_NOFAIL denotes
563+
* that overcharging is requested instead of failure, but is not applied for the
564+
* internal metadata allocation.
565+
*
566+
* There are several cases where it will return true even if the charging was
567+
* not done:
568+
* More specifically:
569+
*
570+
* 1. For !CONFIG_MEMCG or cgroup_disable=memory systems.
571+
* 2. Already charged slab objects.
572+
* 3. For slab objects from KMALLOC_NORMAL caches - allocated by kmalloc()
573+
* without __GFP_ACCOUNT
574+
* 4. Allocating internal metadata has failed
575+
*
576+
* Return: true if charge was successful otherwise false.
577+
*/
578+
bool kmem_cache_charge(void *objp, gfp_t gfpflags);
550579
void kmem_cache_free(struct kmem_cache *s, void *objp);
551580

552581
kmem_buckets *kmem_buckets_create(const char *name, slab_flags_t flags,

kernel/rcu/tree.c

Lines changed: 101 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3584,18 +3584,15 @@ kvfree_rcu_drain_ready(struct kfree_rcu_cpu *krcp)
35843584
}
35853585

35863586
/*
3587-
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
3587+
* Return: %true if a work is queued, %false otherwise.
35883588
*/
3589-
static void kfree_rcu_monitor(struct work_struct *work)
3589+
static bool
3590+
kvfree_rcu_queue_batch(struct kfree_rcu_cpu *krcp)
35903591
{
3591-
struct kfree_rcu_cpu *krcp = container_of(work,
3592-
struct kfree_rcu_cpu, monitor_work.work);
35933592
unsigned long flags;
3593+
bool queued = false;
35943594
int i, j;
35953595

3596-
// Drain ready for reclaim.
3597-
kvfree_rcu_drain_ready(krcp);
3598-
35993596
raw_spin_lock_irqsave(&krcp->lock, flags);
36003597

36013598
// Attempt to start a new batch.
@@ -3634,11 +3631,27 @@ static void kfree_rcu_monitor(struct work_struct *work)
36343631
// be that the work is in the pending state when
36353632
// channels have been detached following by each
36363633
// other.
3637-
queue_rcu_work(system_wq, &krwp->rcu_work);
3634+
queued = queue_rcu_work(system_wq, &krwp->rcu_work);
36383635
}
36393636
}
36403637

36413638
raw_spin_unlock_irqrestore(&krcp->lock, flags);
3639+
return queued;
3640+
}
3641+
3642+
/*
3643+
* This function is invoked after the KFREE_DRAIN_JIFFIES timeout.
3644+
*/
3645+
static void kfree_rcu_monitor(struct work_struct *work)
3646+
{
3647+
struct kfree_rcu_cpu *krcp = container_of(work,
3648+
struct kfree_rcu_cpu, monitor_work.work);
3649+
3650+
// Drain ready for reclaim.
3651+
kvfree_rcu_drain_ready(krcp);
3652+
3653+
// Queue a batch for a rest.
3654+
kvfree_rcu_queue_batch(krcp);
36423655

36433656
// If there is nothing to detach, it means that our job is
36443657
// successfully done here. In case of having at least one
@@ -3859,6 +3872,86 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr)
38593872
}
38603873
EXPORT_SYMBOL_GPL(kvfree_call_rcu);
38613874

3875+
/**
3876+
* kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete.
3877+
*
3878+
* Note that a single argument of kvfree_rcu() call has a slow path that
3879+
* triggers synchronize_rcu() following by freeing a pointer. It is done
3880+
* before the return from the function. Therefore for any single-argument
3881+
* call that will result in a kfree() to a cache that is to be destroyed
3882+
* during module exit, it is developer's responsibility to ensure that all
3883+
* such calls have returned before the call to kmem_cache_destroy().
3884+
*/
3885+
void kvfree_rcu_barrier(void)
3886+
{
3887+
struct kfree_rcu_cpu_work *krwp;
3888+
struct kfree_rcu_cpu *krcp;
3889+
bool queued;
3890+
int i, cpu;
3891+
3892+
/*
3893+
* Firstly we detach objects and queue them over an RCU-batch
3894+
* for all CPUs. Finally queued works are flushed for each CPU.
3895+
*
3896+
* Please note. If there are outstanding batches for a particular
3897+
* CPU, those have to be finished first following by queuing a new.
3898+
*/
3899+
for_each_possible_cpu(cpu) {
3900+
krcp = per_cpu_ptr(&krc, cpu);
3901+
3902+
/*
3903+
* Check if this CPU has any objects which have been queued for a
3904+
* new GP completion. If not(means nothing to detach), we are done
3905+
* with it. If any batch is pending/running for this "krcp", below
3906+
* per-cpu flush_rcu_work() waits its completion(see last step).
3907+
*/
3908+
if (!need_offload_krc(krcp))
3909+
continue;
3910+
3911+
while (1) {
3912+
/*
3913+
* If we are not able to queue a new RCU work it means:
3914+
* - batches for this CPU are still in flight which should
3915+
* be flushed first and then repeat;
3916+
* - no objects to detach, because of concurrency.
3917+
*/
3918+
queued = kvfree_rcu_queue_batch(krcp);
3919+
3920+
/*
3921+
* Bail out, if there is no need to offload this "krcp"
3922+
* anymore. As noted earlier it can run concurrently.
3923+
*/
3924+
if (queued || !need_offload_krc(krcp))
3925+
break;
3926+
3927+
/* There are ongoing batches. */
3928+
for (i = 0; i < KFREE_N_BATCHES; i++) {
3929+
krwp = &(krcp->krw_arr[i]);
3930+
flush_rcu_work(&krwp->rcu_work);
3931+
}
3932+
}
3933+
}
3934+
3935+
/*
3936+
* Now we guarantee that all objects are flushed.
3937+
*/
3938+
for_each_possible_cpu(cpu) {
3939+
krcp = per_cpu_ptr(&krc, cpu);
3940+
3941+
/*
3942+
* A monitor work can drain ready to reclaim objects
3943+
* directly. Wait its completion if running or pending.
3944+
*/
3945+
cancel_delayed_work_sync(&krcp->monitor_work);
3946+
3947+
for (i = 0; i < KFREE_N_BATCHES; i++) {
3948+
krwp = &(krcp->krw_arr[i]);
3949+
flush_rcu_work(&krwp->rcu_work);
3950+
}
3951+
}
3952+
}
3953+
EXPORT_SYMBOL_GPL(kvfree_rcu_barrier);
3954+
38623955
static unsigned long
38633956
kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
38643957
{

lib/slub_kunit.c

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <linux/slab.h>
66
#include <linux/module.h>
77
#include <linux/kernel.h>
8+
#include <linux/rcupdate.h>
89
#include "../mm/slab.h"
910

1011
static struct kunit_resource resource;
@@ -157,6 +158,34 @@ static void test_kmalloc_redzone_access(struct kunit *test)
157158
kmem_cache_destroy(s);
158159
}
159160

161+
struct test_kfree_rcu_struct {
162+
struct rcu_head rcu;
163+
};
164+
165+
static void test_kfree_rcu(struct kunit *test)
166+
{
167+
struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu",
168+
sizeof(struct test_kfree_rcu_struct),
169+
SLAB_NO_MERGE);
170+
struct test_kfree_rcu_struct *p = kmem_cache_alloc(s, GFP_KERNEL);
171+
172+
kfree_rcu(p, rcu);
173+
kmem_cache_destroy(s);
174+
175+
KUNIT_EXPECT_EQ(test, 0, slab_errors);
176+
}
177+
178+
static void test_leak_destroy(struct kunit *test)
179+
{
180+
struct kmem_cache *s = test_kmem_cache_create("TestSlub_kfree_rcu",
181+
64, SLAB_NO_MERGE);
182+
kmem_cache_alloc(s, GFP_KERNEL);
183+
184+
kmem_cache_destroy(s);
185+
186+
KUNIT_EXPECT_EQ(test, 1, slab_errors);
187+
}
188+
160189
static int test_init(struct kunit *test)
161190
{
162191
slab_errors = 0;
@@ -177,6 +206,8 @@ static struct kunit_case test_cases[] = {
177206

178207
KUNIT_CASE(test_clobber_redzone_free),
179208
KUNIT_CASE(test_kmalloc_redzone_access),
209+
KUNIT_CASE(test_kfree_rcu),
210+
KUNIT_CASE(test_leak_destroy),
180211
{}
181212
};
182213

mm/Kconfig.debug

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,38 @@ config SLUB_DEBUG_ON
7070
off in a kernel built with CONFIG_SLUB_DEBUG_ON by specifying
7171
"slab_debug=-".
7272

73+
config SLUB_RCU_DEBUG
74+
bool "Enable UAF detection in TYPESAFE_BY_RCU caches (for KASAN)"
75+
depends on SLUB_DEBUG
76+
# SLUB_RCU_DEBUG should build fine without KASAN, but is currently useless
77+
# without KASAN, so mark it as a dependency of KASAN for now.
78+
depends on KASAN
79+
default KASAN_GENERIC || KASAN_SW_TAGS
80+
help
81+
Make SLAB_TYPESAFE_BY_RCU caches behave approximately as if the cache
82+
was not marked as SLAB_TYPESAFE_BY_RCU and every caller used
83+
kfree_rcu() instead.
84+
85+
This is intended for use in combination with KASAN, to enable KASAN to
86+
detect use-after-free accesses in such caches.
87+
(KFENCE is able to do that independent of this flag.)
88+
89+
This might degrade performance.
90+
Unfortunately this also prevents a very specific bug pattern from
91+
triggering (insufficient checks against an object being recycled
92+
within the RCU grace period); so this option can be turned off even on
93+
KASAN builds, in case you want to test for such a bug.
94+
95+
If you're using this for testing bugs / fuzzing and care about
96+
catching all the bugs WAY more than performance, you might want to
97+
also turn on CONFIG_RCU_STRICT_GRACE_PERIOD.
98+
99+
WARNING:
100+
This is designed as a debugging feature, not a security feature.
101+
Objects are sometimes recycled without RCU delay under memory pressure.
102+
103+
If unsure, say N.
104+
73105
config PAGE_OWNER
74106
bool "Track page owner"
75107
depends on DEBUG_KERNEL && STACKTRACE_SUPPORT

0 commit comments

Comments
 (0)