Skip to content

Commit 5f3c8d6

Browse files
urezkipaulmckrcu
authored andcommitted
rcu/tree: Maintain separate array for vmalloc ptrs
To do so, we use an array of kvfree_rcu_bulk_data structures. It consists of two elements: - index number 0 corresponds to slab pointers. - index number 1 corresponds to vmalloc pointers. Keeping vmalloc pointers separated from slab pointers makes it possible to invoke the right freeing API for the right kind of pointer. It also prepares us for future headless support for vmalloc and SLAB objects. Such objects cannot be queued on a linked list and are instead directly into an array. Signed-off-by: Uladzislau Rezki (Sony) <[email protected]> Signed-off-by: Joel Fernandes (Google) <[email protected]> Reviewed-by: Joel Fernandes (Google) <[email protected]> Co-developed-by: Joel Fernandes (Google) <[email protected]> Signed-off-by: Paul E. McKenney <[email protected]>
1 parent 53c72b5 commit 5f3c8d6

File tree

1 file changed

+100
-73
lines changed

1 file changed

+100
-73
lines changed

kernel/rcu/tree.c

Lines changed: 100 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,8 @@
5757
#include <linux/slab.h>
5858
#include <linux/sched/isolation.h>
5959
#include <linux/sched/clock.h>
60+
#include <linux/vmalloc.h>
61+
#include <linux/mm.h>
6062
#include "../time/tick-internal.h"
6163

6264
#include "tree.h"
@@ -2966,46 +2968,47 @@ EXPORT_SYMBOL_GPL(call_rcu);
29662968
/* Maximum number of jiffies to wait before draining a batch. */
29672969
#define KFREE_DRAIN_JIFFIES (HZ / 50)
29682970
#define KFREE_N_BATCHES 2
2971+
#define FREE_N_CHANNELS 2
29692972

29702973
/**
2971-
* struct kfree_rcu_bulk_data - single block to store kfree_rcu() pointers
2974+
* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
29722975
* @nr_records: Number of active pointers in the array
2973-
* @records: Array of the kfree_rcu() pointers
29742976
* @next: Next bulk object in the block chain
2977+
* @records: Array of the kvfree_rcu() pointers
29752978
*/
2976-
struct kfree_rcu_bulk_data {
2979+
struct kvfree_rcu_bulk_data {
29772980
unsigned long nr_records;
2978-
struct kfree_rcu_bulk_data *next;
2981+
struct kvfree_rcu_bulk_data *next;
29792982
void *records[];
29802983
};
29812984

29822985
/*
29832986
* This macro defines how many entries the "records" array
29842987
* will contain. It is based on the fact that the size of
2985-
* kfree_rcu_bulk_data structure becomes exactly one page.
2988+
* kvfree_rcu_bulk_data structure becomes exactly one page.
29862989
*/
2987-
#define KFREE_BULK_MAX_ENTR \
2988-
((PAGE_SIZE - sizeof(struct kfree_rcu_bulk_data)) / sizeof(void *))
2990+
#define KVFREE_BULK_MAX_ENTR \
2991+
((PAGE_SIZE - sizeof(struct kvfree_rcu_bulk_data)) / sizeof(void *))
29892992

29902993
/**
29912994
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
29922995
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
29932996
* @head_free: List of kfree_rcu() objects waiting for a grace period
2994-
* @bhead_free: Bulk-List of kfree_rcu() objects waiting for a grace period
2997+
* @bkvhead_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
29952998
* @krcp: Pointer to @kfree_rcu_cpu structure
29962999
*/
29973000

29983001
struct kfree_rcu_cpu_work {
29993002
struct rcu_work rcu_work;
30003003
struct rcu_head *head_free;
3001-
struct kfree_rcu_bulk_data *bhead_free;
3004+
struct kvfree_rcu_bulk_data *bkvhead_free[FREE_N_CHANNELS];
30023005
struct kfree_rcu_cpu *krcp;
30033006
};
30043007

30053008
/**
30063009
* struct kfree_rcu_cpu - batch up kfree_rcu() requests for RCU grace period
30073010
* @head: List of kfree_rcu() objects not yet waiting for a grace period
3008-
* @bhead: Bulk-List of kfree_rcu() objects not yet waiting for a grace period
3011+
* @bkvhead: Bulk-List of kvfree_rcu() objects not yet waiting for a grace period
30093012
* @krw_arr: Array of batches of kfree_rcu() objects waiting for a grace period
30103013
* @lock: Synchronize access to this structure
30113014
* @monitor_work: Promote @head to @head_free after KFREE_DRAIN_JIFFIES
@@ -3020,7 +3023,7 @@ struct kfree_rcu_cpu_work {
30203023
*/
30213024
struct kfree_rcu_cpu {
30223025
struct rcu_head *head;
3023-
struct kfree_rcu_bulk_data *bhead;
3026+
struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS];
30243027
struct kfree_rcu_cpu_work krw_arr[KFREE_N_BATCHES];
30253028
raw_spinlock_t lock;
30263029
struct delayed_work monitor_work;
@@ -3044,7 +3047,7 @@ static DEFINE_PER_CPU(struct kfree_rcu_cpu, krc) = {
30443047
};
30453048

30463049
static __always_inline void
3047-
debug_rcu_bhead_unqueue(struct kfree_rcu_bulk_data *bhead)
3050+
debug_rcu_bhead_unqueue(struct kvfree_rcu_bulk_data *bhead)
30483051
{
30493052
#ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD
30503053
int i;
@@ -3073,20 +3076,20 @@ krc_this_cpu_unlock(struct kfree_rcu_cpu *krcp, unsigned long flags)
30733076
local_irq_restore(flags);
30743077
}
30753078

3076-
static inline struct kfree_rcu_bulk_data *
3079+
static inline struct kvfree_rcu_bulk_data *
30773080
get_cached_bnode(struct kfree_rcu_cpu *krcp)
30783081
{
30793082
if (!krcp->nr_bkv_objs)
30803083
return NULL;
30813084

30823085
krcp->nr_bkv_objs--;
3083-
return (struct kfree_rcu_bulk_data *)
3086+
return (struct kvfree_rcu_bulk_data *)
30843087
llist_del_first(&krcp->bkvcache);
30853088
}
30863089

30873090
static inline bool
30883091
put_cached_bnode(struct kfree_rcu_cpu *krcp,
3089-
struct kfree_rcu_bulk_data *bnode)
3092+
struct kvfree_rcu_bulk_data *bnode)
30903093
{
30913094
// Check the limit.
30923095
if (krcp->nr_bkv_objs >= rcu_min_cached_objs)
@@ -3105,43 +3108,63 @@ put_cached_bnode(struct kfree_rcu_cpu *krcp,
31053108
static void kfree_rcu_work(struct work_struct *work)
31063109
{
31073110
unsigned long flags;
3111+
struct kvfree_rcu_bulk_data *bkvhead[FREE_N_CHANNELS], *bnext;
31083112
struct rcu_head *head, *next;
3109-
struct kfree_rcu_bulk_data *bhead, *bnext;
31103113
struct kfree_rcu_cpu *krcp;
31113114
struct kfree_rcu_cpu_work *krwp;
3115+
int i, j;
31123116

31133117
krwp = container_of(to_rcu_work(work),
31143118
struct kfree_rcu_cpu_work, rcu_work);
31153119
krcp = krwp->krcp;
3120+
31163121
raw_spin_lock_irqsave(&krcp->lock, flags);
3122+
// Channels 1 and 2.
3123+
for (i = 0; i < FREE_N_CHANNELS; i++) {
3124+
bkvhead[i] = krwp->bkvhead_free[i];
3125+
krwp->bkvhead_free[i] = NULL;
3126+
}
3127+
3128+
// Channel 3.
31173129
head = krwp->head_free;
31183130
krwp->head_free = NULL;
3119-
bhead = krwp->bhead_free;
3120-
krwp->bhead_free = NULL;
31213131
raw_spin_unlock_irqrestore(&krcp->lock, flags);
31223132

3123-
/* "bhead" is now private, so traverse locklessly. */
3124-
for (; bhead; bhead = bnext) {
3125-
bnext = bhead->next;
3126-
3127-
debug_rcu_bhead_unqueue(bhead);
3128-
3129-
rcu_lock_acquire(&rcu_callback_map);
3130-
trace_rcu_invoke_kfree_bulk_callback(rcu_state.name,
3131-
bhead->nr_records, bhead->records);
3132-
3133-
kfree_bulk(bhead->nr_records, bhead->records);
3134-
rcu_lock_release(&rcu_callback_map);
3133+
// Handle two first channels.
3134+
for (i = 0; i < FREE_N_CHANNELS; i++) {
3135+
for (; bkvhead[i]; bkvhead[i] = bnext) {
3136+
bnext = bkvhead[i]->next;
3137+
debug_rcu_bhead_unqueue(bkvhead[i]);
3138+
3139+
rcu_lock_acquire(&rcu_callback_map);
3140+
if (i == 0) { // kmalloc() / kfree().
3141+
trace_rcu_invoke_kfree_bulk_callback(
3142+
rcu_state.name, bkvhead[i]->nr_records,
3143+
bkvhead[i]->records);
3144+
3145+
kfree_bulk(bkvhead[i]->nr_records,
3146+
bkvhead[i]->records);
3147+
} else { // vmalloc() / vfree().
3148+
for (j = 0; j < bkvhead[i]->nr_records; j++) {
3149+
trace_rcu_invoke_kfree_callback(
3150+
rcu_state.name,
3151+
bkvhead[i]->records[j], 0);
3152+
3153+
vfree(bkvhead[i]->records[j]);
3154+
}
3155+
}
3156+
rcu_lock_release(&rcu_callback_map);
31353157

3136-
krcp = krc_this_cpu_lock(&flags);
3137-
if (put_cached_bnode(krcp, bhead))
3138-
bhead = NULL;
3139-
krc_this_cpu_unlock(krcp, flags);
3158+
krcp = krc_this_cpu_lock(&flags);
3159+
if (put_cached_bnode(krcp, bkvhead[i]))
3160+
bkvhead[i] = NULL;
3161+
krc_this_cpu_unlock(krcp, flags);
31403162

3141-
if (bhead)
3142-
free_page((unsigned long) bhead);
3163+
if (bkvhead[i])
3164+
free_page((unsigned long) bkvhead[i]);
31433165

3144-
cond_resched_tasks_rcu_qs();
3166+
cond_resched_tasks_rcu_qs();
3167+
}
31453168
}
31463169

31473170
/*
@@ -3159,7 +3182,7 @@ static void kfree_rcu_work(struct work_struct *work)
31593182
trace_rcu_invoke_kfree_callback(rcu_state.name, head, offset);
31603183

31613184
if (!WARN_ON_ONCE(!__is_kfree_rcu_offset(offset)))
3162-
kfree(ptr);
3185+
kvfree(ptr);
31633186

31643187
rcu_lock_release(&rcu_callback_map);
31653188
cond_resched_tasks_rcu_qs();
@@ -3176,29 +3199,33 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
31763199
{
31773200
struct kfree_rcu_cpu_work *krwp;
31783201
bool repeat = false;
3179-
int i;
3202+
int i, j;
31803203

31813204
lockdep_assert_held(&krcp->lock);
31823205

31833206
for (i = 0; i < KFREE_N_BATCHES; i++) {
31843207
krwp = &(krcp->krw_arr[i]);
31853208

31863209
/*
3187-
* Try to detach bhead or head and attach it over any
3210+
* Try to detach bkvhead or head and attach it over any
31883211
* available corresponding free channel. It can be that
31893212
* a previous RCU batch is in progress, it means that
31903213
* immediately to queue another one is not possible so
31913214
* return false to tell caller to retry.
31923215
*/
3193-
if ((krcp->bhead && !krwp->bhead_free) ||
3216+
if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
3217+
(krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
31943218
(krcp->head && !krwp->head_free)) {
3195-
/* Channel 1. */
3196-
if (!krwp->bhead_free) {
3197-
krwp->bhead_free = krcp->bhead;
3198-
krcp->bhead = NULL;
3219+
// Channel 1 corresponds to SLAB ptrs.
3220+
// Channel 2 corresponds to vmalloc ptrs.
3221+
for (j = 0; j < FREE_N_CHANNELS; j++) {
3222+
if (!krwp->bkvhead_free[j]) {
3223+
krwp->bkvhead_free[j] = krcp->bkvhead[j];
3224+
krcp->bkvhead[j] = NULL;
3225+
}
31993226
}
32003227

3201-
/* Channel 2. */
3228+
// Channel 3 corresponds to emergency path.
32023229
if (!krwp->head_free) {
32033230
krwp->head_free = krcp->head;
32043231
krcp->head = NULL;
@@ -3207,16 +3234,17 @@ static inline bool queue_kfree_rcu_work(struct kfree_rcu_cpu *krcp)
32073234
WRITE_ONCE(krcp->count, 0);
32083235

32093236
/*
3210-
* One work is per one batch, so there are two "free channels",
3211-
* "bhead_free" and "head_free" the batch can handle. It can be
3212-
* that the work is in the pending state when two channels have
3213-
* been detached following each other, one by one.
3237+
* One work is per one batch, so there are three
3238+
* "free channels", the batch can handle. It can
3239+
* be that the work is in the pending state when
3240+
* channels have been detached following by each
3241+
* other.
32143242
*/
32153243
queue_rcu_work(system_wq, &krwp->rcu_work);
32163244
}
32173245

3218-
/* Repeat if any "free" corresponding channel is still busy. */
3219-
if (krcp->bhead || krcp->head)
3246+
// Repeat if any "free" corresponding channel is still busy.
3247+
if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
32203248
repeat = true;
32213249
}
32223250

@@ -3258,23 +3286,22 @@ static void kfree_rcu_monitor(struct work_struct *work)
32583286
}
32593287

32603288
static inline bool
3261-
kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
3262-
struct rcu_head *head, rcu_callback_t func)
3289+
kvfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp, void *ptr)
32633290
{
3264-
struct kfree_rcu_bulk_data *bnode;
3291+
struct kvfree_rcu_bulk_data *bnode;
3292+
int idx;
32653293

32663294
if (unlikely(!krcp->initialized))
32673295
return false;
32683296

32693297
lockdep_assert_held(&krcp->lock);
3298+
idx = !!is_vmalloc_addr(ptr);
32703299

32713300
/* Check if a new block is required. */
3272-
if (!krcp->bhead ||
3273-
krcp->bhead->nr_records == KFREE_BULK_MAX_ENTR) {
3301+
if (!krcp->bkvhead[idx] ||
3302+
krcp->bkvhead[idx]->nr_records == KVFREE_BULK_MAX_ENTR) {
32743303
bnode = get_cached_bnode(krcp);
32753304
if (!bnode) {
3276-
WARN_ON_ONCE(sizeof(struct kfree_rcu_bulk_data) > PAGE_SIZE);
3277-
32783305
/*
32793306
* To keep this path working on raw non-preemptible
32803307
* sections, prevent the optional entry into the
@@ -3287,7 +3314,7 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
32873314
if (IS_ENABLED(CONFIG_PREEMPT_RT))
32883315
return false;
32893316

3290-
bnode = (struct kfree_rcu_bulk_data *)
3317+
bnode = (struct kvfree_rcu_bulk_data *)
32913318
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
32923319
}
32933320

@@ -3297,30 +3324,30 @@ kfree_call_rcu_add_ptr_to_bulk(struct kfree_rcu_cpu *krcp,
32973324

32983325
/* Initialize the new block. */
32993326
bnode->nr_records = 0;
3300-
bnode->next = krcp->bhead;
3327+
bnode->next = krcp->bkvhead[idx];
33013328

33023329
/* Attach it to the head. */
3303-
krcp->bhead = bnode;
3330+
krcp->bkvhead[idx] = bnode;
33043331
}
33053332

33063333
/* Finally insert. */
3307-
krcp->bhead->records[krcp->bhead->nr_records++] =
3308-
(void *) head - (unsigned long) func;
3334+
krcp->bkvhead[idx]->records
3335+
[krcp->bkvhead[idx]->nr_records++] = ptr;
33093336

33103337
return true;
33113338
}
33123339

33133340
/*
3314-
* Queue a request for lazy invocation of kfree_bulk()/kfree() after a grace
3315-
* period. Please note there are two paths are maintained, one is the main one
3316-
* that uses kfree_bulk() interface and second one is emergency one, that is
3317-
* used only when the main path can not be maintained temporary, due to memory
3318-
* pressure.
3341+
* Queue a request for lazy invocation of appropriate free routine after a
3342+
* grace period. Please note there are three paths are maintained, two are the
3343+
* main ones that use array of pointers interface and third one is emergency
3344+
* one, that is used only when the main path can not be maintained temporary,
3345+
* due to memory pressure.
33193346
*
33203347
* Each kfree_call_rcu() request is added to a batch. The batch will be drained
33213348
* every KFREE_DRAIN_JIFFIES number of jiffies. All the objects in the batch will
33223349
* be free'd in workqueue context. This allows us to: batch requests together to
3323-
* reduce the number of grace periods during heavy kfree_rcu() load.
3350+
* reduce the number of grace periods during heavy kfree_rcu()/kvfree_rcu() load.
33243351
*/
33253352
void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
33263353
{
@@ -3343,7 +3370,7 @@ void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func)
33433370
* Under high memory pressure GFP_NOWAIT can fail,
33443371
* in that case the emergency path is maintained.
33453372
*/
3346-
if (unlikely(!kfree_call_rcu_add_ptr_to_bulk(krcp, head, func))) {
3373+
if (unlikely(!kvfree_call_rcu_add_ptr_to_bulk(krcp, ptr))) {
33473374
head->func = func;
33483375
head->next = krcp->head;
33493376
krcp->head = head;
@@ -4324,15 +4351,15 @@ static void __init kfree_rcu_batch_init(void)
43244351

43254352
for_each_possible_cpu(cpu) {
43264353
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
4327-
struct kfree_rcu_bulk_data *bnode;
4354+
struct kvfree_rcu_bulk_data *bnode;
43284355

43294356
for (i = 0; i < KFREE_N_BATCHES; i++) {
43304357
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
43314358
krcp->krw_arr[i].krcp = krcp;
43324359
}
43334360

43344361
for (i = 0; i < rcu_min_cached_objs; i++) {
4335-
bnode = (struct kfree_rcu_bulk_data *)
4362+
bnode = (struct kvfree_rcu_bulk_data *)
43364363
__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
43374364

43384365
if (bnode)

0 commit comments

Comments
 (0)