Skip to content

Commit cc37d52

Browse files
urezkipaulmckrcu
authored andcommitted
rcu/kvfree: Use a polled API to speedup a reclaim process
Currently all objects placed into a batch wait for a full grace period to elapse after that batch is ready to send to RCU. However, this can unnecessarily delay freeing of the first objects that were added to the batch. After all, several RCU grace periods might have elapsed since those objects were added, and if so, there is no point in further deferring their freeing. This commit therefore adds per-page grace-period snapshots which are obtained from get_state_synchronize_rcu(). When the batch is ready to be passed to call_rcu(), each page's snapshot is checked by passing it to poll_state_synchronize_rcu(). If a given page's RCU grace period has already elapsed, its objects are freed immediately by kvfree_rcu_bulk(). Otherwise, these objects are freed after a call to synchronize_rcu(). This approach requires that the pages be traversed in reverse order, that is, the oldest ones first. Test example: kvm.sh --memory 10G --torture rcuscale --allcpus --duration 1 \ --kconfig CONFIG_NR_CPUS=64 \ --kconfig CONFIG_RCU_NOCB_CPU=y \ --kconfig CONFIG_RCU_NOCB_CPU_DEFAULT_ALL=y \ --kconfig CONFIG_RCU_LAZY=n \ --bootargs "rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 \ rcuscale.holdoff=20 rcuscale.kfree_loops=10000 \ torture.disable_onoff_at_boot" --trust-make Before this commit: Total time taken by all kfree'ers: 8535693700 ns, loops: 10000, batches: 1188, memory footprint: 2248MB Total time taken by all kfree'ers: 8466933582 ns, loops: 10000, batches: 1157, memory footprint: 2820MB Total time taken by all kfree'ers: 5375602446 ns, loops: 10000, batches: 1130, memory footprint: 6502MB Total time taken by all kfree'ers: 7523283832 ns, loops: 10000, batches: 1006, memory footprint: 3343MB Total time taken by all kfree'ers: 6459171956 ns, loops: 10000, batches: 1150, memory footprint: 6549MB After this commit: Total time taken by all kfree'ers: 8560060176 ns, loops: 10000, batches: 1787, memory footprint: 61MB Total time taken by all kfree'ers: 8573885501 ns, loops: 10000, batches: 1777, memory footprint: 93MB Total time taken by all kfree'ers: 8320000202 ns, loops: 10000, batches: 1727, memory footprint: 66MB Total time taken by all kfree'ers: 8552718794 ns, loops: 10000, batches: 1790, memory footprint: 75MB Total time taken by all kfree'ers: 8601368792 ns, loops: 10000, batches: 1724, memory footprint: 62MB The reduction in memory footprint is well in excess of an order of magnitude. Signed-off-by: Uladzislau Rezki (Sony) <[email protected]> Signed-off-by: Paul E. McKenney <[email protected]>
1 parent 8fc5494 commit cc37d52

File tree

1 file changed

+39
-8
lines changed

1 file changed

+39
-8
lines changed

kernel/rcu/tree.c

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2877,11 +2877,13 @@ EXPORT_SYMBOL_GPL(call_rcu);
28772877
/**
28782878
* struct kvfree_rcu_bulk_data - single block to store kvfree_rcu() pointers
28792879
* @list: List node. All blocks are linked between each other
2880+
* @gp_snap: Snapshot of RCU state for objects placed to this bulk
28802881
* @nr_records: Number of active pointers in the array
28812882
* @records: Array of the kvfree_rcu() pointers
28822883
*/
28832884
struct kvfree_rcu_bulk_data {
28842885
struct list_head list;
2886+
unsigned long gp_snap;
28852887
unsigned long nr_records;
28862888
void *records[];
28872889
};
@@ -2898,13 +2900,15 @@ struct kvfree_rcu_bulk_data {
28982900
* struct kfree_rcu_cpu_work - single batch of kfree_rcu() requests
28992901
* @rcu_work: Let queue_rcu_work() invoke workqueue handler after grace period
29002902
* @head_free: List of kfree_rcu() objects waiting for a grace period
2903+
* @head_free_gp_snap: Snapshot of RCU state for objects placed to "@head_free"
29012904
* @bulk_head_free: Bulk-List of kvfree_rcu() objects waiting for a grace period
29022905
* @krcp: Pointer to @kfree_rcu_cpu structure
29032906
*/
29042907

29052908
struct kfree_rcu_cpu_work {
2906-
struct rcu_work rcu_work;
2909+
struct work_struct rcu_work;
29072910
struct rcu_head *head_free;
2911+
unsigned long head_free_gp_snap;
29082912
struct list_head bulk_head_free[FREE_N_CHANNELS];
29092913
struct kfree_rcu_cpu *krcp;
29102914
};
@@ -3100,10 +3104,11 @@ static void kfree_rcu_work(struct work_struct *work)
31003104
struct rcu_head *head;
31013105
struct kfree_rcu_cpu *krcp;
31023106
struct kfree_rcu_cpu_work *krwp;
3107+
unsigned long head_free_gp_snap;
31033108
int i;
31043109

3105-
krwp = container_of(to_rcu_work(work),
3106-
struct kfree_rcu_cpu_work, rcu_work);
3110+
krwp = container_of(work,
3111+
struct kfree_rcu_cpu_work, rcu_work);
31073112
krcp = krwp->krcp;
31083113

31093114
raw_spin_lock_irqsave(&krcp->lock, flags);
@@ -3114,12 +3119,29 @@ static void kfree_rcu_work(struct work_struct *work)
31143119
// Channel 3.
31153120
head = krwp->head_free;
31163121
krwp->head_free = NULL;
3122+
head_free_gp_snap = krwp->head_free_gp_snap;
31173123
raw_spin_unlock_irqrestore(&krcp->lock, flags);
31183124

31193125
// Handle the first two channels.
3120-
for (i = 0; i < FREE_N_CHANNELS; i++)
3126+
for (i = 0; i < FREE_N_CHANNELS; i++) {
3127+
// Start from the tail page, so a GP is likely passed for it.
3128+
list_for_each_entry_safe_reverse(bnode, n, &bulk_head[i], list) {
3129+
// Not yet ready? Bail out since we need one more GP.
3130+
if (!poll_state_synchronize_rcu(bnode->gp_snap))
3131+
break;
3132+
3133+
list_del_init(&bnode->list);
3134+
kvfree_rcu_bulk(krcp, bnode, i);
3135+
}
3136+
3137+
// Please note a request for one more extra GP can
3138+
// occur only once for all objects in this batch.
3139+
if (!list_empty(&bulk_head[i]))
3140+
synchronize_rcu();
3141+
31213142
list_for_each_entry_safe(bnode, n, &bulk_head[i], list)
31223143
kvfree_rcu_bulk(krcp, bnode, i);
3144+
}
31233145

31243146
/*
31253147
* This is used when the "bulk" path can not be used for the
@@ -3128,7 +3150,10 @@ static void kfree_rcu_work(struct work_struct *work)
31283150
* queued on a linked list through their rcu_head structures.
31293151
* This list is named "Channel 3".
31303152
*/
3131-
kvfree_rcu_list(head);
3153+
if (head) {
3154+
cond_synchronize_rcu(head_free_gp_snap);
3155+
kvfree_rcu_list(head);
3156+
}
31323157
}
31333158

31343159
static bool
@@ -3195,6 +3220,11 @@ static void kfree_rcu_monitor(struct work_struct *work)
31953220
if (!krwp->head_free) {
31963221
krwp->head_free = krcp->head;
31973222
WRITE_ONCE(krcp->head, NULL);
3223+
3224+
// Take a snapshot for this krwp. Please note no more
3225+
// any objects can be added to attached head_free channel
3226+
// therefore fixate a GP for it here.
3227+
krwp->head_free_gp_snap = get_state_synchronize_rcu();
31983228
}
31993229

32003230
WRITE_ONCE(krcp->count, 0);
@@ -3204,7 +3234,7 @@ static void kfree_rcu_monitor(struct work_struct *work)
32043234
// be that the work is in the pending state when
32053235
// channels have been detached following by each
32063236
// other.
3207-
queue_rcu_work(system_wq, &krwp->rcu_work);
3237+
queue_work(system_wq, &krwp->rcu_work);
32083238
}
32093239
}
32103240

@@ -3332,8 +3362,9 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp,
33323362
list_add(&bnode->list, &(*krcp)->bulk_head[idx]);
33333363
}
33343364

3335-
/* Finally insert. */
3365+
// Finally insert and update the GP for this page.
33363366
bnode->records[bnode->nr_records++] = ptr;
3367+
bnode->gp_snap = get_state_synchronize_rcu();
33373368
return true;
33383369
}
33393370

@@ -4783,7 +4814,7 @@ static void __init kfree_rcu_batch_init(void)
47834814
struct kfree_rcu_cpu *krcp = per_cpu_ptr(&krc, cpu);
47844815

47854816
for (i = 0; i < KFREE_N_BATCHES; i++) {
4786-
INIT_RCU_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
4817+
INIT_WORK(&krcp->krw_arr[i].rcu_work, kfree_rcu_work);
47874818
krcp->krw_arr[i].krcp = krcp;
47884819

47894820
for (j = 0; j < FREE_N_CHANNELS; j++)

0 commit comments

Comments
 (0)