Skip to content

Commit 28e9802

Browse files
ryncsnakpm00
authored andcommitted
mm/list_lru: simplify reparenting and initial allocation
Currently, there is a lot of code for detecting reparent racing using kmemcg_id as the synchronization flag. And an intermediate table is required to record and compare the kmemcg_id. We can simplify this by just checking the cgroup css status, skip if cgroup is being offlined. On the reparenting side, ensure no more allocation is on going and no further allocation will occur by using the XArray lock as barrier. Combined with a O(n^2) top-down walk for the allocation, we get rid of the intermediate table allocation completely. Despite being O(n^2), it should be actually faster because it's not practical to have a very deep cgroup level, and in most cases the parent cgroup should have been allocated already. This also avoided changing kmemcg_id before reparenting, making cgroups have a stable index for list_lru_memcg. After this change it's possible that a dying cgroup will see a NULL value in XArray corresponding to the kmemcg_id, because the kmemcg_id will point to an empty slot. In such case, just fallback to use its parent. As a result the code is simpler, following test also showed a very slight performance gain (12 test runs): prepare() { mkdir /tmp/test-fs modprobe brd rd_nr=1 rd_size=16777216 mkfs.xfs -f /dev/ram0 mount -t xfs /dev/ram0 /tmp/test-fs for i in $(seq 10000); do seq 8000 > "/tmp/test-fs/$i" done mkdir -p /sys/fs/cgroup/system.slice/bench/test/1 echo +memory > /sys/fs/cgroup/system.slice/bench/cgroup.subtree_control echo +memory > /sys/fs/cgroup/system.slice/bench/test/cgroup.subtree_control echo +memory > /sys/fs/cgroup/system.slice/bench/test/1/cgroup.subtree_control echo 768M > /sys/fs/cgroup/system.slice/bench/memory.max } do_test() { read_worker() { mkdir -p "/sys/fs/cgroup/system.slice/bench/test/1/$1" echo $BASHPID > "/sys/fs/cgroup/system.slice/bench/test/1/$1/cgroup.procs" read -r __TMP < "/tmp/test-fs/$1"; } read_in_all() { for i in $(seq 10000); do read_worker "$i" & done; wait } echo 3 > /proc/sys/vm/drop_caches time read_in_all for i in $(seq 1 10000); do rmdir "/sys/fs/cgroup/system.slice/bench/test/1/$i" &>/dev/null done } Before: real 0m3.498s user 0m11.037s sys 0m35.872s real 1m33.860s user 0m11.593s sys 3m1.169s real 1m31.883s user 0m11.265s sys 2m59.198s real 1m32.394s user 0m11.294s sys 3m1.616s real 1m31.017s user 0m11.379s sys 3m1.349s real 1m31.931s user 0m11.295s sys 2m59.863s real 1m32.758s user 0m11.254s sys 2m59.538s real 1m35.198s user 0m11.145s sys 3m1.123s real 1m30.531s user 0m11.393s sys 2m58.089s real 1m31.142s user 0m11.333s sys 3m0.549s After: real 0m3.489s user 0m10.943s sys 0m36.036s real 1m10.893s user 0m11.495s sys 2m38.545s real 1m29.129s user 0m11.382s sys 3m1.601s real 1m29.944s user 0m11.494s sys 3m1.575s real 1m31.208s user 0m11.451s sys 2m59.693s real 1m25.944s user 0m11.327s sys 2m56.394s real 1m28.599s user 0m11.312s sys 3m0.162s real 1m26.746s user 0m11.538s sys 2m55.462s real 1m30.668s user 0m11.475s sys 3m2.075s real 1m29.258s user 0m11.292s sys 3m0.780s Which is slightly faster in real time. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Kairui Song <[email protected]> Cc: Chengming Zhou <[email protected]> Cc: Johannes Weiner <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Michal Hocko <[email protected]> Cc: Muchun Song <[email protected]> Cc: Qi Zheng <[email protected]> Cc: Roman Gushchin <[email protected]> Cc: Shakeel Butt <[email protected]> Cc: Waiman Long <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent 8d42abb commit 28e9802

File tree

2 files changed

+77
-108
lines changed

2 files changed

+77
-108
lines changed

mm/list_lru.c

Lines changed: 74 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
5959
}
6060
return &lru->node[nid].lru;
6161
}
62+
63+
static inline struct list_lru_one *
64+
list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg)
65+
{
66+
struct list_lru_one *l;
67+
again:
68+
l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
69+
if (likely(l))
70+
return l;
71+
72+
memcg = parent_mem_cgroup(memcg);
73+
VM_WARN_ON(!css_is_dying(&memcg->css));
74+
goto again;
75+
}
6276
#else
6377
static void list_lru_register(struct list_lru *lru)
6478
{
@@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx)
8397
{
8498
return &lru->node[nid].lru;
8599
}
100+
101+
static inline struct list_lru_one *
102+
list_lru_from_memcg(struct list_lru *lru, int nid, int idx)
103+
{
104+
return &lru->node[nid].lru;
105+
}
86106
#endif /* CONFIG_MEMCG */
87107

88108
/* The caller must ensure the memcg lifetime. */
@@ -94,7 +114,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid,
94114

95115
spin_lock(&nlru->lock);
96116
if (list_empty(item)) {
97-
l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
117+
l = list_lru_from_memcg(lru, nid, memcg);
98118
list_add_tail(item, &l->list);
99119
/* Set shrinker bit if the first element was added */
100120
if (!l->nr_items++)
@@ -133,7 +153,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid,
133153

134154
spin_lock(&nlru->lock);
135155
if (!list_empty(item)) {
136-
l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg));
156+
l = list_lru_from_memcg(lru, nid, memcg);
137157
list_del_init(item);
138158
l->nr_items--;
139159
nlru->nr_items--;
@@ -355,20 +375,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp)
355375
return mlru;
356376
}
357377

358-
static void memcg_list_lru_free(struct list_lru *lru, int src_idx)
359-
{
360-
struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx);
361-
362-
/*
363-
* The __list_lru_walk_one() can walk the list of this node.
364-
* We need kvfree_rcu() here. And the walking of the list
365-
* is under lru->node[nid]->lock, which can serve as a RCU
366-
* read-side critical section.
367-
*/
368-
if (mlru)
369-
kvfree_rcu(mlru, rcu);
370-
}
371-
372378
static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
373379
{
374380
if (memcg_aware)
@@ -393,22 +399,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru)
393399
}
394400

395401
static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
396-
int src_idx, struct mem_cgroup *dst_memcg)
402+
struct list_lru_one *src,
403+
struct mem_cgroup *dst_memcg)
397404
{
398405
struct list_lru_node *nlru = &lru->node[nid];
399-
int dst_idx = dst_memcg->kmemcg_id;
400-
struct list_lru_one *src, *dst;
406+
struct list_lru_one *dst;
401407

402408
/*
403409
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
404410
* we have to use IRQ-safe primitives here to avoid deadlock.
405411
*/
406412
spin_lock_irq(&nlru->lock);
407-
408-
src = list_lru_from_memcg_idx(lru, nid, src_idx);
409-
if (!src)
410-
goto out;
411-
dst = list_lru_from_memcg_idx(lru, nid, dst_idx);
413+
dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg));
412414

413415
list_splice_init(&src->list, &dst->list);
414416

@@ -417,46 +419,43 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid,
417419
set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
418420
src->nr_items = 0;
419421
}
420-
out:
421422
spin_unlock_irq(&nlru->lock);
422423
}
423424

424425
void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent)
425426
{
426-
struct cgroup_subsys_state *css;
427427
struct list_lru *lru;
428-
int src_idx = memcg->kmemcg_id, i;
429-
430-
/*
431-
* Change kmemcg_id of this cgroup and all its descendants to the
432-
* parent's id, and then move all entries from this cgroup's list_lrus
433-
* to ones of the parent.
434-
*/
435-
rcu_read_lock();
436-
css_for_each_descendant_pre(css, &memcg->css) {
437-
struct mem_cgroup *child;
438-
439-
child = mem_cgroup_from_css(css);
440-
WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id);
441-
}
442-
rcu_read_unlock();
428+
int i;
443429

444-
/*
445-
* With kmemcg_id set to parent, holding the lock of each list_lru_node
446-
* below can prevent list_lru_{add,del,isolate} from touching the lru,
447-
* safe to reparent.
448-
*/
449430
mutex_lock(&list_lrus_mutex);
450431
list_for_each_entry(lru, &memcg_list_lrus, list) {
432+
struct list_lru_memcg *mlru;
433+
XA_STATE(xas, &lru->xa, memcg->kmemcg_id);
434+
435+
/*
436+
* Lock the Xarray to ensure no on going list_lru_memcg
437+
* allocation and further allocation will see css_is_dying().
438+
*/
439+
xas_lock_irq(&xas);
440+
mlru = xas_store(&xas, NULL);
441+
xas_unlock_irq(&xas);
442+
if (!mlru)
443+
continue;
444+
445+
/*
446+
* With Xarray value set to NULL, holding the lru lock below
447+
* prevents list_lru_{add,del,isolate} from touching the lru,
448+
* safe to reparent.
449+
*/
451450
for_each_node(i)
452-
memcg_reparent_list_lru_node(lru, i, src_idx, parent);
451+
memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent);
453452

454453
/*
455454
* Here all list_lrus corresponding to the cgroup are guaranteed
456455
* to remain empty, we can safely free this lru, any further
457456
* memcg_list_lru_alloc() call will simply bail out.
458457
*/
459-
memcg_list_lru_free(lru, src_idx);
458+
kvfree_rcu(mlru, rcu);
460459
}
461460
mutex_unlock(&list_lrus_mutex);
462461
}
@@ -472,77 +471,48 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg,
472471
int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru,
473472
gfp_t gfp)
474473
{
475-
int i;
476474
unsigned long flags;
477-
struct list_lru_memcg_table {
478-
struct list_lru_memcg *mlru;
479-
struct mem_cgroup *memcg;
480-
} *table;
475+
struct list_lru_memcg *mlru;
476+
struct mem_cgroup *pos, *parent;
481477
XA_STATE(xas, &lru->xa, 0);
482478

483479
if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru))
484480
return 0;
485481

486482
gfp &= GFP_RECLAIM_MASK;
487-
table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp);
488-
if (!table)
489-
return -ENOMEM;
490-
491483
/*
492484
* Because the list_lru can be reparented to the parent cgroup's
493485
* list_lru, we should make sure that this cgroup and all its
494486
* ancestors have allocated list_lru_memcg.
495487
*/
496-
for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) {
497-
if (memcg_list_lru_allocated(memcg, lru))
498-
break;
499-
500-
table[i].memcg = memcg;
501-
table[i].mlru = memcg_init_list_lru_one(gfp);
502-
if (!table[i].mlru) {
503-
while (i--)
504-
kfree(table[i].mlru);
505-
kfree(table);
506-
return -ENOMEM;
488+
do {
489+
/*
490+
* Keep finding the farest parent that wasn't populated
491+
* until found memcg itself.
492+
*/
493+
pos = memcg;
494+
parent = parent_mem_cgroup(pos);
495+
while (!memcg_list_lru_allocated(parent, lru)) {
496+
pos = parent;
497+
parent = parent_mem_cgroup(pos);
507498
}
508-
}
509-
510-
xas_lock_irqsave(&xas, flags);
511-
while (i--) {
512-
int index = READ_ONCE(table[i].memcg->kmemcg_id);
513-
struct list_lru_memcg *mlru = table[i].mlru;
514499

515-
xas_set(&xas, index);
516-
retry:
517-
if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) {
518-
kfree(mlru);
519-
} else {
520-
xas_store(&xas, mlru);
521-
if (xas_error(&xas) == -ENOMEM) {
522-
xas_unlock_irqrestore(&xas, flags);
523-
if (xas_nomem(&xas, gfp))
524-
xas_set_err(&xas, 0);
525-
xas_lock_irqsave(&xas, flags);
526-
/*
527-
* The xas lock has been released, this memcg
528-
* can be reparented before us. So reload
529-
* memcg id. More details see the comments
530-
* in memcg_reparent_list_lrus().
531-
*/
532-
index = READ_ONCE(table[i].memcg->kmemcg_id);
533-
if (index < 0)
534-
xas_set_err(&xas, 0);
535-
else if (!xas_error(&xas) && index != xas.xa_index)
536-
xas_set(&xas, index);
537-
goto retry;
500+
mlru = memcg_init_list_lru_one(gfp);
501+
if (!mlru)
502+
return -ENOMEM;
503+
xas_set(&xas, pos->kmemcg_id);
504+
do {
505+
xas_lock_irqsave(&xas, flags);
506+
if (!xas_load(&xas) && !css_is_dying(&pos->css)) {
507+
xas_store(&xas, mlru);
508+
if (!xas_error(&xas))
509+
mlru = NULL;
538510
}
539-
}
540-
}
541-
/* xas_nomem() is used to free memory instead of memory allocation. */
542-
if (xas.xa_alloc)
543-
xas_nomem(&xas, gfp);
544-
xas_unlock_irqrestore(&xas, flags);
545-
kfree(table);
511+
xas_unlock_irqrestore(&xas, flags);
512+
} while (xas_nomem(&xas, gfp));
513+
if (mlru)
514+
kfree(mlru);
515+
} while (pos != memcg && !css_is_dying(&pos->css));
546516

547517
return xas_error(&xas);
548518
}

mm/zswap.c

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -709,12 +709,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
709709

710710
/*
711711
* Note that it is safe to use rcu_read_lock() here, even in the face of
712-
* concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection
713-
* used in list_lru lookup, only two scenarios are possible:
712+
* concurrent memcg offlining:
714713
*
715-
* 1. list_lru_add() is called before memcg->kmemcg_id is updated. The
714+
* 1. list_lru_add() is called before list_lru_memcg is erased. The
716715
* new entry will be reparented to memcg's parent's list_lru.
717-
* 2. list_lru_add() is called after memcg->kmemcg_id is updated. The
716+
* 2. list_lru_add() is called after list_lru_memcg is erased. The
718717
* new entry will be added directly to memcg's parent's list_lru.
719718
*
720719
* Similar reasoning holds for list_lru_del().

0 commit comments

Comments
 (0)