Skip to content

Commit 62334fa

Browse files
author
Darrick J. Wong
committed
xfs: use per-mount cpumask to track nonempty percpu inodegc lists
Directly track which CPUs have contributed to the inodegc percpu lists instead of trusting the cpu online mask. This eliminates a theoretical problem where the inodegc flush functions might fail to flush a CPU's inodes if that CPU happened to be dying at exactly the same time. Most likely nobody's noticed this because the CPU dead hook moves the percpu inodegc list to another CPU and schedules that worker immediately. But it's quite possible that this is a subtle race leading to UAF if the inodegc flush were part of an unmount. Further benefits: This reduces the overhead of the inodegc flush code slightly by allowing us to ignore CPUs that have empty lists. Better yet, it reduces our dependence on the cpu online masks, which have been the cause of confusion and drama lately. Fixes: ab23a77 ("xfs: per-cpu deferred inode inactivation queues") Signed-off-by: Darrick J. Wong <[email protected]> Reviewed-by: Dave Chinner <[email protected]>
1 parent ecd49f7 commit 62334fa

File tree

4 files changed

+33
-56
lines changed

4 files changed

+33
-56
lines changed

fs/xfs/xfs_icache.c

Lines changed: 28 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
443443
int cpu;
444444
bool ret = false;
445445

446-
for_each_online_cpu(cpu) {
446+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
447447
gc = per_cpu_ptr(mp->m_inodegc, cpu);
448448
if (!llist_empty(&gc->list)) {
449449
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
@@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
463463
int error = 0;
464464

465465
flush_workqueue(mp->m_inodegc_wq);
466-
for_each_online_cpu(cpu) {
466+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
467467
struct xfs_inodegc *gc;
468468

469469
gc = per_cpu_ptr(mp->m_inodegc, cpu);
@@ -1845,9 +1845,17 @@ xfs_inodegc_worker(
18451845
struct xfs_inodegc, work);
18461846
struct llist_node *node = llist_del_all(&gc->list);
18471847
struct xfs_inode *ip, *n;
1848+
struct xfs_mount *mp = gc->mp;
18481849
unsigned int nofs_flag;
18491850

1850-
ASSERT(gc->cpu == smp_processor_id());
1851+
/*
1852+
* Clear the cpu mask bit and ensure that we have seen the latest
1853+
* update of the gc structure associated with this CPU. This matches
1854+
* with the release semantics used when setting the cpumask bit in
1855+
* xfs_inodegc_queue.
1856+
*/
1857+
cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
1858+
smp_mb__after_atomic();
18511859

18521860
WRITE_ONCE(gc->items, 0);
18531861

@@ -1862,7 +1870,7 @@ xfs_inodegc_worker(
18621870
nofs_flag = memalloc_nofs_save();
18631871

18641872
ip = llist_entry(node, struct xfs_inode, i_gclist);
1865-
trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
1873+
trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
18661874

18671875
WRITE_ONCE(gc->shrinker_hits, 0);
18681876
llist_for_each_entry_safe(ip, n, node, i_gclist) {
@@ -2057,25 +2065,36 @@ xfs_inodegc_queue(
20572065
struct xfs_inodegc *gc;
20582066
int items;
20592067
unsigned int shrinker_hits;
2068+
unsigned int cpu_nr;
20602069
unsigned long queue_delay = 1;
20612070

20622071
trace_xfs_inode_set_need_inactive(ip);
20632072
spin_lock(&ip->i_flags_lock);
20642073
ip->i_flags |= XFS_NEED_INACTIVE;
20652074
spin_unlock(&ip->i_flags_lock);
20662075

2067-
gc = get_cpu_ptr(mp->m_inodegc);
2076+
cpu_nr = get_cpu();
2077+
gc = this_cpu_ptr(mp->m_inodegc);
20682078
llist_add(&ip->i_gclist, &gc->list);
20692079
items = READ_ONCE(gc->items);
20702080
WRITE_ONCE(gc->items, items + 1);
20712081
shrinker_hits = READ_ONCE(gc->shrinker_hits);
20722082

2083+
/*
2084+
* Ensure the list add is always seen by anyone who finds the cpumask
2085+
* bit set. This effectively gives the cpumask bit set operation
2086+
* release ordering semantics.
2087+
*/
2088+
smp_mb__before_atomic();
2089+
if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
2090+
cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
2091+
20732092
/*
20742093
* We queue the work while holding the current CPU so that the work
20752094
* is scheduled to run on this CPU.
20762095
*/
20772096
if (!xfs_is_inodegc_enabled(mp)) {
2078-
put_cpu_ptr(gc);
2097+
put_cpu();
20792098
return;
20802099
}
20812100

@@ -2085,55 +2104,14 @@ xfs_inodegc_queue(
20852104
trace_xfs_inodegc_queue(mp, __return_address);
20862105
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
20872106
queue_delay);
2088-
put_cpu_ptr(gc);
2107+
put_cpu();
20892108

20902109
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
20912110
trace_xfs_inodegc_throttle(mp, __return_address);
20922111
flush_delayed_work(&gc->work);
20932112
}
20942113
}
20952114

2096-
/*
2097-
* Fold the dead CPU inodegc queue into the current CPUs queue.
2098-
*/
2099-
void
2100-
xfs_inodegc_cpu_dead(
2101-
struct xfs_mount *mp,
2102-
unsigned int dead_cpu)
2103-
{
2104-
struct xfs_inodegc *dead_gc, *gc;
2105-
struct llist_node *first, *last;
2106-
unsigned int count = 0;
2107-
2108-
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
2109-
cancel_delayed_work_sync(&dead_gc->work);
2110-
2111-
if (llist_empty(&dead_gc->list))
2112-
return;
2113-
2114-
first = dead_gc->list.first;
2115-
last = first;
2116-
while (last->next) {
2117-
last = last->next;
2118-
count++;
2119-
}
2120-
dead_gc->list.first = NULL;
2121-
dead_gc->items = 0;
2122-
2123-
/* Add pending work to current CPU */
2124-
gc = get_cpu_ptr(mp->m_inodegc);
2125-
llist_add_batch(first, last, &gc->list);
2126-
count += READ_ONCE(gc->items);
2127-
WRITE_ONCE(gc->items, count);
2128-
2129-
if (xfs_is_inodegc_enabled(mp)) {
2130-
trace_xfs_inodegc_queue(mp, __return_address);
2131-
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
2132-
0);
2133-
}
2134-
put_cpu_ptr(gc);
2135-
}
2136-
21372115
/*
21382116
* We set the inode flag atomically with the radix tree tag. Once we get tag
21392117
* lookups on the radix tree, this inode flag can go away.
@@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count(
21952173
if (!xfs_is_inodegc_enabled(mp))
21962174
return 0;
21972175

2198-
for_each_online_cpu(cpu) {
2176+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
21992177
gc = per_cpu_ptr(mp->m_inodegc, cpu);
22002178
if (!llist_empty(&gc->list))
22012179
return XFS_INODEGC_SHRINKER_COUNT;
@@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan(
22202198

22212199
trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
22222200

2223-
for_each_online_cpu(cpu) {
2201+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
22242202
gc = per_cpu_ptr(mp->m_inodegc, cpu);
22252203
if (!llist_empty(&gc->list)) {
22262204
unsigned int h = READ_ONCE(gc->shrinker_hits);

fs/xfs/xfs_icache.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
7979
int xfs_inodegc_flush(struct xfs_mount *mp);
8080
void xfs_inodegc_stop(struct xfs_mount *mp);
8181
void xfs_inodegc_start(struct xfs_mount *mp);
82-
void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
8382
int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
8483

8584
#endif

fs/xfs/xfs_mount.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,15 @@ struct xfs_error_cfg {
6060
* Per-cpu deferred inode inactivation GC lists.
6161
*/
6262
struct xfs_inodegc {
63+
struct xfs_mount *mp;
6364
struct llist_head list;
6465
struct delayed_work work;
6566
int error;
6667

6768
/* approximate count of inodes in the list */
6869
unsigned int items;
6970
unsigned int shrinker_hits;
70-
#if defined(DEBUG) || defined(XFS_WARN)
7171
unsigned int cpu;
72-
#endif
7372
};
7473

7574
/*
@@ -249,6 +248,9 @@ typedef struct xfs_mount {
249248
unsigned int *m_errortag;
250249
struct xfs_kobj m_errortag_kobj;
251250
#endif
251+
252+
/* cpus that have inodes queued for inactivation */
253+
struct cpumask m_inodegc_cpumask;
252254
} xfs_mount_t;
253255

254256
#define M_IGEO(mp) (&(mp)->m_ino_geo)

fs/xfs/xfs_super.c

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,9 +1135,8 @@ xfs_inodegc_init_percpu(
11351135

11361136
for_each_possible_cpu(cpu) {
11371137
gc = per_cpu_ptr(mp->m_inodegc, cpu);
1138-
#if defined(DEBUG) || defined(XFS_WARN)
11391138
gc->cpu = cpu;
1140-
#endif
1139+
gc->mp = mp;
11411140
init_llist_head(&gc->list);
11421141
gc->items = 0;
11431142
gc->error = 0;
@@ -2336,7 +2335,6 @@ xfs_cpu_dead(
23362335
spin_lock(&xfs_mount_list_lock);
23372336
list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
23382337
spin_unlock(&xfs_mount_list_lock);
2339-
xfs_inodegc_cpu_dead(mp, cpu);
23402338
spin_lock(&xfs_mount_list_lock);
23412339
}
23422340
spin_unlock(&xfs_mount_list_lock);

0 commit comments

Comments
 (0)