Skip to content

Commit 0a229c9

Browse files
author
Chandan Babu R
committed
Merge tag 'fix-percpu-lists-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.6-fixesA
xfs: fix cpu hotplug mess Ritesh and Eric separately reported crashes in XFS's hook function for CPU hot remove if the remove event races with a filesystem being mounted. I also noticed via generic/650 that once in a while the log will shut down over an apparent overrun of a transaction reservation; this turned out to be due to CIL percpu list aggregation failing to pick up the percpu list items from a dying CPU. Either way, the solution here is to eliminate the need for a CPU dying hook by using a private cpumask to track which CPUs have added to their percpu lists directly, and iterating with that mask. This fixes the log problems and (I think) solves a theoretical UAF bug in the inodegc code too. Signed-off-by: Darrick J. Wong <[email protected]> Signed-off-by: Chandan Babu R <[email protected]> * tag 'fix-percpu-lists-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux: xfs: remove CPU hotplug infrastructure xfs: remove the all-mounts list xfs: use per-mount cpumask to track nonempty percpu inodegc lists xfs: fix per-cpu CIL structure aggregation racing with dying cpus
2 parents da6f841 + ef7d959 commit 0a229c9

File tree

7 files changed

+56
-183
lines changed

7 files changed

+56
-183
lines changed

fs/xfs/xfs_icache.c

Lines changed: 28 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
443443
int cpu;
444444
bool ret = false;
445445

446-
for_each_online_cpu(cpu) {
446+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
447447
gc = per_cpu_ptr(mp->m_inodegc, cpu);
448448
if (!llist_empty(&gc->list)) {
449449
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
@@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
463463
int error = 0;
464464

465465
flush_workqueue(mp->m_inodegc_wq);
466-
for_each_online_cpu(cpu) {
466+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
467467
struct xfs_inodegc *gc;
468468

469469
gc = per_cpu_ptr(mp->m_inodegc, cpu);
@@ -1845,9 +1845,17 @@ xfs_inodegc_worker(
18451845
struct xfs_inodegc, work);
18461846
struct llist_node *node = llist_del_all(&gc->list);
18471847
struct xfs_inode *ip, *n;
1848+
struct xfs_mount *mp = gc->mp;
18481849
unsigned int nofs_flag;
18491850

1850-
ASSERT(gc->cpu == smp_processor_id());
1851+
/*
1852+
* Clear the cpu mask bit and ensure that we have seen the latest
1853+
* update of the gc structure associated with this CPU. This matches
1854+
* with the release semantics used when setting the cpumask bit in
1855+
* xfs_inodegc_queue.
1856+
*/
1857+
cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
1858+
smp_mb__after_atomic();
18511859

18521860
WRITE_ONCE(gc->items, 0);
18531861

@@ -1862,7 +1870,7 @@ xfs_inodegc_worker(
18621870
nofs_flag = memalloc_nofs_save();
18631871

18641872
ip = llist_entry(node, struct xfs_inode, i_gclist);
1865-
trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
1873+
trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
18661874

18671875
WRITE_ONCE(gc->shrinker_hits, 0);
18681876
llist_for_each_entry_safe(ip, n, node, i_gclist) {
@@ -2057,25 +2065,36 @@ xfs_inodegc_queue(
20572065
struct xfs_inodegc *gc;
20582066
int items;
20592067
unsigned int shrinker_hits;
2068+
unsigned int cpu_nr;
20602069
unsigned long queue_delay = 1;
20612070

20622071
trace_xfs_inode_set_need_inactive(ip);
20632072
spin_lock(&ip->i_flags_lock);
20642073
ip->i_flags |= XFS_NEED_INACTIVE;
20652074
spin_unlock(&ip->i_flags_lock);
20662075

2067-
gc = get_cpu_ptr(mp->m_inodegc);
2076+
cpu_nr = get_cpu();
2077+
gc = this_cpu_ptr(mp->m_inodegc);
20682078
llist_add(&ip->i_gclist, &gc->list);
20692079
items = READ_ONCE(gc->items);
20702080
WRITE_ONCE(gc->items, items + 1);
20712081
shrinker_hits = READ_ONCE(gc->shrinker_hits);
20722082

2083+
/*
2084+
* Ensure the list add is always seen by anyone who finds the cpumask
2085+
* bit set. This effectively gives the cpumask bit set operation
2086+
* release ordering semantics.
2087+
*/
2088+
smp_mb__before_atomic();
2089+
if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
2090+
cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
2091+
20732092
/*
20742093
* We queue the work while holding the current CPU so that the work
20752094
* is scheduled to run on this CPU.
20762095
*/
20772096
if (!xfs_is_inodegc_enabled(mp)) {
2078-
put_cpu_ptr(gc);
2097+
put_cpu();
20792098
return;
20802099
}
20812100

@@ -2085,55 +2104,14 @@ xfs_inodegc_queue(
20852104
trace_xfs_inodegc_queue(mp, __return_address);
20862105
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
20872106
queue_delay);
2088-
put_cpu_ptr(gc);
2107+
put_cpu();
20892108

20902109
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
20912110
trace_xfs_inodegc_throttle(mp, __return_address);
20922111
flush_delayed_work(&gc->work);
20932112
}
20942113
}
20952114

2096-
/*
2097-
* Fold the dead CPU inodegc queue into the current CPUs queue.
2098-
*/
2099-
void
2100-
xfs_inodegc_cpu_dead(
2101-
struct xfs_mount *mp,
2102-
unsigned int dead_cpu)
2103-
{
2104-
struct xfs_inodegc *dead_gc, *gc;
2105-
struct llist_node *first, *last;
2106-
unsigned int count = 0;
2107-
2108-
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
2109-
cancel_delayed_work_sync(&dead_gc->work);
2110-
2111-
if (llist_empty(&dead_gc->list))
2112-
return;
2113-
2114-
first = dead_gc->list.first;
2115-
last = first;
2116-
while (last->next) {
2117-
last = last->next;
2118-
count++;
2119-
}
2120-
dead_gc->list.first = NULL;
2121-
dead_gc->items = 0;
2122-
2123-
/* Add pending work to current CPU */
2124-
gc = get_cpu_ptr(mp->m_inodegc);
2125-
llist_add_batch(first, last, &gc->list);
2126-
count += READ_ONCE(gc->items);
2127-
WRITE_ONCE(gc->items, count);
2128-
2129-
if (xfs_is_inodegc_enabled(mp)) {
2130-
trace_xfs_inodegc_queue(mp, __return_address);
2131-
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
2132-
0);
2133-
}
2134-
put_cpu_ptr(gc);
2135-
}
2136-
21372115
/*
21382116
* We set the inode flag atomically with the radix tree tag. Once we get tag
21392117
* lookups on the radix tree, this inode flag can go away.
@@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count(
21952173
if (!xfs_is_inodegc_enabled(mp))
21962174
return 0;
21972175

2198-
for_each_online_cpu(cpu) {
2176+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
21992177
gc = per_cpu_ptr(mp->m_inodegc, cpu);
22002178
if (!llist_empty(&gc->list))
22012179
return XFS_INODEGC_SHRINKER_COUNT;
@@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan(
22202198

22212199
trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
22222200

2223-
for_each_online_cpu(cpu) {
2201+
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
22242202
gc = per_cpu_ptr(mp->m_inodegc, cpu);
22252203
if (!llist_empty(&gc->list)) {
22262204
unsigned int h = READ_ONCE(gc->shrinker_hits);

fs/xfs/xfs_icache.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
7979
int xfs_inodegc_flush(struct xfs_mount *mp);
8080
void xfs_inodegc_stop(struct xfs_mount *mp);
8181
void xfs_inodegc_start(struct xfs_mount *mp);
82-
void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
8382
int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
8483

8584
#endif

fs/xfs/xfs_log_cil.c

Lines changed: 16 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ xlog_cil_push_pcp_aggregate(
124124
struct xlog_cil_pcp *cilpcp;
125125
int cpu;
126126

127-
for_each_online_cpu(cpu) {
127+
for_each_cpu(cpu, &ctx->cil_pcpmask) {
128128
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
129129

130130
ctx->ticket->t_curr_res += cilpcp->space_reserved;
@@ -165,7 +165,13 @@ xlog_cil_insert_pcp_aggregate(
165165
if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
166166
return;
167167

168-
for_each_online_cpu(cpu) {
168+
/*
169+
* We can race with other cpus setting cil_pcpmask. However, we've
170+
* atomically cleared PCP_SPACE which forces other threads to add to
171+
* the global space used count. cil_pcpmask is a superset of cilpcp
172+
* structures that could have a nonzero space_used.
173+
*/
174+
for_each_cpu(cpu, &ctx->cil_pcpmask) {
169175
int old, prev;
170176

171177
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
@@ -554,6 +560,7 @@ xlog_cil_insert_items(
554560
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
555561
int space_used;
556562
int order;
563+
unsigned int cpu_nr;
557564
struct xlog_cil_pcp *cilpcp;
558565

559566
ASSERT(tp);
@@ -577,7 +584,12 @@ xlog_cil_insert_items(
577584
* can't be scheduled away between split sample/update operations that
578585
* are done without outside locking to serialise them.
579586
*/
580-
cilpcp = get_cpu_ptr(cil->xc_pcp);
587+
cpu_nr = get_cpu();
588+
cilpcp = this_cpu_ptr(cil->xc_pcp);
589+
590+
/* Tell the future push that there was work added by this CPU. */
591+
if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask))
592+
cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask);
581593

582594
/*
583595
* We need to take the CIL checkpoint unit reservation on the first
@@ -663,7 +675,7 @@ xlog_cil_insert_items(
663675
continue;
664676
list_add_tail(&lip->li_cil, &cilpcp->log_items);
665677
}
666-
put_cpu_ptr(cilpcp);
678+
put_cpu();
667679

668680
/*
669681
* If we've overrun the reservation, dump the tx details before we move
@@ -1790,38 +1802,6 @@ xlog_cil_force_seq(
17901802
return 0;
17911803
}
17921804

1793-
/*
1794-
* Move dead percpu state to the relevant CIL context structures.
1795-
*
1796-
* We have to lock the CIL context here to ensure that nothing is modifying
1797-
* the percpu state, either addition or removal. Both of these are done under
1798-
* the CIL context lock, so grabbing that exclusively here will ensure we can
1799-
* safely drain the cilpcp for the CPU that is dying.
1800-
*/
1801-
void
1802-
xlog_cil_pcp_dead(
1803-
struct xlog *log,
1804-
unsigned int cpu)
1805-
{
1806-
struct xfs_cil *cil = log->l_cilp;
1807-
struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
1808-
struct xfs_cil_ctx *ctx;
1809-
1810-
down_write(&cil->xc_ctx_lock);
1811-
ctx = cil->xc_ctx;
1812-
if (ctx->ticket)
1813-
ctx->ticket->t_curr_res += cilpcp->space_reserved;
1814-
cilpcp->space_reserved = 0;
1815-
1816-
if (!list_empty(&cilpcp->log_items))
1817-
list_splice_init(&cilpcp->log_items, &ctx->log_items);
1818-
if (!list_empty(&cilpcp->busy_extents))
1819-
list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
1820-
atomic_add(cilpcp->space_used, &ctx->space_used);
1821-
cilpcp->space_used = 0;
1822-
up_write(&cil->xc_ctx_lock);
1823-
}
1824-
18251805
/*
18261806
* Perform initial CIL structure initialisation.
18271807
*/

fs/xfs/xfs_log_priv.h

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,12 @@ struct xfs_cil_ctx {
231231
struct work_struct discard_endio_work;
232232
struct work_struct push_work;
233233
atomic_t order_id;
234+
235+
/*
236+
* CPUs that could have added items to the percpu CIL data. Access is
237+
* coordinated with xc_ctx_lock.
238+
*/
239+
struct cpumask cil_pcpmask;
234240
};
235241

236242
/*
@@ -278,9 +284,6 @@ struct xfs_cil {
278284
wait_queue_head_t xc_push_wait; /* background push throttle */
279285

280286
void __percpu *xc_pcp; /* percpu CIL structures */
281-
#ifdef CONFIG_HOTPLUG_CPU
282-
struct list_head xc_pcp_list;
283-
#endif
284287
} ____cacheline_aligned_in_smp;
285288

286289
/* xc_flags bit values */
@@ -705,9 +708,4 @@ xlog_kvmalloc(
705708
return p;
706709
}
707710

708-
/*
709-
* CIL CPU dead notifier
710-
*/
711-
void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
712-
713711
#endif /* __XFS_LOG_PRIV_H__ */

fs/xfs/xfs_mount.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,16 +60,15 @@ struct xfs_error_cfg {
6060
* Per-cpu deferred inode inactivation GC lists.
6161
*/
6262
struct xfs_inodegc {
63+
struct xfs_mount *mp;
6364
struct llist_head list;
6465
struct delayed_work work;
6566
int error;
6667

6768
/* approximate count of inodes in the list */
6869
unsigned int items;
6970
unsigned int shrinker_hits;
70-
#if defined(DEBUG) || defined(XFS_WARN)
7171
unsigned int cpu;
72-
#endif
7372
};
7473

7574
/*
@@ -98,7 +97,6 @@ typedef struct xfs_mount {
9897
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
9998
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
10099
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
101-
struct list_head m_mount_list; /* global mount list */
102100
void __percpu *m_inodegc; /* percpu inodegc structures */
103101

104102
/*
@@ -249,6 +247,9 @@ typedef struct xfs_mount {
249247
unsigned int *m_errortag;
250248
struct xfs_kobj m_errortag_kobj;
251249
#endif
250+
251+
/* cpus that have inodes queued for inactivation */
252+
struct cpumask m_inodegc_cpumask;
252253
} xfs_mount_t;
253254

254255
#define M_IGEO(mp) (&(mp)->m_ino_geo)

0 commit comments

Comments
 (0)