Skip to content

Commit 4ba3fcd

Browse files
zhangyi089tytso
authored andcommitted
jbd2,ext4: add a shrinker to release checkpointed buffers
Current metadata buffer release logic in bdev_try_to_free_page() have a lot of use-after-free issues when umount filesystem concurrently, and it is difficult to fix directly because ext4 is the only user of s_op->bdev_try_to_free_page callback and we may have to add more special refcount or lock that is only used by ext4 into the common vfs layer, which is unacceptable. One better solution is remove the bdev_try_to_free_page callback, but the real problem is we cannot easily release journal_head on the checkpointed buffer, so try_to_free_buffers() cannot release buffers and page under memory pressure, which is more likely to trigger out-of-memory. So we cannot remove the callback directly before we find another way to release journal_head. This patch introduce a shrinker to free journal_head on the checkpointed transaction. After the journal_head got freed, try_to_free_buffers() could free buffer properly. Signed-off-by: Zhang Yi <[email protected]> Suggested-by: Jan Kara <[email protected]> Reviewed-by: Jan Kara <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Theodore Ts'o <[email protected]>
1 parent 214eb5a commit 4ba3fcd

File tree

5 files changed

+369
-0
lines changed

5 files changed

+369
-0
lines changed

fs/ext4/super.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb)
11741174
ext4_unregister_sysfs(sb);
11751175

11761176
if (sbi->s_journal) {
1177+
jbd2_journal_unregister_shrinker(sbi->s_journal);
11771178
aborted = is_journal_aborted(sbi->s_journal);
11781179
err = jbd2_journal_destroy(sbi->s_journal);
11791180
sbi->s_journal = NULL;
@@ -5186,6 +5187,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
51865187
sbi->s_ea_block_cache = NULL;
51875188

51885189
if (sbi->s_journal) {
5190+
jbd2_journal_unregister_shrinker(sbi->s_journal);
51895191
jbd2_journal_destroy(sbi->s_journal);
51905192
sbi->s_journal = NULL;
51915193
}
@@ -5511,6 +5513,12 @@ static int ext4_load_journal(struct super_block *sb,
55115513
ext4_commit_super(sb);
55125514
}
55135515

5516+
err = jbd2_journal_register_shrinker(journal);
5517+
if (err) {
5518+
EXT4_SB(sb)->s_journal = NULL;
5519+
goto err_out;
5520+
}
5521+
55145522
return 0;
55155523

55165524
err_out:

fs/jbd2/checkpoint.c

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,18 @@ static inline void __buffer_relink_io(struct journal_head *jh)
7979
transaction->t_checkpoint_io_list = jh;
8080
}
8181

82+
/*
83+
* Check a checkpoint buffer could be release or not.
84+
*
85+
* Requires j_list_lock
86+
*/
87+
static inline bool __cp_buffer_busy(struct journal_head *jh)
88+
{
89+
struct buffer_head *bh = jh2bh(jh);
90+
91+
return (jh->b_transaction || buffer_locked(bh) || buffer_dirty(bh));
92+
}
93+
8294
/*
8395
* Try to release a checkpointed buffer from its transaction.
8496
* Returns 1 if we released it and 2 if we also released the
@@ -458,6 +470,137 @@ static int journal_clean_one_cp_list(struct journal_head *jh, bool destroy)
458470
return 0;
459471
}
460472

473+
/*
474+
* journal_shrink_one_cp_list
475+
*
476+
* Find 'nr_to_scan' written-back checkpoint buffers in the given list
477+
* and try to release them. If the whole transaction is released, set
478+
* the 'released' parameter. Return the number of released checkpointed
479+
* buffers.
480+
*
481+
* Called with j_list_lock held.
482+
*/
483+
static unsigned long journal_shrink_one_cp_list(struct journal_head *jh,
484+
unsigned long *nr_to_scan,
485+
bool *released)
486+
{
487+
struct journal_head *last_jh;
488+
struct journal_head *next_jh = jh;
489+
unsigned long nr_freed = 0;
490+
int ret;
491+
492+
if (!jh || *nr_to_scan == 0)
493+
return 0;
494+
495+
last_jh = jh->b_cpprev;
496+
do {
497+
jh = next_jh;
498+
next_jh = jh->b_cpnext;
499+
500+
(*nr_to_scan)--;
501+
if (__cp_buffer_busy(jh))
502+
continue;
503+
504+
nr_freed++;
505+
ret = __jbd2_journal_remove_checkpoint(jh);
506+
if (ret) {
507+
*released = true;
508+
break;
509+
}
510+
511+
if (need_resched())
512+
break;
513+
} while (jh != last_jh && *nr_to_scan);
514+
515+
return nr_freed;
516+
}
517+
518+
/*
519+
* jbd2_journal_shrink_checkpoint_list
520+
*
521+
* Find 'nr_to_scan' written-back checkpoint buffers in the journal
522+
* and try to release them. Return the number of released checkpointed
523+
* buffers.
524+
*
525+
* Called with j_list_lock held.
526+
*/
527+
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal,
528+
unsigned long *nr_to_scan)
529+
{
530+
transaction_t *transaction, *last_transaction, *next_transaction;
531+
bool released;
532+
tid_t first_tid = 0, last_tid = 0, next_tid = 0;
533+
tid_t tid = 0;
534+
unsigned long nr_freed = 0;
535+
unsigned long nr_scanned = *nr_to_scan;
536+
537+
again:
538+
spin_lock(&journal->j_list_lock);
539+
if (!journal->j_checkpoint_transactions) {
540+
spin_unlock(&journal->j_list_lock);
541+
goto out;
542+
}
543+
544+
/*
545+
* Get next shrink transaction, resume previous scan or start
546+
* over again. If some others do checkpoint and drop transaction
547+
* from the checkpoint list, we ignore saved j_shrink_transaction
548+
* and start over unconditionally.
549+
*/
550+
if (journal->j_shrink_transaction)
551+
transaction = journal->j_shrink_transaction;
552+
else
553+
transaction = journal->j_checkpoint_transactions;
554+
555+
if (!first_tid)
556+
first_tid = transaction->t_tid;
557+
last_transaction = journal->j_checkpoint_transactions->t_cpprev;
558+
next_transaction = transaction;
559+
last_tid = last_transaction->t_tid;
560+
do {
561+
transaction = next_transaction;
562+
next_transaction = transaction->t_cpnext;
563+
tid = transaction->t_tid;
564+
released = false;
565+
566+
nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_list,
567+
nr_to_scan, &released);
568+
if (*nr_to_scan == 0)
569+
break;
570+
if (need_resched() || spin_needbreak(&journal->j_list_lock))
571+
break;
572+
if (released)
573+
continue;
574+
575+
nr_freed += journal_shrink_one_cp_list(transaction->t_checkpoint_io_list,
576+
nr_to_scan, &released);
577+
if (*nr_to_scan == 0)
578+
break;
579+
if (need_resched() || spin_needbreak(&journal->j_list_lock))
580+
break;
581+
} while (transaction != last_transaction);
582+
583+
if (transaction != last_transaction) {
584+
journal->j_shrink_transaction = next_transaction;
585+
next_tid = next_transaction->t_tid;
586+
} else {
587+
journal->j_shrink_transaction = NULL;
588+
next_tid = 0;
589+
}
590+
591+
spin_unlock(&journal->j_list_lock);
592+
cond_resched();
593+
594+
if (*nr_to_scan && next_tid)
595+
goto again;
596+
out:
597+
nr_scanned -= *nr_to_scan;
598+
trace_jbd2_shrink_checkpoint_list(journal, first_tid, tid, last_tid,
599+
nr_freed, nr_scanned, next_tid);
600+
601+
return nr_freed;
602+
}
603+
461604
/*
462605
* journal_clean_checkpoint_list
463606
*
@@ -580,6 +723,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
580723

581724
__buffer_unlink(jh);
582725
jh->b_cp_transaction = NULL;
726+
percpu_counter_dec(&journal->j_jh_shrink_count);
583727
jbd2_journal_put_journal_head(jh);
584728

585729
/* Is this transaction empty? */
@@ -642,6 +786,7 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
642786
jh->b_cpnext->b_cpprev = jh;
643787
}
644788
transaction->t_checkpoint_list = jh;
789+
percpu_counter_inc(&transaction->t_journal->j_jh_shrink_count);
645790
}
646791

647792
/*
@@ -657,6 +802,8 @@ void __jbd2_journal_insert_checkpoint(struct journal_head *jh,
657802
void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transaction)
658803
{
659804
assert_spin_locked(&journal->j_list_lock);
805+
806+
journal->j_shrink_transaction = NULL;
660807
if (transaction->t_cpnext) {
661808
transaction->t_cpnext->t_cpprev = transaction->t_cpprev;
662809
transaction->t_cpprev->t_cpnext = transaction->t_cpnext;

fs/jbd2/journal.c

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2050,6 +2050,91 @@ int jbd2_journal_load(journal_t *journal)
20502050
return -EIO;
20512051
}
20522052

2053+
/**
2054+
* jbd2_journal_shrink_scan()
2055+
*
2056+
* Scan the checkpointed buffer on the checkpoint list and release the
2057+
* journal_head.
2058+
*/
2059+
static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink,
2060+
struct shrink_control *sc)
2061+
{
2062+
journal_t *journal = container_of(shrink, journal_t, j_shrinker);
2063+
unsigned long nr_to_scan = sc->nr_to_scan;
2064+
unsigned long nr_shrunk;
2065+
unsigned long count;
2066+
2067+
count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
2068+
trace_jbd2_shrink_scan_enter(journal, sc->nr_to_scan, count);
2069+
2070+
nr_shrunk = jbd2_journal_shrink_checkpoint_list(journal, &nr_to_scan);
2071+
2072+
count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
2073+
trace_jbd2_shrink_scan_exit(journal, nr_to_scan, nr_shrunk, count);
2074+
2075+
return nr_shrunk;
2076+
}
2077+
2078+
/**
2079+
* jbd2_journal_shrink_count()
2080+
*
2081+
* Count the number of checkpoint buffers on the checkpoint list.
2082+
*/
2083+
static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink,
2084+
struct shrink_control *sc)
2085+
{
2086+
journal_t *journal = container_of(shrink, journal_t, j_shrinker);
2087+
unsigned long count;
2088+
2089+
count = percpu_counter_read_positive(&journal->j_jh_shrink_count);
2090+
trace_jbd2_shrink_count(journal, sc->nr_to_scan, count);
2091+
2092+
return count;
2093+
}
2094+
2095+
/**
2096+
* jbd2_journal_register_shrinker()
2097+
* @journal: Journal to act on.
2098+
*
2099+
* Init a percpu counter to record the checkpointed buffers on the checkpoint
2100+
* list and register a shrinker to release their journal_head.
2101+
*/
2102+
int jbd2_journal_register_shrinker(journal_t *journal)
2103+
{
2104+
int err;
2105+
2106+
journal->j_shrink_transaction = NULL;
2107+
2108+
err = percpu_counter_init(&journal->j_jh_shrink_count, 0, GFP_KERNEL);
2109+
if (err)
2110+
return err;
2111+
2112+
journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan;
2113+
journal->j_shrinker.count_objects = jbd2_journal_shrink_count;
2114+
journal->j_shrinker.seeks = DEFAULT_SEEKS;
2115+
journal->j_shrinker.batch = journal->j_max_transaction_buffers;
2116+
2117+
err = register_shrinker(&journal->j_shrinker);
2118+
if (err) {
2119+
percpu_counter_destroy(&journal->j_jh_shrink_count);
2120+
return err;
2121+
}
2122+
2123+
return 0;
2124+
}
2125+
2126+
/**
2127+
* jbd2_journal_unregister_shrinker()
2128+
* @journal: Journal to act on.
2129+
*
2130+
* Unregister the checkpointed buffer shrinker and destroy the percpu counter.
2131+
*/
2132+
void jbd2_journal_unregister_shrinker(journal_t *journal)
2133+
{
2134+
percpu_counter_destroy(&journal->j_jh_shrink_count);
2135+
unregister_shrinker(&journal->j_shrinker);
2136+
}
2137+
20532138
/**
20542139
* jbd2_journal_destroy() - Release a journal_t structure.
20552140
* @journal: Journal to act on.
@@ -2122,6 +2207,8 @@ int jbd2_journal_destroy(journal_t *journal)
21222207
brelse(journal->j_sb_buffer);
21232208
}
21242209

2210+
jbd2_journal_unregister_shrinker(journal);
2211+
21252212
if (journal->j_proc_entry)
21262213
jbd2_stats_proc_exit(journal);
21272214
iput(journal->j_inode);

include/linux/jbd2.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -909,6 +909,29 @@ struct journal_s
909909
*/
910910
struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH];
911911

912+
/**
913+
* @j_shrinker:
914+
*
915+
* Journal head shrinker, reclaim buffer's journal head which
916+
* has been written back.
917+
*/
918+
struct shrinker j_shrinker;
919+
920+
/**
921+
* @j_jh_shrink_count:
922+
*
923+
* Number of journal buffers on the checkpoint list. [j_list_lock]
924+
*/
925+
struct percpu_counter j_jh_shrink_count;
926+
927+
/**
928+
* @j_shrink_transaction:
929+
*
930+
* Record next transaction will shrink on the checkpoint list.
931+
* [j_list_lock]
932+
*/
933+
transaction_t *j_shrink_transaction;
934+
912935
/**
913936
* @j_head:
914937
*
@@ -1422,6 +1445,7 @@ extern void jbd2_journal_commit_transaction(journal_t *);
14221445

14231446
/* Checkpoint list management */
14241447
void __jbd2_journal_clean_checkpoint_list(journal_t *journal, bool destroy);
1448+
unsigned long jbd2_journal_shrink_checkpoint_list(journal_t *journal, unsigned long *nr_to_scan);
14251449
int __jbd2_journal_remove_checkpoint(struct journal_head *);
14261450
void jbd2_journal_destroy_checkpoint(journal_t *journal);
14271451
void __jbd2_journal_insert_checkpoint(struct journal_head *, transaction_t *);
@@ -1532,6 +1556,8 @@ extern int jbd2_journal_set_features
15321556
(journal_t *, unsigned long, unsigned long, unsigned long);
15331557
extern void jbd2_journal_clear_features
15341558
(journal_t *, unsigned long, unsigned long, unsigned long);
1559+
extern int jbd2_journal_register_shrinker(journal_t *journal);
1560+
extern void jbd2_journal_unregister_shrinker(journal_t *journal);
15351561
extern int jbd2_journal_load (journal_t *journal);
15361562
extern int jbd2_journal_destroy (journal_t *);
15371563
extern int jbd2_journal_recover (journal_t *journal);

0 commit comments

Comments
 (0)