25
25
* Copyright (c) 2012, 2018 by Delphix. All rights reserved.
26
26
* Copyright (c) 2015 by Chunwei Chen. All rights reserved.
27
27
* Copyright 2017 Nexenta Systems, Inc.
28
+ * Copyright (c) 2025, Klara, Inc.
28
29
*/
29
30
30
31
/* Portions Copyright 2007 Jeremy Teo */
@@ -3884,17 +3885,49 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3884
3885
3885
3886
err = sa_bulk_update (zp -> z_sa_hdl , bulk , cnt , tx );
3886
3887
3887
- boolean_t commit = B_FALSE ;
3888
- if (wbc -> sync_mode != WB_SYNC_NONE ) {
3889
- /*
3890
- * Note that this is rarely called under writepages(), because
3891
- * writepages() normally handles the entire commit for
3892
- * performance reasons.
3893
- */
3894
- commit = B_TRUE ;
3895
- }
3888
+ /*
3889
+ * A note about for_sync vs wbc->sync_mode.
3890
+ *
3891
+ * for_sync indicates that this is a syncing writeback, that is, kernel
3892
+ * caller expects the data to be durably stored before being notified.
3893
+ * Often, but not always, the call was triggered by a userspace syncing
3894
+ * op (eg fsync(), msync(MS_SYNC)). For our purposes, for_sync==TRUE
3895
+ * means that that page should remain "locked" (in the writeback state)
3896
+ * until it is definitely on disk (ie zil_commit() or spa_sync()).
3897
+ * Otherwise, we can unlock and return as soon as it is on the
3898
+ * in-memory ZIL.
3899
+ *
3900
+ * wbc->sync_mode has similar meaning. wbc is passed from the kernel to
3901
+ * zpl_writepages()/zpl_writepage(); wbc->sync_mode==WB_SYNC_NONE
3902
+ * indicates this a regular async writeback (eg a cache eviction) and
3903
+ * so does not need a durability guarantee, while WB_SYNC_ALL indicates
3904
+ * a syncing op that must be waited on (by convention, we test for
3905
+ * !WB_SYNC_NONE rather than WB_SYNC_ALL, to prefer durability over
3906
+ * performance should there ever be a new mode that we have not yet
3907
+ * added support for).
3908
+ *
3909
+ * So, why a separate for_sync field? This is because zpl_writepages()
3910
+ * calls zfs_putpage() multiple times for a single "logical" operation.
3911
+ * It wants all the individual pages to be for_sync==TRUE ie only
3912
+ * unlocked once durably stored, but it only wants one call to
3913
+ * zil_commit() at the very end, once all the pages are synced. So,
3914
+ * it repurposes sync_mode slightly to indicate who issue and wait for
3915
+ * the IO: for NONE, the caller to zfs_putpage() will do it, while for
3916
+ * ALL, zfs_putpage should do it.
3917
+ *
3918
+ * Summary:
3919
+ * for_sync: 0=unlock immediately; 1 unlock once on disk
3920
+ * sync_mode: NONE=caller will commit; ALL=we will commit
3921
+ */
3922
+ boolean_t need_commit = (wbc -> sync_mode != WB_SYNC_NONE );
3896
3923
3897
- zfs_log_write (zfsvfs -> z_log , tx , TX_WRITE , zp , pgoff , pglen , commit ,
3924
+ /*
3925
+ * We use for_sync as the "commit" arg to zfs_log_write() (arg 7)
3926
+ * because it is a policy flag that indicates "someone will call
3927
+ * zil_commit() soon". for_sync=TRUE means exactly that; the only
3928
+ * question is whether it will be us, or zpl_writepages().
3929
+ */
3930
+ zfs_log_write (zfsvfs -> z_log , tx , TX_WRITE , zp , pgoff , pglen , for_sync ,
3898
3931
B_FALSE , for_sync ? zfs_putpage_commit_cb : NULL , pp );
3899
3932
3900
3933
if (!for_sync ) {
@@ -3906,7 +3939,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc,
3906
3939
3907
3940
zfs_rangelock_exit (lr );
3908
3941
3909
- if (commit )
3942
+ if (need_commit )
3910
3943
zil_commit (zfsvfs -> z_log , zp -> z_id );
3911
3944
3912
3945
dataset_kstats_update_write_kstats (& zfsvfs -> z_kstat , pglen );
0 commit comments