Skip to content

Commit b0dff46

Browse files
Dave Chinnerdjwong
authored andcommitted
xfs: separate read-only variables in struct xfs_mount
Seeing massive cpu usage from xfs_agino_range() on one machine; instruction level profiles look similar to another machine running the same workload, only one machine is consuming 10x as much CPU as the other and going much slower. The only real difference between the two machines is core count per socket. Both are running identical 16p/16GB virtual machine configurations Machine A: 25.83% [k] xfs_agino_range 12.68% [k] __xfs_dir3_data_check 6.95% [k] xfs_verify_ino 6.78% [k] xfs_dir2_data_entry_tag_p 3.56% [k] xfs_buf_find 2.31% [k] xfs_verify_dir_ino 2.02% [k] xfs_dabuf_map.constprop.0 1.65% [k] xfs_ag_block_count And takes around 13 minutes to remove 50 million inodes. Machine B: 13.90% [k] __pv_queued_spin_lock_slowpath 3.76% [k] do_raw_spin_lock 2.83% [k] xfs_dir3_leaf_check_int 2.75% [k] xfs_agino_range 2.51% [k] __raw_callee_save___pv_queued_spin_unlock 2.18% [k] __xfs_dir3_data_check 2.02% [k] xfs_log_commit_cil And takes around 5m30s to remove 50 million inodes. Suspect is cacheline contention on m_sectbb_log which is used in one of the macros in xfs_agino_range. This is a read-only variable but shares a cacheline with m_active_trans which is a global atomic that gets bounced all around the machine. The workload is trying to run hundreds of thousands of transactions per second and hence cacheline contention will be occurring on this atomic counter. Hence xfs_agino_range() is likely just be an innocent bystander as the cache coherency protocol fights over the cacheline between CPU cores and sockets. On machine A, this rearrangement of the struct xfs_mount results in the profile changing to: 9.77% [kernel] [k] xfs_agino_range 6.27% [kernel] [k] __xfs_dir3_data_check 5.31% [kernel] [k] __pv_queued_spin_lock_slowpath 4.54% [kernel] [k] xfs_buf_find 3.79% [kernel] [k] do_raw_spin_lock 3.39% [kernel] [k] xfs_verify_ino 2.73% [kernel] [k] __raw_callee_save___pv_queued_spin_unlock Vastly less CPU usage in xfs_agino_range(), but still 3x the amount of machine B and still runs substantially slower than it should. Current rm -rf of 50 million files: vanilla patched machine A 13m20s 6m42s machine B 5m30s 5m02s It's an improvement, hence indicating that separation and further optimisation of read-only global filesystem data is worthwhile, but it clearly isn't the underlying issue causing this specific performance degradation. Signed-off-by: Dave Chinner <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Reviewed-by: Darrick J. Wong <[email protected]> Signed-off-by: Darrick J. Wong <[email protected]>
1 parent f18c9a9 commit b0dff46

File tree

1 file changed

+82
-66
lines changed

1 file changed

+82
-66
lines changed

fs/xfs/xfs_mount.h

Lines changed: 82 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -55,71 +55,52 @@ struct xfs_error_cfg {
5555
long retry_timeout; /* in jiffies, -1 = infinite */
5656
};
5757

58+
/*
59+
* The struct xfsmount layout is optimised to separate read-mostly variables
60+
* from variables that are frequently modified. We put the read-mostly variables
61+
* first, then place all the other variables at the end.
62+
*
63+
* Typically, read-mostly variables are those that are set at mount time and
64+
* never changed again, or only change rarely as a result of things like sysfs
65+
* knobs being tweaked.
66+
*/
5867
typedef struct xfs_mount {
68+
struct xfs_sb m_sb; /* copy of fs superblock */
5969
struct super_block *m_super;
60-
61-
/*
62-
* Bitsets of per-fs metadata that have been checked and/or are sick.
63-
* Callers must hold m_sb_lock to access these two fields.
64-
*/
65-
uint8_t m_fs_checked;
66-
uint8_t m_fs_sick;
67-
/*
68-
* Bitsets of rt metadata that have been checked and/or are sick.
69-
* Callers must hold m_sb_lock to access this field.
70-
*/
71-
uint8_t m_rt_checked;
72-
uint8_t m_rt_sick;
73-
7470
struct xfs_ail *m_ail; /* fs active log item list */
75-
76-
struct xfs_sb m_sb; /* copy of fs superblock */
77-
spinlock_t m_sb_lock; /* sb counter lock */
78-
struct percpu_counter m_icount; /* allocated inodes counter */
79-
struct percpu_counter m_ifree; /* free inodes counter */
80-
struct percpu_counter m_fdblocks; /* free block counter */
81-
/*
82-
* Count of data device blocks reserved for delayed allocations,
83-
* including indlen blocks. Does not include allocated CoW staging
84-
* extents or anything related to the rt device.
85-
*/
86-
struct percpu_counter m_delalloc_blks;
87-
8871
struct xfs_buf *m_sb_bp; /* buffer for superblock */
8972
char *m_rtname; /* realtime device name */
9073
char *m_logname; /* external log device name */
91-
int m_bsize; /* fs logical block size */
92-
xfs_agnumber_t m_agfrotor; /* last ag where space found */
93-
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
94-
spinlock_t m_agirotor_lock;/* .. and lock protecting it */
95-
xfs_agnumber_t m_maxagi; /* highest inode alloc group */
96-
uint m_allocsize_log;/* min write size log bytes */
97-
uint m_allocsize_blocks; /* min write size blocks */
9874
struct xfs_da_geometry *m_dir_geo; /* directory block geometry */
9975
struct xfs_da_geometry *m_attr_geo; /* attribute block geometry */
10076
struct xlog *m_log; /* log specific stuff */
101-
struct xfs_ino_geometry m_ino_geo; /* inode geometry */
102-
int m_logbufs; /* number of log buffers */
103-
int m_logbsize; /* size of each log buffer */
104-
uint m_rsumlevels; /* rt summary levels */
105-
uint m_rsumsize; /* size of rt summary, bytes */
106-
/*
107-
* Optional cache of rt summary level per bitmap block with the
108-
* invariant that m_rsum_cache[bbno] <= the minimum i for which
109-
* rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
110-
* inode lock.
111-
*/
112-
uint8_t *m_rsum_cache;
11377
struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
11478
struct xfs_inode *m_rsumip; /* pointer to summary inode */
11579
struct xfs_inode *m_rootip; /* pointer to root directory */
11680
struct xfs_quotainfo *m_quotainfo; /* disk quota information */
11781
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
11882
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
11983
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
84+
/*
85+
* Optional cache of rt summary level per bitmap block with the
86+
* invariant that m_rsum_cache[bbno] <= the minimum i for which
87+
* rsum[i][bbno] != 0. Reads and writes are serialized by the rsumip
88+
* inode lock.
89+
*/
90+
uint8_t *m_rsum_cache;
91+
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
92+
struct workqueue_struct *m_buf_workqueue;
93+
struct workqueue_struct *m_unwritten_workqueue;
94+
struct workqueue_struct *m_cil_workqueue;
95+
struct workqueue_struct *m_reclaim_workqueue;
96+
struct workqueue_struct *m_eofblocks_workqueue;
97+
struct workqueue_struct *m_sync_workqueue;
98+
99+
int m_bsize; /* fs logical block size */
120100
uint8_t m_blkbit_log; /* blocklog + NBBY */
121101
uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
122102
uint8_t m_agno_log; /* log #ag's */
103+
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
123104
uint m_blockmask; /* sb_blocksize-1 */
124105
uint m_blockwsize; /* sb_blocksize in words */
125106
uint m_blockwmask; /* blockwsize-1 */
@@ -138,47 +119,83 @@ typedef struct xfs_mount {
138119
xfs_extlen_t m_ag_prealloc_blocks; /* reserved ag blocks */
139120
uint m_alloc_set_aside; /* space we can't use */
140121
uint m_ag_max_usable; /* max space per AG */
141-
struct radix_tree_root m_perag_tree; /* per-ag accounting info */
142-
spinlock_t m_perag_lock; /* lock for m_perag_tree */
143-
struct mutex m_growlock; /* growfs mutex */
122+
int m_dalign; /* stripe unit */
123+
int m_swidth; /* stripe width */
124+
xfs_agnumber_t m_maxagi; /* highest inode alloc group */
125+
uint m_allocsize_log;/* min write size log bytes */
126+
uint m_allocsize_blocks; /* min write size blocks */
127+
int m_logbufs; /* number of log buffers */
128+
int m_logbsize; /* size of each log buffer */
129+
uint m_rsumlevels; /* rt summary levels */
130+
uint m_rsumsize; /* size of rt summary, bytes */
144131
int m_fixedfsid[2]; /* unchanged for life of FS */
145-
uint64_t m_flags; /* global mount flags */
146-
bool m_finobt_nores; /* no per-AG finobt resv. */
147132
uint m_qflags; /* quota status flags */
133+
uint64_t m_flags; /* global mount flags */
134+
int64_t m_low_space[XFS_LOWSP_MAX];
135+
struct xfs_ino_geometry m_ino_geo; /* inode geometry */
148136
struct xfs_trans_resv m_resv; /* precomputed res values */
137+
/* low free space thresholds */
138+
bool m_always_cow;
139+
bool m_fail_unmount;
140+
bool m_finobt_nores; /* no per-AG finobt resv. */
141+
bool m_update_sb; /* sb needs update in mount */
142+
143+
/*
144+
* Bitsets of per-fs metadata that have been checked and/or are sick.
145+
* Callers must hold m_sb_lock to access these two fields.
146+
*/
147+
uint8_t m_fs_checked;
148+
uint8_t m_fs_sick;
149+
/*
150+
* Bitsets of rt metadata that have been checked and/or are sick.
151+
* Callers must hold m_sb_lock to access this field.
152+
*/
153+
uint8_t m_rt_checked;
154+
uint8_t m_rt_sick;
155+
156+
/*
157+
* End of read-mostly variables. Frequently written variables and locks
158+
* should be placed below this comment from now on. The first variable
159+
* here is marked as cacheline aligned so they it is separated from
160+
* the read-mostly variables.
161+
*/
162+
163+
spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */
164+
struct percpu_counter m_icount; /* allocated inodes counter */
165+
struct percpu_counter m_ifree; /* free inodes counter */
166+
struct percpu_counter m_fdblocks; /* free block counter */
167+
/*
168+
* Count of data device blocks reserved for delayed allocations,
169+
* including indlen blocks. Does not include allocated CoW staging
170+
* extents or anything related to the rt device.
171+
*/
172+
struct percpu_counter m_delalloc_blks;
173+
174+
struct radix_tree_root m_perag_tree; /* per-ag accounting info */
175+
spinlock_t m_perag_lock; /* lock for m_perag_tree */
149176
uint64_t m_resblks; /* total reserved blocks */
150177
uint64_t m_resblks_avail;/* available reserved blocks */
151178
uint64_t m_resblks_save; /* reserved blks @ remount,ro */
152-
int m_dalign; /* stripe unit */
153-
int m_swidth; /* stripe width */
154-
uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
155179
atomic_t m_active_trans; /* number trans frozen */
156-
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
157180
struct delayed_work m_reclaim_work; /* background inode reclaim */
158181
struct delayed_work m_eofblocks_work; /* background eof blocks
159182
trimming */
160183
struct delayed_work m_cowblocks_work; /* background cow blocks
161184
trimming */
162-
bool m_update_sb; /* sb needs update in mount */
163-
int64_t m_low_space[XFS_LOWSP_MAX];
164-
/* low free space thresholds */
165185
struct xfs_kobj m_kobj;
166186
struct xfs_kobj m_error_kobj;
167187
struct xfs_kobj m_error_meta_kobj;
168188
struct xfs_error_cfg m_error_cfg[XFS_ERR_CLASS_MAX][XFS_ERR_ERRNO_MAX];
169189
struct xstats m_stats; /* per-fs stats */
190+
xfs_agnumber_t m_agfrotor; /* last ag where space found */
191+
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
192+
spinlock_t m_agirotor_lock;/* .. and lock protecting it */
170193

171194
/*
172195
* Workqueue item so that we can coalesce multiple inode flush attempts
173196
* into a single flush.
174197
*/
175198
struct work_struct m_flush_inodes_work;
176-
struct workqueue_struct *m_buf_workqueue;
177-
struct workqueue_struct *m_unwritten_workqueue;
178-
struct workqueue_struct *m_cil_workqueue;
179-
struct workqueue_struct *m_reclaim_workqueue;
180-
struct workqueue_struct *m_eofblocks_workqueue;
181-
struct workqueue_struct *m_sync_workqueue;
182199

183200
/*
184201
* Generation of the filesysyem layout. This is incremented by each
@@ -190,9 +207,8 @@ typedef struct xfs_mount {
190207
* to various other kinds of pain inflicted on the pNFS server.
191208
*/
192209
uint32_t m_generation;
210+
struct mutex m_growlock; /* growfs mutex */
193211

194-
bool m_always_cow;
195-
bool m_fail_unmount;
196212
#ifdef DEBUG
197213
/*
198214
* Frequency with which errors are injected. Replaces xfs_etest; the

0 commit comments

Comments
 (0)