Skip to content

Commit a44a027

Browse files
committed
Merge tag 'large-extent-counters-v9' of https://github.com/chandanr/linux into xfs-5.19-for-next
xfs: Large extent counters The commit xfs: fix inode fork extent count overflow (3f8a4f1) mentions that 10 billion data fork extents should be possible to create. However the corresponding on-disk field has a signed 32-bit type. Hence this patchset extends the per-inode data fork extent counter to 64 bits (out of which 48 bits are used to store the extent count). Also, XFS has an attribute fork extent counter which is 16 bits wide. A workload that, 1. Creates 1 million 255-byte sized xattrs, 2. Deletes 50% of these xattrs in an alternating manner, 3. Tries to insert 400,000 new 255-byte sized xattrs causes the xattr extent counter to overflow. Dave tells me that there are instances where a single file has more than 100 million hardlinks. With parent pointers being stored in xattrs, we will overflow the signed 16-bits wide attribute extent counter when large number of hardlinks are created. Hence this patchset extends the on-disk field to 32-bits. The following changes are made to accomplish this, 1. A 64-bit inode field is carved out of existing di_pad and di_flushiter fields to hold the 64-bit data fork extent counter. 2. The existing 32-bit inode data fork extent counter will be used to hold the attribute fork extent counter. 3. A new incompat superblock flag to prevent older kernels from mounting the filesystem. Signed-off-by: Chandan Babu R <[email protected]> Signed-off-by: Dave Chinner <[email protected]>
2 parents 463260d + 973ac0e commit a44a027

37 files changed

+607
-278
lines changed

fs/xfs/libxfs/xfs_alloc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2511,7 +2511,7 @@ __xfs_free_extent_later(
25112511

25122512
ASSERT(bno != NULLFSBLOCK);
25132513
ASSERT(len > 0);
2514-
ASSERT(len <= MAXEXTLEN);
2514+
ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
25152515
ASSERT(!isnullstartblock(bno));
25162516
agno = XFS_FSB_TO_AGNO(mp, bno);
25172517
agbno = XFS_FSB_TO_AGBNO(mp, bno);

fs/xfs/libxfs/xfs_attr.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -776,6 +776,9 @@ xfs_attr_set(
776776
if (args->value || xfs_inode_hasattr(dp)) {
777777
error = xfs_iext_count_may_overflow(dp, XFS_ATTR_FORK,
778778
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
779+
if (error == -EFBIG)
780+
error = xfs_iext_count_upgrade(args->trans, dp,
781+
XFS_IEXT_ATTR_MANIP_CNT(rmt_blks));
779782
if (error)
780783
goto out_trans_cancel;
781784
}

fs/xfs/libxfs/xfs_bmap.c

Lines changed: 45 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -52,19 +52,17 @@ xfs_bmap_compute_maxlevels(
5252
xfs_mount_t *mp, /* file system mount structure */
5353
int whichfork) /* data or attr fork */
5454
{
55+
uint64_t maxblocks; /* max blocks at this level */
56+
xfs_extnum_t maxleafents; /* max leaf entries possible */
5557
int level; /* btree level */
56-
uint maxblocks; /* max blocks at this level */
57-
uint maxleafents; /* max leaf entries possible */
5858
int maxrootrecs; /* max records in root block */
5959
int minleafrecs; /* min records in leaf block */
6060
int minnoderecs; /* min records in node block */
6161
int sz; /* root block size */
6262

6363
/*
64-
* The maximum number of extents in a file, hence the maximum number of
65-
* leaf entries, is controlled by the size of the on-disk extent count,
66-
* either a signed 32-bit number for the data fork, or a signed 16-bit
67-
* number for the attr fork.
64+
* The maximum number of extents in a fork, hence the maximum number of
65+
* leaf entries, is controlled by the size of the on-disk extent count.
6866
*
6967
* Note that we can no longer assume that if we are in ATTR1 that the
7068
* fork offset of all the inodes will be
@@ -74,22 +72,22 @@ xfs_bmap_compute_maxlevels(
7472
* ATTR2 we have to assume the worst case scenario of a minimum size
7573
* available.
7674
*/
77-
if (whichfork == XFS_DATA_FORK) {
78-
maxleafents = MAXEXTNUM;
75+
maxleafents = xfs_iext_max_nextents(xfs_has_large_extent_counts(mp),
76+
whichfork);
77+
if (whichfork == XFS_DATA_FORK)
7978
sz = XFS_BMDR_SPACE_CALC(MINDBTPTRS);
80-
} else {
81-
maxleafents = MAXAEXTNUM;
79+
else
8280
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
83-
}
81+
8482
maxrootrecs = xfs_bmdr_maxrecs(sz, 0);
8583
minleafrecs = mp->m_bmap_dmnr[0];
8684
minnoderecs = mp->m_bmap_dmnr[1];
87-
maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
85+
maxblocks = howmany_64(maxleafents, minleafrecs);
8886
for (level = 1; maxblocks > 1; level++) {
8987
if (maxblocks <= maxrootrecs)
9088
maxblocks = 1;
9189
else
92-
maxblocks = (maxblocks + minnoderecs - 1) / minnoderecs;
90+
maxblocks = howmany_64(maxblocks, minnoderecs);
9391
}
9492
mp->m_bm_maxlevels[whichfork] = level;
9593
ASSERT(mp->m_bm_maxlevels[whichfork] <= xfs_bmbt_maxlevels_ondisk());
@@ -468,7 +466,7 @@ xfs_bmap_check_leaf_extents(
468466
if (bp_release)
469467
xfs_trans_brelse(NULL, bp);
470468
error_norelse:
471-
xfs_warn(mp, "%s: BAD after btree leaves for %d extents",
469+
xfs_warn(mp, "%s: BAD after btree leaves for %llu extents",
472470
__func__, i);
473471
xfs_err(mp, "%s: CORRUPTED BTREE OR SOMETHING", __func__);
474472
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
@@ -1452,7 +1450,7 @@ xfs_bmap_add_extent_delay_real(
14521450
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
14531451
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
14541452
LEFT.br_state == new->br_state &&
1455-
LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
1453+
LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
14561454
state |= BMAP_LEFT_CONTIG;
14571455

14581456
/*
@@ -1470,13 +1468,13 @@ xfs_bmap_add_extent_delay_real(
14701468
new_endoff == RIGHT.br_startoff &&
14711469
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
14721470
new->br_state == RIGHT.br_state &&
1473-
new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
1471+
new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
14741472
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
14751473
BMAP_RIGHT_FILLING)) !=
14761474
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
14771475
BMAP_RIGHT_FILLING) ||
14781476
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
1479-
<= MAXEXTLEN))
1477+
<= XFS_MAX_BMBT_EXTLEN))
14801478
state |= BMAP_RIGHT_CONTIG;
14811479

14821480
error = 0;
@@ -2000,7 +1998,7 @@ xfs_bmap_add_extent_unwritten_real(
20001998
LEFT.br_startoff + LEFT.br_blockcount == new->br_startoff &&
20011999
LEFT.br_startblock + LEFT.br_blockcount == new->br_startblock &&
20022000
LEFT.br_state == new->br_state &&
2003-
LEFT.br_blockcount + new->br_blockcount <= MAXEXTLEN)
2001+
LEFT.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
20042002
state |= BMAP_LEFT_CONTIG;
20052003

20062004
/*
@@ -2018,13 +2016,13 @@ xfs_bmap_add_extent_unwritten_real(
20182016
new_endoff == RIGHT.br_startoff &&
20192017
new->br_startblock + new->br_blockcount == RIGHT.br_startblock &&
20202018
new->br_state == RIGHT.br_state &&
2021-
new->br_blockcount + RIGHT.br_blockcount <= MAXEXTLEN &&
2019+
new->br_blockcount + RIGHT.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
20222020
((state & (BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
20232021
BMAP_RIGHT_FILLING)) !=
20242022
(BMAP_LEFT_CONTIG | BMAP_LEFT_FILLING |
20252023
BMAP_RIGHT_FILLING) ||
20262024
LEFT.br_blockcount + new->br_blockcount + RIGHT.br_blockcount
2027-
<= MAXEXTLEN))
2025+
<= XFS_MAX_BMBT_EXTLEN))
20282026
state |= BMAP_RIGHT_CONTIG;
20292027

20302028
/*
@@ -2510,15 +2508,15 @@ xfs_bmap_add_extent_hole_delay(
25102508
*/
25112509
if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) &&
25122510
left.br_startoff + left.br_blockcount == new->br_startoff &&
2513-
left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
2511+
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
25142512
state |= BMAP_LEFT_CONTIG;
25152513

25162514
if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) &&
25172515
new->br_startoff + new->br_blockcount == right.br_startoff &&
2518-
new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
2516+
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
25192517
(!(state & BMAP_LEFT_CONTIG) ||
25202518
(left.br_blockcount + new->br_blockcount +
2521-
right.br_blockcount <= MAXEXTLEN)))
2519+
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN)))
25222520
state |= BMAP_RIGHT_CONTIG;
25232521

25242522
/*
@@ -2661,17 +2659,17 @@ xfs_bmap_add_extent_hole_real(
26612659
left.br_startoff + left.br_blockcount == new->br_startoff &&
26622660
left.br_startblock + left.br_blockcount == new->br_startblock &&
26632661
left.br_state == new->br_state &&
2664-
left.br_blockcount + new->br_blockcount <= MAXEXTLEN)
2662+
left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN)
26652663
state |= BMAP_LEFT_CONTIG;
26662664

26672665
if ((state & BMAP_RIGHT_VALID) && !(state & BMAP_RIGHT_DELAY) &&
26682666
new->br_startoff + new->br_blockcount == right.br_startoff &&
26692667
new->br_startblock + new->br_blockcount == right.br_startblock &&
26702668
new->br_state == right.br_state &&
2671-
new->br_blockcount + right.br_blockcount <= MAXEXTLEN &&
2669+
new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN &&
26722670
(!(state & BMAP_LEFT_CONTIG) ||
26732671
left.br_blockcount + new->br_blockcount +
2674-
right.br_blockcount <= MAXEXTLEN))
2672+
right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))
26752673
state |= BMAP_RIGHT_CONTIG;
26762674

26772675
error = 0;
@@ -2906,15 +2904,15 @@ xfs_bmap_extsize_align(
29062904

29072905
/*
29082906
* For large extent hint sizes, the aligned extent might be larger than
2909-
* MAXEXTLEN. In that case, reduce the size by an extsz so that it pulls
2910-
* the length back under MAXEXTLEN. The outer allocation loops handle
2911-
* short allocation just fine, so it is safe to do this. We only want to
2912-
* do it when we are forced to, though, because it means more allocation
2913-
* operations are required.
2907+
* XFS_BMBT_MAX_EXTLEN. In that case, reduce the size by an extsz so
2908+
* that it pulls the length back under XFS_BMBT_MAX_EXTLEN. The outer
2909+
* allocation loops handle short allocation just fine, so it is safe to
2910+
* do this. We only want to do it when we are forced to, though, because
2911+
* it means more allocation operations are required.
29142912
*/
2915-
while (align_alen > MAXEXTLEN)
2913+
while (align_alen > XFS_MAX_BMBT_EXTLEN)
29162914
align_alen -= extsz;
2917-
ASSERT(align_alen <= MAXEXTLEN);
2915+
ASSERT(align_alen <= XFS_MAX_BMBT_EXTLEN);
29182916

29192917
/*
29202918
* If the previous block overlaps with this proposed allocation
@@ -3004,9 +3002,9 @@ xfs_bmap_extsize_align(
30043002
return -EINVAL;
30053003
} else {
30063004
ASSERT(orig_off >= align_off);
3007-
/* see MAXEXTLEN handling above */
3005+
/* see XFS_BMBT_MAX_EXTLEN handling above */
30083006
ASSERT(orig_end <= align_off + align_alen ||
3009-
align_alen + extsz > MAXEXTLEN);
3007+
align_alen + extsz > XFS_MAX_BMBT_EXTLEN);
30103008
}
30113009

30123010
#ifdef DEBUG
@@ -3971,7 +3969,7 @@ xfs_bmapi_reserve_delalloc(
39713969
* Cap the alloc length. Keep track of prealloc so we know whether to
39723970
* tag the inode before we return.
39733971
*/
3974-
alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
3972+
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
39753973
if (!eof)
39763974
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
39773975
if (prealloc && alen >= len)
@@ -4104,7 +4102,7 @@ xfs_bmapi_allocate(
41044102
if (!xfs_iext_peek_prev_extent(ifp, &bma->icur, &bma->prev))
41054103
bma->prev.br_startoff = NULLFILEOFF;
41064104
} else {
4107-
bma->length = XFS_FILBLKS_MIN(bma->length, MAXEXTLEN);
4105+
bma->length = XFS_FILBLKS_MIN(bma->length, XFS_MAX_BMBT_EXTLEN);
41084106
if (!bma->eof)
41094107
bma->length = XFS_FILBLKS_MIN(bma->length,
41104108
bma->got.br_startoff - bma->offset);
@@ -4424,8 +4422,8 @@ xfs_bmapi_write(
44244422
* xfs_extlen_t and therefore 32 bits. Hence we have to
44254423
* check for 32-bit overflows and handle them here.
44264424
*/
4427-
if (len > (xfs_filblks_t)MAXEXTLEN)
4428-
bma.length = MAXEXTLEN;
4425+
if (len > (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN)
4426+
bma.length = XFS_MAX_BMBT_EXTLEN;
44294427
else
44304428
bma.length = len;
44314429

@@ -4526,14 +4524,16 @@ xfs_bmapi_convert_delalloc(
45264524
return error;
45274525

45284526
xfs_ilock(ip, XFS_ILOCK_EXCL);
4527+
xfs_trans_ijoin(tp, ip, 0);
45294528

45304529
error = xfs_iext_count_may_overflow(ip, whichfork,
45314530
XFS_IEXT_ADD_NOSPLIT_CNT);
4531+
if (error == -EFBIG)
4532+
error = xfs_iext_count_upgrade(tp, ip,
4533+
XFS_IEXT_ADD_NOSPLIT_CNT);
45324534
if (error)
45334535
goto out_trans_cancel;
45344536

4535-
xfs_trans_ijoin(tp, ip, 0);
4536-
45374537
if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &bma.icur, &bma.got) ||
45384538
bma.got.br_startoff > offset_fsb) {
45394539
/*
@@ -4560,7 +4560,8 @@ xfs_bmapi_convert_delalloc(
45604560
bma.ip = ip;
45614561
bma.wasdel = true;
45624562
bma.offset = bma.got.br_startoff;
4563-
bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount, MAXEXTLEN);
4563+
bma.length = max_t(xfs_filblks_t, bma.got.br_blockcount,
4564+
XFS_MAX_BMBT_EXTLEN);
45644565
bma.minleft = xfs_bmapi_minleft(tp, ip, whichfork);
45654566

45664567
/*
@@ -4641,7 +4642,7 @@ xfs_bmapi_remap(
46414642

46424643
ifp = XFS_IFORK_PTR(ip, whichfork);
46434644
ASSERT(len > 0);
4644-
ASSERT(len <= (xfs_filblks_t)MAXEXTLEN);
4645+
ASSERT(len <= (xfs_filblks_t)XFS_MAX_BMBT_EXTLEN);
46454646
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
46464647
ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK | XFS_BMAPI_PREALLOC |
46474648
XFS_BMAPI_NORMAP)));
@@ -5148,26 +5149,6 @@ xfs_bmap_del_extent_real(
51485149
* Deleting the middle of the extent.
51495150
*/
51505151

5151-
/*
5152-
* For directories, -ENOSPC is returned since a directory entry
5153-
* remove operation must not fail due to low extent count
5154-
* availability. -ENOSPC will be handled by higher layers of XFS
5155-
* by letting the corresponding empty Data/Free blocks to linger
5156-
* until a future remove operation. Dabtree blocks would be
5157-
* swapped with the last block in the leaf space and then the
5158-
* new last block will be unmapped.
5159-
*
5160-
* The above logic also applies to the source directory entry of
5161-
* a rename operation.
5162-
*/
5163-
error = xfs_iext_count_may_overflow(ip, whichfork, 1);
5164-
if (error) {
5165-
ASSERT(S_ISDIR(VFS_I(ip)->i_mode) &&
5166-
whichfork == XFS_DATA_FORK);
5167-
error = -ENOSPC;
5168-
goto done;
5169-
}
5170-
51715152
old = got;
51725153

51735154
got.br_blockcount = del->br_startoff - got.br_startoff;
@@ -5641,7 +5622,7 @@ xfs_bmse_can_merge(
56415622
if ((left->br_startoff + left->br_blockcount != startoff) ||
56425623
(left->br_startblock + left->br_blockcount != got->br_startblock) ||
56435624
(left->br_state != got->br_state) ||
5644-
(left->br_blockcount + got->br_blockcount > MAXEXTLEN))
5625+
(left->br_blockcount + got->br_blockcount > XFS_MAX_BMBT_EXTLEN))
56455626
return false;
56465627

56475628
return true;

fs/xfs/libxfs/xfs_bmap_btree.c

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -597,7 +597,11 @@ xfs_bmbt_maxrecs(
597597
return xfs_bmbt_block_maxrecs(blocklen, leaf);
598598
}
599599

600-
/* Compute the max possible height for block mapping btrees. */
600+
/*
601+
* Calculate the maximum possible height of the btree that the on-disk format
602+
* supports. This is used for sizing structures large enough to support every
603+
* possible configuration of a filesystem that might get mounted.
604+
*/
601605
unsigned int
602606
xfs_bmbt_maxlevels_ondisk(void)
603607
{
@@ -611,7 +615,8 @@ xfs_bmbt_maxlevels_ondisk(void)
611615
minrecs[1] = xfs_bmbt_block_maxrecs(blocklen, false) / 2;
612616

613617
/* One extra level for the inode root. */
614-
return xfs_btree_compute_maxlevels(minrecs, MAXEXTNUM) + 1;
618+
return xfs_btree_compute_maxlevels(minrecs,
619+
XFS_MAX_EXTCNT_DATA_FORK_LARGE) + 1;
615620
}
616621

617622
/*

fs/xfs/libxfs/xfs_da_btree.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ struct xfs_da_geometry {
3030
unsigned int free_hdr_size; /* dir2 free header size */
3131
unsigned int free_max_bests; /* # of bests entries in dir2 free */
3232
xfs_dablk_t freeblk; /* blockno of free data v2 */
33+
xfs_extnum_t max_extents; /* Max. extents in corresponding fork */
3334

3435
xfs_dir2_data_aoff_t data_first_offset;
3536
size_t data_entry_offset;

fs/xfs/libxfs/xfs_da_format.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,7 @@ xfs_dir2_sf_firstentry(struct xfs_dir2_sf_hdr *hdr)
277277
* Directory address space divided into sections,
278278
* spaces separated by 32GB.
279279
*/
280+
#define XFS_DIR2_MAX_SPACES 3
280281
#define XFS_DIR2_SPACE_SIZE (1ULL << (32 + XFS_DIR2_DATA_ALIGN_LOG))
281282
#define XFS_DIR2_DATA_SPACE 0
282283
#define XFS_DIR2_DATA_OFFSET (XFS_DIR2_DATA_SPACE * XFS_DIR2_SPACE_SIZE)

fs/xfs/libxfs/xfs_dir2.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,8 @@ xfs_da_mount(
150150
dageo->freeblk = xfs_dir2_byte_to_da(dageo, XFS_DIR2_FREE_OFFSET);
151151
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
152152
(uint)sizeof(xfs_da_node_entry_t);
153+
dageo->max_extents = (XFS_DIR2_MAX_SPACES * XFS_DIR2_SPACE_SIZE) >>
154+
mp->m_sb.sb_blocklog;
153155
dageo->magicpct = (dageo->blksize * 37) / 100;
154156

155157
/* set up attribute geometry - single fsb only */
@@ -161,6 +163,12 @@ xfs_da_mount(
161163
dageo->node_hdr_size = mp->m_dir_geo->node_hdr_size;
162164
dageo->node_ents = (dageo->blksize - dageo->node_hdr_size) /
163165
(uint)sizeof(xfs_da_node_entry_t);
166+
167+
if (xfs_has_large_extent_counts(mp))
168+
dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_LARGE;
169+
else
170+
dageo->max_extents = XFS_MAX_EXTCNT_ATTR_FORK_SMALL;
171+
164172
dageo->magicpct = (dageo->blksize * 37) / 100;
165173
return 0;
166174
}

0 commit comments

Comments
 (0)