Skip to content

Commit 3ba3ab1

Browse files
Darrick J. WongChandan Babu R
authored andcommitted
xfs: enable FITRIM on the realtime device
Implement FITRIM for the realtime device by pretending that it's "space" immediately after the data device. We have to hold the rtbitmap ILOCK while the discard operations are ongoing because there's no busy extent tracking for the rt volume to prevent reallocations. Cc: Konst Mayer <[email protected]> Signed-off-by: Darrick J. Wong <[email protected]> Reviewed-by: Christoph Hellwig <[email protected]> Signed-off-by: Chandan Babu R <[email protected]>
1 parent a330cae commit 3ba3ab1

File tree

2 files changed

+308
-24
lines changed

2 files changed

+308
-24
lines changed

fs/xfs/xfs_discard.c

Lines changed: 279 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "xfs_log.h"
2121
#include "xfs_ag.h"
2222
#include "xfs_health.h"
23+
#include "xfs_rtbitmap.h"
2324

2425
/*
2526
* Notes on an efficient, low latency fstrim algorithm
@@ -322,7 +323,7 @@ xfs_trim_should_stop(void)
322323
* we found in the last batch as the key to start the next.
323324
*/
324325
static int
325-
xfs_trim_extents(
326+
xfs_trim_perag_extents(
326327
struct xfs_perag *pag,
327328
xfs_agblock_t start,
328329
xfs_agblock_t end,
@@ -383,6 +384,259 @@ xfs_trim_extents(
383384

384385
}
385386

387+
static int
388+
xfs_trim_datadev_extents(
389+
struct xfs_mount *mp,
390+
xfs_daddr_t start,
391+
xfs_daddr_t end,
392+
xfs_extlen_t minlen,
393+
uint64_t *blocks_trimmed)
394+
{
395+
xfs_agnumber_t start_agno, end_agno;
396+
xfs_agblock_t start_agbno, end_agbno;
397+
xfs_daddr_t ddev_end;
398+
struct xfs_perag *pag;
399+
int last_error = 0, error;
400+
401+
ddev_end = min_t(xfs_daddr_t, end,
402+
XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1);
403+
404+
start_agno = xfs_daddr_to_agno(mp, start);
405+
start_agbno = xfs_daddr_to_agbno(mp, start);
406+
end_agno = xfs_daddr_to_agno(mp, ddev_end);
407+
end_agbno = xfs_daddr_to_agbno(mp, ddev_end);
408+
409+
for_each_perag_range(mp, start_agno, end_agno, pag) {
410+
xfs_agblock_t agend = pag->block_count;
411+
412+
if (start_agno == end_agno)
413+
agend = end_agbno;
414+
error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen,
415+
blocks_trimmed);
416+
if (error)
417+
last_error = error;
418+
419+
if (xfs_trim_should_stop()) {
420+
xfs_perag_rele(pag);
421+
break;
422+
}
423+
start_agbno = 0;
424+
}
425+
426+
return last_error;
427+
}
428+
429+
#ifdef CONFIG_XFS_RT
430+
struct xfs_trim_rtdev {
431+
/* list of rt extents to free */
432+
struct list_head extent_list;
433+
434+
/* pointer to count of blocks trimmed */
435+
uint64_t *blocks_trimmed;
436+
437+
/* minimum length that caller allows us to trim */
438+
xfs_rtblock_t minlen_fsb;
439+
440+
/* restart point for the rtbitmap walk */
441+
xfs_rtxnum_t restart_rtx;
442+
443+
/* stopping point for the current rtbitmap walk */
444+
xfs_rtxnum_t stop_rtx;
445+
};
446+
447+
struct xfs_rtx_busy {
448+
struct list_head list;
449+
xfs_rtblock_t bno;
450+
xfs_rtblock_t length;
451+
};
452+
453+
static void
454+
xfs_discard_free_rtdev_extents(
455+
struct xfs_trim_rtdev *tr)
456+
{
457+
struct xfs_rtx_busy *busyp, *n;
458+
459+
list_for_each_entry_safe(busyp, n, &tr->extent_list, list) {
460+
list_del_init(&busyp->list);
461+
kfree(busyp);
462+
}
463+
}
464+
465+
/*
466+
* Walk the discard list and issue discards on all the busy extents in the
467+
* list. We plug and chain the bios so that we only need a single completion
468+
* call to clear all the busy extents once the discards are complete.
469+
*/
470+
static int
471+
xfs_discard_rtdev_extents(
472+
struct xfs_mount *mp,
473+
struct xfs_trim_rtdev *tr)
474+
{
475+
struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
476+
struct xfs_rtx_busy *busyp;
477+
struct bio *bio = NULL;
478+
struct blk_plug plug;
479+
xfs_rtblock_t start = NULLRTBLOCK, length = 0;
480+
int error = 0;
481+
482+
blk_start_plug(&plug);
483+
list_for_each_entry(busyp, &tr->extent_list, list) {
484+
if (start == NULLRTBLOCK)
485+
start = busyp->bno;
486+
length += busyp->length;
487+
488+
trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length);
489+
490+
error = __blkdev_issue_discard(bdev,
491+
XFS_FSB_TO_BB(mp, busyp->bno),
492+
XFS_FSB_TO_BB(mp, busyp->length),
493+
GFP_NOFS, &bio);
494+
if (error)
495+
break;
496+
}
497+
xfs_discard_free_rtdev_extents(tr);
498+
499+
if (bio) {
500+
error = submit_bio_wait(bio);
501+
if (error == -EOPNOTSUPP)
502+
error = 0;
503+
if (error)
504+
xfs_info(mp,
505+
"discard failed for rtextent [0x%llx,%llu], error %d",
506+
(unsigned long long)start,
507+
(unsigned long long)length,
508+
error);
509+
bio_put(bio);
510+
}
511+
blk_finish_plug(&plug);
512+
513+
return error;
514+
}
515+
516+
static int
517+
xfs_trim_gather_rtextent(
518+
struct xfs_mount *mp,
519+
struct xfs_trans *tp,
520+
const struct xfs_rtalloc_rec *rec,
521+
void *priv)
522+
{
523+
struct xfs_trim_rtdev *tr = priv;
524+
struct xfs_rtx_busy *busyp;
525+
xfs_rtblock_t rbno, rlen;
526+
527+
if (rec->ar_startext > tr->stop_rtx) {
528+
/*
529+
* If we've scanned a large number of rtbitmap blocks, update
530+
* the cursor to point at this extent so we restart the next
531+
* batch from this extent.
532+
*/
533+
tr->restart_rtx = rec->ar_startext;
534+
return -ECANCELED;
535+
}
536+
537+
rbno = xfs_rtx_to_rtb(mp, rec->ar_startext);
538+
rlen = xfs_rtx_to_rtb(mp, rec->ar_extcount);
539+
540+
/* Ignore too small. */
541+
if (rlen < tr->minlen_fsb) {
542+
trace_xfs_discard_rttoosmall(mp, rbno, rlen);
543+
return 0;
544+
}
545+
546+
busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL);
547+
if (!busyp)
548+
return -ENOMEM;
549+
550+
busyp->bno = rbno;
551+
busyp->length = rlen;
552+
INIT_LIST_HEAD(&busyp->list);
553+
list_add_tail(&busyp->list, &tr->extent_list);
554+
*tr->blocks_trimmed += rlen;
555+
556+
tr->restart_rtx = rec->ar_startext + rec->ar_extcount;
557+
return 0;
558+
}
559+
560+
static int
561+
xfs_trim_rtdev_extents(
562+
struct xfs_mount *mp,
563+
xfs_daddr_t start,
564+
xfs_daddr_t end,
565+
xfs_daddr_t minlen,
566+
uint64_t *blocks_trimmed)
567+
{
568+
struct xfs_rtalloc_rec low = { };
569+
struct xfs_rtalloc_rec high = { };
570+
struct xfs_trim_rtdev tr = {
571+
.blocks_trimmed = blocks_trimmed,
572+
.minlen_fsb = XFS_BB_TO_FSB(mp, minlen),
573+
};
574+
struct xfs_trans *tp;
575+
xfs_daddr_t rtdev_daddr;
576+
int error;
577+
578+
INIT_LIST_HEAD(&tr.extent_list);
579+
580+
/* Shift the start and end downwards to match the rt device. */
581+
rtdev_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
582+
if (start > rtdev_daddr)
583+
start -= rtdev_daddr;
584+
else
585+
start = 0;
586+
587+
if (end <= rtdev_daddr)
588+
return 0;
589+
end -= rtdev_daddr;
590+
591+
error = xfs_trans_alloc_empty(mp, &tp);
592+
if (error)
593+
return error;
594+
595+
end = min_t(xfs_daddr_t, end,
596+
XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks) - 1);
597+
598+
/* Convert the rt blocks to rt extents */
599+
low.ar_startext = xfs_rtb_to_rtxup(mp, XFS_BB_TO_FSB(mp, start));
600+
high.ar_startext = xfs_rtb_to_rtx(mp, XFS_BB_TO_FSBT(mp, end));
601+
602+
/*
603+
* Walk the free ranges between low and high. The query_range function
604+
* trims the extents returned.
605+
*/
606+
do {
607+
tr.stop_rtx = low.ar_startext + (mp->m_sb.sb_blocksize * NBBY);
608+
xfs_rtbitmap_lock_shared(mp, XFS_RBMLOCK_BITMAP);
609+
error = xfs_rtalloc_query_range(mp, tp, &low, &high,
610+
xfs_trim_gather_rtextent, &tr);
611+
612+
if (error == -ECANCELED)
613+
error = 0;
614+
if (error) {
615+
xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
616+
xfs_discard_free_rtdev_extents(&tr);
617+
break;
618+
}
619+
620+
if (list_empty(&tr.extent_list)) {
621+
xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
622+
break;
623+
}
624+
625+
error = xfs_discard_rtdev_extents(mp, &tr);
626+
xfs_rtbitmap_unlock_shared(mp, XFS_RBMLOCK_BITMAP);
627+
if (error)
628+
break;
629+
630+
low.ar_startext = tr.restart_rtx;
631+
} while (!xfs_trim_should_stop() && low.ar_startext <= high.ar_startext);
632+
633+
xfs_trans_cancel(tp);
634+
return error;
635+
}
636+
#else
637+
# define xfs_trim_rtdev_extents(m,s,e,n,b) (-EOPNOTSUPP)
638+
#endif /* CONFIG_XFS_RT */
639+
386640
/*
387641
* trim a range of the filesystem.
388642
*
@@ -391,28 +645,37 @@ xfs_trim_extents(
391645
* addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format
392646
* is a linear address range. Hence we need to use DADDR based conversions and
393647
* comparisons for determining the correct offset and regions to trim.
648+
*
649+
* The realtime device is mapped into the FITRIM "address space" immediately
650+
* after the data device.
394651
*/
395652
int
396653
xfs_ioc_trim(
397654
struct xfs_mount *mp,
398655
struct fstrim_range __user *urange)
399656
{
400-
struct xfs_perag *pag;
401657
unsigned int granularity =
402658
bdev_discard_granularity(mp->m_ddev_targp->bt_bdev);
659+
struct block_device *rt_bdev = NULL;
403660
struct fstrim_range range;
404661
xfs_daddr_t start, end;
405662
xfs_extlen_t minlen;
406-
xfs_agnumber_t start_agno, end_agno;
407-
xfs_agblock_t start_agbno, end_agbno;
663+
xfs_rfsblock_t max_blocks;
408664
uint64_t blocks_trimmed = 0;
409665
int error, last_error = 0;
410666

411667
if (!capable(CAP_SYS_ADMIN))
412668
return -EPERM;
413-
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev))
669+
if (mp->m_rtdev_targp &&
670+
bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev))
671+
rt_bdev = mp->m_rtdev_targp->bt_bdev;
672+
if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev)
414673
return -EOPNOTSUPP;
415674

675+
if (rt_bdev)
676+
granularity = max(granularity,
677+
bdev_discard_granularity(rt_bdev));
678+
416679
/*
417680
* We haven't recovered the log, so we cannot use our bnobt-guided
418681
* storage zapping commands.
@@ -433,35 +696,27 @@ xfs_ioc_trim(
433696
* used by the fstrim application. In the end it really doesn't
434697
* matter as trimming blocks is an advisory interface.
435698
*/
436-
if (range.start >= XFS_FSB_TO_B(mp, mp->m_sb.sb_dblocks) ||
699+
max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks;
700+
if (range.start >= XFS_FSB_TO_B(mp, max_blocks) ||
437701
range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) ||
438702
range.len < mp->m_sb.sb_blocksize)
439703
return -EINVAL;
440704

441705
start = BTOBB(range.start);
442-
end = min_t(xfs_daddr_t, start + BTOBBT(range.len),
443-
XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) - 1;
706+
end = start + BTOBBT(range.len) - 1;
444707

445-
start_agno = xfs_daddr_to_agno(mp, start);
446-
start_agbno = xfs_daddr_to_agbno(mp, start);
447-
end_agno = xfs_daddr_to_agno(mp, end);
448-
end_agbno = xfs_daddr_to_agbno(mp, end);
449-
450-
for_each_perag_range(mp, start_agno, end_agno, pag) {
451-
xfs_agblock_t agend = pag->block_count;
452-
453-
if (start_agno == end_agno)
454-
agend = end_agbno;
455-
error = xfs_trim_extents(pag, start_agbno, agend, minlen,
708+
if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) {
709+
error = xfs_trim_datadev_extents(mp, start, end, minlen,
456710
&blocks_trimmed);
457711
if (error)
458712
last_error = error;
713+
}
459714

460-
if (xfs_trim_should_stop()) {
461-
xfs_perag_rele(pag);
462-
break;
463-
}
464-
start_agbno = 0;
715+
if (rt_bdev && !xfs_trim_should_stop()) {
716+
error = xfs_trim_rtdev_extents(mp, start, end, minlen,
717+
&blocks_trimmed);
718+
if (error)
719+
last_error = error;
465720
}
466721

467722
if (last_error)

0 commit comments

Comments
 (0)