Skip to content

Commit 50c5225

Browse files
isilenceaxboe
authored andcommitted
block: implement async io_uring discard cmd
io_uring allows implementing custom file specific asynchronous operations via the fops->uring_cmd callback, a.k.a. IORING_OP_URING_CMD requests or just io_uring commands. Use it to add support for async discards. Normally, it first tries to queue up bios in a non-blocking context, and if that fails, we'd retry from a blocking context by returning -EAGAIN to the core io_uring. We always get the result from bios asynchronously by setting a custom bi_end_io callback, at which point we drag the request into the task context to either reissue or complete it and post a completion to the user. Unlike ioctl(BLKDISCARD) with stronger guarantees against races, we only do a best effort attempt to invalidate page cache, and it can race with any writes and reads and leave page cache stale. It's the same kind of races we allow to direct writes. Also, apart from cases where discarding is not allowed at all, e.g. discards are not supported or the file/device is read only, the user should assume that the sector range on disk is not valid anymore, even when an error was returned to the user. Suggested-by: Conrad Meyer <[email protected]> Signed-off-by: Pavel Begunkov <[email protected]> Link: https://lore.kernel.org/r/2b5210443e4fa0257934f73dfafcc18a77cd0e09.1726072086.git.asml.silence@gmail.com Signed-off-by: Jens Axboe <[email protected]>
1 parent 7a07210 commit 50c5225

File tree

4 files changed

+129
-0
lines changed

4 files changed

+129
-0
lines changed

block/blk.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
609609
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
610610
loff_t lstart, loff_t lend);
611611
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
612+
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
612613
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
613614

614615
extern const struct address_space_operations def_blk_aops;

block/fops.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <linux/fs.h>
1818
#include <linux/iomap.h>
1919
#include <linux/module.h>
20+
#include <linux/io_uring/cmd.h>
2021
#include "blk.h"
2122

2223
static inline struct inode *bdev_file_inode(struct file *file)
@@ -873,6 +874,7 @@ const struct file_operations def_blk_fops = {
873874
.splice_read = filemap_splice_read,
874875
.splice_write = iter_file_splice_write,
875876
.fallocate = blkdev_fallocate,
877+
.uring_cmd = blkdev_uring_cmd,
876878
.fop_flags = FOP_BUFFER_RASYNC,
877879
};
878880

block/ioctl.c

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#include <linux/blktrace_api.h>
1212
#include <linux/pr.h>
1313
#include <linux/uaccess.h>
14+
#include <linux/pagemap.h>
15+
#include <linux/io_uring/cmd.h>
16+
#include <uapi/linux/blkdev.h>
1417
#include "blk.h"
1518

1619
static int blkpg_do_ioctl(struct block_device *bdev,
@@ -748,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
748751
return ret;
749752
}
750753
#endif
754+
755+
struct blk_iou_cmd {
756+
int res;
757+
bool nowait;
758+
};
759+
760+
static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
761+
{
762+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
763+
764+
if (bic->res == -EAGAIN && bic->nowait)
765+
io_uring_cmd_issue_blocking(cmd);
766+
else
767+
io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
768+
}
769+
770+
static void bio_cmd_bio_end_io(struct bio *bio)
771+
{
772+
struct io_uring_cmd *cmd = bio->bi_private;
773+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
774+
775+
if (unlikely(bio->bi_status) && !bic->res)
776+
bic->res = blk_status_to_errno(bio->bi_status);
777+
778+
io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
779+
bio_put(bio);
780+
}
781+
782+
static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
783+
struct block_device *bdev,
784+
uint64_t start, uint64_t len, bool nowait)
785+
{
786+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
787+
gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
788+
sector_t sector = start >> SECTOR_SHIFT;
789+
sector_t nr_sects = len >> SECTOR_SHIFT;
790+
struct bio *prev = NULL, *bio;
791+
int err;
792+
793+
if (!bdev_max_discard_sectors(bdev))
794+
return -EOPNOTSUPP;
795+
if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
796+
return -EBADF;
797+
if (bdev_read_only(bdev))
798+
return -EPERM;
799+
err = blk_validate_byte_range(bdev, start, len);
800+
if (err)
801+
return err;
802+
803+
err = filemap_invalidate_pages(bdev->bd_mapping, start,
804+
start + len - 1, nowait);
805+
if (err)
806+
return err;
807+
808+
while (true) {
809+
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
810+
if (!bio)
811+
break;
812+
if (nowait) {
813+
/*
814+
* Don't allow multi-bio non-blocking submissions as
815+
* subsequent bios may fail but we won't get a direct
816+
* indication of that. Normally, the caller should
817+
* retry from a blocking context.
818+
*/
819+
if (unlikely(nr_sects)) {
820+
bio_put(bio);
821+
return -EAGAIN;
822+
}
823+
bio->bi_opf |= REQ_NOWAIT;
824+
}
825+
826+
prev = bio_chain_and_submit(prev, bio);
827+
}
828+
if (unlikely(!prev))
829+
return -EAGAIN;
830+
if (unlikely(nr_sects))
831+
bic->res = -EAGAIN;
832+
833+
prev->bi_private = cmd;
834+
prev->bi_end_io = bio_cmd_bio_end_io;
835+
submit_bio(prev);
836+
return -EIOCBQUEUED;
837+
}
838+
839+
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
840+
{
841+
struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
842+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
843+
const struct io_uring_sqe *sqe = cmd->sqe;
844+
u32 cmd_op = cmd->cmd_op;
845+
uint64_t start, len;
846+
847+
if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
848+
sqe->rw_flags || sqe->file_index))
849+
return -EINVAL;
850+
851+
bic->res = 0;
852+
bic->nowait = issue_flags & IO_URING_F_NONBLOCK;
853+
854+
start = READ_ONCE(sqe->addr);
855+
len = READ_ONCE(sqe->addr3);
856+
857+
switch (cmd_op) {
858+
case BLOCK_URING_CMD_DISCARD:
859+
return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
860+
}
861+
return -EINVAL;
862+
}

include/uapi/linux/blkdev.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2+
#ifndef _UAPI_LINUX_BLKDEV_H
3+
#define _UAPI_LINUX_BLKDEV_H
4+
5+
#include <linux/ioctl.h>
6+
#include <linux/types.h>
7+
8+
/*
9+
* io_uring block file commands, see IORING_OP_URING_CMD.
10+
* It's a different number space from ioctl(), reuse the block's code 0x12.
11+
*/
12+
#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)
13+
14+
#endif

0 commit comments

Comments
 (0)