Skip to content

Commit adfc3de

Browse files
committed
Merge tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux
Pull io_uring async discard support from Jens Axboe: "Sitting on top of both the 6.12 block and io_uring core branches, here's support for async discard through io_uring. This allows applications to issue async discards, rather than rely on the blocking sync ioctl discards we already have. The sync support is difficult to use outside of idle/cleanup periods. On a real (but slow) device, testing shows the following results when compared to sync discard: qd64 sync discard: 21K IOPS, lat avg 3 msec (max 21 msec) qd64 async discard: 76K IOPS, lat avg 845 usec (max 2.2 msec) qd64 sync discard: 14K IOPS, lat avg 5 msec (max 25 msec) qd64 async discard: 56K IOPS, lat avg 1153 usec (max 3.6 msec) and synthetic null_blk testing with the same queue depth and block size settings as above shows: Type Trim size IOPS Lat avg (usec) Lat Max (usec) ============================================================== sync 4k 144K 444 20314 async 4k 1353K 47 595 sync 1M 56K 1136 21031 async 1M 94K 680 760" * tag 'for-6.12/io_uring-discard-20240913' of git://git.kernel.dk/linux: block: implement async io_uring discard cmd block: introduce blk_validate_byte_range() filemap: introduce filemap_invalidate_pages io_uring/cmd: give inline space in request to cmds io_uring/cmd: expose iowq to cmds
2 parents 26bb0d3 + 50c5225 commit adfc3de

File tree

10 files changed

+209
-24
lines changed

10 files changed

+209
-24
lines changed

block/blk.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -609,6 +609,7 @@ blk_mode_t file_to_blk_mode(struct file *file);
609609
int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
610610
loff_t lstart, loff_t lend);
611611
long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
612+
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags);
612613
long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
613614

614615
extern const struct address_space_operations def_blk_aops;

block/fops.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <linux/fs.h>
1818
#include <linux/iomap.h>
1919
#include <linux/module.h>
20+
#include <linux/io_uring/cmd.h>
2021
#include "blk.h"
2122

2223
static inline struct inode *bdev_file_inode(struct file *file)
@@ -865,6 +866,7 @@ const struct file_operations def_blk_fops = {
865866
.splice_read = filemap_splice_read,
866867
.splice_write = iter_file_splice_write,
867868
.fallocate = blkdev_fallocate,
869+
.uring_cmd = blkdev_uring_cmd,
868870
.fop_flags = FOP_BUFFER_RASYNC,
869871
};
870872

block/ioctl.c

Lines changed: 144 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
#include <linux/blktrace_api.h>
1212
#include <linux/pr.h>
1313
#include <linux/uaccess.h>
14+
#include <linux/pagemap.h>
15+
#include <linux/io_uring/cmd.h>
16+
#include <uapi/linux/blkdev.h>
1417
#include "blk.h"
1518

1619
static int blkpg_do_ioctl(struct block_device *bdev,
@@ -92,41 +95,54 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
9295
}
9396
#endif
9497

98+
/*
99+
* Check that [start, start + len) is a valid range from the block device's
100+
* perspective, including verifying that it can be correctly translated into
101+
* logical block addresses.
102+
*/
103+
static int blk_validate_byte_range(struct block_device *bdev,
104+
uint64_t start, uint64_t len)
105+
{
106+
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
107+
uint64_t end;
108+
109+
if ((start | len) & bs_mask)
110+
return -EINVAL;
111+
if (!len)
112+
return -EINVAL;
113+
if (check_add_overflow(start, len, &end) || end > bdev_nr_bytes(bdev))
114+
return -EINVAL;
115+
116+
return 0;
117+
}
118+
95119
static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
96120
unsigned long arg)
97121
{
98-
unsigned int bs_mask = bdev_logical_block_size(bdev) - 1;
99-
uint64_t range[2], start, len, end;
122+
uint64_t range[2], start, len;
100123
struct bio *prev = NULL, *bio;
101124
sector_t sector, nr_sects;
102125
struct blk_plug plug;
103126
int err;
104127

105-
if (!(mode & BLK_OPEN_WRITE))
106-
return -EBADF;
107-
108-
if (!bdev_max_discard_sectors(bdev))
109-
return -EOPNOTSUPP;
110-
if (bdev_read_only(bdev))
111-
return -EPERM;
112-
113128
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
114129
return -EFAULT;
115-
116130
start = range[0];
117131
len = range[1];
118132

119-
if (!len)
120-
return -EINVAL;
121-
if ((start | len) & bs_mask)
122-
return -EINVAL;
133+
if (!bdev_max_discard_sectors(bdev))
134+
return -EOPNOTSUPP;
123135

124-
if (check_add_overflow(start, len, &end) ||
125-
end > bdev_nr_bytes(bdev))
126-
return -EINVAL;
136+
if (!(mode & BLK_OPEN_WRITE))
137+
return -EBADF;
138+
if (bdev_read_only(bdev))
139+
return -EPERM;
140+
err = blk_validate_byte_range(bdev, start, len);
141+
if (err)
142+
return err;
127143

128144
filemap_invalidate_lock(bdev->bd_mapping);
129-
err = truncate_bdev_range(bdev, mode, start, end - 1);
145+
err = truncate_bdev_range(bdev, mode, start, start + len - 1);
130146
if (err)
131147
goto fail;
132148

@@ -735,3 +751,112 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
735751
return ret;
736752
}
737753
#endif
754+
755+
struct blk_iou_cmd {
756+
int res;
757+
bool nowait;
758+
};
759+
760+
static void blk_cmd_complete(struct io_uring_cmd *cmd, unsigned int issue_flags)
761+
{
762+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
763+
764+
if (bic->res == -EAGAIN && bic->nowait)
765+
io_uring_cmd_issue_blocking(cmd);
766+
else
767+
io_uring_cmd_done(cmd, bic->res, 0, issue_flags);
768+
}
769+
770+
static void bio_cmd_bio_end_io(struct bio *bio)
771+
{
772+
struct io_uring_cmd *cmd = bio->bi_private;
773+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
774+
775+
if (unlikely(bio->bi_status) && !bic->res)
776+
bic->res = blk_status_to_errno(bio->bi_status);
777+
778+
io_uring_cmd_do_in_task_lazy(cmd, blk_cmd_complete);
779+
bio_put(bio);
780+
}
781+
782+
static int blkdev_cmd_discard(struct io_uring_cmd *cmd,
783+
struct block_device *bdev,
784+
uint64_t start, uint64_t len, bool nowait)
785+
{
786+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
787+
gfp_t gfp = nowait ? GFP_NOWAIT : GFP_KERNEL;
788+
sector_t sector = start >> SECTOR_SHIFT;
789+
sector_t nr_sects = len >> SECTOR_SHIFT;
790+
struct bio *prev = NULL, *bio;
791+
int err;
792+
793+
if (!bdev_max_discard_sectors(bdev))
794+
return -EOPNOTSUPP;
795+
if (!(file_to_blk_mode(cmd->file) & BLK_OPEN_WRITE))
796+
return -EBADF;
797+
if (bdev_read_only(bdev))
798+
return -EPERM;
799+
err = blk_validate_byte_range(bdev, start, len);
800+
if (err)
801+
return err;
802+
803+
err = filemap_invalidate_pages(bdev->bd_mapping, start,
804+
start + len - 1, nowait);
805+
if (err)
806+
return err;
807+
808+
while (true) {
809+
bio = blk_alloc_discard_bio(bdev, &sector, &nr_sects, gfp);
810+
if (!bio)
811+
break;
812+
if (nowait) {
813+
/*
814+
* Don't allow multi-bio non-blocking submissions as
815+
* subsequent bios may fail but we won't get a direct
816+
* indication of that. Normally, the caller should
817+
* retry from a blocking context.
818+
*/
819+
if (unlikely(nr_sects)) {
820+
bio_put(bio);
821+
return -EAGAIN;
822+
}
823+
bio->bi_opf |= REQ_NOWAIT;
824+
}
825+
826+
prev = bio_chain_and_submit(prev, bio);
827+
}
828+
if (unlikely(!prev))
829+
return -EAGAIN;
830+
if (unlikely(nr_sects))
831+
bic->res = -EAGAIN;
832+
833+
prev->bi_private = cmd;
834+
prev->bi_end_io = bio_cmd_bio_end_io;
835+
submit_bio(prev);
836+
return -EIOCBQUEUED;
837+
}
838+
839+
int blkdev_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
840+
{
841+
struct block_device *bdev = I_BDEV(cmd->file->f_mapping->host);
842+
struct blk_iou_cmd *bic = io_uring_cmd_to_pdu(cmd, struct blk_iou_cmd);
843+
const struct io_uring_sqe *sqe = cmd->sqe;
844+
u32 cmd_op = cmd->cmd_op;
845+
uint64_t start, len;
846+
847+
if (unlikely(sqe->ioprio || sqe->__pad1 || sqe->len ||
848+
sqe->rw_flags || sqe->file_index))
849+
return -EINVAL;
850+
851+
bic->res = 0;
852+
bic->nowait = issue_flags & IO_URING_F_NONBLOCK;
853+
854+
start = READ_ONCE(sqe->addr);
855+
len = READ_ONCE(sqe->addr3);
856+
857+
switch (cmd_op) {
858+
case BLOCK_URING_CMD_DISCARD:
859+
return blkdev_cmd_discard(cmd, bdev, start, len, bic->nowait);
860+
}
861+
return -EINVAL;
862+
}

include/linux/io_uring/cmd.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,15 @@ static inline const void *io_uring_sqe_cmd(const struct io_uring_sqe *sqe)
2323
return sqe->cmd;
2424
}
2525

26+
static inline void io_uring_cmd_private_sz_check(size_t cmd_sz)
27+
{
28+
BUILD_BUG_ON(cmd_sz > sizeof_field(struct io_uring_cmd, pdu));
29+
}
30+
#define io_uring_cmd_to_pdu(cmd, pdu_type) ( \
31+
io_uring_cmd_private_sz_check(sizeof(pdu_type)), \
32+
((pdu_type *)&(cmd)->pdu) \
33+
)
34+
2635
#if defined(CONFIG_IO_URING)
2736
int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
2837
struct iov_iter *iter, void *ioucmd);
@@ -48,6 +57,9 @@ void __io_uring_cmd_do_in_task(struct io_uring_cmd *ioucmd,
4857
void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
4958
unsigned int issue_flags);
5059

60+
/* Execute the request from a blocking context */
61+
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd);
62+
5163
#else
5264
static inline int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
5365
struct iov_iter *iter, void *ioucmd)
@@ -67,6 +79,9 @@ static inline void io_uring_cmd_mark_cancelable(struct io_uring_cmd *cmd,
6779
unsigned int issue_flags)
6880
{
6981
}
82+
static inline void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
83+
{
84+
}
7085
#endif
7186

7287
/*

include/linux/pagemap.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
3232
pgoff_t start, pgoff_t end);
3333
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count);
3434
void kiocb_invalidate_post_direct_write(struct kiocb *iocb, size_t count);
35+
int filemap_invalidate_pages(struct address_space *mapping,
36+
loff_t pos, loff_t end, bool nowait);
3537

3638
int write_inode_now(struct inode *, int sync);
3739
int filemap_fdatawrite(struct address_space *);

include/uapi/linux/blkdev.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
2+
#ifndef _UAPI_LINUX_BLKDEV_H
3+
#define _UAPI_LINUX_BLKDEV_H
4+
5+
#include <linux/ioctl.h>
6+
#include <linux/types.h>
7+
8+
/*
9+
* io_uring block file commands, see IORING_OP_URING_CMD.
10+
* It's a different number space from ioctl(), reuse the block's code 0x12.
11+
*/
12+
#define BLOCK_URING_CMD_DISCARD _IO(0x12, 0)
13+
14+
#endif

io_uring/io_uring.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,17 @@ static void io_queue_iowq(struct io_kiocb *req)
533533
io_queue_linked_timeout(link);
534534
}
535535

536+
static void io_req_queue_iowq_tw(struct io_kiocb *req, struct io_tw_state *ts)
537+
{
538+
io_queue_iowq(req);
539+
}
540+
541+
void io_req_queue_iowq(struct io_kiocb *req)
542+
{
543+
req->io_task_work.func = io_req_queue_iowq_tw;
544+
io_req_task_work_add(req);
545+
}
546+
536547
static __cold void io_queue_deferred(struct io_ring_ctx *ctx)
537548
{
538549
while (!list_empty(&ctx->defer_list)) {

io_uring/io_uring.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ int io_uring_alloc_task_context(struct task_struct *task,
9494

9595
int io_ring_add_registered_file(struct io_uring_task *tctx, struct file *file,
9696
int start, int end);
97+
void io_req_queue_iowq(struct io_kiocb *req);
9798

9899
int io_poll_issue(struct io_kiocb *req, struct io_tw_state *ts);
99100
int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr);

io_uring/uring_cmd.c

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,13 @@ int io_uring_cmd_import_fixed(u64 ubuf, unsigned long len, int rw,
277277
}
278278
EXPORT_SYMBOL_GPL(io_uring_cmd_import_fixed);
279279

280+
void io_uring_cmd_issue_blocking(struct io_uring_cmd *ioucmd)
281+
{
282+
struct io_kiocb *req = cmd_to_io_kiocb(ioucmd);
283+
284+
io_req_queue_iowq(req);
285+
}
286+
280287
static inline int io_uring_cmd_getsockopt(struct socket *sock,
281288
struct io_uring_cmd *cmd,
282289
unsigned int issue_flags)

mm/filemap.c

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2712,14 +2712,12 @@ int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
27122712
}
27132713
EXPORT_SYMBOL_GPL(kiocb_write_and_wait);
27142714

2715-
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
2715+
int filemap_invalidate_pages(struct address_space *mapping,
2716+
loff_t pos, loff_t end, bool nowait)
27162717
{
2717-
struct address_space *mapping = iocb->ki_filp->f_mapping;
2718-
loff_t pos = iocb->ki_pos;
2719-
loff_t end = pos + count - 1;
27202718
int ret;
27212719

2722-
if (iocb->ki_flags & IOCB_NOWAIT) {
2720+
if (nowait) {
27232721
/* we could block if there are any pages in the range */
27242722
if (filemap_range_has_page(mapping, pos, end))
27252723
return -EAGAIN;
@@ -2738,6 +2736,15 @@ int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
27382736
return invalidate_inode_pages2_range(mapping, pos >> PAGE_SHIFT,
27392737
end >> PAGE_SHIFT);
27402738
}
2739+
2740+
int kiocb_invalidate_pages(struct kiocb *iocb, size_t count)
2741+
{
2742+
struct address_space *mapping = iocb->ki_filp->f_mapping;
2743+
2744+
return filemap_invalidate_pages(mapping, iocb->ki_pos,
2745+
iocb->ki_pos + count - 1,
2746+
iocb->ki_flags & IOCB_NOWAIT);
2747+
}
27412748
EXPORT_SYMBOL_GPL(kiocb_invalidate_pages);
27422749

27432750
/**

0 commit comments

Comments
 (0)