Skip to content

Commit 5719e28

Browse files
thejhaxboe
authored andcommitted
io_uring/rsrc: Simplify buffer cloning by locking both rings
The locking in the buffer cloning code is somewhat complex because it goes back and forth between locking the source ring and the destination ring. Make it easier to reason about by locking both rings at the same time. To avoid ABBA deadlocks, lock the rings in ascending kernel address order, just like in lock_two_nondirectories(). Signed-off-by: Jann Horn <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Jens Axboe <[email protected]>
1 parent 95ec54a commit 5719e28

File tree

1 file changed

+40
-33
lines changed

1 file changed

+40
-33
lines changed

io_uring/rsrc.c

Lines changed: 40 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -921,13 +921,26 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
921921
return 0;
922922
}
923923

924+
/* Lock two rings at once. The rings must be different! */
925+
static void lock_two_rings(struct io_ring_ctx *ctx1, struct io_ring_ctx *ctx2)
926+
{
927+
if (ctx1 > ctx2)
928+
swap(ctx1, ctx2);
929+
mutex_lock(&ctx1->uring_lock);
930+
mutex_lock_nested(&ctx2->uring_lock, SINGLE_DEPTH_NESTING);
931+
}
932+
933+
/* Both rings are locked by the caller. */
924934
static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx,
925935
struct io_uring_clone_buffers *arg)
926936
{
927937
struct io_rsrc_data data;
928938
int i, ret, off, nr;
929939
unsigned int nbufs;
930940

941+
lockdep_assert_held(&ctx->uring_lock);
942+
lockdep_assert_held(&src_ctx->uring_lock);
943+
931944
/*
932945
* Accounting state is shared between the two rings; that only works if
933946
* both rings are accounted towards the same counters.
@@ -942,7 +955,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
942955
if (ctx->buf_table.nr && !(arg->flags & IORING_REGISTER_DST_REPLACE))
943956
return -EBUSY;
944957

945-
nbufs = READ_ONCE(src_ctx->buf_table.nr);
958+
nbufs = src_ctx->buf_table.nr;
946959
if (!arg->nr)
947960
arg->nr = nbufs;
948961
else if (arg->nr > nbufs)
@@ -966,27 +979,20 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
966979
}
967980
}
968981

969-
/*
970-
* Drop our own lock here. We'll setup the data we need and reference
971-
* the source buffers, then re-grab, check, and assign at the end.
972-
*/
973-
mutex_unlock(&ctx->uring_lock);
974-
975-
mutex_lock(&src_ctx->uring_lock);
976982
ret = -ENXIO;
977983
nbufs = src_ctx->buf_table.nr;
978984
if (!nbufs)
979-
goto out_unlock;
985+
goto out_free;
980986
ret = -EINVAL;
981987
if (!arg->nr)
982988
arg->nr = nbufs;
983989
else if (arg->nr > nbufs)
984-
goto out_unlock;
990+
goto out_free;
985991
ret = -EOVERFLOW;
986992
if (check_add_overflow(arg->nr, arg->src_off, &off))
987-
goto out_unlock;
993+
goto out_free;
988994
if (off > nbufs)
989-
goto out_unlock;
995+
goto out_free;
990996

991997
off = arg->dst_off;
992998
i = arg->src_off;
@@ -1001,7 +1007,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
10011007
dst_node = io_rsrc_node_alloc(ctx, IORING_RSRC_BUFFER);
10021008
if (!dst_node) {
10031009
ret = -ENOMEM;
1004-
goto out_unlock;
1010+
goto out_free;
10051011
}
10061012

10071013
refcount_inc(&src_node->buf->refs);
@@ -1011,10 +1017,6 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
10111017
i++;
10121018
}
10131019

1014-
/* Have a ref on the bufs now, drop src lock and re-grab our own lock */
1015-
mutex_unlock(&src_ctx->uring_lock);
1016-
mutex_lock(&ctx->uring_lock);
1017-
10181020
/*
10191021
* If asked for replace, put the old table. data->nodes[] holds both
10201022
* old and new nodes at this point.
@@ -1023,24 +1025,17 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
10231025
io_rsrc_data_free(ctx, &ctx->buf_table);
10241026

10251027
/*
1026-
* ctx->buf_table should be empty now - either the contents are being
1027-
* replaced and we just freed the table, or someone raced setting up
1028-
* a buffer table while the clone was happening. If not empty, fall
1029-
* through to failure handling.
1028+
* ctx->buf_table must be empty now - either the contents are being
1029+
* replaced and we just freed the table, or the contents are being
1030+
* copied to a ring that does not have buffers yet (checked at function
1031+
* entry).
10301032
*/
1031-
if (!ctx->buf_table.nr) {
1032-
ctx->buf_table = data;
1033-
return 0;
1034-
}
1033+
WARN_ON_ONCE(ctx->buf_table.nr);
1034+
ctx->buf_table = data;
1035+
return 0;
10351036

1036-
mutex_unlock(&ctx->uring_lock);
1037-
mutex_lock(&src_ctx->uring_lock);
1038-
/* someone raced setting up buffers, dump ours */
1039-
ret = -EBUSY;
1040-
out_unlock:
1037+
out_free:
10411038
io_rsrc_data_free(ctx, &data);
1042-
mutex_unlock(&src_ctx->uring_lock);
1043-
mutex_lock(&ctx->uring_lock);
10441039
return ret;
10451040
}
10461041

@@ -1054,6 +1049,7 @@ static int io_clone_buffers(struct io_ring_ctx *ctx, struct io_ring_ctx *src_ctx
10541049
int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
10551050
{
10561051
struct io_uring_clone_buffers buf;
1052+
struct io_ring_ctx *src_ctx;
10571053
bool registered_src;
10581054
struct file *file;
10591055
int ret;
@@ -1071,7 +1067,18 @@ int io_register_clone_buffers(struct io_ring_ctx *ctx, void __user *arg)
10711067
file = io_uring_register_get_file(buf.src_fd, registered_src);
10721068
if (IS_ERR(file))
10731069
return PTR_ERR(file);
1074-
ret = io_clone_buffers(ctx, file->private_data, &buf);
1070+
1071+
src_ctx = file->private_data;
1072+
if (src_ctx != ctx) {
1073+
mutex_unlock(&ctx->uring_lock);
1074+
lock_two_rings(ctx, src_ctx);
1075+
}
1076+
1077+
ret = io_clone_buffers(ctx, src_ctx, &buf);
1078+
1079+
if (src_ctx != ctx)
1080+
mutex_unlock(&src_ctx->uring_lock);
1081+
10751082
if (!registered_src)
10761083
fput(file);
10771084
return ret;

0 commit comments

Comments
 (0)