On 29/03/2023 17:11, Tudor Cretu wrote:
Introduce compat versions of the structs exposed in the uAPI headers that might contain pointers as a member. Also, implement functions that convert the compat versions to the native versions of the struct.
A subsequent patch is going to change the io_uring structs to enable them to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/io_uring_compat.h | 129 ++++++++++++++++++ include/linux/io_uring_types.h | 11 +- io_uring/cancel.c | 28 +++- io_uring/epoll.c | 2 +- io_uring/fdinfo.c | 81 ++++++----- io_uring/io_uring.c | 229 ++++++++++++++++++++++---------- io_uring/io_uring.h | 108 ++++++++++++--- io_uring/kbuf.c | 98 ++++++++++++-- io_uring/kbuf.h | 6 +- io_uring/net.c | 5 +- io_uring/rsrc.c | 110 +++++++++++++-- io_uring/tctx.c | 56 +++++++- io_uring/uring_cmd.h | 4 + 13 files changed, 716 insertions(+), 151 deletions(-) create mode 100644 include/linux/io_uring_compat.h
diff --git a/include/linux/io_uring_compat.h b/include/linux/io_uring_compat.h new file mode 100644 index 0000000000000..3e91babe2e2ba --- /dev/null +++ b/include/linux/io_uring_compat.h @@ -0,0 +1,129 @@
I have no idea why include/linux/io_uring.h has no license header, but in any case we should include one. The most common in recently added files in include/linux seems to be:
/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef IO_URING_COMPAT_H +#define IO_URING_COMPAT_H
[...]
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index bc8c9d764bc13..fd02317627ae7 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -48,6 +48,38 @@ static __cold int io_uring_show_cred(struct seq_file *m, unsigned int id, return 0; } +#define print_sqe(m, sqe, sq_idx, sq_shift) \
do { \
Nit: could reduce the indentation level by one (starting from one tab is enough).
seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " \
"addr:0x%llx, rw_flags:0x%x, buf_index:%d " \
"user_data:%llu", \
sq_idx, io_uring_get_opcode((sqe)->opcode), (sqe)->fd, \
Missing parentheses around sq_idx, also around cq_idx below.
(sqe)->flags, (unsigned long long) (sqe)->off, \
(unsigned long long) (sqe)->addr, (sqe)->rw_flags, \
(sqe)->buf_index, (sqe)->user_data); \
if (sq_shift) { \
u64 *sqeb = (void *) ((sqe) + 1); \
int size = sizeof(*(sqe)) / sizeof(u64); \
int j; \
\
for (j = 0; j < size; j++) { \
seq_printf(m, ", e%d:0x%llx", j, \
(unsigned long long) *sqeb); \
Nit: that line should be indented further.
sqeb++; \
} \
} \
} while (0)
+#define print_cqe(m, cqe, cq_idx, cq_shift) \
do { \
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", \
cq_idx, (cqe)->user_data, (cqe)->res, \
(cqe)->flags); \
if (cq_shift) \
seq_printf(m, ", extra1:%llu, extra2:%llu\n", \
(cqe)->big_cqe[0], (cqe)->big_cqe[1]); \
} while (0)
static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) { @@ -88,45 +120,32 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { unsigned int entry = i + sq_head;
struct io_uring_sqe *sqe;
unsigned int sq_idx;
unsigned int sq_idx, sq_off;
sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue;
sqe = &ctx->sq_sqes[sq_idx << sq_shift];
seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, "
"addr:0x%llx, rw_flags:0x%x, buf_index:%d "
"user_data:%llu",
sq_idx, io_uring_get_opcode(sqe->opcode), sqe->fd,
sqe->flags, (unsigned long long) sqe->off,
(unsigned long long) sqe->addr, sqe->rw_flags,
sqe->buf_index, sqe->user_data);
if (sq_shift) {
u64 *sqeb = (void *) (sqe + 1);
int size = sizeof(struct io_uring_sqe) / sizeof(u64);
int j;
for (j = 0; j < size; j++) {
seq_printf(m, ", e%d:0x%llx", j,
(unsigned long long) *sqeb);
sqeb++;
}
}
sq_off = sq_idx << sq_shift;
if (io_in_compat64(ctx))
print_sqe(m, &ctx->sq_sqes_compat[sq_off], sq_idx, sq_shift);
else
print_sqe(m, &ctx->sq_sqes[sq_off], sq_idx, sq_shift);
- seq_printf(m, "\n"); } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &ctx->cqes[(entry & cq_mask) << cq_shift];
seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x",
entry & cq_mask, cqe->user_data, cqe->res,
cqe->flags);
if (cq_shift)
seq_printf(m, ", extra1:%llu, extra2:%llu\n",
cqe->big_cqe[0], cqe->big_cqe[1]);
unsigned int cq_idx = entry & cq_mask;
unsigned int cq_off = cq_idx << cq_shift;
if (io_in_compat64(ctx))
print_cqe(m, &ctx->cqes_compat[cq_off], cq_idx, cq_shift);
else
print_cqe(m, &ctx->cqes[cq_off], cq_idx, cq_shift);
- seq_printf(m, "\n"); }
@@ -192,12 +211,14 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", cqe->user_data, cqe->res, cqe->flags);
Looks like another leftover.
} spin_unlock(&ctx->completion_lock); } +#undef print_sqe +#undef print_cqe
I think it's OK to leave the macros defined in the rest of the file - they are defined outside of any function and they could in principle be used by another function.
__cold void io_uring_show_fdinfo(struct seq_file *m, struct file *f) { struct io_ring_ctx *ctx = f->private_data; diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index fb6d07e1e7358..a355f2a2e7ac3 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -152,6 +152,37 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; +static int get_compat64_io_uring_getevents_arg(struct io_uring_getevents_arg *arg,
const void __user *user_arg)
+{
- struct compat_io_uring_getevents_arg compat_arg;
- if (copy_from_user(&compat_arg, user_arg, sizeof(compat_arg)))
return -EFAULT;
- arg->sigmask = compat_arg.sigmask;
- arg->sigmask_sz = compat_arg.sigmask_sz;
- arg->pad = compat_arg.pad;
- arg->ts = compat_arg.ts;
- return 0;
+}
+static int copy_io_uring_getevents_arg_from_user(struct io_ring_ctx *ctx,
struct io_uring_getevents_arg *arg,
const void __user *argp,
size_t size)
+{
- if (io_in_compat64(ctx)) {
if (size != sizeof(struct compat_io_uring_getevents_arg))
return -EINVAL;
return get_compat64_io_uring_getevents_arg(arg, argp);
- }
- if (size != sizeof(*arg))
return -EINVAL;
- if (copy_from_user(arg, argp, sizeof(*arg)))
return -EFAULT;
- return 0;
+}
struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -604,14 +635,10 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx) static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool all_flushed;
- size_t cqe_size = sizeof(struct io_uring_cqe);
if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false;
- if (ctx->flags & IORING_SETUP_CQE32)
cqe_size <<= 1;
- io_cq_lock(ctx); while (!list_empty(&ctx->cq_overflow_list)) { struct io_uring_cqe *cqe = io_get_cqe_overflow(ctx, true);
@@ -621,9 +648,18 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) break; ocqe = list_first_entry(&ctx->cq_overflow_list, struct io_overflow_cqe, list);
if (cqe)
memcpy(cqe, &ocqe->cqe, cqe_size);
else
if (cqe) {
u64 extra1 = 0;
u64 extra2 = 0;
if (ctx->flags & IORING_SETUP_CQE32) {
extra1 = ocqe->cqe.big_cqe[0];
extra2 = ocqe->cqe.big_cqe[1];
}
__io_fill_cqe(ctx, cqe, ocqe->cqe.user_data, ocqe->cqe.res,
ocqe->cqe.flags, extra1, extra2);
} else io_account_cq_overflow(ctx);
Nit: if the "if" branch uses braces, so should the "else" branch too (see the coding style). This is not always respected in practice, but I think it is more readable (and less error-prone).
list_del(&ocqe->list); @@ -735,6 +771,15 @@ bool io_req_cqe_overflow(struct io_kiocb *req) req->cqe.res, req->cqe.flags, req->extra1, req->extra2); } +/*
- Retrieves the pointer of the ith CQE
Nit: "a pointer to..."
- */
+struct io_uring_cqe *__io_get_ith_cqe(struct io_ring_ctx *ctx, unsigned int i)
Very minor: would make a bit more sense to add it *after* __io_get_cqe() (in the header too), since the latter is the more usual interface.
+{
- return io_in_compat64(ctx) ?
(struct io_uring_cqe *)&ctx->cqes_compat[i] :
&ctx->cqes[i];
+} /*
- writes to the cq entry need to come after reading head; the
@@ -774,7 +819,7 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) ctx->cqe_cached++; if (ctx->flags & IORING_SETUP_CQE32) ctx->cqe_cached++;
- return &ctx->cqes[off];
- return __io_get_ith_cqe(ctx, off);
} bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, @@ -793,14 +838,7 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags if (likely(cqe)) { trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
if (ctx->flags & IORING_SETUP_CQE32) {
WRITE_ONCE(cqe->big_cqe[0], 0);
WRITE_ONCE(cqe->big_cqe[1], 0);
}
return true; }__io_fill_cqe(ctx, cqe, user_data, res, cflags, 0, 0);
@@ -2240,7 +2278,9 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1;
return &ctx->sq_sqes[head];
return io_in_compat64(ctx) ?
(struct io_uring_sqe *)&ctx->sq_sqes_compat[head] :
}&ctx->sq_sqes[head];
/* drop invalid entries */ @@ -2267,6 +2307,7 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) do { const struct io_uring_sqe *sqe; struct io_kiocb *req;
struct io_uring_sqe native_sqe[2];
if (unlikely(!io_alloc_req_refill(ctx))) break; @@ -2276,6 +2317,11 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) io_req_add_to_cache(req, ctx); break; }
if (io_in_compat64(ctx)) {
convert_compat64_io_uring_sqe(ctx, native_sqe,
(struct compat_io_uring_sqe *)sqe);
sqe = native_sqe;
}
/* * Continue submitting even for sqe failure if the @@ -2480,6 +2526,9 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries { struct io_rings *rings; size_t off, cq_array_size, sq_array_size;
- size_t cqe_size = io_in_compat64(ctx) ?
sizeof(struct compat_io_uring_cqe) :
sizeof(struct io_uring_cqe);
off = sizeof(*rings); @@ -2492,7 +2541,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries if (cq_offset) *cq_offset = off;
- cq_array_size = array_size(sizeof(struct io_uring_cqe), cq_entries);
- cq_array_size = array_size(cqe_size, cq_entries); if (cq_array_size == SIZE_MAX) return SIZE_MAX;
@@ -3120,20 +3169,19 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned int flags,
const void __user *argp, size_t argsz)
{ if (flags & IORING_ENTER_EXT_ARG) { struct io_uring_getevents_arg arg;
if (argsz != sizeof(arg))
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
} return 0;return copy_io_uring_getevents_arg_from_user(ctx, &arg, argp, argsz);
} -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags,
- const void __user *argp, size_t *argsz,
Nit: should be aligned on the opening parenthesis.
Kevin