On 23/02/2023 18:53, Tudor Cretu wrote:
Introduce compat versions of the structs exposed in the uAPI headers that might contain pointers as a member. Also, implement functions that convert the compat versions to the native versions of the struct.
A subsequent patch is going to change the io_uring structs to enable them to support new architectures. On such architectures, the current struct layout still needs to be supported for compat tasks.
Signed-off-by: Tudor Cretu tudor.cretu@arm.com
include/linux/io_uring_types.h | 140 ++++++++++++++++++- io_uring/cancel.c | 38 ++++- io_uring/epoll.c | 2 +- io_uring/fdinfo.c | 55 +++++++- io_uring/io_uring.c | 248 +++++++++++++++++++++++---------- io_uring/io_uring.h | 122 +++++++++++++--- io_uring/kbuf.c | 108 ++++++++++++-- io_uring/kbuf.h | 6 +- io_uring/net.c | 5 +- io_uring/rsrc.c | 125 +++++++++++++++-- io_uring/tctx.c | 57 ++++++-- io_uring/uring_cmd.h | 7 + 12 files changed, 782 insertions(+), 131 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h index 440179029a8f0..737bc1aa67306 100644 --- a/include/linux/io_uring_types.h +++ b/include/linux/io_uring_types.h @@ -7,6 +7,130 @@ #include <linux/llist.h> #include <uapi/linux/io_uring.h> +struct compat_io_uring_sqe {
- __u8 opcode;
- __u8 flags;
- __u16 ioprio;
- __s32 fd;
- union {
__u64 off;
__u64 addr2;
struct {
__u32 cmd_op;
__u32 __pad1;
};
- };
- union {
__u64 addr;
__u64 splice_off_in;
- };
- __u32 len;
- union {
__kernel_rwf_t rw_flags;
__u32 fsync_flags;
__u16 poll_events;
__u32 poll32_events;
__u32 sync_range_flags;
__u32 msg_flags;
__u32 timeout_flags;
__u32 accept_flags;
__u32 cancel_flags;
__u32 open_flags;
__u32 statx_flags;
__u32 fadvise_advice;
__u32 splice_flags;
__u32 rename_flags;
__u32 unlink_flags;
__u32 hardlink_flags;
__u32 xattr_flags;
__u32 msg_ring_flags;
__u32 uring_cmd_flags;
I see that a few new members have appeared since 5.18. This makes me think, is there really much point in keeping those union members in sync? This is going to prove rather painful when rebasing, in fact they'll probably end up getting out of sync. Maybe we could simply represent the union as the member that we actually use in convert_compat_io_uring_sqe()? A comment next to that member would be enough to make it clear it's actually a union in the native struct.
- };
- __u64 user_data;
- union {
__u16 buf_index;
__u16 buf_group;
- } __packed;
- __u16 personality;
- union {
__s32 splice_fd_in;
__u32 file_index;
struct {
__u16 addr_len;
__u16 __pad3[1];
};
- };
- union {
struct {
__u64 addr3;
__u64 __pad2[1];
};
__u8 cmd[0];
- };
+};
+struct compat_io_uring_cqe {
- __u64 user_data;
- __s32 res;
- __u32 flags;
- __u64 big_cqe[];
Nit: might as well align it like the rest, even though it's not in the native struct. I think it would make sense to have the same alignment approach for the other structs too, the native struct declarations are not very consistent.
+};
+struct compat_io_uring_files_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 fds;
+};
+struct compat_io_uring_rsrc_register {
- __u32 nr;
- __u32 flags;
- __u64 resv2;
- __aligned_u64 data;
- __aligned_u64 tags;
+};
+struct compat_io_uring_rsrc_update {
- __u32 offset;
- __u32 resv;
- __aligned_u64 data;
+};
+struct compat_io_uring_rsrc_update2 {
- __u32 offset;
- __u32 resv;
- __aligned_u64 data;
- __aligned_u64 tags;
- __u32 nr;
- __u32 resv2;
+};
+struct compat_io_uring_buf {
- __u64 addr;
- __u32 len;
- __u16 bid;
- __u16 resv;
+};
+struct compat_io_uring_buf_ring {
- union {
struct {
__u64 resv1;
__u32 resv2;
__u16 resv3;
__u16 tail;
};
struct compat_io_uring_buf bufs[0];
- };
+};
+struct compat_io_uring_getevents_arg {
- __u64 sigmask;
- __u32 sigmask_sz;
- __u32 pad;
- __u64 ts;
+};
struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -216,7 +340,11 @@ struct io_ring_ctx { * array. */ u32 *sq_array;
struct io_uring_sqe *sq_sqes;
Nit: not sure adding a new line helps, it feels clearer to keep all the sq_* stuff in one block.
union {
struct compat_io_uring_sqe *sq_sqes_compat;
struct io_uring_sqe *sq_sqes;
unsigned cached_sq_head; unsigned sq_entries;};
@@ -271,7 +399,10 @@ struct io_ring_ctx { * produced, so the application is allowed to modify pending * entries. */
struct io_uring_cqe *cqes;
union {
struct compat_io_uring_cqe *cqes_compat;
struct io_uring_cqe *cqes;
};
/* * We cache a range of free CQEs we can use, once exhausted it @@ -581,7 +712,10 @@ struct io_kiocb { struct io_overflow_cqe { struct list_head list;
- struct io_uring_cqe cqe;
- union {
struct compat_io_uring_cqe compat_cqe;
struct io_uring_cqe cqe;
- };
}; #endif diff --git a/io_uring/cancel.c b/io_uring/cancel.c index 2291a53cdabd1..20cb8634e44af 100644 --- a/io_uring/cancel.c +++ b/io_uring/cancel.c @@ -27,6 +27,42 @@ struct io_cancel { #define CANCEL_FLAGS (IORING_ASYNC_CANCEL_ALL | IORING_ASYNC_CANCEL_FD | \ IORING_ASYNC_CANCEL_ANY | IORING_ASYNC_CANCEL_FD_FIXED) +struct compat_io_uring_sync_cancel_reg {
Shouldn't that struct be with its friends in io_uring_types.h? Same question regarding struct compat_io_uring_buf_reg.
- __u64 addr;
- __s32 fd;
- __u32 flags;
- struct __kernel_timespec timeout;
- __u64 pad[4];
+};
+#ifdef CONFIG_COMPAT64
The #ifdef'ing makes me think: should we make it more systematic? Right now it looks like only the compat copy helpers and calls to them are #ifdef'd, but arguably it doesn't make sense to #ifdef that and not things like the size selection (e.g. in rings_size()), as we are effectively assuming that the compat and native structs are identical apart from in compat64. Maybe a new helper that returns IS_ENABLED(CONFIG_COMPAT64) && ctx->compat would help? In that case the copy helpers couldn't be #ifdef'd any longer but it may be better this way.
+static int get_compat_io_uring_sync_cancel_reg(struct io_uring_sync_cancel_reg *sc,
const void __user *user_sc)
+{
- struct compat_io_uring_sync_cancel_reg compat_sc;
- if (unlikely(copy_from_user(&compat_sc, user_sc, sizeof(compat_sc))))
return -EFAULT;
- sc->addr = compat_sc.addr;
- sc->fd = compat_sc.fd;
- sc->flags = compat_sc.flags;
- sc->timeout = compat_sc.timeout;
- memcpy(sc->pad, compat_sc.pad, sizeof(sc->pad));
- return 0;
+} +#endif
+static int copy_io_uring_sync_cancel_reg_from_user(struct io_ring_ctx *ctx,
struct io_uring_sync_cancel_reg *sc,
const void __user *arg)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat)
return get_compat_io_uring_sync_cancel_reg(sc, arg);
+#endif
- return copy_from_user(sc, arg, sizeof(*sc));
+}
static bool io_cancel_cb(struct io_wq_work *work, void *data) { struct io_kiocb *req = container_of(work, struct io_kiocb, work); @@ -243,7 +279,7 @@ int io_sync_cancel(struct io_ring_ctx *ctx, void __user *arg) DEFINE_WAIT(wait); int ret;
- if (copy_from_user(&sc, arg, sizeof(sc)))
- if (copy_io_uring_sync_cancel_reg_from_user(ctx, &sc, arg)) return -EFAULT; if (sc.flags & ~CANCEL_FLAGS) return -EINVAL;
diff --git a/io_uring/epoll.c b/io_uring/epoll.c index 9aa74d2c80bc4..d5580ff465c3e 100644 --- a/io_uring/epoll.c +++ b/io_uring/epoll.c @@ -40,7 +40,7 @@ int io_epoll_ctl_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) struct epoll_event __user *ev; ev = u64_to_user_ptr(READ_ONCE(sqe->addr));
if (copy_from_user(&epoll->event, ev, sizeof(*ev)))
}if (copy_epoll_event_from_user(&epoll->event, ev, req->ctx->compat)) return -EFAULT;
diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index bc8c9d764bc13..c5bd669081c98 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -89,12 +89,25 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, for (i = 0; i < sq_entries; i++) { unsigned int entry = i + sq_head; struct io_uring_sqe *sqe;
unsigned int sq_idx;
unsigned int sq_idx, sq_off;
+#ifdef CONFIG_COMPAT64
struct io_uring_sqe *native_sqe = NULL;
+#endif sq_idx = READ_ONCE(ctx->sq_array[entry & sq_mask]); if (sq_idx > sq_mask) continue;
sqe = &ctx->sq_sqes[sq_idx << sq_shift];
sq_off = sq_idx << sq_shift;
sqe = ctx->compat ? (void *)&ctx->sq_sqes_compat[sq_off] : &ctx->sq_sqes[sq_off];
+#ifdef CONFIG_COMPAT64
if (ctx->compat) {
native_sqe = kmalloc(sizeof(struct io_uring_sqe) << sq_shift, GFP_KERNEL);
It's pretty suboptimal to do a kmalloc() / kfree() in a loop. An alternative would be to have a local struct io_uring_sqe native_sqe[2]; to account for the SQE128 case.
That said for the case of the printing code in this file, I wonder if we shouldn't skip the conversion altogether, since the structs are not passed to any other function. If we move each of the three printing blocks where conversion is needed to a macro, then we can easily handle both the native and compat struct with the same code.
convert_compat_io_uring_sqe(ctx, native_sqe,
(struct compat_io_uring_sqe *)sqe);
sqe = native_sqe;
}
+#endif
- seq_printf(m, "%5u: opcode:%s, fd:%d, flags:%x, off:%llu, " "addr:0x%llx, rw_flags:0x%x, buf_index:%d " "user_data:%llu",
@@ -104,7 +117,8 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, sqe->buf_index, sqe->user_data); if (sq_shift) { u64 *sqeb = (void *) (sqe + 1);
int size = sizeof(struct io_uring_sqe) / sizeof(u64);
int size = (ctx->compat ? sizeof(struct compat_io_uring_sqe)
: sizeof(struct io_uring_sqe)) / sizeof(u64); int j;
for (j = 0; j < size; j++) { @@ -114,12 +128,29 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, } } seq_printf(m, "\n"); +#ifdef CONFIG_COMPAT64
kfree(native_sqe);
+#endif } seq_printf(m, "CQEs:\t%u\n", cq_tail - cq_head); cq_entries = min(cq_tail - cq_head, ctx->cq_entries); for (i = 0; i < cq_entries; i++) { unsigned int entry = i + cq_head;
struct io_uring_cqe *cqe = &ctx->cqes[(entry & cq_mask) << cq_shift];
struct io_uring_cqe *cqe;
unsigned int cq_off = (entry & cq_mask) << cq_shift;
+#ifdef CONFIG_COMPAT64
struct io_uring_cqe *native_cqe = NULL;
+#endif
cqe = ctx->compat ? (void *)&ctx->cqes_compat[cq_off] : &ctx->cqes[cq_off];
+#ifdef CONFIG_COMPAT64
if (ctx->compat) {
native_cqe = kmalloc(sizeof(struct io_uring_cqe) << cq_shift, GFP_KERNEL);
convert_compat_io_uring_cqe(ctx, native_cqe,
(struct compat_io_uring_cqe *)cqe);
cqe = native_cqe;
}
+#endif seq_printf(m, "%5u: user_data:%llu, res:%d, flag:%x", entry & cq_mask, cqe->user_data, cqe->res, @@ -128,6 +159,9 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, seq_printf(m, ", extra1:%llu, extra2:%llu\n", cqe->big_cqe[0], cqe->big_cqe[1]); seq_printf(m, "\n"); +#ifdef CONFIG_COMPAT64
kfree(native_cqe);
+#endif } /* @@ -189,10 +223,23 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, spin_lock(&ctx->completion_lock); list_for_each_entry(ocqe, &ctx->cq_overflow_list, list) { struct io_uring_cqe *cqe = &ocqe->cqe; +#ifdef CONFIG_COMPAT64
struct io_uring_cqe *native_cqe = NULL;
if (ctx->compat) {
native_cqe = kmalloc(sizeof(struct io_uring_cqe) << cq_shift, GFP_KERNEL);
convert_compat_io_uring_cqe(ctx, native_cqe,
(struct compat_io_uring_cqe *)cqe);
cqe = native_cqe;
}
+#endif seq_printf(m, " user_data=%llu, res=%d, flags=%x\n", cqe->user_data, cqe->res, cqe->flags); +#ifdef CONFIG_COMPAT64
kfree(native_cqe);
+#endif } spin_unlock(&ctx->completion_lock); diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 707229ae04dc8..03506776af043 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -152,6 +152,43 @@ static void __io_submit_flush_completions(struct io_ring_ctx *ctx); static struct kmem_cache *req_cachep; +#ifdef CONFIG_COMPAT64 +static int get_compat_io_uring_getevents_arg(struct io_uring_getevents_arg *arg,
const void __user *user_arg)
+{
- struct compat_io_uring_getevents_arg compat_arg;
- if (unlikely(copy_from_user(&compat_arg, user_arg, sizeof(compat_arg))))
Not very important but curious: we only seem to be using unlikely(copy_from_user(...)) in the compat helpers, any reason for that?
return -EFAULT;
- arg->sigmask = (__kernel_uintptr_t)compat_ptr(compat_arg.sigmask);
Looks like a leftover conversion that belongs to patch 8 instead.
- arg->sigmask_sz = compat_arg.sigmask_sz;
- arg->pad = compat_arg.pad;
- arg->ts = (__kernel_uintptr_t)compat_ptr(compat_arg.ts);
- return 0;
+} +#endif /* CONFIG_COMPAT64 */
+static int copy_io_uring_getevents_arg_from_user(struct io_ring_ctx *ctx,
struct io_uring_getevents_arg *arg,
const void __user *argp,
size_t size)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat) {
if (size != sizeof(struct compat_io_uring_getevents_arg))
return -EINVAL;
if (get_compat_io_uring_getevents_arg(arg, argp))
return -EFAULT;
return 0;
Or simply return get_compat_io_uring_getevents_arg(arg, argp);
I think this pattern reappears in quite a few copy_*_from_user() helpers.
- }
+#endif
- if (size != sizeof(*arg))
return -EINVAL;
- if (copy_from_user(arg, argp, sizeof(*arg)))
return -EFAULT;
- return 0;
+}
struct sock *io_uring_get_socket(struct file *file) { #if defined(CONFIG_UNIX) @@ -604,7 +641,9 @@ void io_cq_unlock_post(struct io_ring_ctx *ctx) static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) { bool all_flushed;
- size_t cqe_size = sizeof(struct io_uring_cqe);
- size_t cqe_size = ctx->compat ?
sizeof(struct compat_io_uring_cqe) :
sizeof(struct io_uring_cqe);
I'm not sure this is enough to get the overflow machinery to work in compat64. struct io_overflow_cqe gets filled in io_cqring_event_overflow() but AFAICT that code is not compat-aware, so it always initialises the native cqe and not the compat one. I suppose we could make use of __io_fill_cqe_any() in io_cqring_event_overflow()?
This makes me think, are there tests that trigger this overflow mechanism? Issues there might easily go unnoticed otherwise.
if (!force && __io_cqring_events(ctx) == ctx->cq_entries) return false; @@ -741,10 +780,11 @@ bool io_req_cqe_overflow(struct io_kiocb *req)
- control dependency is enough as we're using WRITE_ONCE to
- fill the cq entry
*/ -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) +void *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow)
I certainly understand the rationale behind changing the return value to void *, as it is now either struct io_uring_cqe * or struct compat_io_uring_cqe *. That said I'm not convinced this change makes a lot of sense as it stands, because all users of *io_get_cqe* still interpret the return value as struct io_uring_cqe * (which works as void * is implicitly convertible to any pointer type, not too conveniently here...).
My feeling is that we should really choose one way or the other. Either make such pointers void * everywhere - or maybe better, pointer to a union, to stop them from being implicitly convertible. Or just keep struct io_uring_cqe * everywhere, which is quite common when dealing with compat structs. I don't really have a preference as both approaches have merits, up to you :)
{ unsigned int off = ctx->cached_cq_tail & (ctx->cq_entries - 1); unsigned int free, queued, len;
- void *cqe;
/* * Posting into the CQ when there are pending overflowed CQEs may break @@ -767,14 +807,15 @@ struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow) len <<= 1; }
- ctx->cqe_cached = &ctx->cqes[off];
- cqe = ctx->compat ? (void *)&ctx->cqes_compat[off] : (void *)&ctx->cqes[off];
- ctx->cqe_cached = cqe;
Don't we have a problem with the whole cqe_cached / cqe_sentinel logic? It assumes that it is manipulating pointers to struct io_uring_cqe, but in fact these are pointers to struct compat_io_uring_cqe in compat64, aren't they? This appeared in 5.19 [1] so you may not have noticed it when rebasing on 6.1.
[1] https://lore.kernel.org/all/487eeef00f3146537b3d9c1a9cef2fc0b9a86f81.1649771...
ctx->cqe_sentinel = ctx->cqe_cached + len; ctx->cached_cq_tail++; ctx->cqe_cached++; if (ctx->flags & IORING_SETUP_CQE32) ctx->cqe_cached++;
- return &ctx->cqes[off];
- return cqe;
} bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags, @@ -793,14 +834,7 @@ bool io_fill_cqe_aux(struct io_ring_ctx *ctx, u64 user_data, s32 res, u32 cflags if (likely(cqe)) { trace_io_uring_complete(ctx, NULL, user_data, res, cflags, 0, 0);
WRITE_ONCE(cqe->user_data, user_data);
WRITE_ONCE(cqe->res, res);
WRITE_ONCE(cqe->flags, cflags);
if (ctx->flags & IORING_SETUP_CQE32) {
WRITE_ONCE(cqe->big_cqe[0], 0);
WRITE_ONCE(cqe->big_cqe[1], 0);
}
return true; }__io_fill_cqe_any(ctx, cqe, user_data, res, cflags, 0, 0);
@@ -2222,7 +2256,7 @@ static void io_commit_sqring(struct io_ring_ctx *ctx)
- used, it's important that those reads are done through READ_ONCE() to
- prevent a re-load down the line.
*/ -static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) +static const void *io_get_sqe(struct io_ring_ctx *ctx) { unsigned head, mask = ctx->sq_entries - 1; unsigned sq_idx = ctx->cached_sq_head++ & mask; @@ -2240,7 +2274,8 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) /* double index for 128-byte SQEs, twice as long */ if (ctx->flags & IORING_SETUP_SQE128) head <<= 1;
return &ctx->sq_sqes[head];
return ctx->compat ? (void *)&ctx->sq_sqes_compat[head]
}: (void *)&ctx->sq_sqes[head];
/* drop invalid entries */ @@ -2265,8 +2300,11 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) io_submit_state_start(&ctx->submit_state, left); do {
const struct io_uring_sqe *sqe;
struct io_kiocb *req;const void *sqe;
+#ifdef CONFIG_COMPAT64
struct io_uring_sqe native_sqe;
It would seem that this will result in a buffer overflow in the IORING_OP_URING_CMD + IORING_SETUP_SQE128 case, since no space is allocated for the extra data. Maybe the best is to allocate an array of two sqes to cover this situation.
+#endif if (unlikely(!io_alloc_req_refill(ctx))) break; @@ -2276,6 +2314,12 @@ int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) io_req_add_to_cache(req, ctx); break; } +#ifdef CONFIG_COMPAT64
if (ctx->compat) {
convert_compat_io_uring_sqe(ctx, &native_sqe, sqe);
sqe = &native_sqe;
}
+#endif /* * Continue submitting even for sqe failure if the @@ -2480,6 +2524,9 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries { struct io_rings *rings; size_t off, cq_array_size, sq_array_size;
- size_t cqe_size = ctx->compat ?
sizeof(struct compat_io_uring_cqe) :
sizeof(struct io_uring_cqe);
off = sizeof(*rings); @@ -2492,7 +2539,7 @@ static unsigned long rings_size(struct io_ring_ctx *ctx, unsigned int sq_entries if (cq_offset) *cq_offset = off;
- cq_array_size = array_size(sizeof(struct io_uring_cqe), cq_entries);
- cq_array_size = array_size(cqe_size, cq_entries); if (cq_array_size == SIZE_MAX) return SIZE_MAX;
@@ -3120,20 +3167,22 @@ static unsigned long io_uring_nommu_get_unmapped_area(struct file *file, #endif /* !CONFIG_MMU */ -static int io_validate_ext_arg(unsigned flags, const void __user *argp, size_t argsz) +static int io_validate_ext_arg(struct io_ring_ctx *ctx, unsigned int flags,
- const void __user *argp, size_t argsz)
It looks like overflow lines in prototypes are generally aligned on the opening parenthesis in this file, not just indented by a tab.
{ if (flags & IORING_ENTER_EXT_ARG) { struct io_uring_getevents_arg arg;
int ret;
if (argsz != sizeof(arg))
return -EINVAL;
if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
ret = copy_io_uring_getevents_arg_from_user(ctx, &arg, argp, argsz);
if (ret)
return ret;
Or simply return copy_io_uring_getevents_arg_from_user(...);
} return 0; } -static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz, +static int io_get_ext_arg(struct io_ring_ctx *ctx, unsigned int flags,
- const void __user *argp, size_t *argsz,
#ifdef CONFIG_CHERI_PURECAP_UABI struct __kernel_timespec * __capability *ts, const sigset_t * __capability *sig) @@ -3143,6 +3192,7 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz #endif { struct io_uring_getevents_arg arg;
- int ret;
/* * If EXT_ARG isn't set, then we have no timespec and the argp pointer @@ -3158,10 +3208,9 @@ static int io_get_ext_arg(unsigned flags, const void __user *argp, size_t *argsz * EXT_ARG is set - ensure we agree on the size of it and copy in our * timespec and sigset_t pointers if good. */
- if (*argsz != sizeof(arg))
return -EINVAL;
- if (copy_from_user(&arg, argp, sizeof(arg)))
return -EFAULT;
- ret = copy_io_uring_getevents_arg_from_user(ctx, &arg, argp, *argsz);
- if (ret)
if (arg.pad) return -EINVAL; *sig = u64_to_user_ptr(arg.sigmask);return ret;
@@ -3268,7 +3317,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, */ mutex_lock(&ctx->uring_lock); iopoll_locked:
ret2 = io_validate_ext_arg(flags, argp, argsz);
ret2 = io_validate_ext_arg(ctx, flags, argp, argsz); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries);
@@ -3279,7 +3328,7 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, const sigset_t __user *sig; struct __kernel_timespec __user *ts;
ret2 = io_get_ext_arg(flags, argp, &argsz, &ts, &sig);
ret2 = io_get_ext_arg(ctx, flags, argp, &argsz, &ts, &sig); if (likely(!ret2)) { min_complete = min(min_complete, ctx->cq_entries);
@@ -3329,6 +3378,9 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, { struct io_rings *rings; size_t size, cqes_offset, sq_array_offset;
- size_t sqe_size = ctx->compat ?
sizeof(struct compat_io_uring_sqe) :
sizeof(struct io_uring_sqe);
/* make sure these are sane, as we already accounted them */ ctx->sq_entries = p->sq_entries; @@ -3351,9 +3403,9 @@ static __cold int io_allocate_scq_urings(struct io_ring_ctx *ctx, rings->cq_ring_entries = p->cq_entries; if (p->flags & IORING_SETUP_SQE128)
size = array_size(2 * sizeof(struct io_uring_sqe), p->sq_entries);
elsesize = array_size(2 * sqe_size, p->sq_entries);
size = array_size(sizeof(struct io_uring_sqe), p->sq_entries);
if (size == SIZE_MAX) { io_mem_free(ctx->rings); ctx->rings = NULL;size = array_size(sqe_size, p->sq_entries);
@@ -4107,48 +4159,45 @@ static int __init io_uring_init(void) #define BUILD_BUG_SQE_ELEM_SIZE(eoffset, esize, ename) \ __BUILD_BUG_VERIFY_OFFSET_SIZE(struct io_uring_sqe, eoffset, esize, ename) BUILD_BUG_ON(sizeof(struct io_uring_sqe) != 64);
- BUILD_BUG_SQE_ELEM(0, __u8, opcode);
- BUILD_BUG_SQE_ELEM(1, __u8, flags);
- BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
- BUILD_BUG_SQE_ELEM(4, __s32, fd);
- BUILD_BUG_SQE_ELEM(8, __u64, off);
- BUILD_BUG_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_SQE_ELEM(8, __u32, cmd_op);
- BUILD_BUG_SQE_ELEM(0, __u8, opcode);
- BUILD_BUG_SQE_ELEM(1, __u8, flags);
- BUILD_BUG_SQE_ELEM(2, __u16, ioprio);
- BUILD_BUG_SQE_ELEM(4, __s32, fd);
- BUILD_BUG_SQE_ELEM(8, __u64, off);
- BUILD_BUG_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_SQE_ELEM(8, __u32, cmd_op); BUILD_BUG_SQE_ELEM(12, __u32, __pad1);
- BUILD_BUG_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(24, __u32, len);
- BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u32, rw_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, rename_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags);
- BUILD_BUG_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(44, __u32, file_index);
- BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
- BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
- BUILD_BUG_SQE_ELEM(48, __u64, addr3);
- BUILD_BUG_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_SQE_ELEM(24, __u32, len);
- BUILD_BUG_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, rename_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, unlink_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, hardlink_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, xattr_flags);
- BUILD_BUG_SQE_ELEM(28, __u32, msg_ring_flags);
- BUILD_BUG_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_SQE_ELEM(44, __u32, file_index);
- BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
- BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
- BUILD_BUG_SQE_ELEM(48, __u64, addr3); BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
- BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
- BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
BUILD_BUG_ON(sizeof(struct io_uring_files_update) != sizeof(struct io_uring_rsrc_update)); @@ -4160,6 +4209,65 @@ static int __init io_uring_init(void) BUILD_BUG_ON(offsetof(struct io_uring_buf, resv) != offsetof(struct io_uring_buf_ring, tail)); +#ifdef CONFIG_COMPAT64 +#define BUILD_BUG_COMPAT_SQE_ELEM(eoffset, etype, ename) \
- __BUILD_BUG_VERIFY_OFFSET_SIZE(struct compat_io_uring_sqe, eoffset, sizeof(etype), ename)
+#define BUILD_BUG_COMPAT_SQE_ELEM_SIZE(eoffset, esize, ename) \
- __BUILD_BUG_VERIFY_OFFSET_SIZE(struct compat_io_uring_sqe, eoffset, esize, ename)
- BUILD_BUG_ON(sizeof(struct compat_io_uring_sqe) != 64);
- BUILD_BUG_COMPAT_SQE_ELEM(0, __u8, opcode);
- BUILD_BUG_COMPAT_SQE_ELEM(1, __u8, flags);
- BUILD_BUG_COMPAT_SQE_ELEM(2, __u16, ioprio);
- BUILD_BUG_COMPAT_SQE_ELEM(4, __s32, fd);
- BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, off);
- BUILD_BUG_COMPAT_SQE_ELEM(8, __u64, addr2);
- BUILD_BUG_COMPAT_SQE_ELEM(8, __u32, cmd_op);
- BUILD_BUG_COMPAT_SQE_ELEM(12, __u32, __pad1);
- BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, addr);
- BUILD_BUG_COMPAT_SQE_ELEM(16, __u64, splice_off_in);
- BUILD_BUG_COMPAT_SQE_ELEM(24, __u32, len);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __kernel_rwf_t, rw_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ int, rw_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u32, rw_flags);
I'm a bit confused by these lines, as they are removed from the native checks above but preserved here? I don't think "compat" has much to do with compat64 here, in the case of poll_events below it seems to be about a backwards-compatible change in ABI (see [1]).
I also expected the changes in whitespace above to be reflected here.
[1] https://lore.kernel.org/all/1592387636-80105-2-git-send-email-jiufei.xue@lin...
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fsync_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, /* compat */ __u16, poll_events);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, poll32_events);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, sync_range_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, msg_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, timeout_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, accept_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, cancel_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, open_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, statx_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, fadvise_advice);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, splice_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, rename_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, unlink_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, hardlink_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, xattr_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(28, __u32, msg_ring_flags);
- BUILD_BUG_COMPAT_SQE_ELEM(32, __u64, user_data);
- BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_index);
- BUILD_BUG_COMPAT_SQE_ELEM(40, __u16, buf_group);
- BUILD_BUG_COMPAT_SQE_ELEM(42, __u16, personality);
- BUILD_BUG_COMPAT_SQE_ELEM(44, __s32, splice_fd_in);
- BUILD_BUG_COMPAT_SQE_ELEM(44, __u32, file_index);
- BUILD_BUG_COMPAT_SQE_ELEM(44, __u16, addr_len);
- BUILD_BUG_COMPAT_SQE_ELEM(46, __u16, __pad3[0]);
- BUILD_BUG_COMPAT_SQE_ELEM(48, __u64, addr3);
- BUILD_BUG_COMPAT_SQE_ELEM_SIZE(48, 0, cmd);
- BUILD_BUG_COMPAT_SQE_ELEM(56, __u64, __pad2);
- BUILD_BUG_ON(sizeof(struct compat_io_uring_files_update) !=
sizeof(struct compat_io_uring_rsrc_update));
- BUILD_BUG_ON(sizeof(struct compat_io_uring_rsrc_update) >
sizeof(struct compat_io_uring_rsrc_update2));
- BUILD_BUG_ON(offsetof(struct compat_io_uring_buf_ring, bufs) != 0);
- BUILD_BUG_ON(offsetof(struct compat_io_uring_buf, resv) !=
offsetof(struct compat_io_uring_buf_ring, tail));
+#endif /* CONFIG_COMPAT64 */
- /* should fit into one byte */ BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8)); BUILD_BUG_ON(SQE_COMMON_FLAGS >= (1 << 8));
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index 50bc3af449534..fb2711770bfb0 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -5,6 +5,7 @@ #include <linux/lockdep.h> #include <linux/io_uring_types.h> #include "io-wq.h" +#include "uring_cmd.h" #include "slist.h" #include "filetable.h" @@ -24,7 +25,7 @@ enum { IOU_STOP_MULTISHOT = -ECANCELED, }; -struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); +void *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); bool io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); @@ -93,8 +94,67 @@ static inline void io_cq_lock(struct io_ring_ctx *ctx) void io_cq_unlock_post(struct io_ring_ctx *ctx); -static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx,
bool overflow)
+#ifdef CONFIG_COMPAT64 +static inline void convert_compat_io_uring_cqe(struct io_ring_ctx *ctx,
struct io_uring_cqe *cqe,
const struct compat_io_uring_cqe *compat_cqe)
+{
- cqe->user_data = READ_ONCE(compat_cqe->user_data);
- cqe->res = READ_ONCE(compat_cqe->res);
- cqe->flags = READ_ONCE(compat_cqe->flags);
- if (ctx->flags & IORING_SETUP_CQE32) {
cqe->big_cqe[0] = READ_ONCE(compat_cqe->big_cqe[0]);
cqe->big_cqe[1] = READ_ONCE(compat_cqe->big_cqe[1]);
- }
+}
+static inline void convert_compat_io_uring_sqe(struct io_ring_ctx *ctx,
struct io_uring_sqe *sqe,
const struct compat_io_uring_sqe *compat_sqe)
+{ +/*
- The struct io_uring_sqe contains anonymous unions and there is no field
- keeping track of which union's member is active. Because in all the cases,
- the unions are between integral types and the types are compatible, use the
- largest member of each union to perform the copy. Use this compile-time check
- to ensure that the union's members are not truncated during the conversion.
- */
+#define BUILD_BUG_COMPAT_SQE_UNION_ELEM(elem1, elem2) \
- BUILD_BUG_ON(sizeof_field(struct compat_io_uring_sqe, elem1) != \
(offsetof(struct compat_io_uring_sqe, elem2) - \
offsetof(struct compat_io_uring_sqe, elem1)))
- sqe->opcode = READ_ONCE(compat_sqe->opcode);
- sqe->flags = READ_ONCE(compat_sqe->flags);
- sqe->ioprio = READ_ONCE(compat_sqe->ioprio);
- sqe->fd = READ_ONCE(compat_sqe->fd);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr2, addr);
- sqe->addr2 = READ_ONCE(compat_sqe->addr2);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(addr, len);
- sqe->addr = READ_ONCE(compat_sqe->addr);
- sqe->len = READ_ONCE(compat_sqe->len);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(rw_flags, user_data);
- sqe->rw_flags = READ_ONCE(compat_sqe->rw_flags);
- sqe->user_data = READ_ONCE(compat_sqe->user_data);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(buf_index, personality);
- sqe->buf_index = READ_ONCE(compat_sqe->buf_index);
- sqe->personality = READ_ONCE(compat_sqe->personality);
- BUILD_BUG_COMPAT_SQE_UNION_ELEM(splice_fd_in, addr3);
- sqe->splice_fd_in = READ_ONCE(compat_sqe->splice_fd_in);
- if (sqe->opcode == IORING_OP_URING_CMD) {
size_t compat_cmd_size = compat_uring_cmd_pdu_size(ctx->flags &
IORING_SETUP_SQE128);
memcpy(sqe->cmd, compat_sqe->cmd, compat_cmd_size);
- } else
sqe->addr3 = READ_ONCE(compat_sqe->addr3);
It would be better to copy __pad2 too, as some net code does check that it is zero.
+#undef BUILD_BUG_COMPAT_SQE_UNION_ELEM +} +#endif /* CONFIG_COMPAT64 */
+static inline void *io_get_cqe_overflow(struct io_ring_ctx *ctx,
bool overflow)
{ if (likely(ctx->cqe_cached < ctx->cqe_sentinel)) { struct io_uring_cqe *cqe = ctx->cqe_cached; @@ -109,15 +169,46 @@ static inline struct io_uring_cqe *io_get_cqe_overflow(struct io_ring_ctx *ctx, return __io_get_cqe(ctx, overflow); } -static inline struct io_uring_cqe *io_get_cqe(struct io_ring_ctx *ctx) +static inline void *io_get_cqe(struct io_ring_ctx *ctx) { return io_get_cqe_overflow(ctx, false); } +static inline void __io_fill_cqe_any(struct io_ring_ctx *ctx, struct io_uring_cqe *cqe,
Nit: not sure the "any" suffix really helps here, __io_fill_cqe() is probably enough.
u64 user_data, s32 res, u32 cflags,
u64 extra1, u64 extra2)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat) {
struct compat_io_uring_cqe *compat_cqe = (struct compat_io_uring_cqe *)cqe;
WRITE_ONCE(compat_cqe->user_data, user_data);
WRITE_ONCE(compat_cqe->res, res);
WRITE_ONCE(compat_cqe->flags, cflags);
if (ctx->flags & IORING_SETUP_CQE32) {
WRITE_ONCE(compat_cqe->big_cqe[0], extra1);
WRITE_ONCE(compat_cqe->big_cqe[1], extra2);
}
return;
- }
+#endif
- WRITE_ONCE(cqe->user_data, user_data);
- WRITE_ONCE(cqe->res, res);
- WRITE_ONCE(cqe->flags, cflags);
- if (ctx->flags & IORING_SETUP_CQE32) {
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
- }
+}
static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, struct io_kiocb *req) { struct io_uring_cqe *cqe;
- u64 extra1 = 0;
- u64 extra2 = 0;
/* * If we can't get a cq entry, userspace overflowed the @@ -128,24 +219,17 @@ static inline bool __io_fill_cqe_req(struct io_ring_ctx *ctx, if (unlikely(!cqe)) return io_req_cqe_overflow(req);
- if (ctx->flags & IORING_SETUP_CQE32 && req->flags & REQ_F_CQE32_INIT) {
extra1 = req->extra1;
extra2 = req->extra2;
- }
- trace_io_uring_complete(req->ctx, req, req->cqe.user_data, req->cqe.res, req->cqe.flags,
(req->flags & REQ_F_CQE32_INIT) ? req->extra1 : 0,
(req->flags & REQ_F_CQE32_INIT) ? req->extra2 : 0);
extra1, extra2);
- memcpy(cqe, &req->cqe, sizeof(*cqe));
- if (ctx->flags & IORING_SETUP_CQE32) {
u64 extra1 = 0, extra2 = 0;
if (req->flags & REQ_F_CQE32_INIT) {
extra1 = req->extra1;
extra2 = req->extra2;
}
WRITE_ONCE(cqe->big_cqe[0], extra1);
WRITE_ONCE(cqe->big_cqe[1], extra2);
- }
- __io_fill_cqe_any(ctx, cqe, req->cqe.user_data, req->cqe.res,
return true;req->cqe.flags, extra1, extra2);
} diff --git a/io_uring/kbuf.c b/io_uring/kbuf.c index e2c46889d5fab..9f840ba93c888 100644 --- a/io_uring/kbuf.c +++ b/io_uring/kbuf.c @@ -16,6 +16,9 @@ #include "kbuf.h" #define IO_BUFFER_LIST_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct io_uring_buf)) +#ifdef CONFIG_COMPAT64 +#define IO_BUFFER_LIST_COMPAT_BUF_PER_PAGE (PAGE_SIZE / sizeof(struct compat_io_uring_buf)) +#endif #define BGID_ARRAY 64 @@ -28,6 +31,42 @@ struct io_provide_buf { __u16 bid; }; +#ifdef CONFIG_COMPAT64 +struct compat_io_uring_buf_reg {
- __u64 ring_addr;
- __u32 ring_entries;
- __u16 bgid;
- __u16 pad;
- __u64 resv[3];
+};
+static int get_compat_io_uring_buf_reg(struct io_uring_buf_reg *reg,
const void __user *user_reg)
+{
- struct compat_io_uring_buf_reg compat_reg;
- if (unlikely(copy_from_user(&compat_reg, user_reg, sizeof(compat_reg))))
return -EFAULT;
- reg->ring_addr = compat_reg.ring_addr;
- reg->ring_entries = compat_reg.ring_entries;
- reg->bgid = compat_reg.bgid;
- reg->pad = compat_reg.pad;
- memcpy(reg->resv, compat_reg.resv, sizeof(reg->resv));
- return 0;
+} +#endif
+static int copy_io_uring_buf_reg_from_user(struct io_ring_ctx *ctx,
struct io_uring_buf_reg *reg,
const void __user *arg)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat)
return get_compat_io_uring_buf_reg(reg, arg);
+#endif
- return copy_from_user(reg, arg, sizeof(*reg));
+}
static inline struct io_buffer_list *io_buffer_get_list(struct io_ring_ctx *ctx, unsigned int bgid) { @@ -125,6 +164,41 @@ static void __user *io_provided_buffer_select(struct io_kiocb *req, size_t *len, return NULL; } +#ifdef CONFIG_COMPAT64 +static void __user *io_ring_buffer_select_compat(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
unsigned int issue_flags)
+{
- struct compat_io_uring_buf_ring *br = bl->buf_ring_compat;
- struct compat_io_uring_buf *buf;
- __u16 head = bl->head;
- if (unlikely(smp_load_acquire(&br->tail) == head))
return NULL;
- head &= bl->mask;
- if (head < IO_BUFFER_LIST_COMPAT_BUF_PER_PAGE) {
buf = &br->bufs[head];
- } else {
int off = head & (IO_BUFFER_LIST_COMPAT_BUF_PER_PAGE - 1);
int index = head / IO_BUFFER_LIST_COMPAT_BUF_PER_PAGE;
buf = page_address(bl->buf_pages[index]);
buf += off;
- }
- if (*len == 0 || *len > buf->len)
*len = buf->len;
- req->flags |= REQ_F_BUFFER_RING;
- req->buf_list = bl;
- req->buf_index = buf->bid;
- if (issue_flags & IO_URING_F_UNLOCKED || !file_can_poll(req->file)) {
req->buf_list = NULL;
bl->head++;
- }
I agree duplicating that function is the only reasonable option here. It looks like this block could be moved to the common function though, reducing the duplication a bit.
- return compat_ptr(buf->addr);
+} +#endif /* CONFIG_COMPAT64 */
static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, struct io_buffer_list *bl, unsigned int issue_flags) @@ -168,6 +242,17 @@ static void __user *io_ring_buffer_select(struct io_kiocb *req, size_t *len, return u64_to_user_ptr(buf->addr); } +static void __user *io_ring_buffer_select_any(struct io_kiocb *req, size_t *len,
struct io_buffer_list *bl,
unsigned int issue_flags)
+{ +#ifdef CONFIG_COMPAT64
- if (req->ctx->compat)
return io_ring_buffer_select_compat(req, len, bl, issue_flags);
+#endif
- return io_ring_buffer_select(req, len, bl, issue_flags);
+}
void __user *io_buffer_select(struct io_kiocb *req, size_t *len, unsigned int issue_flags) { @@ -180,7 +265,7 @@ void __user *io_buffer_select(struct io_kiocb *req, size_t *len, bl = io_buffer_get_list(ctx, req->buf_index); if (likely(bl)) { if (bl->buf_nr_pages)
ret = io_ring_buffer_select(req, len, bl, issue_flags);
else ret = io_provided_buffer_select(req, len, bl); }ret = io_ring_buffer_select_any(req, len, bl, issue_flags);
@@ -215,9 +300,12 @@ static int __io_remove_buffers(struct io_ring_ctx *ctx, return 0; if (bl->buf_nr_pages) {
__u16 tail = ctx->compat ?
bl->buf_ring_compat->tail :
int j;bl->buf_ring->tail;
i = bl->buf_ring->tail - bl->head;
for (j = 0; j < bl->buf_nr_pages; j++) unpin_user_page(bl->buf_pages[j]); kvfree(bl->buf_pages);i = tail - bl->head;
@@ -469,13 +557,13 @@ int io_provide_buffers(struct io_kiocb *req, unsigned int issue_flags) int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) {
- struct io_uring_buf_ring *br; struct io_uring_buf_reg reg; struct io_buffer_list *bl, *free_bl = NULL; struct page **pages;
- size_t pages_size; int nr_pages;
- if (copy_from_user(®, arg, sizeof(reg)))
- if (copy_io_uring_buf_reg_from_user(ctx, ®, arg)) return -EFAULT;
if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) @@ -508,19 +596,19 @@ int io_register_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) return -ENOMEM; }
- pages = io_pin_pages(reg.ring_addr,
struct_size(br, bufs, reg.ring_entries),
AFAICT the new code is not quite equivalent to this line, even in the native case, because struct_size() is equivalent to sizeof(struct) + n * sizeof(flex_array_member). struct io_uring_buf_ring is a bit of a strange union, in the end its size is the size of struct io_uring_buf. IOW I think this line actually yields (n + 1) * sizeof(struct io_uring_buf).
I suspect this is not what was actually intended, and that your change fixes that off-by-one, though I may well be wrong as the way the union is used is not exactly trivial. If that's the case then it should be a separate patch, and it would be a good candidate for upstreaming.
&nr_pages);
- pages_size = ctx->compat ?
size_mul(sizeof(struct compat_io_uring_buf), reg.ring_entries) :
size_mul(sizeof(struct io_uring_buf), reg.ring_entries);
- pages = io_pin_pages(reg.ring_addr, pages_size, &nr_pages); if (IS_ERR(pages)) { kfree(free_bl); return PTR_ERR(pages); }
- br = page_address(pages[0]); bl->buf_pages = pages; bl->buf_nr_pages = nr_pages; bl->nr_entries = reg.ring_entries;
- bl->buf_ring = br;
- bl->buf_ring = page_address(pages[0]); bl->mask = reg.ring_entries - 1; io_buffer_add_list(ctx, bl, reg.bgid); return 0;
@@ -531,7 +619,7 @@ int io_unregister_pbuf_ring(struct io_ring_ctx *ctx, void __user *arg) struct io_uring_buf_reg reg; struct io_buffer_list *bl;
- if (copy_from_user(®, arg, sizeof(reg)))
- if (copy_io_uring_buf_reg_from_user(ctx, ®, arg)) return -EFAULT; if (reg.pad || reg.resv[0] || reg.resv[1] || reg.resv[2]) return -EINVAL;
diff --git a/io_uring/kbuf.h b/io_uring/kbuf.h index c23e15d7d3caf..1aa5bbbc5d628 100644 --- a/io_uring/kbuf.h +++ b/io_uring/kbuf.h @@ -2,6 +2,7 @@ #ifndef IOU_KBUF_H #define IOU_KBUF_H +#include <linux/io_uring_types.h> #include <uapi/linux/io_uring.h> struct io_buffer_list { @@ -13,7 +14,10 @@ struct io_buffer_list { struct list_head buf_list; struct { struct page **buf_pages;
struct io_uring_buf_ring *buf_ring;
union {
struct io_uring_buf_ring *buf_ring;
struct compat_io_uring_buf_ring *buf_ring_compat;
}; }; __u16 bgid;};
diff --git a/io_uring/net.c b/io_uring/net.c index c586278858e7e..4c133bc6f9d1d 100644 --- a/io_uring/net.c +++ b/io_uring/net.c @@ -4,6 +4,7 @@ #include <linux/file.h> #include <linux/slab.h> #include <linux/net.h> +#include <linux/uio.h> #include <linux/compat.h> #include <net/compat.h> #include <linux/io_uring.h> @@ -435,7 +436,9 @@ static int __io_recvmsg_copy_hdr(struct io_kiocb *req, } else if (msg.msg_iovlen > 1) { return -EINVAL; } else {
if (copy_from_user(iomsg->fast_iov, msg.msg_iov, sizeof(*msg.msg_iov)))
void *iov = iovec_from_user(msg.msg_iov, 1, 1, iomsg->fast_iov,
req->ctx->compat);
if (IS_ERR(iov))
This looks correct, but do you have any idea why the compat code looks so different? It does not set fast_iov at all in that case, which makes me wonder if it's really equivalent...
return -EFAULT; sr->len = iomsg->fast_iov[0].iov_len; iomsg->free_iov = NULL;
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c index 41e192de9e8a7..eafdb039f9e51 100644 --- a/io_uring/rsrc.c +++ b/io_uring/rsrc.c @@ -23,6 +23,106 @@ struct io_rsrc_update { u32 offset; }; +#ifdef CONFIG_COMPAT64 +static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update2 *up2,
const void __user *user_up)
+{
- struct compat_io_uring_rsrc_update compat_up;
- if (unlikely(copy_from_user(&compat_up, user_up, sizeof(compat_up))))
return -EFAULT;
- up2->offset = compat_up.offset;
- up2->resv = compat_up.resv;
- up2->data = compat_up.data;
- return 0;
+}
+static int get_compat_io_uring_rsrc_update2(struct io_uring_rsrc_update2 *up2,
const void __user *user_up2)
+{
- struct compat_io_uring_rsrc_update2 compat_up2;
- if (unlikely(copy_from_user(&compat_up2, user_up2, sizeof(compat_up2))))
return -EFAULT;
- up2->offset = compat_up2.offset;
- up2->resv = compat_up2.resv;
- up2->data = compat_up2.data;
- up2->tags = compat_up2.tags;
- up2->nr = compat_up2.nr;
- up2->resv2 = compat_up2.resv2;
- return 0;
+}
+static int get_compat_io_uring_rsrc_register(struct io_uring_rsrc_register *rr,
const void __user *user_rr)
+{
- struct compat_io_uring_rsrc_register compat_rr;
- if (unlikely(copy_from_user(&compat_rr, user_rr, sizeof(compat_rr))))
return -EFAULT;
- rr->nr = compat_rr.nr;
- rr->flags = compat_rr.flags;
- rr->resv2 = compat_rr.resv2;
- rr->data = compat_rr.data;
- rr->tags = compat_rr.tags;
- return 0;
+} +#endif /* CONFIG_COMPAT64 */
+static int copy_io_uring_rsrc_update_from_user(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up2,
const void __user *arg)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat)
return get_compat_io_uring_rsrc_update(up2, arg);
+#endif
- return copy_from_user(up2, arg, sizeof(struct io_uring_rsrc_update));
+}
+static int copy_io_uring_rsrc_update2_from_user(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update2 *up2,
const void __user *arg,
size_t size)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat) {
if (size != sizeof(struct compat_io_uring_rsrc_update2))
return -EINVAL;
if (get_compat_io_uring_rsrc_update2(up2, arg))
return -EFAULT;
return 0;
- }
+#endif
- if (size != sizeof(*up2))
return -EINVAL;
- if (copy_from_user(up2, arg, sizeof(*up2)))
return -EFAULT;
- return 0;
+}
+static int copy_io_uring_rsrc_register_from_user(struct io_ring_ctx *ctx,
struct io_uring_rsrc_register *rr,
const void __user *arg,
size_t size)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat) {
if (size != sizeof(struct compat_io_uring_rsrc_register))
return -EINVAL;
if (get_compat_io_uring_rsrc_register(rr, arg))
return -EFAULT;
return 0;
- }
+#endif
- /* keep it extendible */
Nit: I don't think it's worth keeping that comment, especially considering the existing typo :)
- if (size != sizeof(*rr))
return -EINVAL;
- if (copy_from_user(rr, arg, size))
return -EFAULT;
- return 0;
+}
static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov, struct io_mapped_ubuf **pimu, struct page **last_hpage); @@ -597,12 +697,14 @@ int io_register_files_update(struct io_ring_ctx *ctx, void __user *arg, unsigned nr_args) { struct io_uring_rsrc_update2 up;
- int ret;
if (!nr_args) return -EINVAL; memset(&up, 0, sizeof(up));
- if (copy_from_user(&up, arg, sizeof(struct io_uring_rsrc_update)))
return -EFAULT;
- ret = copy_io_uring_rsrc_update_from_user(ctx, &up, arg);
- if (ret)
if (up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, IORING_RSRC_FILE, &up, nr_args);return ret;
@@ -612,11 +714,11 @@ int io_register_rsrc_update(struct io_ring_ctx *ctx, void __user *arg, unsigned size, unsigned type) { struct io_uring_rsrc_update2 up;
- int ret;
- if (size != sizeof(up))
return -EINVAL;
- if (copy_from_user(&up, arg, sizeof(up)))
return -EFAULT;
- ret = copy_io_uring_rsrc_update2_from_user(ctx, &up, arg, size);
- if (ret)
if (!up.nr || up.resv || up.resv2) return -EINVAL; return __io_register_rsrc_update(ctx, type, &up, up.nr);return ret;
@@ -626,14 +728,11 @@ __cold int io_register_rsrc(struct io_ring_ctx *ctx, void __user *arg, unsigned int size, unsigned int type) { struct io_uring_rsrc_register rr;
- int ret;
- /* keep it extendible */
- if (size != sizeof(rr))
return -EINVAL;
- memset(&rr, 0, sizeof(rr));
- if (copy_from_user(&rr, arg, size))
return -EFAULT;
- ret = copy_io_uring_rsrc_register_from_user(ctx, &rr, arg, size);
- if (ret)
if (!rr.nr || rr.resv2) return -EINVAL; if (rr.flags & ~IORING_RSRC_REGISTER_SPARSE)return ret;
diff --git a/io_uring/tctx.c b/io_uring/tctx.c index 96f77450cf4e2..6ab9916ed3844 100644 --- a/io_uring/tctx.c +++ b/io_uring/tctx.c @@ -12,6 +12,32 @@ #include "io_uring.h" #include "tctx.h" +#ifdef CONFIG_COMPAT64 +static int get_compat_io_uring_rsrc_update(struct io_uring_rsrc_update *up,
const void __user *user_up)
+{
- struct compat_io_uring_rsrc_update compat_up;
- if (unlikely(copy_from_user(&compat_up, user_up, sizeof(compat_up))))
return -EFAULT;
- up->offset = compat_up.offset;
- up->resv = compat_up.resv;
- up->data = compat_up.data;
- return 0;
+} +#endif /* CONFIG_COMPAT64 */
+static int copy_io_uring_rsrc_update_from_user(struct io_ring_ctx *ctx,
struct io_uring_rsrc_update *up,
const void __user *arg)
+{ +#ifdef CONFIG_COMPAT64
- if (ctx->compat)
return get_compat_io_uring_rsrc_update(up, arg);
+#endif
- return copy_from_user(up, arg, sizeof(struct io_uring_rsrc_update));
+}
static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx, struct task_struct *task) { @@ -244,8 +270,6 @@ static int io_ring_add_registered_fd(struct io_uring_task *tctx, int fd, int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args) {
- struct io_uring_rsrc_update __user *arg = __arg;
- struct io_uring_rsrc_update reg; struct io_uring_task *tctx; int ret, i;
@@ -260,9 +284,17 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, tctx = current->io_uring; for (i = 0; i < nr_args; i++) {
void __user *arg;
__u32 __user *arg_offset;
int start, end;struct io_uring_rsrc_update reg;
if (copy_from_user(®, &arg[i], sizeof(reg))) {
if (ctx->compat)
arg = &((struct compat_io_uring_rsrc_update __user *)__arg)[i];
else
arg = &((struct io_uring_rsrc_update __user *)__arg)[i];
Might be worth adding a helper for that magic, since it somewhat hurts the eyes and it's also used in io_ringfd_unregister().
}if (copy_io_uring_rsrc_update_from_user(ctx, ®, arg)) { ret = -EFAULT; break;
@@ -288,8 +320,10 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, if (ret < 0) break;
reg.offset = ret;
Don't want we want to keep this line?
Kevin
if (put_user(reg.offset, &arg[i].offset)) {
arg_offset = ctx->compat ?
&((struct compat_io_uring_rsrc_update __user *)arg)->offset :
&((struct io_uring_rsrc_update __user *)arg)->offset;
if (put_user(reg.offset, arg_offset)) { fput(tctx->registered_rings[reg.offset]); tctx->registered_rings[reg.offset] = NULL; ret = -EFAULT;
@@ -303,9 +337,7 @@ int io_ringfd_register(struct io_ring_ctx *ctx, void __user *__arg, int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, unsigned nr_args) {
- struct io_uring_rsrc_update __user *arg = __arg; struct io_uring_task *tctx = current->io_uring;
- struct io_uring_rsrc_update reg; int ret = 0, i;
if (!nr_args || nr_args > IO_RINGFD_REG_MAX) @@ -314,10 +346,19 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg, return 0; for (i = 0; i < nr_args; i++) {
if (copy_from_user(®, &arg[i], sizeof(reg))) {
void __user *arg;
struct io_uring_rsrc_update reg;
if (ctx->compat)
arg = &((struct compat_io_uring_rsrc_update __user *)__arg)[i];
else
arg = &((struct io_uring_rsrc_update __user *)__arg)[i];
}if (copy_io_uring_rsrc_update_from_user(ctx, ®, arg)) { ret = -EFAULT; break;
- if (reg.resv || reg.data || reg.offset >= IO_RINGFD_REG_MAX) { ret = -EINVAL; break;
diff --git a/io_uring/uring_cmd.h b/io_uring/uring_cmd.h index 7c6697d13cb2e..d67bb30ad543b 100644 --- a/io_uring/uring_cmd.h +++ b/io_uring/uring_cmd.h @@ -11,3 +11,10 @@ int io_uring_cmd_prep_async(struct io_kiocb *req); #define uring_cmd_pdu_size(is_sqe128) \ ((1 + !!(is_sqe128)) * sizeof(struct io_uring_sqe) - \ offsetof(struct io_uring_sqe, cmd))
+#ifdef CONFIG_COMPAT64 +#define compat_uring_cmd_pdu_size(is_sqe128) \
- ((1 + !!(is_sqe128)) * sizeof(struct compat_io_uring_sqe) - \
offsetof(struct compat_io_uring_sqe, cmd))
+#endif