The clone3 syscall provides a superset of the functionality compared to its clone forerunner, encapsulating all the various arguments within a single, dedicated clone_args structure.
As it stands now, members of said structure that do represent user pointers, are specified as being of __u64 type (alignment aside). This is problematic for some architectures, where that type cannot actually hold said pointers.
In order to add support for the latter, the relevant clone_args struct members get redefined with the appropriate __kernel_uintptr_t type.
This though, brings its own drawbacks. The structure itself gains revised signature (with implicit paddings) which results in noticeable growth in its size. Furthermore, for affected architectures, as a consequence, clone3 is no longer compat mode agnostic, and as such, requires repurposing the main syscall handler at a cost of marginal overhead though in favour of avoiding code duplication.
Signed-off-by: Beata Michalska beata.michalska@arm.com --- include/uapi/linux/sched.h | 30 +++++--- kernel/fork.c | 139 +++++++++++++++++++++++++++++-------- 2 files changed, 131 insertions(+), 38 deletions(-)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index 3bac0a8ceab2..e3ca56b530d2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -87,26 +87,36 @@ * * The structure is versioned by size and thus extensible. * New struct members must go at the end of the struct and - * must be properly 64bit aligned. + * must be properly aligned on at least 64bit boundary. + * Mind implicit padding when: + * ((sizeof(__kernel_uintptr_t) > sizeof(__u64)) + * */ + struct clone_args { __aligned_u64 flags; - __aligned_u64 pidfd; - __aligned_u64 child_tid; - __aligned_u64 parent_tid; + __kernel_aligned_uintptr_t pidfd; + __kernel_aligned_uintptr_t child_tid; + __kernel_aligned_uintptr_t parent_tid; __aligned_u64 exit_signal; - __aligned_u64 stack; + __kernel_aligned_uintptr_t stack; __aligned_u64 stack_size; - __aligned_u64 tls; - __aligned_u64 set_tid; + __kernel_aligned_uintptr_t tls; + /* VER0 boundary */ + __kernel_aligned_uintptr_t set_tid; __aligned_u64 set_tid_size; + /* VER1 boundary */ __aligned_u64 cgroup; + /* VER2 boundary */ }; #endif
-#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ -#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ -#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ +#define CLONE_ARGS_SIZE_VER0 /* sizeof first published struct */ \ + ((sizeof(__kernel_uintptr_t) > sizeof(__u64)) ? 128 : 64) +#define CLONE_ARGS_SIZE_VER1 /* sizeof second published struct */ \ + ((sizeof(__kernel_uintptr_t) > sizeof(__u64)) ? 152 : 80) +#define CLONE_ARGS_SIZE_VER2 /* sizeof third published struct */ \ + ((sizeof(__kernel_uintptr_t) > sizeof(__u64)) ? 160 : 88)
/* * Scheduling policies diff --git a/kernel/fork.c b/kernel/fork.c index 181771571599..94777ac4d455 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2759,67 +2759,150 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, user_uintptr_t, newsp,
#ifdef __ARCH_WANT_SYS_CLONE3
+#define __clone_args_size_ver(ver, pfx) \ + pfx##CLONE_ARGS_SIZE_VER##ver + +#ifdef CONFIG_COMPAT64 +struct compat_clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; + __aligned_u64 set_tid; + __aligned_u64 set_tid_size; + __aligned_u64 cgroup; +}; + +#define COMPAT_CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */ +#define COMPAT_CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */ + +#define clone_args_size_ver(args, ver) \ + ((args)->compat_mode ? __clone_args_size_ver(ver, COMPAT_) \ + : __clone_args_size_ver(ver, )) + +#define clone_args_get(args, member) \ + ((args)->compat_mode ? (args)->__compat_args.member \ + : (args)->__args.member) + +#define clone_args_get_user_ptr(args, member) \ + ((args)->compat_mode ? \ + compat_ptr(clone_args_get(args, member)) : \ + (void __user *)(user_uintptr_t)(clone_args_get(args, member))) + +#else /* CONFIG_COMPAT64 */ +#define clone_args_size_ver(args, ver) __clone_args_size_ver(ver, ) +#define clone_args_get(args, member) ((args)->member) +#define clone_args_get_user_ptr(args, member) \ + (void __user *)(user_uintptr_t)(clone_args_get(args, member)) +#endif /* CONFIG_COMPAT64 */ + +static inline void clone_args_validate_static(void) +{ +#define CLONE_ARGS_BUILD_BUG_ON(type, pfx) \ +do { \ + BUILD_BUG_ON(offsetofend(type, tls) != \ + __clone_args_size_ver(0, pfx)); \ + BUILD_BUG_ON(offsetofend(type, set_tid_size) != \ + __clone_args_size_ver(1, pfx)); \ + BUILD_BUG_ON(offsetofend(type, cgroup) != \ + __clone_args_size_ver(2, pfx)); \ + BUILD_BUG_ON(sizeof(type) != __clone_args_size_ver(2, pfx)); \ +} while (0) + + CLONE_ARGS_BUILD_BUG_ON(struct clone_args, ); +#ifdef CONFIG_COMPAT64 + CLONE_ARGS_BUILD_BUG_ON(struct compat_clone_args, COMPAT_); + BUILD_BUG_ON(sizeof(struct clone_args) < sizeof(struct compat_clone_args)); +#endif +} + noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, struct clone_args __user *uargs, size_t usize) { int err; +#ifdef CONFIG_COMPAT64 + struct { + union { + struct clone_args __args; + struct compat_clone_args __compat_args; + }; + bool compat_mode; + } args = { + .compat_mode = in_compat_syscall(), + }; +#else struct clone_args args; +#endif pid_t *kset_tid = kargs->set_tid;
- BUILD_BUG_ON(offsetofend(struct clone_args, tls) != - CLONE_ARGS_SIZE_VER0); - BUILD_BUG_ON(offsetofend(struct clone_args, set_tid_size) != - CLONE_ARGS_SIZE_VER1); - BUILD_BUG_ON(offsetofend(struct clone_args, cgroup) != - CLONE_ARGS_SIZE_VER2); - BUILD_BUG_ON(sizeof(struct clone_args) != CLONE_ARGS_SIZE_VER2); + clone_args_validate_static();
if (unlikely(usize > PAGE_SIZE)) return -E2BIG; - if (unlikely(usize < CLONE_ARGS_SIZE_VER0)) + if (unlikely(usize < clone_args_size_ver(&args, 0))) return -EINVAL;
- err = copy_struct_from_user(&args, sizeof(args), uargs, usize); +#ifdef CONFIG_COMPAT64 + if (args.compat_mode) + err = copy_struct_from_user(&args.__compat_args, + sizeof(args.__compat_args), + uargs, usize); + else + err = copy_struct_from_user_with_ptr(&args.__args, + sizeof(args.__args), + uargs, usize); +#else + err = copy_struct_from_user_with_ptr(&args, sizeof(args), uargs, usize); +#endif + if (err) return err;
- if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL)) + if (unlikely(clone_args_get(&args, set_tid_size) > MAX_PID_NS_LEVEL)) return -EINVAL;
- if (unlikely(!args.set_tid && args.set_tid_size > 0)) + if (unlikely(!clone_args_get(&args, set_tid) && + clone_args_get(&args, set_tid_size) > 0)) return -EINVAL;
- if (unlikely(args.set_tid && args.set_tid_size == 0)) + if (unlikely(clone_args_get(&args, set_tid) && + clone_args_get(&args, set_tid_size) == 0)) return -EINVAL;
/* * Verify that higher 32bits of exit_signal are unset and that * it is a valid signal */ - if (unlikely((args.exit_signal & ~((u64)CSIGNAL)) || - !valid_signal(args.exit_signal))) + if (unlikely((clone_args_get(&args, exit_signal) & ~((u64)CSIGNAL)) || + !valid_signal(clone_args_get(&args, exit_signal)))) return -EINVAL;
- if ((args.flags & CLONE_INTO_CGROUP) && - (args.cgroup > INT_MAX || usize < CLONE_ARGS_SIZE_VER2)) + if ((clone_args_get(&args, flags) & CLONE_INTO_CGROUP) && + (clone_args_get(&args, cgroup) > INT_MAX || + usize < clone_args_size_ver(&args, 2))) return -EINVAL;
*kargs = (struct kernel_clone_args){ - .flags = args.flags, - .pidfd = u64_to_user_ptr(args.pidfd), - .child_tid = u64_to_user_ptr(args.child_tid), - .parent_tid = u64_to_user_ptr(args.parent_tid), - .exit_signal = args.exit_signal, - .stack = args.stack, - .stack_size = args.stack_size, - .tls = args.tls, - .set_tid_size = args.set_tid_size, - .cgroup = args.cgroup, + .flags = clone_args_get(&args, flags), + .pidfd = clone_args_get_user_ptr(&args, pidfd), + .child_tid = clone_args_get_user_ptr(&args, child_tid), + .parent_tid = clone_args_get_user_ptr(&args, parent_tid), + .exit_signal = clone_args_get(&args, exit_signal), + .stack = clone_args_get(&args, stack), + .stack_size = clone_args_get(&args, stack_size), + .tls = clone_args_get(&args, tls), + .set_tid_size = clone_args_get(&args, set_tid_size), + .cgroup = clone_args_get(&args, cgroup), };
- if (args.set_tid && - copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid), + if (clone_args_get(&args, set_tid) && + copy_from_user(kset_tid, clone_args_get_user_ptr(&args, set_tid), (kargs->set_tid_size * sizeof(pid_t)))) return -EFAULT;