diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index a43212036257..64a6c952091e 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -237,6 +237,14 @@ COMPAT_SYSCALL_DEFINE5(x86_clone, unsigned long, clone_flags, unsigned long, newsp, int __user *, parent_tidptr, unsigned long, tls_val, int __user *, child_tidptr) { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, - tls_val); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls_val, + }; + + return _do_fork(&args); } diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index f1227f2c38a4..109a0df5af39 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -8,11 +8,26 @@ */ #include +#include struct task_struct; struct rusage; union thread_union; +/* All the bits taken by the old clone syscall. */ +#define CLONE_LEGACY_FLAGS 0xffffffffULL + +struct kernel_clone_args { + u64 flags; + int __user *pidfd; + int __user *child_tid; + int __user *parent_tid; + int exit_signal; + unsigned long stack; + unsigned long stack_size; + unsigned long tls; +}; + /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but @@ -73,7 +88,7 @@ extern void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); -extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long); +extern long _do_fork(struct kernel_clone_args *kargs); extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index e2870fe1be5b..60a81f374ca3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -70,6 +70,7 @@ struct sigaltstack; struct rseq; union bpf_attr; struct io_uring_params; +struct clone_args; #include #include @@ -852,6 +853,9 @@ asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, int __user *, unsigned long); #endif #endif + +asmlinkage long sys_clone3(struct clone_args __user *uargs, size_t size); + asmlinkage long sys_execve(const char __user *filename, const char __user *const __user *argv, const char __user *const __user *envp); diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h index ed4ee170bee2..f5331dbdcaa2 100644 --- a/include/uapi/linux/sched.h +++ b/include/uapi/linux/sched.h @@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_SCHED_H #define _UAPI_LINUX_SCHED_H +#include + /* * cloning flags: */ @@ -31,6 +33,20 @@ #define CLONE_NEWNET 0x40000000 /* New network namespace */ #define CLONE_IO 0x80000000 /* Clone io context */ +/* + * Arguments for the clone3 syscall + */ +struct clone_args { + __aligned_u64 flags; + __aligned_u64 pidfd; + __aligned_u64 child_tid; + __aligned_u64 parent_tid; + __aligned_u64 exit_signal; + __aligned_u64 stack; + __aligned_u64 stack_size; + __aligned_u64 tls; +}; + /* * Scheduling policies */ diff --git a/kernel/fork.c b/kernel/fork.c index b4cba953040a..08ff131f26b4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1760,19 +1760,15 @@ static __always_inline void delayed_free_task(struct task_struct *tsk) * flags). The actual kick-off is left to the caller. */ static __latent_entropy struct task_struct *copy_process( - unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, struct pid *pid, int trace, - unsigned long tls, - int node) + int node, + struct kernel_clone_args *args) { int pidfd = -1, retval; struct task_struct *p; struct multiprocess_signals delayed; + u64 clone_flags = args->flags; /* * Don't allow sharing the root directory with processes in a different @@ -1821,27 +1817,12 @@ static __latent_entropy struct task_struct *copy_process( } if (clone_flags & CLONE_PIDFD) { - int reserved; - /* - * - CLONE_PARENT_SETTID is useless for pidfds and also - * parent_tidptr is used to return pidfds. * - CLONE_DETACHED is blocked so that we can potentially * reuse it later for CLONE_PIDFD. * - CLONE_THREAD is blocked until someone really needs it. */ - if (clone_flags & - (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD)) - return ERR_PTR(-EINVAL); - - /* - * Verify that parent_tidptr is sane so we can potentially - * reuse it later. - */ - if (get_user(reserved, parent_tidptr)) - return ERR_PTR(-EFAULT); - - if (reserved != 0) + if (clone_flags & (CLONE_DETACHED | CLONE_THREAD)) return ERR_PTR(-EINVAL); } @@ -1874,11 +1855,11 @@ static __latent_entropy struct task_struct *copy_process( * p->set_child_tid which is (ab)used as a kthread's data pointer for * kernel threads (PF_KTHREAD). */ - p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; + p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? args->child_tid : NULL; /* * Clear TID on mm_release()? */ - p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL; + p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? args->child_tid : NULL; ftrace_graph_init_task(p); @@ -2037,7 +2018,8 @@ static __latent_entropy struct task_struct *copy_process( retval = copy_io(clone_flags, p); if (retval) goto bad_fork_cleanup_namespaces; - retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls); + retval = copy_thread_tls(clone_flags, args->stack, args->stack_size, p, + args->tls); if (retval) goto bad_fork_cleanup_io; @@ -2062,7 +2044,7 @@ static __latent_entropy struct task_struct *copy_process( goto bad_fork_free_pid; pidfd = retval; - retval = put_user(pidfd, parent_tidptr); + retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; } @@ -2105,7 +2087,7 @@ static __latent_entropy struct task_struct *copy_process( if (clone_flags & CLONE_PARENT) p->exit_signal = current->group_leader->exit_signal; else - p->exit_signal = (clone_flags & CSIGNAL); + p->exit_signal = args->exit_signal; p->group_leader = p; p->tgid = p->pid; } @@ -2313,8 +2295,11 @@ static inline void init_idle_pids(struct task_struct *idle) struct task_struct *fork_idle(int cpu) { struct task_struct *task; - task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0, - cpu_to_node(cpu)); + struct kernel_clone_args args = { + .flags = CLONE_VM, + }; + + task = copy_process(&init_struct_pid, 0, cpu_to_node(cpu), &args); if (!IS_ERR(task)) { init_idle_pids(task); init_idle(task, cpu); @@ -2334,13 +2319,9 @@ struct mm_struct *copy_init_mm(void) * It copies the process, and if successful kick-starts * it and waits for it to finish using the VM if required. */ -long _do_fork(unsigned long clone_flags, - unsigned long stack_start, - unsigned long stack_size, - int __user *parent_tidptr, - int __user *child_tidptr, - unsigned long tls) +long _do_fork(struct kernel_clone_args *args) { + u64 clone_flags = args->flags; struct completion vfork; struct pid *pid; struct task_struct *p; @@ -2356,7 +2337,7 @@ long _do_fork(unsigned long clone_flags, if (!(clone_flags & CLONE_UNTRACED)) { if (clone_flags & CLONE_VFORK) trace = PTRACE_EVENT_VFORK; - else if ((clone_flags & CSIGNAL) != SIGCHLD) + else if (args->exit_signal != SIGCHLD) trace = PTRACE_EVENT_CLONE; else trace = PTRACE_EVENT_FORK; @@ -2365,8 +2346,7 @@ long _do_fork(unsigned long clone_flags, trace = 0; } - p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr, - child_tidptr, NULL, trace, tls, NUMA_NO_NODE); + p = copy_process(NULL, trace, NUMA_NO_NODE, args); add_latent_entropy(); if (IS_ERR(p)) @@ -2382,7 +2362,7 @@ long _do_fork(unsigned long clone_flags, nr = pid_vnr(pid); if (clone_flags & CLONE_PARENT_SETTID) - put_user(nr, parent_tidptr); + put_user(nr, args->parent_tid); if (clone_flags & CLONE_VFORK) { p->vfork_done = &vfork; @@ -2414,8 +2394,16 @@ long do_fork(unsigned long clone_flags, int __user *parent_tidptr, int __user *child_tidptr) { - return _do_fork(clone_flags, stack_start, stack_size, - parent_tidptr, child_tidptr, 0); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = stack_start, + .stack_size = stack_size, + }; + + return _do_fork(&args); } #endif @@ -2424,15 +2412,25 @@ long do_fork(unsigned long clone_flags, */ pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) { - return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn, - (unsigned long)arg, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = ((flags | CLONE_VM | CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (flags & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return _do_fork(&args); } #ifdef __ARCH_WANT_SYS_FORK SYSCALL_DEFINE0(fork) { #ifdef CONFIG_MMU - return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); #else /* can not support in nommu mode */ return -EINVAL; @@ -2443,8 +2441,12 @@ SYSCALL_DEFINE0(fork) #ifdef __ARCH_WANT_SYS_VFORK SYSCALL_DEFINE0(vfork) { - return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0, - 0, NULL, NULL, 0); + struct kernel_clone_args args = { + .flags = CLONE_VFORK | CLONE_VM, + .exit_signal = SIGCHLD, + }; + + return _do_fork(&args); } #endif @@ -2472,7 +2474,110 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, unsigned long, tls) #endif { - return _do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr, tls); + struct kernel_clone_args args = { + .flags = (clone_flags & ~CSIGNAL), + .pidfd = parent_tidptr, + .child_tid = child_tidptr, + .parent_tid = parent_tidptr, + .exit_signal = (clone_flags & CSIGNAL), + .stack = newsp, + .tls = tls, + }; + + /* clone(CLONE_PIDFD) uses parent_tidptr to return a pidfd */ + if ((clone_flags & CLONE_PIDFD) && (clone_flags & CLONE_PARENT_SETTID)) + return -EINVAL; + + return _do_fork(&args); +} + +noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs, + struct clone_args __user *uargs, + size_t size) +{ + struct clone_args args; + + if (unlikely(size > PAGE_SIZE)) + return -E2BIG; + + if (unlikely(size < sizeof(struct clone_args))) + return -EINVAL; + + if (unlikely(!access_ok(uargs, size))) + return -EFAULT; + + if (size > sizeof(struct clone_args)) { + unsigned char __user *addr; + unsigned char __user *end; + unsigned char val; + + addr = (void __user *)uargs + sizeof(struct clone_args); + end = (void __user *)uargs + size; + + for (; addr < end; addr++) { + if (get_user(val, addr)) + return -EFAULT; + if (val) + return -E2BIG; + } + + size = sizeof(struct clone_args); + } + + if (copy_from_user(&args, uargs, size)) + return -EFAULT; + + *kargs = (struct kernel_clone_args){ + .flags = args.flags, + .pidfd = u64_to_user_ptr(args.pidfd), + .child_tid = u64_to_user_ptr(args.child_tid), + .parent_tid = u64_to_user_ptr(args.parent_tid), + .exit_signal = args.exit_signal, + .stack = args.stack, + .stack_size = args.stack_size, + .tls = args.tls, + }; + + return 0; +} + +static bool clone3_args_valid(const struct kernel_clone_args *kargs) +{ + /* + * All lower bits of the flag word are taken. + * Verify that no other unknown flags are passed along. + */ + if (kargs->flags & ~CLONE_LEGACY_FLAGS) + return false; + + /* + * - make the CLONE_DETACHED bit reuseable for clone3 + * - make the CSIGNAL bits reuseable for clone3 + */ + if (kargs->flags & (CLONE_DETACHED | CSIGNAL)) + return false; + + if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) && + kargs->exit_signal) + return false; + + return true; +} + +SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size) +{ + int err; + + struct kernel_clone_args kargs; + + err = copy_clone_args_from_user(&kargs, uargs, size); + if (err) + return err; + + if (!clone3_args_valid(&kargs)) + return -EINVAL; + + return _do_fork(&kargs); } #endif