50804fe373
Unsharing of the pid namespace unlike unsharing of other namespaces does not take affect immediately. Instead it affects the children created with fork and clone. The first of these children becomes the init process of the new pid namespace, the rest become oddball children of pid 0. From the point of view of the new pid namespace the process that created it is pid 0, as it's pid does not map. A couple of different semantics were considered but this one was settled on because it is easy to implement and it is usable from pam modules. The core reasons for the existence of unshare. I took a survey of the callers of pam modules and the following appears to be a representative sample of their logic. { setup stuff include pam child = fork(); if (!child) { setuid() exec /bin/bash } waitpid(child); pam and other cleanup } As you can see there is a fork to create the unprivileged user space process. Which means that the unprivileged user space process will appear as pid 1 in the new pid namespace. Further most login processes do not cope with extraneous children which means shifting the duty of reaping extraneous child process to the creator of those extraneous children makes the system more comprehensible. The practical reason for this set of pid namespace semantics is that it is simple to implement and verify they work correctly. Whereas an implementation that requres changing the struct pid on a process comes with a lot more races and pain. Not the least of which is that glibc caches getpid(). These semantics are implemented by having two notions of the pid namespace of a proces. There is task_active_pid_ns which is the pid namspace the process was created with and the pid namespace that all pids are presented to that process in. The task_active_pid_ns is stored in the struct pid of the task. Then there is the pid namespace that will be used for children that pid namespace is stored in task->nsproxy->pid_ns. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
278 lines
6 KiB
C
278 lines
6 KiB
C
/*
|
|
* Copyright (C) 2006 IBM Corporation
|
|
*
|
|
* Author: Serge Hallyn <serue@us.ibm.com>
|
|
*
|
|
* This program is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU General Public License as
|
|
* published by the Free Software Foundation, version 2 of the
|
|
* License.
|
|
*
|
|
* Jun 2006 - namespaces support
|
|
* OpenVZ, SWsoft Inc.
|
|
* Pavel Emelianov <xemul@openvz.org>
|
|
*/
|
|
|
|
#include <linux/slab.h>
|
|
#include <linux/export.h>
|
|
#include <linux/nsproxy.h>
|
|
#include <linux/init_task.h>
|
|
#include <linux/mnt_namespace.h>
|
|
#include <linux/utsname.h>
|
|
#include <linux/pid_namespace.h>
|
|
#include <net/net_namespace.h>
|
|
#include <linux/ipc_namespace.h>
|
|
#include <linux/proc_fs.h>
|
|
#include <linux/file.h>
|
|
#include <linux/syscalls.h>
|
|
|
|
static struct kmem_cache *nsproxy_cachep;
|
|
|
|
struct nsproxy init_nsproxy = {
|
|
.count = ATOMIC_INIT(1),
|
|
.uts_ns = &init_uts_ns,
|
|
#if defined(CONFIG_POSIX_MQUEUE) || defined(CONFIG_SYSVIPC)
|
|
.ipc_ns = &init_ipc_ns,
|
|
#endif
|
|
.mnt_ns = NULL,
|
|
.pid_ns = &init_pid_ns,
|
|
#ifdef CONFIG_NET
|
|
.net_ns = &init_net,
|
|
#endif
|
|
};
|
|
|
|
static inline struct nsproxy *create_nsproxy(void)
|
|
{
|
|
struct nsproxy *nsproxy;
|
|
|
|
nsproxy = kmem_cache_alloc(nsproxy_cachep, GFP_KERNEL);
|
|
if (nsproxy)
|
|
atomic_set(&nsproxy->count, 1);
|
|
return nsproxy;
|
|
}
|
|
|
|
/*
|
|
* Create new nsproxy and all of its the associated namespaces.
|
|
* Return the newly created nsproxy. Do not attach this to the task,
|
|
* leave it to the caller to do proper locking and attach it to task.
|
|
*/
|
|
static struct nsproxy *create_new_namespaces(unsigned long flags,
|
|
struct task_struct *tsk, struct fs_struct *new_fs)
|
|
{
|
|
struct nsproxy *new_nsp;
|
|
int err;
|
|
|
|
new_nsp = create_nsproxy();
|
|
if (!new_nsp)
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
new_nsp->mnt_ns = copy_mnt_ns(flags, tsk->nsproxy->mnt_ns, new_fs);
|
|
if (IS_ERR(new_nsp->mnt_ns)) {
|
|
err = PTR_ERR(new_nsp->mnt_ns);
|
|
goto out_ns;
|
|
}
|
|
|
|
new_nsp->uts_ns = copy_utsname(flags, tsk);
|
|
if (IS_ERR(new_nsp->uts_ns)) {
|
|
err = PTR_ERR(new_nsp->uts_ns);
|
|
goto out_uts;
|
|
}
|
|
|
|
new_nsp->ipc_ns = copy_ipcs(flags, tsk);
|
|
if (IS_ERR(new_nsp->ipc_ns)) {
|
|
err = PTR_ERR(new_nsp->ipc_ns);
|
|
goto out_ipc;
|
|
}
|
|
|
|
new_nsp->pid_ns = copy_pid_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->pid_ns);
|
|
if (IS_ERR(new_nsp->pid_ns)) {
|
|
err = PTR_ERR(new_nsp->pid_ns);
|
|
goto out_pid;
|
|
}
|
|
|
|
new_nsp->net_ns = copy_net_ns(flags, task_cred_xxx(tsk, user_ns), tsk->nsproxy->net_ns);
|
|
if (IS_ERR(new_nsp->net_ns)) {
|
|
err = PTR_ERR(new_nsp->net_ns);
|
|
goto out_net;
|
|
}
|
|
|
|
return new_nsp;
|
|
|
|
out_net:
|
|
if (new_nsp->pid_ns)
|
|
put_pid_ns(new_nsp->pid_ns);
|
|
out_pid:
|
|
if (new_nsp->ipc_ns)
|
|
put_ipc_ns(new_nsp->ipc_ns);
|
|
out_ipc:
|
|
if (new_nsp->uts_ns)
|
|
put_uts_ns(new_nsp->uts_ns);
|
|
out_uts:
|
|
if (new_nsp->mnt_ns)
|
|
put_mnt_ns(new_nsp->mnt_ns);
|
|
out_ns:
|
|
kmem_cache_free(nsproxy_cachep, new_nsp);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
/*
|
|
* called from clone. This now handles copy for nsproxy and all
|
|
* namespaces therein.
|
|
*/
|
|
int copy_namespaces(unsigned long flags, struct task_struct *tsk)
|
|
{
|
|
struct nsproxy *old_ns = tsk->nsproxy;
|
|
struct nsproxy *new_ns;
|
|
int err = 0;
|
|
|
|
if (!old_ns)
|
|
return 0;
|
|
|
|
get_nsproxy(old_ns);
|
|
|
|
if (!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
|
CLONE_NEWPID | CLONE_NEWNET)))
|
|
return 0;
|
|
|
|
if (!capable(CAP_SYS_ADMIN)) {
|
|
err = -EPERM;
|
|
goto out;
|
|
}
|
|
|
|
/*
|
|
* CLONE_NEWIPC must detach from the undolist: after switching
|
|
* to a new ipc namespace, the semaphore arrays from the old
|
|
* namespace are unreachable. In clone parlance, CLONE_SYSVSEM
|
|
* means share undolist with parent, so we must forbid using
|
|
* it along with CLONE_NEWIPC.
|
|
*/
|
|
if ((flags & CLONE_NEWIPC) && (flags & CLONE_SYSVSEM)) {
|
|
err = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
new_ns = create_new_namespaces(flags, tsk, tsk->fs);
|
|
if (IS_ERR(new_ns)) {
|
|
err = PTR_ERR(new_ns);
|
|
goto out;
|
|
}
|
|
|
|
tsk->nsproxy = new_ns;
|
|
|
|
out:
|
|
put_nsproxy(old_ns);
|
|
return err;
|
|
}
|
|
|
|
void free_nsproxy(struct nsproxy *ns)
|
|
{
|
|
if (ns->mnt_ns)
|
|
put_mnt_ns(ns->mnt_ns);
|
|
if (ns->uts_ns)
|
|
put_uts_ns(ns->uts_ns);
|
|
if (ns->ipc_ns)
|
|
put_ipc_ns(ns->ipc_ns);
|
|
if (ns->pid_ns)
|
|
put_pid_ns(ns->pid_ns);
|
|
put_net(ns->net_ns);
|
|
kmem_cache_free(nsproxy_cachep, ns);
|
|
}
|
|
|
|
/*
|
|
* Called from unshare. Unshare all the namespaces part of nsproxy.
|
|
* On success, returns the new nsproxy.
|
|
*/
|
|
int unshare_nsproxy_namespaces(unsigned long unshare_flags,
|
|
struct nsproxy **new_nsp, struct fs_struct *new_fs)
|
|
{
|
|
int err = 0;
|
|
|
|
if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC |
|
|
CLONE_NEWNET | CLONE_NEWPID)))
|
|
return 0;
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
|
|
*new_nsp = create_new_namespaces(unshare_flags, current,
|
|
new_fs ? new_fs : current->fs);
|
|
if (IS_ERR(*new_nsp)) {
|
|
err = PTR_ERR(*new_nsp);
|
|
goto out;
|
|
}
|
|
|
|
out:
|
|
return err;
|
|
}
|
|
|
|
void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
|
|
{
|
|
struct nsproxy *ns;
|
|
|
|
might_sleep();
|
|
|
|
ns = p->nsproxy;
|
|
|
|
rcu_assign_pointer(p->nsproxy, new);
|
|
|
|
if (ns && atomic_dec_and_test(&ns->count)) {
|
|
/*
|
|
* wait for others to get what they want from this nsproxy.
|
|
*
|
|
* cannot release this nsproxy via the call_rcu() since
|
|
* put_mnt_ns() will want to sleep
|
|
*/
|
|
synchronize_rcu();
|
|
free_nsproxy(ns);
|
|
}
|
|
}
|
|
|
|
void exit_task_namespaces(struct task_struct *p)
|
|
{
|
|
switch_task_namespaces(p, NULL);
|
|
}
|
|
|
|
SYSCALL_DEFINE2(setns, int, fd, int, nstype)
|
|
{
|
|
const struct proc_ns_operations *ops;
|
|
struct task_struct *tsk = current;
|
|
struct nsproxy *new_nsproxy;
|
|
struct proc_inode *ei;
|
|
struct file *file;
|
|
int err;
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
return -EPERM;
|
|
|
|
file = proc_ns_fget(fd);
|
|
if (IS_ERR(file))
|
|
return PTR_ERR(file);
|
|
|
|
err = -EINVAL;
|
|
ei = PROC_I(file->f_dentry->d_inode);
|
|
ops = ei->ns_ops;
|
|
if (nstype && (ops->type != nstype))
|
|
goto out;
|
|
|
|
new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
|
|
if (IS_ERR(new_nsproxy)) {
|
|
err = PTR_ERR(new_nsproxy);
|
|
goto out;
|
|
}
|
|
|
|
err = ops->install(new_nsproxy, ei->ns);
|
|
if (err) {
|
|
free_nsproxy(new_nsproxy);
|
|
goto out;
|
|
}
|
|
switch_task_namespaces(tsk, new_nsproxy);
|
|
out:
|
|
fput(file);
|
|
return err;
|
|
}
|
|
|
|
int __init nsproxy_cache_init(void)
|
|
{
|
|
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
|
|
return 0;
|
|
}
|