From f7428af9c11cd47e6252a3fbf24db411e513c241 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Wed, 19 Jun 2019 09:20:10 -0700 Subject: [PATCH] Add MountNamespace to task. This allows tasks to have distinct mount namespace, instead of all sharing the kernel's root mount namespace. Currently, the only way for a task to get a different mount namespace than the kernel's root is by explicitly setting a different MountNamespace in CreateProcessArgs, and nothing does this (yet). In a follow-up CL, we will set CreateProcessArgs.MountNamespace when creating a new container inside runsc. Note that "MountNamespace" is a poor term for this thing. It's more like a distinct VFS tree. When we get around to adding real mount namespaces, this will need a better naem. PiperOrigin-RevId: 254009310 --- pkg/sentry/fs/mounts.go | 11 ++++++- pkg/sentry/kernel/kernel.go | 49 ++++++++++++++++++++++++------- pkg/sentry/kernel/task.go | 4 +-- pkg/sentry/kernel/task_clone.go | 3 +- pkg/sentry/kernel/thread_group.go | 13 +++++++- 5 files changed, 65 insertions(+), 15 deletions(-) diff --git a/pkg/sentry/fs/mounts.go b/pkg/sentry/fs/mounts.go index a5d6f8b9a..281364dfc 100644 --- a/pkg/sentry/fs/mounts.go +++ b/pkg/sentry/fs/mounts.go @@ -124,7 +124,16 @@ func (m *Mount) IsUndo() bool { return false } -// MountNamespace defines a collection of mounts. +// MountNamespace defines a VFS root. It contains collection of Mounts that are +// mounted inside the Dirent tree rooted at the Root Dirent. It provides +// methods for traversing the Dirent, and for mounting/unmounting in the tree. +// +// Note that this does not correspond to a "mount namespace" in the Linux. It +// is more like a unique VFS instance. +// +// It's possible for different processes to have different MountNamespaces. In +// this case, the file systems exposed to the processes are completely +// distinct. // // +stateify savable type MountNamespace struct { diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index 07ae592c4..9fe9eb914 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -381,9 +381,23 @@ func (k *Kernel) SaveTo(w io.Writer) error { // flushMountSourceRefs flushes the MountSources for all mounted filesystems // and open FDs. func (k *Kernel) flushMountSourceRefs() error { - // Flush all mount sources for currently mounted filesystems. + // Flush all mount sources for currently mounted filesystems in the + // root mount namespace. k.mounts.FlushMountSourceRefs() + // Some tasks may have other mount namespaces; flush those as well. + flushed := make(map[*fs.MountNamespace]struct{}) + k.tasks.mu.RLock() + k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) { + if _, ok := flushed[tg.mounts]; ok { + // Already flushed. + return + } + tg.mounts.FlushMountSourceRefs() + flushed[tg.mounts] = struct{}{} + }) + k.tasks.mu.RUnlock() + // There may be some open FDs whose filesystems have been unmounted. We // must flush those as well. return k.tasks.forEachFDPaused(func(desc descriptor) error { @@ -611,12 +625,18 @@ type CreateProcessArgs struct { // AbstractSocketNamespace is the initial Abstract Socket namespace. AbstractSocketNamespace *AbstractSocketNamespace + // MountNamespace optionally contains the mount namespace for this + // process. If nil, the kernel's mount namespace is used. + // + // Anyone setting MountNamespace must donate a reference (i.e. + // increment it). + MountNamespace *fs.MountNamespace + // Root optionally contains the dirent that serves as the root for the // process. If nil, the mount namespace's root is used as the process' // root. // - // Anyone setting Root must donate a reference (i.e. increment it) to - // keep it alive until it is decremented by CreateProcess. + // Anyone setting Root must donate a reference (i.e. increment it). Root *fs.Dirent // ContainerID is the container that the process belongs to. @@ -715,20 +735,29 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, return nil, 0, fmt.Errorf("no kernel MountNamespace") } - tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) + // Grab the mount namespace. + mounts := args.MountNamespace + if mounts == nil { + // If no MountNamespace was configured, then use the kernel's + // root mount namespace, with an extra reference that will be + // donated to the task. + mounts = k.mounts + mounts.IncRef() + } + + tg := k.newThreadGroup(mounts, k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock) ctx := args.NewContext(k) // Grab the root directory. root := args.Root if root == nil { - root = fs.RootFromContext(ctx) - // Is the root STILL nil? - if root == nil { - return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context") - } + // If no Root was configured, then get it from the + // MountNamespace. + root = mounts.Root() } + // The call to newFSContext below will take a reference on root, so we + // don't need to hold this one. defer root.DecRef() - args.Root = nil // Grab the working directory. remainingTraversals := uint(args.MaxSymlinkTraversals) diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index e883696f9..7ed589a02 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -665,7 +665,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock { // Preconditions: The caller must be running on the task goroutine, or t.mu // must be locked. func (t *Task) IsChrooted() bool { - realRoot := t.k.mounts.Root() + realRoot := t.tg.mounts.Root() defer realRoot.DecRef() root := t.fsc.RootDirectory() if root != nil { @@ -710,7 +710,7 @@ func (t *Task) WithMuLocked(f func(*Task)) { // MountNamespace returns t's MountNamespace. MountNamespace does not take an // additional reference on the returned MountNamespace. func (t *Task) MountNamespace() *fs.MountNamespace { - return t.k.mounts + return t.tg.mounts } // AbstractSockets returns t's AbstractSocketNamespace. diff --git a/pkg/sentry/kernel/task_clone.go b/pkg/sentry/kernel/task_clone.go index cafc9296f..0e621f0d1 100644 --- a/pkg/sentry/kernel/task_clone.go +++ b/pkg/sentry/kernel/task_clone.go @@ -238,11 +238,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) { } tg := t.tg if opts.NewThreadGroup { + tg.mounts.IncRef() sh := t.tg.signalHandlers if opts.NewSignalHandlers { sh = sh.Fork() } - tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) + tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock) } cfg := &TaskConfig{ diff --git a/pkg/sentry/kernel/thread_group.go b/pkg/sentry/kernel/thread_group.go index 95346290d..3562ef179 100644 --- a/pkg/sentry/kernel/thread_group.go +++ b/pkg/sentry/kernel/thread_group.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/fs" ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/limits" "gvisor.dev/gvisor/pkg/sentry/usage" @@ -236,13 +237,21 @@ type ThreadGroup struct { // rscr is the thread group's RSEQ critical region. rscr atomic.Value `state:".(*RSEQCriticalRegion)"` + + // mounts is the thread group's mount namespace. This does not really + // correspond to a "mount namespace" in Linux, but is more like a + // complete VFS that need not be shared between processes. See the + // comment in mounts.go for more information. + // + // mounts is immutable. + mounts *fs.MountNamespace } // newThreadGroup returns a new, empty thread group in PID namespace ns. The // thread group leader will send its parent terminationSignal when it exits. // The new thread group isn't visible to the system until a task has been // created inside of it by a successful call to TaskSet.NewTask. -func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { +func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup { tg := &ThreadGroup{ threadGroupNode: threadGroupNode{ pidns: ns, @@ -251,6 +260,7 @@ func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminatio terminationSignal: terminationSignal, ioUsage: &usage.IO{}, limits: limits, + mounts: mounts, } tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg}) tg.timers = make(map[linux.TimerID]*IntervalTimer) @@ -298,6 +308,7 @@ func (tg *ThreadGroup) release() { for _, it := range its { it.DestroyTimer() } + tg.mounts.DecRef() } // forEachChildThreadGroupLocked indicates over all child ThreadGroups.