Add MountNamespace to task.

This allows tasks to have distinct mount namespace, instead of all sharing the
kernel's root mount namespace.

Currently, the only way for a task to get a different mount namespace than the
kernel's root is by explicitly setting a different MountNamespace in
CreateProcessArgs, and nothing does this (yet).

In a follow-up CL, we will set CreateProcessArgs.MountNamespace when creating a
new container inside runsc.

Note that "MountNamespace" is a poor term for this thing. It's more like a
distinct VFS tree. When we get around to adding real mount namespaces, this
will need a better naem.

PiperOrigin-RevId: 254009310
This commit is contained in:
Nicolas Lacasse 2019-06-19 09:20:10 -07:00 committed by gVisor bot
parent 0d1dc50b70
commit f7428af9c1
5 changed files with 65 additions and 15 deletions

View File

@ -124,7 +124,16 @@ func (m *Mount) IsUndo() bool {
return false
}
// MountNamespace defines a collection of mounts.
// MountNamespace defines a VFS root. It contains collection of Mounts that are
// mounted inside the Dirent tree rooted at the Root Dirent. It provides
// methods for traversing the Dirent, and for mounting/unmounting in the tree.
//
// Note that this does not correspond to a "mount namespace" in the Linux. It
// is more like a unique VFS instance.
//
// It's possible for different processes to have different MountNamespaces. In
// this case, the file systems exposed to the processes are completely
// distinct.
//
// +stateify savable
type MountNamespace struct {

View File

@ -381,9 +381,23 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs() error {
// Flush all mount sources for currently mounted filesystems.
// Flush all mount sources for currently mounted filesystems in the
// root mount namespace.
k.mounts.FlushMountSourceRefs()
// Some tasks may have other mount namespaces; flush those as well.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
if _, ok := flushed[tg.mounts]; ok {
// Already flushed.
return
}
tg.mounts.FlushMountSourceRefs()
flushed[tg.mounts] = struct{}{}
})
k.tasks.mu.RUnlock()
// There may be some open FDs whose filesystems have been unmounted. We
// must flush those as well.
return k.tasks.forEachFDPaused(func(desc descriptor) error {
@ -611,12 +625,18 @@ type CreateProcessArgs struct {
// AbstractSocketNamespace is the initial Abstract Socket namespace.
AbstractSocketNamespace *AbstractSocketNamespace
// MountNamespace optionally contains the mount namespace for this
// process. If nil, the kernel's mount namespace is used.
//
// Anyone setting MountNamespace must donate a reference (i.e.
// increment it).
MountNamespace *fs.MountNamespace
// Root optionally contains the dirent that serves as the root for the
// process. If nil, the mount namespace's root is used as the process'
// root.
//
// Anyone setting Root must donate a reference (i.e. increment it) to
// keep it alive until it is decremented by CreateProcess.
// Anyone setting Root must donate a reference (i.e. increment it).
Root *fs.Dirent
// ContainerID is the container that the process belongs to.
@ -715,20 +735,29 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
return nil, 0, fmt.Errorf("no kernel MountNamespace")
}
tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
// Grab the mount namespace.
mounts := args.MountNamespace
if mounts == nil {
// If no MountNamespace was configured, then use the kernel's
// root mount namespace, with an extra reference that will be
// donated to the task.
mounts = k.mounts
mounts.IncRef()
}
tg := k.newThreadGroup(mounts, k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
ctx := args.NewContext(k)
// Grab the root directory.
root := args.Root
if root == nil {
root = fs.RootFromContext(ctx)
// Is the root STILL nil?
if root == nil {
return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context")
}
// If no Root was configured, then get it from the
// MountNamespace.
root = mounts.Root()
}
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
defer root.DecRef()
args.Root = nil
// Grab the working directory.
remainingTraversals := uint(args.MaxSymlinkTraversals)

View File

@ -665,7 +665,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
// Preconditions: The caller must be running on the task goroutine, or t.mu
// must be locked.
func (t *Task) IsChrooted() bool {
realRoot := t.k.mounts.Root()
realRoot := t.tg.mounts.Root()
defer realRoot.DecRef()
root := t.fsc.RootDirectory()
if root != nil {
@ -710,7 +710,7 @@ func (t *Task) WithMuLocked(f func(*Task)) {
// MountNamespace returns t's MountNamespace. MountNamespace does not take an
// additional reference on the returned MountNamespace.
func (t *Task) MountNamespace() *fs.MountNamespace {
return t.k.mounts
return t.tg.mounts
}
// AbstractSockets returns t's AbstractSocketNamespace.

View File

@ -238,11 +238,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
}
tg := t.tg
if opts.NewThreadGroup {
tg.mounts.IncRef()
sh := t.tg.signalHandlers
if opts.NewSignalHandlers {
sh = sh.Fork()
}
tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
}
cfg := &TaskConfig{

View File

@ -19,6 +19,7 @@ import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/fs"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
@ -236,13 +237,21 @@ type ThreadGroup struct {
// rscr is the thread group's RSEQ critical region.
rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
// mounts is the thread group's mount namespace. This does not really
// correspond to a "mount namespace" in Linux, but is more like a
// complete VFS that need not be shared between processes. See the
// comment in mounts.go for more information.
//
// mounts is immutable.
mounts *fs.MountNamespace
}
// newThreadGroup returns a new, empty thread group in PID namespace ns. The
// thread group leader will send its parent terminationSignal when it exits.
// The new thread group isn't visible to the system until a task has been
// created inside of it by a successful call to TaskSet.NewTask.
func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
tg := &ThreadGroup{
threadGroupNode: threadGroupNode{
pidns: ns,
@ -251,6 +260,7 @@ func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminatio
terminationSignal: terminationSignal,
ioUsage: &usage.IO{},
limits: limits,
mounts: mounts,
}
tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
tg.timers = make(map[linux.TimerID]*IntervalTimer)
@ -298,6 +308,7 @@ func (tg *ThreadGroup) release() {
for _, it := range its {
it.DestroyTimer()
}
tg.mounts.DecRef()
}
// forEachChildThreadGroupLocked indicates over all child ThreadGroups.