Add MountNamespace to task.
This allows tasks to have distinct mount namespace, instead of all sharing the kernel's root mount namespace. Currently, the only way for a task to get a different mount namespace than the kernel's root is by explicitly setting a different MountNamespace in CreateProcessArgs, and nothing does this (yet). In a follow-up CL, we will set CreateProcessArgs.MountNamespace when creating a new container inside runsc. Note that "MountNamespace" is a poor term for this thing. It's more like a distinct VFS tree. When we get around to adding real mount namespaces, this will need a better naem. PiperOrigin-RevId: 254009310
This commit is contained in:
parent
0d1dc50b70
commit
f7428af9c1
|
@ -124,7 +124,16 @@ func (m *Mount) IsUndo() bool {
|
|||
return false
|
||||
}
|
||||
|
||||
// MountNamespace defines a collection of mounts.
|
||||
// MountNamespace defines a VFS root. It contains collection of Mounts that are
|
||||
// mounted inside the Dirent tree rooted at the Root Dirent. It provides
|
||||
// methods for traversing the Dirent, and for mounting/unmounting in the tree.
|
||||
//
|
||||
// Note that this does not correspond to a "mount namespace" in the Linux. It
|
||||
// is more like a unique VFS instance.
|
||||
//
|
||||
// It's possible for different processes to have different MountNamespaces. In
|
||||
// this case, the file systems exposed to the processes are completely
|
||||
// distinct.
|
||||
//
|
||||
// +stateify savable
|
||||
type MountNamespace struct {
|
||||
|
|
|
@ -381,9 +381,23 @@ func (k *Kernel) SaveTo(w io.Writer) error {
|
|||
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
|
||||
// and open FDs.
|
||||
func (k *Kernel) flushMountSourceRefs() error {
|
||||
// Flush all mount sources for currently mounted filesystems.
|
||||
// Flush all mount sources for currently mounted filesystems in the
|
||||
// root mount namespace.
|
||||
k.mounts.FlushMountSourceRefs()
|
||||
|
||||
// Some tasks may have other mount namespaces; flush those as well.
|
||||
flushed := make(map[*fs.MountNamespace]struct{})
|
||||
k.tasks.mu.RLock()
|
||||
k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
|
||||
if _, ok := flushed[tg.mounts]; ok {
|
||||
// Already flushed.
|
||||
return
|
||||
}
|
||||
tg.mounts.FlushMountSourceRefs()
|
||||
flushed[tg.mounts] = struct{}{}
|
||||
})
|
||||
k.tasks.mu.RUnlock()
|
||||
|
||||
// There may be some open FDs whose filesystems have been unmounted. We
|
||||
// must flush those as well.
|
||||
return k.tasks.forEachFDPaused(func(desc descriptor) error {
|
||||
|
@ -611,12 +625,18 @@ type CreateProcessArgs struct {
|
|||
// AbstractSocketNamespace is the initial Abstract Socket namespace.
|
||||
AbstractSocketNamespace *AbstractSocketNamespace
|
||||
|
||||
// MountNamespace optionally contains the mount namespace for this
|
||||
// process. If nil, the kernel's mount namespace is used.
|
||||
//
|
||||
// Anyone setting MountNamespace must donate a reference (i.e.
|
||||
// increment it).
|
||||
MountNamespace *fs.MountNamespace
|
||||
|
||||
// Root optionally contains the dirent that serves as the root for the
|
||||
// process. If nil, the mount namespace's root is used as the process'
|
||||
// root.
|
||||
//
|
||||
// Anyone setting Root must donate a reference (i.e. increment it) to
|
||||
// keep it alive until it is decremented by CreateProcess.
|
||||
// Anyone setting Root must donate a reference (i.e. increment it).
|
||||
Root *fs.Dirent
|
||||
|
||||
// ContainerID is the container that the process belongs to.
|
||||
|
@ -715,20 +735,29 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
|
|||
return nil, 0, fmt.Errorf("no kernel MountNamespace")
|
||||
}
|
||||
|
||||
tg := k.newThreadGroup(k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
|
||||
// Grab the mount namespace.
|
||||
mounts := args.MountNamespace
|
||||
if mounts == nil {
|
||||
// If no MountNamespace was configured, then use the kernel's
|
||||
// root mount namespace, with an extra reference that will be
|
||||
// donated to the task.
|
||||
mounts = k.mounts
|
||||
mounts.IncRef()
|
||||
}
|
||||
|
||||
tg := k.newThreadGroup(mounts, k.tasks.Root, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
|
||||
ctx := args.NewContext(k)
|
||||
|
||||
// Grab the root directory.
|
||||
root := args.Root
|
||||
if root == nil {
|
||||
root = fs.RootFromContext(ctx)
|
||||
// Is the root STILL nil?
|
||||
if root == nil {
|
||||
return nil, 0, fmt.Errorf("CreateProcessArgs.Root was not provided, and failed to get root from context")
|
||||
}
|
||||
// If no Root was configured, then get it from the
|
||||
// MountNamespace.
|
||||
root = mounts.Root()
|
||||
}
|
||||
// The call to newFSContext below will take a reference on root, so we
|
||||
// don't need to hold this one.
|
||||
defer root.DecRef()
|
||||
args.Root = nil
|
||||
|
||||
// Grab the working directory.
|
||||
remainingTraversals := uint(args.MaxSymlinkTraversals)
|
||||
|
|
|
@ -665,7 +665,7 @@ func (t *Task) SyscallRestartBlock() SyscallRestartBlock {
|
|||
// Preconditions: The caller must be running on the task goroutine, or t.mu
|
||||
// must be locked.
|
||||
func (t *Task) IsChrooted() bool {
|
||||
realRoot := t.k.mounts.Root()
|
||||
realRoot := t.tg.mounts.Root()
|
||||
defer realRoot.DecRef()
|
||||
root := t.fsc.RootDirectory()
|
||||
if root != nil {
|
||||
|
@ -710,7 +710,7 @@ func (t *Task) WithMuLocked(f func(*Task)) {
|
|||
// MountNamespace returns t's MountNamespace. MountNamespace does not take an
|
||||
// additional reference on the returned MountNamespace.
|
||||
func (t *Task) MountNamespace() *fs.MountNamespace {
|
||||
return t.k.mounts
|
||||
return t.tg.mounts
|
||||
}
|
||||
|
||||
// AbstractSockets returns t's AbstractSocketNamespace.
|
||||
|
|
|
@ -238,11 +238,12 @@ func (t *Task) Clone(opts *CloneOptions) (ThreadID, *SyscallControl, error) {
|
|||
}
|
||||
tg := t.tg
|
||||
if opts.NewThreadGroup {
|
||||
tg.mounts.IncRef()
|
||||
sh := t.tg.signalHandlers
|
||||
if opts.NewSignalHandlers {
|
||||
sh = sh.Fork()
|
||||
}
|
||||
tg = t.k.newThreadGroup(pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
|
||||
tg = t.k.newThreadGroup(tg.mounts, pidns, sh, opts.TerminationSignal, tg.limits.GetCopy(), t.k.monotonicClock)
|
||||
}
|
||||
|
||||
cfg := &TaskConfig{
|
||||
|
|
|
@ -19,6 +19,7 @@ import (
|
|||
"sync/atomic"
|
||||
|
||||
"gvisor.dev/gvisor/pkg/abi/linux"
|
||||
"gvisor.dev/gvisor/pkg/sentry/fs"
|
||||
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
|
||||
"gvisor.dev/gvisor/pkg/sentry/limits"
|
||||
"gvisor.dev/gvisor/pkg/sentry/usage"
|
||||
|
@ -236,13 +237,21 @@ type ThreadGroup struct {
|
|||
|
||||
// rscr is the thread group's RSEQ critical region.
|
||||
rscr atomic.Value `state:".(*RSEQCriticalRegion)"`
|
||||
|
||||
// mounts is the thread group's mount namespace. This does not really
|
||||
// correspond to a "mount namespace" in Linux, but is more like a
|
||||
// complete VFS that need not be shared between processes. See the
|
||||
// comment in mounts.go for more information.
|
||||
//
|
||||
// mounts is immutable.
|
||||
mounts *fs.MountNamespace
|
||||
}
|
||||
|
||||
// newThreadGroup returns a new, empty thread group in PID namespace ns. The
|
||||
// thread group leader will send its parent terminationSignal when it exits.
|
||||
// The new thread group isn't visible to the system until a task has been
|
||||
// created inside of it by a successful call to TaskSet.NewTask.
|
||||
func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
|
||||
func (k *Kernel) newThreadGroup(mounts *fs.MountNamespace, ns *PIDNamespace, sh *SignalHandlers, terminationSignal linux.Signal, limits *limits.LimitSet, monotonicClock *timekeeperClock) *ThreadGroup {
|
||||
tg := &ThreadGroup{
|
||||
threadGroupNode: threadGroupNode{
|
||||
pidns: ns,
|
||||
|
@ -251,6 +260,7 @@ func (k *Kernel) newThreadGroup(ns *PIDNamespace, sh *SignalHandlers, terminatio
|
|||
terminationSignal: terminationSignal,
|
||||
ioUsage: &usage.IO{},
|
||||
limits: limits,
|
||||
mounts: mounts,
|
||||
}
|
||||
tg.itimerRealTimer = ktime.NewTimer(k.monotonicClock, &itimerRealListener{tg: tg})
|
||||
tg.timers = make(map[linux.TimerID]*IntervalTimer)
|
||||
|
@ -298,6 +308,7 @@ func (tg *ThreadGroup) release() {
|
|||
for _, it := range its {
|
||||
it.DestroyTimer()
|
||||
}
|
||||
tg.mounts.DecRef()
|
||||
}
|
||||
|
||||
// forEachChildThreadGroupLocked indicates over all child ThreadGroups.
|
||||
|
|
Loading…
Reference in New Issue