Remove kernel.mounts.

We can get the mount namespace from the CreateProcessArgs in all cases where we
need it. This also gets rid of kernel.Destroy method, since the only thing it
was doing was DecRefing the mounts.

Removing the need to call kernel.SetRootMountNamespace also allowed for some
more simplifications in the container fs setup code.

PiperOrigin-RevId: 261357060
This commit is contained in:
Nicolas Lacasse 2019-08-02 11:21:50 -07:00 committed by gVisor bot
parent 6a1ac34077
commit aaaefdf9ca
7 changed files with 78 additions and 145 deletions

View File

@ -56,15 +56,10 @@ type ExecArgs struct {
// MountNamespace is the mount namespace to execute the new process in.
// A reference on MountNamespace must be held for the lifetime of the
// ExecArgs. If MountNamespace is nil, it will default to the kernel's
// root MountNamespace.
// ExecArgs. If MountNamespace is nil, it will default to the init
// process's MountNamespace.
MountNamespace *fs.MountNamespace
// Root defines the root directory for the new process. A reference on
// Root must be held for the lifetime of the ExecArgs. If Root is nil,
// it will default to the VFS root.
Root *fs.Dirent
// WorkingDirectory defines the working directory for the new process.
WorkingDirectory string `json:"wd"`
@ -155,7 +150,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
Envv: args.Envv,
WorkingDirectory: args.WorkingDirectory,
MountNamespace: args.MountNamespace,
Root: args.Root,
Credentials: creds,
FDTable: fdTable,
Umask: 0022,
@ -167,11 +161,6 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
ContainerID: args.ContainerID,
PIDNamespace: args.PIDNamespace,
}
if initArgs.Root != nil {
// initArgs must hold a reference on Root, which will be
// donated to the new process in CreateProcess.
initArgs.Root.IncRef()
}
if initArgs.MountNamespace != nil {
// initArgs must hold a reference on MountNamespace, which will
// be donated to the new process in CreateProcess.
@ -184,7 +173,7 @@ func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadI
paths := fs.GetPath(initArgs.Envv)
mns := initArgs.MountNamespace
if mns == nil {
mns = proc.Kernel.RootMountNamespace()
mns = proc.Kernel.GlobalInit().Leader().MountNamespace()
}
f, err := mns.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
if err != nil {

View File

@ -112,3 +112,27 @@ func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
}
return nil
}
type rootContext struct {
context.Context
root *Dirent
}
// WithRoot returns a copy of ctx with the given root.
func WithRoot(ctx context.Context, root *Dirent) context.Context {
return &rootContext{
Context: ctx,
root: root,
}
}
// Value implements Context.Value.
func (rc rootContext) Value(key interface{}) interface{} {
switch key {
case CtxRoot:
rc.root.IncRef()
return rc.root
default:
return rc.Context.Value(key)
}
}

View File

@ -219,6 +219,13 @@ func (mns *MountNamespace) flushMountSourceRefsLocked() {
}
}
if mns.root == nil {
// No root? This MountSource must have already been destroyed.
// This can happen when a Save is triggered while a process is
// exiting. There is nothing to flush.
return
}
// Flush root's MountSource references.
mns.root.Inode.MountSource.FlushDirentRefs()
}
@ -249,6 +256,10 @@ func (mns *MountNamespace) destroy() {
// Drop reference on the root.
mns.root.DecRef()
// Ensure that root cannot be accessed via this MountNamespace any
// more.
mns.root = nil
// Wait for asynchronous work (queued by dropping Dirent references
// above) to complete before destroying this MountNamespace.
AsyncBarrier()

View File

@ -112,11 +112,6 @@ type Kernel struct {
rootIPCNamespace *IPCNamespace
rootAbstractSocketNamespace *AbstractSocketNamespace
// mounts holds the state of the virtual filesystem. mounts is initially
// nil, and must be set by calling Kernel.SetRootMountNamespace before
// Kernel.CreateProcess can succeed.
mounts *fs.MountNamespace
// futexes is the "root" futex.Manager, from which all others are forked.
// This is necessary to ensure that shared futexes are coherent across all
// tasks, including those created by CreateProcess.
@ -392,11 +387,7 @@ func (k *Kernel) SaveTo(w io.Writer) error {
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs() error {
// Flush all mount sources for currently mounted filesystems in the
// root mount namespace.
k.mounts.FlushMountSourceRefs()
// Some tasks may have other mount namespaces; flush those as well.
// Flush all mount sources for currently mounted filesystems in each task.
flushed := make(map[*fs.MountNamespace]struct{})
k.tasks.mu.RLock()
k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
@ -573,16 +564,6 @@ func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack) error {
return nil
}
// Destroy releases resources owned by k.
//
// Preconditions: There must be no task goroutines running in k.
func (k *Kernel) Destroy() {
if k.mounts != nil {
k.mounts.DecRef()
k.mounts = nil
}
}
// UniqueID returns a unique identifier.
func (k *Kernel) UniqueID() uint64 {
id := atomic.AddUint64(&k.uniqueID, 1)
@ -646,19 +627,12 @@ type CreateProcessArgs struct {
AbstractSocketNamespace *AbstractSocketNamespace
// MountNamespace optionally contains the mount namespace for this
// process. If nil, the kernel's mount namespace is used.
// process. If nil, the init process's mount namespace is used.
//
// Anyone setting MountNamespace must donate a reference (i.e.
// increment it).
MountNamespace *fs.MountNamespace
// Root optionally contains the dirent that serves as the root for the
// process. If nil, the mount namespace's root is used as the process'
// root.
//
// Anyone setting Root must donate a reference (i.e. increment it).
Root *fs.Dirent
// ContainerID is the container that the process belongs to.
ContainerID string
}
@ -696,16 +670,10 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
case auth.CtxCredentials:
return ctx.args.Credentials
case fs.CtxRoot:
if ctx.args.Root != nil {
// Take a reference on the root dirent that will be
// given to the caller.
ctx.args.Root.IncRef()
return ctx.args.Root
}
if ctx.k.mounts != nil {
// MountNamespace.Root() will take a reference on the
// root dirent for us.
return ctx.k.mounts.Root()
if ctx.args.MountNamespace != nil {
// MountNamespace.Root() will take a reference on the root
// dirent for us.
return ctx.args.MountNamespace.Root()
}
return nil
case fs.CtxDirentCacheLimiter:
@ -749,30 +717,18 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
defer k.extMu.Unlock()
log.Infof("EXEC: %v", args.Argv)
if k.mounts == nil {
return nil, 0, fmt.Errorf("no kernel MountNamespace")
}
// Grab the mount namespace.
mounts := args.MountNamespace
if mounts == nil {
// If no MountNamespace was configured, then use the kernel's
// root mount namespace, with an extra reference that will be
// donated to the task.
mounts = k.mounts
mounts = k.GlobalInit().Leader().MountNamespace()
mounts.IncRef()
}
tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
ctx := args.NewContext(k)
// Grab the root directory.
root := args.Root
if root == nil {
// If no Root was configured, then get it from the
// MountNamespace.
root = mounts.Root()
}
// Get the root directory from the MountNamespace.
root := mounts.Root()
// The call to newFSContext below will take a reference on root, so we
// don't need to hold this one.
defer root.DecRef()
@ -782,7 +738,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
wd := root // Default.
if args.WorkingDirectory != "" {
var err error
wd, err = k.mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
if err != nil {
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
}
@ -811,8 +767,7 @@ func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID,
// Create a fresh task context.
remainingTraversals = uint(args.MaxSymlinkTraversals)
tc, se := k.LoadTaskImage(ctx, k.mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
if se != nil {
return nil, 0, errors.New(se.String())
}
@ -1056,20 +1011,6 @@ func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
return k.rootAbstractSocketNamespace
}
// RootMountNamespace returns the MountNamespace.
func (k *Kernel) RootMountNamespace() *fs.MountNamespace {
k.extMu.Lock()
defer k.extMu.Unlock()
return k.mounts
}
// SetRootMountNamespace sets the MountNamespace.
func (k *Kernel) SetRootMountNamespace(mounts *fs.MountNamespace) {
k.extMu.Lock()
defer k.extMu.Unlock()
k.mounts = mounts
}
// NetworkStack returns the network stack. NetworkStack may return nil if no
// network stack is available.
func (k *Kernel) NetworkStack() inet.Stack {
@ -1260,7 +1201,10 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
// The supervisor context is global root.
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
case fs.CtxRoot:
return ctx.k.mounts.Root()
if ctx.k.globalInit != nil {
return ctx.k.globalInit.mounts.Root()
}
return nil
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case ktime.CtxRealtimeClock:

View File

@ -328,10 +328,8 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("at most two files may be passed to Restore")
}
networkStack := cm.l.k.NetworkStack()
// Destroy the old kernel and create a new kernel.
// Pause the kernel while we build a new one.
cm.l.k.Pause()
cm.l.k.Destroy()
p, err := createPlatform(cm.l.conf, deviceFile)
if err != nil {
@ -345,6 +343,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error {
return fmt.Errorf("creating memory file: %v", err)
}
k.SetMemoryFile(mf)
networkStack := cm.l.k.NetworkStack()
cm.l.k = k
// Set up the restore environment.

View File

@ -34,12 +34,10 @@ import (
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/runsc/specutils"
)
@ -506,44 +504,16 @@ func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel
}
}
// setupFS is used to set up the file system for containers and amend
// the procArgs accordingly. This is the main entry point for this rest of
// functions in this file. procArgs are passed by reference and the FDMap field
// is modified. It dups stdioFDs.
func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
// Use root user to configure mounts. The current user might not have
// permission to do so.
rootProcArgs := kernel.CreateProcessArgs{
WorkingDirectory: "/",
Credentials: auth.NewRootCredentials(creds.UserNamespace),
Umask: 0022,
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
PIDNamespace: procArgs.PIDNamespace,
}
rootCtx := rootProcArgs.NewContext(c.k)
// If this is the root container, we also need to setup the root mount
// namespace.
rootMNS := c.k.RootMountNamespace()
if rootMNS == nil {
// Setup the root container.
if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) {
// The callback to setupRootContainer inherits a
// reference on the rootMNS, so we don't need to take
// an additional reference here.
procArgs.MountNamespace = rootMNS
procArgs.Root = rootMNS.Root()
c.k.SetRootMountNamespace(rootMNS)
}); err != nil {
return err
}
return c.checkDispenser()
}
// setupChildContainer is used to set up the file system for non-root containers
// and amend the procArgs accordingly. This is the main entry point for this
// rest of functions in this file. procArgs are passed by reference and the
// FDMap field is modified. It dups stdioFDs.
func (c *containerMounter) setupChildContainer(conf *Config, procArgs *kernel.CreateProcessArgs) error {
// Setup a child container.
log.Infof("Creating new process in child container.")
// Create a new root inode and mount namespace for the container.
rootCtx := c.k.SupervisorContext()
rootInode, err := c.createRootMount(rootCtx, conf)
if err != nil {
return fmt.Errorf("creating filesystem for container: %v", err)
@ -552,14 +522,12 @@ func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *
if err != nil {
return fmt.Errorf("creating new mount namespace for container: %v", err)
}
// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
// This will also donate a reference to procArgs, as required.
procArgs.MountNamespace = mns
procArgs.Root = mns.Root()
root := mns.Root()
defer root.DecRef()
// Mount all submounts.
if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil {
if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil {
return err
}
return c.checkDispenser()
@ -599,7 +567,10 @@ func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx c
root := mns.Root()
defer root.DecRef()
return c.mountSubmounts(rootCtx, conf, mns, root)
if err := c.mountSubmounts(rootCtx, conf, mns, root); err != nil {
return fmt.Errorf("mounting submounts: %v", err)
}
return c.checkDispenser()
}
// mountSharedMaster mounts the master of a volume that is shared among

View File

@ -527,12 +527,15 @@ func (l *Loader) run() error {
// cid for root container can be empty. Only subcontainers need it to set
// the mount location.
mntr := newContainerMounter(l.spec, "", l.goferFDs, l.k, l.mountHints)
if err := mntr.setupFS(ctx, l.conf, &l.rootProcArgs, l.rootProcArgs.Credentials); err != nil {
// Setup the root container.
if err := mntr.setupRootContainer(ctx, ctx, l.conf, func(mns *fs.MountNamespace) {
l.rootProcArgs.MountNamespace = mns
}); err != nil {
return err
}
rootCtx := l.rootProcArgs.NewContext(l.k)
if err := setExecutablePath(rootCtx, &l.rootProcArgs); err != nil {
if err := setExecutablePath(ctx, &l.rootProcArgs); err != nil {
return err
}
@ -546,7 +549,7 @@ func (l *Loader) run() error {
}
}
if !hasHomeEnvv {
homeDir, err := getExecUserHome(rootCtx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
homeDir, err := getExecUserHome(ctx, l.rootProcArgs.MountNamespace, uint32(l.rootProcArgs.Credentials.RealKUID))
if err != nil {
return fmt.Errorf("error reading exec user: %v", err)
}
@ -685,7 +688,7 @@ func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, file
}
mntr := newContainerMounter(spec, cid, goferFDs, l.k, l.mountHints)
if err := mntr.setupFS(ctx, conf, &procArgs, creds); err != nil {
if err := mntr.setupChildContainer(conf, &procArgs); err != nil {
return fmt.Errorf("configuring container FS: %v", err)
}
@ -756,22 +759,14 @@ func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
return 0, fmt.Errorf("no such container: %q", args.ContainerID)
}
// Get the container Root Dirent and MountNamespace from the Task.
// Get the container MountNamespace from the Task.
tg.Leader().WithMuLocked(func(t *kernel.Task) {
// FSContext.RootDirectory() will take an extra ref for us.
args.Root = t.FSContext().RootDirectory()
// task.MountNamespace() does not take a ref, so we must do so
// ourselves.
args.MountNamespace = t.MountNamespace()
args.MountNamespace.IncRef()
})
defer func() {
if args.Root != nil {
args.Root.DecRef()
}
args.MountNamespace.DecRef()
}()
defer args.MountNamespace.DecRef()
// Start the process.
proc := control.Proc{Kernel: l.k}