2020-04-17 17:38:04 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package boot
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2020-04-25 01:22:21 +00:00
|
|
|
"sort"
|
2020-04-17 17:38:04 +00:00
|
|
|
"strings"
|
|
|
|
|
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
2020-05-29 23:33:50 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/context"
|
2020-04-17 17:38:04 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/fspath"
|
2020-05-29 23:33:50 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/log"
|
2020-04-17 17:38:04 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/devices/memdev"
|
2020-06-24 01:47:22 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/devices/ttydev"
|
2020-06-24 23:21:53 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/devices/tundev"
|
2020-05-29 23:33:50 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs/user"
|
2020-05-13 17:30:00 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
|
2020-06-17 20:23:27 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/fuse"
|
2020-05-13 17:30:00 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
|
Add //pkg/sentry/fsimpl/overlay.
Major differences from existing overlay filesystems:
- Linux allows lower layers in an overlay to require revalidation, but not the
upper layer. VFS1 allows the upper layer in an overlay to require
revalidation, but not the lower layer. VFS2 does not allow any layers to
require revalidation. (Now that vfs.MkdirOptions.ForSyntheticMountpoint
exists, no uses of overlay in VFS1 are believed to require upper layer
revalidation; in particular, the requirement that the upper layer support the
creation of "trusted." extended attributes for whiteouts effectively required
the upper filesystem to be tmpfs in most cases.)
- Like VFS1, but unlike Linux, VFS2 overlay does not attempt to make mutations
of the upper layer atomic using a working directory and features like
RENAME_WHITEOUT. (This may change in the future, since not having a working
directory makes error recovery for some operations, e.g. rmdir, particularly
painful.)
- Like Linux, but unlike VFS1, VFS2 represents whiteouts using character
devices with rdev == 0; the equivalent of the whiteout attribute on
directories is xattr trusted.overlay.opaque = "y"; and there is no equivalent
to the whiteout attribute on non-directories since non-directories are never
merged with lower layers.
- Device and inode numbers work as follows:
- In Linux, modulo the xino feature and a special case for when all layers
are the same filesystem:
- Directories use the overlay filesystem's device number and an
ephemeral inode number assigned by the overlay.
- Non-directories that have been copied up use the device and inode
number assigned by the upper filesystem.
- Non-directories that have not been copied up use a per-(overlay,
layer)-pair device number and the inode number assigned by the lower
filesystem.
- In VFS1, device and inode numbers always come from the lower layer unless
"whited out"; this has the adverse effect of requiring interaction with
the lower filesystem even for non-directory files that exist on the upper
layer.
- In VFS2, device and inode numbers are assigned as in Linux, except that
xino and the samefs special case are not supported.
- Like Linux, but unlike VFS1, VFS2 does not attempt to maintain memory mapping
coherence across copy-up. (This may have to change in the future, as users
may be dependent on this property.)
- Like Linux, but unlike VFS1, VFS2 uses the overlayfs mounter's credentials
when interacting with the overlay's layers, rather than the caller's.
- Like Linux, but unlike VFS1, VFS2 permits multiple lower layers in an
overlay.
- Like Linux, but unlike VFS1, VFS2's overlay filesystem is
application-mountable.
Updates #1199
PiperOrigin-RevId: 316019067
2020-06-12 01:33:35 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/overlay"
|
2020-05-13 17:30:00 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
|
2020-08-06 18:01:13 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/inet"
|
2020-04-17 17:38:04 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/vfs"
|
2020-05-29 23:33:50 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/syserror"
|
2020-08-20 01:35:35 +00:00
|
|
|
"gvisor.dev/gvisor/runsc/config"
|
2020-04-17 17:38:04 +00:00
|
|
|
)
|
|
|
|
|
2020-07-09 00:10:35 +00:00
|
|
|
func registerFilesystems(k *kernel.Kernel) error {
|
|
|
|
ctx := k.SupervisorContext()
|
|
|
|
creds := auth.NewRootCredentials(k.RootUserNamespace())
|
|
|
|
vfsObj := k.VFS()
|
|
|
|
|
2020-05-13 17:30:00 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(devpts.Name, &devpts.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
2020-04-17 17:38:04 +00:00
|
|
|
AllowUserList: true,
|
2020-05-04 17:58:01 +00:00
|
|
|
// TODO(b/29356795): Users may mount this once the terminals are in a
|
|
|
|
// usable state.
|
|
|
|
AllowUserMount: false,
|
2020-04-17 17:38:04 +00:00
|
|
|
})
|
2020-05-13 17:30:00 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(devtmpfs.Name, &devtmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
2020-04-17 17:38:04 +00:00
|
|
|
AllowUserMount: true,
|
|
|
|
AllowUserList: true,
|
|
|
|
})
|
2020-05-13 17:30:00 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(gofer.Name, &gofer.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
2020-05-04 17:58:01 +00:00
|
|
|
AllowUserList: true,
|
2020-04-17 17:38:04 +00:00
|
|
|
})
|
Add //pkg/sentry/fsimpl/overlay.
Major differences from existing overlay filesystems:
- Linux allows lower layers in an overlay to require revalidation, but not the
upper layer. VFS1 allows the upper layer in an overlay to require
revalidation, but not the lower layer. VFS2 does not allow any layers to
require revalidation. (Now that vfs.MkdirOptions.ForSyntheticMountpoint
exists, no uses of overlay in VFS1 are believed to require upper layer
revalidation; in particular, the requirement that the upper layer support the
creation of "trusted." extended attributes for whiteouts effectively required
the upper filesystem to be tmpfs in most cases.)
- Like VFS1, but unlike Linux, VFS2 overlay does not attempt to make mutations
of the upper layer atomic using a working directory and features like
RENAME_WHITEOUT. (This may change in the future, since not having a working
directory makes error recovery for some operations, e.g. rmdir, particularly
painful.)
- Like Linux, but unlike VFS1, VFS2 represents whiteouts using character
devices with rdev == 0; the equivalent of the whiteout attribute on
directories is xattr trusted.overlay.opaque = "y"; and there is no equivalent
to the whiteout attribute on non-directories since non-directories are never
merged with lower layers.
- Device and inode numbers work as follows:
- In Linux, modulo the xino feature and a special case for when all layers
are the same filesystem:
- Directories use the overlay filesystem's device number and an
ephemeral inode number assigned by the overlay.
- Non-directories that have been copied up use the device and inode
number assigned by the upper filesystem.
- Non-directories that have not been copied up use a per-(overlay,
layer)-pair device number and the inode number assigned by the lower
filesystem.
- In VFS1, device and inode numbers always come from the lower layer unless
"whited out"; this has the adverse effect of requiring interaction with
the lower filesystem even for non-directory files that exist on the upper
layer.
- In VFS2, device and inode numbers are assigned as in Linux, except that
xino and the samefs special case are not supported.
- Like Linux, but unlike VFS1, VFS2 does not attempt to maintain memory mapping
coherence across copy-up. (This may have to change in the future, as users
may be dependent on this property.)
- Like Linux, but unlike VFS1, VFS2 uses the overlayfs mounter's credentials
when interacting with the overlay's layers, rather than the caller's.
- Like Linux, but unlike VFS1, VFS2 permits multiple lower layers in an
overlay.
- Like Linux, but unlike VFS1, VFS2's overlay filesystem is
application-mountable.
Updates #1199
PiperOrigin-RevId: 316019067
2020-06-12 01:33:35 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(overlay.Name, &overlay.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
|
|
|
AllowUserMount: true,
|
|
|
|
AllowUserList: true,
|
|
|
|
})
|
2020-05-13 17:30:00 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(proc.Name, &proc.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
2020-04-17 17:38:04 +00:00
|
|
|
AllowUserMount: true,
|
|
|
|
AllowUserList: true,
|
|
|
|
})
|
2020-05-13 17:30:00 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(sys.Name, &sys.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
2020-04-17 17:38:04 +00:00
|
|
|
AllowUserMount: true,
|
|
|
|
AllowUserList: true,
|
|
|
|
})
|
2020-05-13 17:30:00 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(tmpfs.Name, &tmpfs.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
2020-04-17 17:38:04 +00:00
|
|
|
AllowUserMount: true,
|
|
|
|
AllowUserList: true,
|
|
|
|
})
|
2020-06-09 16:35:39 +00:00
|
|
|
vfsObj.MustRegisterFilesystemType(fuse.Name, &fuse.FilesystemType{}, &vfs.RegisterFilesystemTypeOptions{
|
|
|
|
AllowUserMount: true,
|
|
|
|
AllowUserList: true,
|
|
|
|
})
|
2020-04-17 17:38:04 +00:00
|
|
|
|
|
|
|
// Setup files in devtmpfs.
|
|
|
|
if err := memdev.Register(vfsObj); err != nil {
|
|
|
|
return fmt.Errorf("registering memdev: %w", err)
|
|
|
|
}
|
2020-06-24 01:47:22 +00:00
|
|
|
if err := ttydev.Register(vfsObj); err != nil {
|
|
|
|
return fmt.Errorf("registering ttydev: %w", err)
|
2020-06-23 18:25:38 +00:00
|
|
|
}
|
2020-08-06 18:01:13 +00:00
|
|
|
tunSupported := tundev.IsNetTunSupported(inet.StackFromContext(ctx))
|
|
|
|
if tunSupported {
|
|
|
|
if err := tundev.Register(vfsObj); err != nil {
|
|
|
|
return fmt.Errorf("registering tundev: %v", err)
|
|
|
|
}
|
|
|
|
}
|
2020-06-17 20:23:27 +00:00
|
|
|
|
2020-07-08 01:48:25 +00:00
|
|
|
if kernel.FUSEEnabled {
|
|
|
|
if err := fuse.Register(vfsObj); err != nil {
|
|
|
|
return fmt.Errorf("registering fusedev: %w", err)
|
|
|
|
}
|
2020-06-24 01:47:22 +00:00
|
|
|
}
|
2020-07-08 01:48:25 +00:00
|
|
|
|
2020-05-13 17:30:00 +00:00
|
|
|
a, err := devtmpfs.NewAccessor(ctx, vfsObj, creds, devtmpfs.Name)
|
2020-04-17 17:38:04 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("creating devtmpfs accessor: %w", err)
|
|
|
|
}
|
2020-08-03 20:33:47 +00:00
|
|
|
defer a.Release(ctx)
|
2020-04-17 17:38:04 +00:00
|
|
|
|
|
|
|
if err := a.UserspaceInit(ctx); err != nil {
|
|
|
|
return fmt.Errorf("initializing userspace: %w", err)
|
|
|
|
}
|
|
|
|
if err := memdev.CreateDevtmpfsFiles(ctx, a); err != nil {
|
2020-06-24 23:21:53 +00:00
|
|
|
return fmt.Errorf("creating memdev devtmpfs files: %w", err)
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
2020-06-24 01:47:22 +00:00
|
|
|
if err := ttydev.CreateDevtmpfsFiles(ctx, a); err != nil {
|
2020-06-24 23:21:53 +00:00
|
|
|
return fmt.Errorf("creating ttydev devtmpfs files: %w", err)
|
|
|
|
}
|
2020-08-06 18:01:13 +00:00
|
|
|
if tunSupported {
|
|
|
|
if err := tundev.CreateDevtmpfsFiles(ctx, a); err != nil {
|
|
|
|
return fmt.Errorf("creating tundev devtmpfs files: %v", err)
|
|
|
|
}
|
2020-06-24 01:47:22 +00:00
|
|
|
}
|
2020-07-08 01:48:25 +00:00
|
|
|
|
|
|
|
if kernel.FUSEEnabled {
|
|
|
|
if err := fuse.CreateDevtmpfsFile(ctx, a); err != nil {
|
|
|
|
return fmt.Errorf("creating fusedev devtmpfs files: %w", err)
|
|
|
|
}
|
2020-06-17 20:23:27 +00:00
|
|
|
}
|
2020-06-09 16:35:39 +00:00
|
|
|
|
2020-04-17 17:38:04 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2020-08-20 01:35:35 +00:00
|
|
|
func setupContainerVFS2(ctx context.Context, conf *config.Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
|
2020-04-17 17:38:04 +00:00
|
|
|
mns, err := mntr.setupVFS2(ctx, conf, procArgs)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to setupFS: %w", err)
|
|
|
|
}
|
|
|
|
procArgs.MountNamespaceVFS2 = mns
|
|
|
|
|
2020-05-29 23:33:50 +00:00
|
|
|
// Resolve the executable path from working dir and environment.
|
2020-06-09 06:06:50 +00:00
|
|
|
resolved, err := user.ResolveExecutablePath(ctx, procArgs)
|
2020-05-29 23:33:50 +00:00
|
|
|
if err != nil {
|
2020-06-09 06:06:50 +00:00
|
|
|
return err
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
2020-06-09 06:06:50 +00:00
|
|
|
procArgs.Filename = resolved
|
2020-05-29 23:33:50 +00:00
|
|
|
return nil
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
|
|
|
|
2020-08-20 01:35:35 +00:00
|
|
|
func (c *containerMounter) setupVFS2(ctx context.Context, conf *config.Config, procArgs *kernel.CreateProcessArgs) (*vfs.MountNamespace, error) {
|
2020-04-17 17:38:04 +00:00
|
|
|
log.Infof("Configuring container's file system with VFS2")
|
|
|
|
|
|
|
|
// Create context with root credentials to mount the filesystem (the current
|
|
|
|
// user may not be privileged enough).
|
2020-05-04 18:41:38 +00:00
|
|
|
rootCreds := auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
|
2020-04-17 17:38:04 +00:00
|
|
|
rootProcArgs := *procArgs
|
|
|
|
rootProcArgs.WorkingDirectory = "/"
|
2020-05-04 18:41:38 +00:00
|
|
|
rootProcArgs.Credentials = rootCreds
|
2020-04-17 17:38:04 +00:00
|
|
|
rootProcArgs.Umask = 0022
|
|
|
|
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
|
|
|
|
rootCtx := procArgs.NewContext(c.k)
|
|
|
|
|
2020-05-04 18:41:38 +00:00
|
|
|
mns, err := c.createMountNamespaceVFS2(rootCtx, conf, rootCreds)
|
2020-04-17 17:38:04 +00:00
|
|
|
if err != nil {
|
2020-04-25 01:22:21 +00:00
|
|
|
return nil, fmt.Errorf("creating mount namespace: %w", err)
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
|
|
|
rootProcArgs.MountNamespaceVFS2 = mns
|
|
|
|
|
|
|
|
// Mount submounts.
|
2020-05-04 18:41:38 +00:00
|
|
|
if err := c.mountSubmountsVFS2(rootCtx, conf, mns, rootCreds); err != nil {
|
2020-04-17 17:38:04 +00:00
|
|
|
return nil, fmt.Errorf("mounting submounts vfs2: %w", err)
|
|
|
|
}
|
|
|
|
return mns, nil
|
|
|
|
}
|
|
|
|
|
2020-08-20 01:35:35 +00:00
|
|
|
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
|
2020-04-25 01:22:21 +00:00
|
|
|
fd := c.fds.remove()
|
2020-07-30 21:13:11 +00:00
|
|
|
opts := p9MountData(fd, conf.FileAccess, true /* vfs2 */)
|
|
|
|
|
|
|
|
if conf.OverlayfsStaleRead {
|
|
|
|
// We can't check for overlayfs here because sandbox is chroot'ed and gofer
|
|
|
|
// can only send mount options for specs.Mounts (specs.Root is missing
|
|
|
|
// Options field). So assume root is always on top of overlayfs.
|
|
|
|
opts = append(opts, "overlayfs_stale_read")
|
|
|
|
}
|
2020-04-25 01:22:21 +00:00
|
|
|
|
|
|
|
log.Infof("Mounting root over 9P, ioFD: %d", fd)
|
2020-07-30 21:13:11 +00:00
|
|
|
mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{
|
|
|
|
Data: strings.Join(opts, ","),
|
|
|
|
})
|
2020-04-25 01:22:21 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("setting up mount namespace: %w", err)
|
|
|
|
}
|
|
|
|
return mns, nil
|
|
|
|
}
|
|
|
|
|
2020-08-20 01:35:35 +00:00
|
|
|
func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials) error {
|
2020-05-14 01:16:45 +00:00
|
|
|
mounts, err := c.prepareMountsVFS2()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2020-04-25 01:22:21 +00:00
|
|
|
|
2020-05-14 01:16:45 +00:00
|
|
|
for i := range mounts {
|
|
|
|
submount := &mounts[i]
|
2020-04-17 17:38:04 +00:00
|
|
|
log.Debugf("Mounting %q to %q, type: %s, options: %s", submount.Source, submount.Destination, submount.Type, submount.Options)
|
2020-08-21 21:28:27 +00:00
|
|
|
var (
|
|
|
|
mnt *vfs.Mount
|
|
|
|
err error
|
|
|
|
)
|
|
|
|
|
2020-07-09 00:10:35 +00:00
|
|
|
if hint := c.hints.findMount(submount.Mount); hint != nil && hint.isSupported() {
|
2020-08-21 21:28:27 +00:00
|
|
|
mnt, err = c.mountSharedSubmountVFS2(ctx, conf, mns, creds, submount.Mount, hint)
|
|
|
|
if err != nil {
|
2020-07-09 00:10:35 +00:00
|
|
|
return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, submount.Destination, err)
|
|
|
|
}
|
|
|
|
} else {
|
2020-08-21 21:28:27 +00:00
|
|
|
mnt, err = c.mountSubmountVFS2(ctx, conf, mns, creds, submount)
|
|
|
|
if err != nil {
|
2020-07-09 00:10:35 +00:00
|
|
|
return fmt.Errorf("mount submount %q: %w", submount.Destination, err)
|
|
|
|
}
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
2020-08-21 21:28:27 +00:00
|
|
|
|
|
|
|
if mnt != nil && mnt.ReadOnly() {
|
|
|
|
// Switch to ReadWrite while we setup submounts.
|
|
|
|
if err := c.k.VFS().SetMountReadOnly(mnt, false); err != nil {
|
|
|
|
return fmt.Errorf("failed to set mount at %q readwrite: %v", submount.Destination, err)
|
|
|
|
}
|
|
|
|
defer func() {
|
|
|
|
if err := c.k.VFS().SetMountReadOnly(mnt, true); err != nil {
|
|
|
|
panic(fmt.Sprintf("failed to restore mount at %q back to readonly: %v", submount.Destination, err))
|
|
|
|
}
|
|
|
|
}()
|
|
|
|
}
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
|
|
|
|
2020-06-02 04:30:28 +00:00
|
|
|
if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
|
|
|
|
return fmt.Errorf(`mount submount "\tmp": %w`, err)
|
|
|
|
}
|
2020-05-14 01:16:45 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type mountAndFD struct {
|
|
|
|
specs.Mount
|
|
|
|
fd int
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
|
|
|
|
2020-05-14 01:16:45 +00:00
|
|
|
func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
|
|
|
|
// Associate bind mounts with their FDs before sorting since there is an
|
|
|
|
// undocumented assumption that FDs are dispensed in the order in which
|
|
|
|
// they are required by mounts.
|
|
|
|
var mounts []mountAndFD
|
|
|
|
for _, m := range c.mounts {
|
|
|
|
fd := -1
|
|
|
|
// Only bind mounts use host FDs; see
|
|
|
|
// containerMounter.getMountNameAndOptionsVFS2.
|
2020-05-28 19:24:37 +00:00
|
|
|
if m.Type == bind {
|
2020-05-14 01:16:45 +00:00
|
|
|
fd = c.fds.remove()
|
|
|
|
}
|
|
|
|
mounts = append(mounts, mountAndFD{
|
|
|
|
Mount: m,
|
|
|
|
fd: fd,
|
|
|
|
})
|
|
|
|
}
|
|
|
|
if err := c.checkDispenser(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2020-04-25 01:22:21 +00:00
|
|
|
// Sort the mounts so that we don't place children before parents.
|
2020-05-14 01:16:45 +00:00
|
|
|
sort.Slice(mounts, func(i, j int) bool {
|
|
|
|
return len(mounts[i].Destination) < len(mounts[j].Destination)
|
|
|
|
})
|
|
|
|
|
|
|
|
return mounts, nil
|
2020-04-25 01:22:21 +00:00
|
|
|
}
|
|
|
|
|
2020-08-21 21:28:27 +00:00
|
|
|
func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) (*vfs.Mount, error) {
|
2020-04-17 17:38:04 +00:00
|
|
|
root := mns.Root()
|
2020-08-03 20:33:47 +00:00
|
|
|
defer root.DecRef(ctx)
|
2020-04-17 17:38:04 +00:00
|
|
|
target := &vfs.PathOperation{
|
|
|
|
Root: root,
|
|
|
|
Start: root,
|
|
|
|
Path: fspath.Parse(submount.Destination),
|
|
|
|
}
|
2020-06-02 04:30:28 +00:00
|
|
|
fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
|
2020-04-17 17:38:04 +00:00
|
|
|
if err != nil {
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, fmt.Errorf("mountOptions failed: %w", err)
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
2020-06-02 04:30:28 +00:00
|
|
|
if len(fsName) == 0 {
|
2020-04-25 01:22:21 +00:00
|
|
|
// Filesystem is not supported (e.g. cgroup), just skip it.
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, nil
|
2020-04-25 01:22:21 +00:00
|
|
|
}
|
|
|
|
|
2020-08-22 03:04:31 +00:00
|
|
|
if err := c.k.VFS().MakeSyntheticMountpoint(ctx, submount.Destination, root, creds); err != nil {
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, err
|
2020-04-25 01:22:21 +00:00
|
|
|
}
|
2020-08-21 21:28:27 +00:00
|
|
|
mnt, err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
2020-05-13 00:24:46 +00:00
|
|
|
log.Infof("Mounted %q to %q type: %s, internal-options: %q", submount.Source, submount.Destination, submount.Type, opts.GetFilesystemOptions.Data)
|
2020-08-21 21:28:27 +00:00
|
|
|
return mnt, nil
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
|
|
|
|
2020-05-14 01:16:45 +00:00
|
|
|
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
|
|
|
|
// used for mounts.
|
2020-08-20 01:35:35 +00:00
|
|
|
func (c *containerMounter) getMountNameAndOptionsVFS2(conf *config.Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
|
2020-07-09 00:10:35 +00:00
|
|
|
fsName := m.Type
|
|
|
|
var data []string
|
2020-05-14 01:16:45 +00:00
|
|
|
|
2020-06-02 04:30:28 +00:00
|
|
|
// Find filesystem name and FS specific data field.
|
2020-05-14 01:16:45 +00:00
|
|
|
switch m.Type {
|
|
|
|
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
|
2020-07-09 00:10:35 +00:00
|
|
|
// Nothing to do.
|
|
|
|
|
2020-05-14 01:16:45 +00:00
|
|
|
case nonefs:
|
|
|
|
fsName = sys.Name
|
|
|
|
|
2020-07-09 00:10:35 +00:00
|
|
|
case tmpfs.Name:
|
2020-05-14 01:16:45 +00:00
|
|
|
var err error
|
2020-06-02 04:30:28 +00:00
|
|
|
data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
|
2020-05-14 01:16:45 +00:00
|
|
|
if err != nil {
|
2020-06-02 04:30:28 +00:00
|
|
|
return "", nil, err
|
2020-05-14 01:16:45 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
case bind:
|
|
|
|
fsName = gofer.Name
|
2020-07-09 00:10:35 +00:00
|
|
|
if m.fd == 0 {
|
|
|
|
// Check that an FD was provided to fails fast. Technically FD=0 is valid,
|
|
|
|
// but unlikely to be correct in this context.
|
|
|
|
return "", nil, fmt.Errorf("9P mount requires a connection FD")
|
|
|
|
}
|
2020-06-02 04:30:28 +00:00
|
|
|
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
|
2020-05-14 01:16:45 +00:00
|
|
|
|
|
|
|
default:
|
|
|
|
log.Warningf("ignoring unknown filesystem type %q", m.Type)
|
2020-07-09 00:10:35 +00:00
|
|
|
return "", nil, nil
|
2020-05-14 01:16:45 +00:00
|
|
|
}
|
2020-06-02 04:30:28 +00:00
|
|
|
|
|
|
|
opts := &vfs.MountOptions{
|
|
|
|
GetFilesystemOptions: vfs.GetFilesystemOptions{
|
|
|
|
Data: strings.Join(data, ","),
|
|
|
|
},
|
|
|
|
InternalMount: true,
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, o := range m.Options {
|
|
|
|
switch o {
|
|
|
|
case "rw":
|
|
|
|
opts.ReadOnly = false
|
|
|
|
case "ro":
|
|
|
|
opts.ReadOnly = true
|
|
|
|
case "noatime":
|
2020-06-06 02:10:28 +00:00
|
|
|
opts.Flags.NoATime = true
|
2020-06-02 04:30:28 +00:00
|
|
|
case "noexec":
|
|
|
|
opts.Flags.NoExec = true
|
|
|
|
default:
|
|
|
|
log.Warningf("ignoring unknown mount option %q", o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if conf.Overlay {
|
|
|
|
// All writes go to upper, be paranoid and make lower readonly.
|
|
|
|
opts.ReadOnly = true
|
|
|
|
}
|
|
|
|
return fsName, opts, nil
|
2020-05-14 01:16:45 +00:00
|
|
|
}
|
|
|
|
|
2020-06-02 04:30:28 +00:00
|
|
|
// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
|
|
|
|
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
|
|
|
|
// the host /tmp, but this is a nice optimization, and fixes some apps that call
|
|
|
|
// mknod in /tmp. It's unsafe to mount tmpfs if:
|
|
|
|
// 1. /tmp is mounted explicitly: we should not override user's wish
|
|
|
|
// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
|
|
|
|
//
|
|
|
|
// Note that when there are submounts inside of '/tmp', directories for the
|
|
|
|
// mount points must be present, making '/tmp' not empty anymore.
|
2020-08-20 01:35:35 +00:00
|
|
|
func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *config.Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
|
2020-06-02 04:30:28 +00:00
|
|
|
for _, m := range c.mounts {
|
|
|
|
// m.Destination has been cleaned, so it's to use equality here.
|
|
|
|
if m.Destination == "/tmp" {
|
|
|
|
log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
root := mns.Root()
|
2020-08-03 20:33:47 +00:00
|
|
|
defer root.DecRef(ctx)
|
2020-06-02 04:30:28 +00:00
|
|
|
pop := vfs.PathOperation{
|
|
|
|
Root: root,
|
|
|
|
Start: root,
|
|
|
|
Path: fspath.Parse("/tmp"),
|
|
|
|
}
|
|
|
|
// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
|
2020-08-08 03:06:39 +00:00
|
|
|
fd, err := c.k.VFS().OpenAt(ctx, creds, &pop, &vfs.OpenOptions{Flags: linux.O_RDONLY | linux.O_DIRECTORY})
|
2020-06-02 04:30:28 +00:00
|
|
|
switch err {
|
|
|
|
case nil:
|
2020-08-08 03:06:39 +00:00
|
|
|
defer fd.DecRef(ctx)
|
|
|
|
|
|
|
|
err := fd.IterDirents(ctx, vfs.IterDirentsCallbackFunc(func(dirent vfs.Dirent) error {
|
|
|
|
if dirent.Name != "." && dirent.Name != ".." {
|
|
|
|
return syserror.ENOTEMPTY
|
|
|
|
}
|
2020-06-02 04:30:28 +00:00
|
|
|
return nil
|
2020-08-08 03:06:39 +00:00
|
|
|
}))
|
|
|
|
switch err {
|
|
|
|
case nil:
|
|
|
|
log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
|
|
|
|
case syserror.ENOTEMPTY:
|
2020-06-02 04:30:28 +00:00
|
|
|
// If more than "." and ".." is found, skip internal tmpfs to prevent
|
|
|
|
// hiding existing files.
|
|
|
|
log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
|
|
|
|
return nil
|
2020-08-08 03:06:39 +00:00
|
|
|
default:
|
|
|
|
return err
|
2020-06-02 04:30:28 +00:00
|
|
|
}
|
|
|
|
fallthrough
|
|
|
|
|
|
|
|
case syserror.ENOENT:
|
|
|
|
// No '/tmp' found (or fallthrough from above). It's safe to mount internal
|
|
|
|
// tmpfs.
|
|
|
|
tmpMount := specs.Mount{
|
|
|
|
Type: tmpfs.Name,
|
|
|
|
Destination: "/tmp",
|
|
|
|
// Sticky bit is added to prevent accidental deletion of files from
|
|
|
|
// another user. This is normally done for /tmp.
|
|
|
|
Options: []string{"mode=01777"},
|
|
|
|
}
|
2020-08-21 21:28:27 +00:00
|
|
|
_, err := c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
|
|
|
|
return err
|
2020-06-02 04:30:28 +00:00
|
|
|
|
2020-08-08 03:06:39 +00:00
|
|
|
case syserror.ENOTDIR:
|
|
|
|
// Not a dir?! Let it be.
|
|
|
|
return nil
|
|
|
|
|
2020-06-02 04:30:28 +00:00
|
|
|
default:
|
2020-08-08 03:06:39 +00:00
|
|
|
return fmt.Errorf(`opening "/tmp" inside container: %w`, err)
|
2020-06-02 04:30:28 +00:00
|
|
|
}
|
|
|
|
}
|
2020-07-09 00:10:35 +00:00
|
|
|
|
|
|
|
// processHintsVFS2 processes annotations that container hints about how volumes
|
|
|
|
// should be mounted (e.g. a volume shared between containers). It must be
|
|
|
|
// called for the root container only.
|
2020-08-20 01:35:35 +00:00
|
|
|
func (c *containerMounter) processHintsVFS2(conf *config.Config, creds *auth.Credentials) error {
|
2020-07-09 00:10:35 +00:00
|
|
|
ctx := c.k.SupervisorContext()
|
|
|
|
for _, hint := range c.hints.mounts {
|
|
|
|
// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
|
|
|
|
// common gofer to mount all shared volumes.
|
|
|
|
if hint.mount.Type != tmpfs.Name {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
|
|
|
|
mnt, err := c.mountSharedMasterVFS2(ctx, conf, hint, creds)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
|
|
|
|
}
|
|
|
|
hint.vfsMount = mnt
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// mountSharedMasterVFS2 mounts the master of a volume that is shared among
|
|
|
|
// containers in a pod.
|
2020-08-20 01:35:35 +00:00
|
|
|
func (c *containerMounter) mountSharedMasterVFS2(ctx context.Context, conf *config.Config, hint *mountHint, creds *auth.Credentials) (*vfs.Mount, error) {
|
2020-07-09 00:10:35 +00:00
|
|
|
// Map mount type to filesystem name, and parse out the options that we are
|
|
|
|
// capable of dealing with.
|
|
|
|
mntFD := &mountAndFD{Mount: hint.mount}
|
|
|
|
fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, mntFD)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if len(fsName) == 0 {
|
|
|
|
return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
|
|
|
|
}
|
|
|
|
return c.k.VFS().MountDisconnected(ctx, creds, "", fsName, opts)
|
|
|
|
}
|
|
|
|
|
|
|
|
// mountSharedSubmount binds mount to a previously mounted volume that is shared
|
|
|
|
// among containers in the same pod.
|
2020-08-21 21:28:27 +00:00
|
|
|
func (c *containerMounter) mountSharedSubmountVFS2(ctx context.Context, conf *config.Config, mns *vfs.MountNamespace, creds *auth.Credentials, mount specs.Mount, source *mountHint) (*vfs.Mount, error) {
|
2020-07-09 00:10:35 +00:00
|
|
|
if err := source.checkCompatible(mount); err != nil {
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, err
|
2020-07-09 00:10:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
_, opts, err := c.getMountNameAndOptionsVFS2(conf, &mountAndFD{Mount: mount})
|
|
|
|
if err != nil {
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, err
|
2020-07-09 00:10:35 +00:00
|
|
|
}
|
|
|
|
newMnt, err := c.k.VFS().NewDisconnectedMount(source.vfsMount.Filesystem(), source.vfsMount.Root(), opts)
|
|
|
|
if err != nil {
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, err
|
2020-07-09 00:10:35 +00:00
|
|
|
}
|
2020-08-03 20:33:47 +00:00
|
|
|
defer newMnt.DecRef(ctx)
|
2020-07-09 00:10:35 +00:00
|
|
|
|
|
|
|
root := mns.Root()
|
2020-08-03 20:33:47 +00:00
|
|
|
defer root.DecRef(ctx)
|
2020-08-22 03:04:31 +00:00
|
|
|
if err := c.k.VFS().MakeSyntheticMountpoint(ctx, mount.Destination, root, creds); err != nil {
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, err
|
2020-07-09 00:10:35 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
target := &vfs.PathOperation{
|
|
|
|
Root: root,
|
|
|
|
Start: root,
|
|
|
|
Path: fspath.Parse(mount.Destination),
|
|
|
|
}
|
|
|
|
if err := c.k.VFS().ConnectMountAt(ctx, creds, newMnt, target); err != nil {
|
2020-08-21 21:28:27 +00:00
|
|
|
return nil, err
|
2020-07-09 00:10:35 +00:00
|
|
|
}
|
|
|
|
log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
|
2020-08-21 21:28:27 +00:00
|
|
|
return newMnt, nil
|
2020-07-09 00:10:35 +00:00
|
|
|
}
|