2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package boot
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"path/filepath"
|
2019-06-11 21:52:06 +00:00
|
|
|
"sort"
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
"strconv"
|
2018-04-27 17:37:02 +00:00
|
|
|
"strings"
|
2019-04-17 19:56:23 +00:00
|
|
|
"syscall"
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Include filesystem types that OCI spec might mount.
|
2019-06-13 23:49:09 +00:00
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
|
2020-07-09 00:10:35 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/vfs"
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
2019-08-27 17:46:06 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
2020-01-27 23:17:58 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/context"
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs"
|
2019-08-27 17:46:06 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
|
2020-05-29 23:33:50 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs/user"
|
2020-05-13 17:30:00 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devpts"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fsimpl/devtmpfs"
|
|
|
|
gofervfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/gofer"
|
|
|
|
procvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/proc"
|
|
|
|
sysvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/sys"
|
|
|
|
tmpfsvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/tmpfs"
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel"
|
2019-08-27 17:46:06 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/syserror"
|
|
|
|
"gvisor.dev/gvisor/runsc/specutils"
|
2018-04-27 17:37:02 +00:00
|
|
|
)
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
const (
|
|
|
|
// Device name for root mount.
|
|
|
|
rootDevice = "9pfs-/"
|
2018-08-15 23:24:07 +00:00
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// MountPrefix is the annotation prefix for mount hints.
|
2019-12-06 21:50:12 +00:00
|
|
|
MountPrefix = "dev.gvisor.spec.mount."
|
2019-06-11 21:52:06 +00:00
|
|
|
|
2020-05-13 17:30:00 +00:00
|
|
|
// Supported filesystems that map to different internal filesystem.
|
|
|
|
bind = "bind"
|
|
|
|
nonefs = "none"
|
2018-06-29 21:46:45 +00:00
|
|
|
)
|
|
|
|
|
2020-04-22 06:04:18 +00:00
|
|
|
// tmpfs has some extra supported options that we must pass through.
|
2020-06-02 04:30:28 +00:00
|
|
|
var tmpfsAllowedData = []string{"mode", "uid", "gid"}
|
2019-10-08 20:34:46 +00:00
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
|
|
|
|
// Upper layer uses the same flags as lower, but it must be read-write.
|
|
|
|
upperFlags := lowerFlags
|
|
|
|
upperFlags.ReadOnly = false
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
tmpFS := mustFindFilesystem("tmpfs")
|
|
|
|
if !fs.IsDir(lower.StableAttr) {
|
|
|
|
// Create overlay on top of mount file, e.g. /etc/hostname.
|
2019-06-14 01:39:43 +00:00
|
|
|
msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags)
|
2019-06-04 01:19:52 +00:00
|
|
|
return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
|
2019-04-17 19:56:23 +00:00
|
|
|
}
|
2018-10-01 17:29:45 +00:00
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
// Create overlay on top of mount dir.
|
|
|
|
upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2019-06-04 01:19:52 +00:00
|
|
|
return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
|
2018-05-24 21:27:05 +00:00
|
|
|
}
|
2019-07-25 23:47:32 +00:00
|
|
|
|
|
|
|
// Replicate permissions and owner from lower to upper mount point.
|
|
|
|
attr, err := lower.UnstableAttr(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("reading attributes from lower mount point: %v", err)
|
|
|
|
}
|
|
|
|
if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) {
|
|
|
|
return nil, fmt.Errorf("error setting permission to upper mount point")
|
|
|
|
}
|
|
|
|
if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil {
|
|
|
|
return nil, fmt.Errorf("setting owner to upper mount point: %v", err)
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
|
2018-05-24 21:27:05 +00:00
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
// compileMounts returns the supported mounts from the mount spec, adding any
|
2018-07-03 17:35:27 +00:00
|
|
|
// mandatory mounts that are required by the OCI specification.
|
2018-06-29 21:46:45 +00:00
|
|
|
func compileMounts(spec *specs.Spec) []specs.Mount {
|
2019-01-16 20:47:21 +00:00
|
|
|
// Keep track of whether proc and sys were mounted.
|
|
|
|
var procMounted, sysMounted bool
|
2018-06-29 21:46:45 +00:00
|
|
|
var mounts []specs.Mount
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-06-15 20:57:29 +00:00
|
|
|
// Always mount /dev.
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, specs.Mount{
|
2020-05-13 17:30:00 +00:00
|
|
|
Type: devtmpfs.Name,
|
2018-06-15 20:57:29 +00:00
|
|
|
Destination: "/dev",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-06-15 20:57:29 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, specs.Mount{
|
2020-05-13 17:30:00 +00:00
|
|
|
Type: devpts.Name,
|
2018-06-15 20:57:29 +00:00
|
|
|
Destination: "/dev/pts",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-06-15 20:57:29 +00:00
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Mount all submounts from the spec.
|
|
|
|
for _, m := range spec.Mounts {
|
2018-06-15 20:57:29 +00:00
|
|
|
if !specutils.IsSupportedDevMount(m) {
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Warningf("ignoring dev mount at %q", m.Destination)
|
|
|
|
continue
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, m)
|
2018-06-15 20:57:29 +00:00
|
|
|
switch filepath.Clean(m.Destination) {
|
2018-04-27 17:37:02 +00:00
|
|
|
case "/proc":
|
|
|
|
procMounted = true
|
|
|
|
case "/sys":
|
|
|
|
sysMounted = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mount proc and sys even if the user did not ask for it, as the spec
|
|
|
|
// says we SHOULD.
|
2018-07-03 17:35:27 +00:00
|
|
|
var mandatoryMounts []specs.Mount
|
2018-04-27 17:37:02 +00:00
|
|
|
if !procMounted {
|
2018-07-03 17:35:27 +00:00
|
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
2020-05-13 17:30:00 +00:00
|
|
|
Type: procvfs2.Name,
|
2018-04-27 17:37:02 +00:00
|
|
|
Destination: "/proc",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
if !sysMounted {
|
2018-07-03 17:35:27 +00:00
|
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
2020-05-13 17:30:00 +00:00
|
|
|
Type: sysvfs2.Name,
|
2018-04-27 17:37:02 +00:00
|
|
|
Destination: "/sys",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-07-03 17:35:27 +00:00
|
|
|
// The mandatory mounts should be ordered right after the root, in case
|
|
|
|
// there are submounts of these mandatory mounts already in the spec.
|
|
|
|
mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
return mounts
|
|
|
|
}
|
|
|
|
|
2020-06-02 04:30:28 +00:00
|
|
|
// p9MountData creates a slice of p9 mount data.
|
|
|
|
func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
|
2019-06-04 01:19:52 +00:00
|
|
|
opts := []string{
|
|
|
|
"trans=fd",
|
|
|
|
"rfdno=" + strconv.Itoa(fd),
|
|
|
|
"wfdno=" + strconv.Itoa(fd),
|
2020-05-13 17:30:00 +00:00
|
|
|
}
|
|
|
|
if !vfs2 {
|
|
|
|
// privateunixsocket is always enabled in VFS2. VFS1 requires explicit
|
|
|
|
// enablement.
|
|
|
|
opts = append(opts, "privateunixsocket=true")
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
if fa == FileAccessShared {
|
|
|
|
opts = append(opts, "cache=remote_revalidating")
|
|
|
|
}
|
|
|
|
return opts
|
|
|
|
}
|
|
|
|
|
|
|
|
// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
|
|
|
|
// keys.
|
|
|
|
func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
|
|
|
|
var out []string
|
|
|
|
for _, o := range opts {
|
2019-10-08 20:34:46 +00:00
|
|
|
ok, err := parseMountOption(o, allowedKeys...)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if ok {
|
|
|
|
out = append(out, o)
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return out, nil
|
|
|
|
}
|
|
|
|
|
2019-10-08 20:34:46 +00:00
|
|
|
func parseMountOption(opt string, allowedKeys ...string) (bool, error) {
|
|
|
|
kv := strings.SplitN(opt, "=", 3)
|
|
|
|
if len(kv) > 2 {
|
|
|
|
return false, fmt.Errorf("invalid option %q", opt)
|
|
|
|
}
|
|
|
|
return specutils.ContainsStr(allowedKeys, kv[0]), nil
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
// mountDevice returns a device string based on the fs type and target
|
|
|
|
// of the mount.
|
|
|
|
func mountDevice(m specs.Mount) string {
|
|
|
|
if m.Type == bind {
|
|
|
|
// Make a device string that includes the target, which is consistent across
|
|
|
|
// S/R and uniquely identifies the connection.
|
|
|
|
return "9pfs-" + m.Destination
|
|
|
|
}
|
|
|
|
// All other fs types use device "none".
|
|
|
|
return "none"
|
|
|
|
}
|
|
|
|
|
|
|
|
func mountFlags(opts []string) fs.MountSourceFlags {
|
|
|
|
mf := fs.MountSourceFlags{}
|
2019-10-08 20:34:46 +00:00
|
|
|
// Note: changes to supported options must be reflected in
|
|
|
|
// isSupportedMountFlag() as well.
|
2019-06-04 01:19:52 +00:00
|
|
|
for _, o := range opts {
|
|
|
|
switch o {
|
|
|
|
case "rw":
|
|
|
|
mf.ReadOnly = false
|
|
|
|
case "ro":
|
|
|
|
mf.ReadOnly = true
|
|
|
|
case "noatime":
|
|
|
|
mf.NoAtime = true
|
|
|
|
case "noexec":
|
|
|
|
mf.NoExec = true
|
|
|
|
default:
|
|
|
|
log.Warningf("ignoring unknown mount option %q", o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return mf
|
|
|
|
}
|
|
|
|
|
2019-10-08 20:34:46 +00:00
|
|
|
func isSupportedMountFlag(fstype, opt string) bool {
|
|
|
|
switch opt {
|
|
|
|
case "rw", "ro", "noatime", "noexec":
|
|
|
|
return true
|
|
|
|
}
|
2020-05-13 17:30:00 +00:00
|
|
|
if fstype == tmpfsvfs2.Name {
|
2020-06-02 04:30:28 +00:00
|
|
|
ok, err := parseMountOption(opt, tmpfsAllowedData...)
|
2019-10-08 20:34:46 +00:00
|
|
|
return ok && err == nil
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
func mustFindFilesystem(name string) fs.Filesystem {
|
|
|
|
fs, ok := fs.FindFilesystem(name)
|
|
|
|
if !ok {
|
|
|
|
panic(fmt.Sprintf("could not find filesystem %q", name))
|
|
|
|
}
|
|
|
|
return fs
|
|
|
|
}
|
|
|
|
|
|
|
|
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
|
|
|
|
// paths.
|
|
|
|
func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
|
2019-06-27 21:22:40 +00:00
|
|
|
// Construct a ramfs tree of mount points. The contents never
|
|
|
|
// change, so this can be fully caching. There's no real
|
|
|
|
// filesystem backing this tree, so we set the filesystem to
|
|
|
|
// nil.
|
|
|
|
msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{})
|
2019-06-04 01:19:52 +00:00
|
|
|
mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("creating mount tree: %v", err)
|
|
|
|
}
|
|
|
|
overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("adding mount overlay: %v", err)
|
|
|
|
}
|
|
|
|
return overlayInode, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// subtargets takes a set of Mounts and returns only the targets that are
|
|
|
|
// children of the given root. The returned paths are relative to the root.
|
|
|
|
func subtargets(root string, mnts []specs.Mount) []string {
|
|
|
|
var targets []string
|
|
|
|
for _, mnt := range mnts {
|
|
|
|
if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
|
|
|
|
targets = append(targets, relPath)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return targets
|
|
|
|
}
|
|
|
|
|
2019-08-27 17:46:06 +00:00
|
|
|
func setupContainerFS(ctx context.Context, conf *Config, mntr *containerMounter, procArgs *kernel.CreateProcessArgs) error {
|
2020-04-17 17:38:04 +00:00
|
|
|
if conf.VFS2 {
|
|
|
|
return setupContainerVFS2(ctx, conf, mntr, procArgs)
|
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
mns, err := mntr.setupFS(conf, procArgs)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set namespace here so that it can be found in ctx.
|
|
|
|
procArgs.MountNamespace = mns
|
|
|
|
|
2020-05-29 23:33:50 +00:00
|
|
|
// Resolve the executable path from working dir and environment.
|
2020-06-09 06:06:50 +00:00
|
|
|
resolved, err := user.ResolveExecutablePath(ctx, procArgs)
|
2019-06-04 01:19:52 +00:00
|
|
|
if err != nil {
|
2020-06-09 06:06:50 +00:00
|
|
|
return err
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
2020-06-09 06:06:50 +00:00
|
|
|
procArgs.Filename = resolved
|
2019-06-04 01:19:52 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func adjustDirentCache(k *kernel.Kernel) error {
|
|
|
|
var hl syscall.Rlimit
|
|
|
|
if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
|
|
|
|
return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
|
|
|
|
}
|
|
|
|
if int64(hl.Cur) != syscall.RLIM_INFINITY {
|
|
|
|
newSize := hl.Cur / 2
|
|
|
|
if newSize < gofer.DefaultDirentCacheSize {
|
|
|
|
log.Infof("Setting gofer dirent cache size to %d", newSize)
|
|
|
|
gofer.DefaultDirentCacheSize = newSize
|
|
|
|
k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type fdDispenser struct {
|
|
|
|
fds []int
|
|
|
|
}
|
|
|
|
|
|
|
|
func (f *fdDispenser) remove() int {
|
|
|
|
if f.empty() {
|
|
|
|
panic("fdDispenser out of fds")
|
|
|
|
}
|
|
|
|
rv := f.fds[0]
|
|
|
|
f.fds = f.fds[1:]
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (f *fdDispenser) empty() bool {
|
|
|
|
return len(f.fds) == 0
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
type shareType int
|
|
|
|
|
|
|
|
const (
|
|
|
|
invalid shareType = iota
|
|
|
|
|
|
|
|
// container shareType indicates that the mount is used by a single container.
|
|
|
|
container
|
|
|
|
|
|
|
|
// pod shareType indicates that the mount is used by more than one container
|
|
|
|
// inside the pod.
|
|
|
|
pod
|
|
|
|
|
|
|
|
// shared shareType indicates that the mount can also be shared with a process
|
|
|
|
// outside the pod, e.g. NFS.
|
|
|
|
shared
|
|
|
|
)
|
|
|
|
|
|
|
|
func parseShare(val string) (shareType, error) {
|
|
|
|
switch val {
|
|
|
|
case "container":
|
|
|
|
return container, nil
|
|
|
|
case "pod":
|
|
|
|
return pod, nil
|
|
|
|
case "shared":
|
|
|
|
return shared, nil
|
|
|
|
default:
|
|
|
|
return 0, fmt.Errorf("invalid share value %q", val)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s shareType) String() string {
|
|
|
|
switch s {
|
|
|
|
case invalid:
|
|
|
|
return "invalid"
|
|
|
|
case container:
|
|
|
|
return "container"
|
|
|
|
case pod:
|
|
|
|
return "pod"
|
|
|
|
case shared:
|
|
|
|
return "shared"
|
|
|
|
default:
|
|
|
|
return fmt.Sprintf("invalid share value %d", s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// mountHint represents extra information about mounts that are provided via
|
|
|
|
// annotations. They can override mount type, and provide sharing information
|
|
|
|
// so that mounts can be correctly shared inside the pod.
|
|
|
|
type mountHint struct {
|
|
|
|
name string
|
|
|
|
share shareType
|
|
|
|
mount specs.Mount
|
|
|
|
|
|
|
|
// root is the inode where the volume is mounted. For mounts with 'pod' share
|
|
|
|
// the volume is mounted once and then bind mounted inside the containers.
|
|
|
|
root *fs.Inode
|
2020-07-09 00:10:35 +00:00
|
|
|
|
|
|
|
// vfsMount is the master mount for the volume. For mounts with 'pod' share
|
|
|
|
// the master volume is bind mounted inside the containers.
|
|
|
|
vfsMount *vfs.Mount
|
2019-06-11 21:52:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) setField(key, val string) error {
|
|
|
|
switch key {
|
|
|
|
case "source":
|
|
|
|
if len(val) == 0 {
|
|
|
|
return fmt.Errorf("source cannot be empty")
|
|
|
|
}
|
|
|
|
m.mount.Source = val
|
|
|
|
case "type":
|
|
|
|
return m.setType(val)
|
|
|
|
case "share":
|
|
|
|
share, err := parseShare(val)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
m.share = share
|
|
|
|
case "options":
|
|
|
|
return m.setOptions(val)
|
|
|
|
default:
|
|
|
|
return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) setType(val string) error {
|
|
|
|
switch val {
|
|
|
|
case "tmpfs", "bind":
|
|
|
|
m.mount.Type = val
|
|
|
|
default:
|
|
|
|
return fmt.Errorf("invalid type %q", val)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) setOptions(val string) error {
|
|
|
|
opts := strings.Split(val, ",")
|
|
|
|
if err := specutils.ValidateMountOptions(opts); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// Sort options so it can be compared with container mount options later on.
|
|
|
|
sort.Strings(opts)
|
|
|
|
m.mount.Options = opts
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) isSupported() bool {
|
2020-05-13 17:30:00 +00:00
|
|
|
return m.mount.Type == tmpfsvfs2.Name && m.share == pod
|
2019-06-11 21:52:06 +00:00
|
|
|
}
|
|
|
|
|
2019-10-08 20:34:46 +00:00
|
|
|
// checkCompatible verifies that shared mount is compatible with master.
|
|
|
|
// For now enforce that all options are the same. Once bind mount is properly
|
|
|
|
// supported, then we should ensure the master is less restrictive than the
|
|
|
|
// container, e.g. master can be 'rw' while container mounts as 'ro'.
|
|
|
|
func (m *mountHint) checkCompatible(mount specs.Mount) error {
|
|
|
|
// Remove options that don't affect to mount's behavior.
|
|
|
|
masterOpts := filterUnsupportedOptions(m.mount)
|
|
|
|
slaveOpts := filterUnsupportedOptions(mount)
|
|
|
|
|
|
|
|
if len(masterOpts) != len(slaveOpts) {
|
|
|
|
return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
|
|
|
|
}
|
|
|
|
|
|
|
|
sort.Strings(masterOpts)
|
|
|
|
sort.Strings(slaveOpts)
|
|
|
|
for i, opt := range masterOpts {
|
|
|
|
if opt != slaveOpts[i] {
|
|
|
|
return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", masterOpts, slaveOpts)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-11-25 19:41:39 +00:00
|
|
|
func (m *mountHint) fileAccessType() FileAccessType {
|
|
|
|
if m.share == container {
|
|
|
|
return FileAccessExclusive
|
|
|
|
}
|
|
|
|
return FileAccessShared
|
|
|
|
}
|
|
|
|
|
2019-10-08 20:34:46 +00:00
|
|
|
func filterUnsupportedOptions(mount specs.Mount) []string {
|
|
|
|
rv := make([]string, 0, len(mount.Options))
|
|
|
|
for _, o := range mount.Options {
|
|
|
|
if isSupportedMountFlag(mount.Type, o) {
|
|
|
|
rv = append(rv, o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// podMountHints contains a collection of mountHints for the pod.
|
|
|
|
type podMountHints struct {
|
|
|
|
mounts map[string]*mountHint
|
|
|
|
}
|
|
|
|
|
|
|
|
func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
|
|
|
|
mnts := make(map[string]*mountHint)
|
|
|
|
for k, v := range spec.Annotations {
|
2019-12-06 21:50:12 +00:00
|
|
|
// Look for 'dev.gvisor.spec.mount' annotations and parse them.
|
2019-06-11 21:52:06 +00:00
|
|
|
if strings.HasPrefix(k, MountPrefix) {
|
2019-12-06 21:50:12 +00:00
|
|
|
// Remove the prefix and split the rest.
|
|
|
|
parts := strings.Split(k[len(MountPrefix):], ".")
|
|
|
|
if len(parts) != 2 {
|
2019-06-11 21:52:06 +00:00
|
|
|
return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
|
|
|
|
}
|
2019-12-06 21:50:12 +00:00
|
|
|
name := parts[0]
|
|
|
|
if len(name) == 0 {
|
2019-06-11 21:52:06 +00:00
|
|
|
return nil, fmt.Errorf("invalid mount name: %s", name)
|
|
|
|
}
|
|
|
|
mnt := mnts[name]
|
|
|
|
if mnt == nil {
|
|
|
|
mnt = &mountHint{name: name}
|
|
|
|
mnts[name] = mnt
|
|
|
|
}
|
2019-12-06 21:50:12 +00:00
|
|
|
if err := mnt.setField(parts[1], v); err != nil {
|
2019-06-11 21:52:06 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate all hints after done parsing.
|
|
|
|
for name, m := range mnts {
|
|
|
|
log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
|
|
|
|
if m.share == invalid {
|
|
|
|
return nil, fmt.Errorf("share field for %q has not been set", m.name)
|
|
|
|
}
|
|
|
|
if len(m.mount.Source) == 0 {
|
|
|
|
return nil, fmt.Errorf("source field for %q has not been set", m.name)
|
|
|
|
}
|
|
|
|
if len(m.mount.Type) == 0 {
|
|
|
|
return nil, fmt.Errorf("type field for %q has not been set", m.name)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for duplicate mount sources.
|
|
|
|
for name2, m2 := range mnts {
|
|
|
|
if name != name2 && m.mount.Source == m2.mount.Source {
|
|
|
|
return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &podMountHints{mounts: mnts}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
|
|
|
|
for _, m := range p.mounts {
|
|
|
|
if m.mount.Source == mount.Source {
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
type containerMounter struct {
|
|
|
|
root *specs.Root
|
|
|
|
|
|
|
|
// mounts is the set of submounts for the container. It's a copy from the spec
|
|
|
|
// that may be freely modified without affecting the original spec.
|
|
|
|
mounts []specs.Mount
|
|
|
|
|
|
|
|
// fds is the list of FDs to be dispensed for mounts that require it.
|
|
|
|
fds fdDispenser
|
|
|
|
|
|
|
|
k *kernel.Kernel
|
2019-06-11 21:52:06 +00:00
|
|
|
|
|
|
|
hints *podMountHints
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
|
2019-08-02 20:46:42 +00:00
|
|
|
func newContainerMounter(spec *specs.Spec, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
|
2019-06-04 01:19:52 +00:00
|
|
|
return &containerMounter{
|
|
|
|
root: spec.Root,
|
|
|
|
mounts: compileMounts(spec),
|
|
|
|
fds: fdDispenser{fds: goferFDs},
|
|
|
|
k: k,
|
2019-06-11 21:52:06 +00:00
|
|
|
hints: hints,
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-27 17:46:06 +00:00
|
|
|
// processHints processes annotations that container hints about how volumes
|
|
|
|
// should be mounted (e.g. a volume shared between containers). It must be
|
|
|
|
// called for the root container only.
|
2020-07-09 00:10:35 +00:00
|
|
|
func (c *containerMounter) processHints(conf *Config, creds *auth.Credentials) error {
|
2020-04-17 17:38:04 +00:00
|
|
|
if conf.VFS2 {
|
2020-07-09 00:10:35 +00:00
|
|
|
return c.processHintsVFS2(conf, creds)
|
2020-04-17 17:38:04 +00:00
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
ctx := c.k.SupervisorContext()
|
|
|
|
for _, hint := range c.hints.mounts {
|
2019-12-06 21:50:12 +00:00
|
|
|
// TODO(b/142076984): Only support tmpfs for now. Bind mounts require a
|
|
|
|
// common gofer to mount all shared volumes.
|
2020-05-13 17:30:00 +00:00
|
|
|
if hint.mount.Type != tmpfsvfs2.Name {
|
2019-12-06 21:50:12 +00:00
|
|
|
continue
|
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
|
|
|
|
inode, err := c.mountSharedMaster(ctx, conf, hint)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
|
|
|
|
}
|
|
|
|
hint.root = inode
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// setupFS is used to set up the file system for all containers. This is the
|
|
|
|
// main entry point method, with most of the other being internal only. It
|
|
|
|
// returns the mount namespace that is created for the container.
|
|
|
|
func (c *containerMounter) setupFS(conf *Config, procArgs *kernel.CreateProcessArgs) (*fs.MountNamespace, error) {
|
|
|
|
log.Infof("Configuring container's file system")
|
|
|
|
|
|
|
|
// Create context with root credentials to mount the filesystem (the current
|
|
|
|
// user may not be privileged enough).
|
|
|
|
rootProcArgs := *procArgs
|
|
|
|
rootProcArgs.WorkingDirectory = "/"
|
|
|
|
rootProcArgs.Credentials = auth.NewRootCredentials(procArgs.Credentials.UserNamespace)
|
|
|
|
rootProcArgs.Umask = 0022
|
|
|
|
rootProcArgs.MaxSymlinkTraversals = linux.MaxSymlinkTraversals
|
|
|
|
rootCtx := rootProcArgs.NewContext(c.k)
|
|
|
|
|
|
|
|
mns, err := c.createMountNamespace(rootCtx, conf)
|
2019-06-04 01:19:52 +00:00
|
|
|
if err != nil {
|
2019-08-27 17:46:06 +00:00
|
|
|
return nil, err
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
|
2019-08-27 17:46:06 +00:00
|
|
|
// Set namespace here so that it can be found in rootCtx.
|
|
|
|
rootProcArgs.MountNamespace = mns
|
|
|
|
|
|
|
|
if err := c.mountSubmounts(rootCtx, conf, mns); err != nil {
|
|
|
|
return nil, err
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
return mns, nil
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
|
2019-08-27 17:46:06 +00:00
|
|
|
func (c *containerMounter) createMountNamespace(ctx context.Context, conf *Config) (*fs.MountNamespace, error) {
|
|
|
|
rootInode, err := c.createRootMount(ctx, conf)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("creating filesystem for container: %v", err)
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
mns, err := fs.NewMountNamespace(ctx, rootInode)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("creating new mount namespace for container: %v", err)
|
|
|
|
}
|
|
|
|
return mns, nil
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
|
2019-08-27 17:46:06 +00:00
|
|
|
func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace) error {
|
|
|
|
root := mns.Root()
|
|
|
|
defer root.DecRef()
|
|
|
|
|
|
|
|
for _, m := range c.mounts {
|
|
|
|
log.Debugf("Mounting %q to %q, type: %s, options: %s", m.Source, m.Destination, m.Type, m.Options)
|
|
|
|
if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
|
|
|
|
if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
|
|
|
|
return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
|
|
|
|
return fmt.Errorf("mount submount %q: %v", m.Destination, err)
|
|
|
|
}
|
2019-06-11 21:52:06 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-08-27 17:46:06 +00:00
|
|
|
if err := c.mountTmp(ctx, conf, mns, root); err != nil {
|
|
|
|
return fmt.Errorf("mount submount %q: %v", "tmp", err)
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
|
|
|
|
if err := c.checkDispenser(); err != nil {
|
|
|
|
return err
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
return nil
|
|
|
|
}
|
2019-06-04 01:19:52 +00:00
|
|
|
|
2019-08-27 17:46:06 +00:00
|
|
|
func (c *containerMounter) checkDispenser() error {
|
|
|
|
if !c.fds.empty() {
|
|
|
|
return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
|
2019-08-02 18:21:50 +00:00
|
|
|
}
|
2019-08-27 17:46:06 +00:00
|
|
|
return nil
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// mountSharedMaster mounts the master of a volume that is shared among
|
|
|
|
// containers in a pod. It returns the root mount's inode.
|
|
|
|
func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
|
|
|
|
// Map mount type to filesystem name, and parse out the options that we are
|
|
|
|
// capable of dealing with.
|
|
|
|
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if len(fsName) == 0 {
|
|
|
|
return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mount with revalidate because it's shared among containers.
|
|
|
|
opts = append(opts, "cache=revalidate")
|
|
|
|
|
|
|
|
// All filesystem names should have been mapped to something we know.
|
|
|
|
filesystem := mustFindFilesystem(fsName)
|
|
|
|
|
|
|
|
mf := mountFlags(hint.mount.Options)
|
|
|
|
if useOverlay {
|
|
|
|
// All writes go to upper, be paranoid and make lower readonly.
|
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
|
|
|
|
|
|
|
inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if useOverlay {
|
|
|
|
log.Debugf("Adding overlay on top of shared mount %q", hint.name)
|
|
|
|
inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return inode, nil
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// createRootMount creates the root filesystem.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
// First construct the filesystem from the spec.Root.
|
2019-06-04 01:19:52 +00:00
|
|
|
mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
fd := c.fds.remove()
|
2018-09-25 00:21:16 +00:00
|
|
|
log.Infof("Mounting root over 9P, ioFD: %d", fd)
|
2018-10-18 19:41:07 +00:00
|
|
|
p9FS := mustFindFilesystem("9p")
|
2020-06-02 04:30:28 +00:00
|
|
|
opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
|
2019-10-16 21:33:23 +00:00
|
|
|
|
|
|
|
if conf.OverlayfsStaleRead {
|
|
|
|
// We can't check for overlayfs here because sandbox is chroot'ed and gofer
|
|
|
|
// can only send mount options for specs.Mounts (specs.Root is missing
|
|
|
|
// Options field). So assume root is always on top of overlayfs.
|
|
|
|
opts = append(opts, "overlayfs_stale_read")
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
|
2018-09-25 00:21:16 +00:00
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return nil, fmt.Errorf("creating root mount point: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// We need to overlay the root on top of a ramfs with stub directories
|
|
|
|
// for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
|
|
|
|
// mounted even if they are not in the spec.
|
2019-06-04 01:19:52 +00:00
|
|
|
submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
|
2018-04-27 17:37:02 +00:00
|
|
|
rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
|
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return nil, fmt.Errorf("adding submount overlay: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
if conf.Overlay && !c.root.Readonly {
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Debugf("Adding overlay on top of root mount")
|
|
|
|
// Overlay a tmpfs filesystem on top of the root.
|
|
|
|
rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
|
|
|
|
if err != nil {
|
2019-06-04 01:19:52 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-06-04 01:19:52 +00:00
|
|
|
|
|
|
|
log.Infof("Mounted %q to %q type root", c.root.Path, "/")
|
|
|
|
return rootInode, nil
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
|
2018-06-21 17:17:19 +00:00
|
|
|
// used for mounts.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
var (
|
|
|
|
fsName string
|
|
|
|
opts []string
|
|
|
|
useOverlay bool
|
|
|
|
)
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
switch m.Type {
|
2020-05-13 17:30:00 +00:00
|
|
|
case devpts.Name, devtmpfs.Name, procvfs2.Name, sysvfs2.Name:
|
2018-04-27 17:37:02 +00:00
|
|
|
fsName = m.Type
|
2018-08-15 23:24:07 +00:00
|
|
|
case nonefs:
|
2020-05-13 17:30:00 +00:00
|
|
|
fsName = sysvfs2.Name
|
|
|
|
case tmpfsvfs2.Name:
|
2018-04-27 17:37:02 +00:00
|
|
|
fsName = m.Type
|
2019-10-16 21:33:23 +00:00
|
|
|
|
|
|
|
var err error
|
2020-06-02 04:30:28 +00:00
|
|
|
opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
|
2019-10-16 21:33:23 +00:00
|
|
|
if err != nil {
|
|
|
|
return "", nil, false, err
|
|
|
|
}
|
2020-04-22 06:04:18 +00:00
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
case bind:
|
2020-04-22 06:04:18 +00:00
|
|
|
fd := c.fds.remove()
|
2020-05-13 17:30:00 +00:00
|
|
|
fsName = gofervfs2.Name
|
2020-06-02 04:30:28 +00:00
|
|
|
opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2)
|
2020-04-22 06:04:18 +00:00
|
|
|
// If configured, add overlay to all writable mounts.
|
|
|
|
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
|
2020-04-20 07:46:22 +00:00
|
|
|
|
2020-04-22 06:04:18 +00:00
|
|
|
default:
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Warningf("ignoring unknown filesystem type %q", m.Type)
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
2019-10-16 21:33:23 +00:00
|
|
|
return fsName, opts, useOverlay, nil
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
|
|
|
|
2019-11-25 19:41:39 +00:00
|
|
|
func (c *containerMounter) getMountAccessType(mount specs.Mount) FileAccessType {
|
|
|
|
if hint := c.hints.findMount(mount); hint != nil {
|
|
|
|
return hint.fileAccessType()
|
|
|
|
}
|
|
|
|
// Non-root bind mounts are always shared if no hints were provided.
|
|
|
|
return FileAccessShared
|
|
|
|
}
|
|
|
|
|
2018-10-18 19:41:07 +00:00
|
|
|
// mountSubmount mounts volumes inside the container's root. Because mounts may
|
|
|
|
// be readonly, a lower ramfs overlay is added to create the mount point dir.
|
|
|
|
// Another overlay is added with tmpfs on top if Config.Overlay is true.
|
|
|
|
// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
|
2018-06-21 17:17:19 +00:00
|
|
|
// Map mount type to filesystem name, and parse out the options that we are
|
|
|
|
// capable of dealing with.
|
2019-06-04 01:19:52 +00:00
|
|
|
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
|
2018-06-21 17:17:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if fsName == "" {
|
2019-06-04 01:19:52 +00:00
|
|
|
// Filesystem is not supported (e.g. cgroup), just skip it.
|
2018-04-27 17:37:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// All filesystem names should have been mapped to something we know.
|
|
|
|
filesystem := mustFindFilesystem(fsName)
|
|
|
|
|
|
|
|
mf := mountFlags(m.Options)
|
|
|
|
if useOverlay {
|
|
|
|
// All writes go to upper, be paranoid and make lower readonly.
|
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
|
|
|
|
2019-03-14 02:23:02 +00:00
|
|
|
inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2020-04-08 01:49:52 +00:00
|
|
|
err := fmt.Errorf("creating mount with source %q: %v", m.Source, err)
|
|
|
|
// Check to see if this is a common error due to a Linux bug.
|
|
|
|
// This error is generated here in order to cause it to be
|
|
|
|
// printed to the user using Docker via 'runsc create' etc. rather
|
|
|
|
// than simply printed to the logs for the 'runsc boot' command.
|
|
|
|
//
|
|
|
|
// We check the error message string rather than type because the
|
|
|
|
// actual error types (syscall.EIO, syscall.EPIPE) are lost by file system
|
|
|
|
// implementation (e.g. p9).
|
|
|
|
// TODO(gvisor.dev/issue/1765): Remove message when bug is resolved.
|
|
|
|
if strings.Contains(err.Error(), syscall.EIO.Error()) || strings.Contains(err.Error(), syscall.EPIPE.Error()) {
|
|
|
|
return fmt.Errorf("%v: %s", err, specutils.FaqErrorMsg("memlock", "you may be encountering a Linux kernel bug"))
|
|
|
|
}
|
|
|
|
return err
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
// If there are submounts, we need to overlay the mount on top of a ramfs
|
|
|
|
// with stub directories for submount paths.
|
|
|
|
submounts := subtargets(m.Destination, c.mounts)
|
2018-06-15 20:57:29 +00:00
|
|
|
if len(submounts) > 0 {
|
|
|
|
log.Infof("Adding submount overlay over %q", m.Destination)
|
|
|
|
inode, err = addSubmountOverlay(ctx, inode, submounts)
|
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return fmt.Errorf("adding submount overlay: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if useOverlay {
|
|
|
|
log.Debugf("Adding overlay on top of mount %q", m.Destination)
|
2018-06-04 19:30:47 +00:00
|
|
|
inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
|
|
|
|
if err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-04 22:31:08 +00:00
|
|
|
maxTraversals := uint(0)
|
|
|
|
dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
defer dirent.DecRef()
|
|
|
|
if err := mns.Mount(ctx, dirent, inode); err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return fmt.Errorf("mount %q error: %v", m.Destination, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-12-06 21:50:12 +00:00
|
|
|
log.Infof("Mounted %q to %q type: %s, internal-options: %q", m.Source, m.Destination, m.Type, opts)
|
2018-04-27 17:37:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// mountSharedSubmount binds mount to a previously mounted volume that is shared
|
|
|
|
// among containers in the same pod.
|
|
|
|
func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
|
2019-10-08 20:34:46 +00:00
|
|
|
if err := source.checkCompatible(mount); err != nil {
|
|
|
|
return err
|
2019-06-11 21:52:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
maxTraversals := uint(0)
|
|
|
|
target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
|
|
|
|
}
|
|
|
|
defer target.DecRef()
|
|
|
|
|
2019-07-12 20:11:53 +00:00
|
|
|
// Take a ref on the inode that is about to be (re)-mounted.
|
|
|
|
source.root.IncRef()
|
2019-06-11 21:52:06 +00:00
|
|
|
if err := mns.Mount(ctx, target, source.root); err != nil {
|
2019-07-12 20:11:53 +00:00
|
|
|
source.root.DecRef()
|
2019-06-11 21:52:06 +00:00
|
|
|
return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
// addRestoreMount adds a mount to the MountSources map used for restoring a
|
|
|
|
// checkpointed container.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
|
|
|
|
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
|
2018-06-21 17:17:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
if fsName == "" {
|
2019-06-04 01:19:52 +00:00
|
|
|
// Filesystem is not supported (e.g. cgroup), just skip it.
|
2018-06-29 21:46:45 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
newMount := fs.MountArgs{
|
2019-03-14 02:23:02 +00:00
|
|
|
Dev: mountDevice(m),
|
|
|
|
Flags: mountFlags(m.Options),
|
|
|
|
DataString: strings.Join(opts, ","),
|
2018-06-29 21:46:45 +00:00
|
|
|
}
|
2019-05-04 04:40:48 +00:00
|
|
|
if useOverlay {
|
|
|
|
newMount.Flags.ReadOnly = true
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
|
|
|
|
log.Infof("Added mount at %q: %+v", fsName, newMount)
|
2018-06-21 17:17:19 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
|
|
|
|
// the mounts to the environment.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
|
2018-06-21 17:17:19 +00:00
|
|
|
renv := &fs.RestoreEnvironment{
|
|
|
|
MountSources: make(map[string][]fs.MountArgs),
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add root mount.
|
2019-06-04 01:19:52 +00:00
|
|
|
fd := c.fds.remove()
|
2020-06-02 04:30:28 +00:00
|
|
|
opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
mf := fs.MountSourceFlags{}
|
2019-06-04 01:19:52 +00:00
|
|
|
if c.root.Readonly || conf.Overlay {
|
2018-06-21 17:17:19 +00:00
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
|
|
|
|
rootMount := fs.MountArgs{
|
2019-03-14 02:23:02 +00:00
|
|
|
Dev: rootDevice,
|
|
|
|
Flags: mf,
|
|
|
|
DataString: strings.Join(opts, ","),
|
2018-06-29 21:46:45 +00:00
|
|
|
}
|
2020-05-13 17:30:00 +00:00
|
|
|
renv.MountSources[gofervfs2.Name] = append(renv.MountSources[gofervfs2.Name], rootMount)
|
2018-06-21 17:17:19 +00:00
|
|
|
|
2018-08-27 18:09:06 +00:00
|
|
|
// Add submounts.
|
2019-01-16 20:47:21 +00:00
|
|
|
var tmpMounted bool
|
2019-06-04 01:19:52 +00:00
|
|
|
for _, m := range c.mounts {
|
|
|
|
if err := c.addRestoreMount(conf, renv, m); err != nil {
|
2018-06-21 17:17:19 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2019-01-16 20:47:21 +00:00
|
|
|
if filepath.Clean(m.Destination) == "/tmp" {
|
|
|
|
tmpMounted = true
|
|
|
|
}
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
2019-01-16 20:47:21 +00:00
|
|
|
|
2019-04-29 21:03:04 +00:00
|
|
|
// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
|
2019-01-16 20:47:21 +00:00
|
|
|
if !tmpMounted {
|
|
|
|
tmpMount := specs.Mount{
|
2020-05-13 17:30:00 +00:00
|
|
|
Type: tmpfsvfs2.Name,
|
2019-01-16 20:47:21 +00:00
|
|
|
Destination: "/tmp",
|
|
|
|
}
|
2019-06-04 01:19:52 +00:00
|
|
|
if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
|
2019-01-16 20:47:21 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
return renv, nil
|
|
|
|
}
|
|
|
|
|
2019-01-16 20:47:21 +00:00
|
|
|
// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
|
|
|
|
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
|
|
|
|
// the host /tmp, but this is a nice optimization, and fixes some apps that call
|
|
|
|
// mknod in /tmp. It's unsafe to mount tmpfs if:
|
2019-06-11 21:52:06 +00:00
|
|
|
// 1. /tmp is mounted explicitly: we should not override user's wish
|
2019-01-16 20:47:21 +00:00
|
|
|
// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
|
|
|
|
//
|
|
|
|
// Note that when there are submounts inside of '/tmp', directories for the
|
|
|
|
// mount points must be present, making '/tmp' not empty anymore.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
|
|
|
|
for _, m := range c.mounts {
|
2019-01-16 20:47:21 +00:00
|
|
|
if filepath.Clean(m.Destination) == "/tmp" {
|
|
|
|
log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
maxTraversals := uint(0)
|
|
|
|
tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
|
|
|
|
switch err {
|
|
|
|
case nil:
|
|
|
|
// Found '/tmp' in filesystem, check if it's empty.
|
|
|
|
defer tmp.DecRef()
|
|
|
|
f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer f.DecRef()
|
|
|
|
serializer := &fs.CollectEntriesSerializer{}
|
|
|
|
if err := f.Readdir(ctx, serializer); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// If more than "." and ".." is found, skip internal tmpfs to prevent hiding
|
|
|
|
// existing files.
|
|
|
|
if len(serializer.Order) > 2 {
|
|
|
|
log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
|
|
|
|
fallthrough
|
|
|
|
|
|
|
|
case syserror.ENOENT:
|
|
|
|
// No '/tmp' found (or fallthrough from above). Safe to mount internal
|
|
|
|
// tmpfs.
|
|
|
|
tmpMount := specs.Mount{
|
2020-05-13 17:30:00 +00:00
|
|
|
Type: tmpfsvfs2.Name,
|
2019-01-16 20:47:21 +00:00
|
|
|
Destination: "/tmp",
|
2019-05-23 13:46:55 +00:00
|
|
|
// Sticky bit is added to prevent accidental deletion of files from
|
|
|
|
// another user. This is normally done for /tmp.
|
2020-06-02 04:30:28 +00:00
|
|
|
Options: []string{"mode=01777"},
|
2019-01-16 20:47:21 +00:00
|
|
|
}
|
2019-06-04 01:19:52 +00:00
|
|
|
return c.mountSubmount(ctx, conf, mns, root, tmpMount)
|
2019-01-16 20:47:21 +00:00
|
|
|
|
|
|
|
default:
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|