2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package boot
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2018-09-28 19:20:56 +00:00
|
|
|
"path"
|
2018-04-27 17:37:02 +00:00
|
|
|
"path/filepath"
|
2019-06-11 21:52:06 +00:00
|
|
|
"sort"
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
"strconv"
|
2018-04-27 17:37:02 +00:00
|
|
|
"strings"
|
2019-04-17 19:56:23 +00:00
|
|
|
"syscall"
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Include filesystem types that OCI spec might mount.
|
2019-06-13 23:49:09 +00:00
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/dev"
|
2019-07-03 02:27:51 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs/gofer"
|
2019-06-13 23:49:09 +00:00
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/host"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/proc"
|
2019-07-03 02:27:51 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs/ramfs"
|
2019-06-13 23:49:09 +00:00
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/sys"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/tmpfs"
|
|
|
|
_ "gvisor.dev/gvisor/pkg/sentry/fs/tty"
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/context"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
|
|
|
|
"gvisor.dev/gvisor/pkg/syserror"
|
|
|
|
"gvisor.dev/gvisor/runsc/specutils"
|
2018-04-27 17:37:02 +00:00
|
|
|
)
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
const (
|
|
|
|
// Filesystem name for 9p gofer mounts.
|
|
|
|
rootFsName = "9p"
|
|
|
|
|
|
|
|
// Device name for root mount.
|
|
|
|
rootDevice = "9pfs-/"
|
2018-08-15 23:24:07 +00:00
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// MountPrefix is the annotation prefix for mount hints.
|
|
|
|
MountPrefix = "gvisor.dev/spec/mount"
|
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
// Filesystems that runsc supports.
|
|
|
|
bind = "bind"
|
|
|
|
devpts = "devpts"
|
|
|
|
devtmpfs = "devtmpfs"
|
|
|
|
proc = "proc"
|
|
|
|
sysfs = "sysfs"
|
|
|
|
tmpfs = "tmpfs"
|
|
|
|
nonefs = "none"
|
2018-06-29 21:46:45 +00:00
|
|
|
)
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
|
|
|
|
// Upper layer uses the same flags as lower, but it must be read-write.
|
|
|
|
upperFlags := lowerFlags
|
|
|
|
upperFlags.ReadOnly = false
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
tmpFS := mustFindFilesystem("tmpfs")
|
|
|
|
if !fs.IsDir(lower.StableAttr) {
|
|
|
|
// Create overlay on top of mount file, e.g. /etc/hostname.
|
2019-06-14 01:39:43 +00:00
|
|
|
msrc := fs.NewCachingMountSource(ctx, tmpFS, upperFlags)
|
2019-06-04 01:19:52 +00:00
|
|
|
return fs.NewOverlayRootFile(ctx, msrc, lower, upperFlags)
|
2019-04-17 19:56:23 +00:00
|
|
|
}
|
2018-10-01 17:29:45 +00:00
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
// Create overlay on top of mount dir.
|
|
|
|
upper, err := tmpFS.Mount(ctx, name+"-upper", upperFlags, "", nil)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2019-06-04 01:19:52 +00:00
|
|
|
return nil, fmt.Errorf("creating tmpfs overlay: %v", err)
|
2018-05-24 21:27:05 +00:00
|
|
|
}
|
2019-07-25 23:47:32 +00:00
|
|
|
|
|
|
|
// Replicate permissions and owner from lower to upper mount point.
|
|
|
|
attr, err := lower.UnstableAttr(ctx)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("reading attributes from lower mount point: %v", err)
|
|
|
|
}
|
|
|
|
if !upper.InodeOperations.SetPermissions(ctx, upper, attr.Perms) {
|
|
|
|
return nil, fmt.Errorf("error setting permission to upper mount point")
|
|
|
|
}
|
|
|
|
if err := upper.InodeOperations.SetOwner(ctx, upper, attr.Owner); err != nil {
|
|
|
|
return nil, fmt.Errorf("setting owner to upper mount point: %v", err)
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
return fs.NewOverlayRoot(ctx, upper, lower, upperFlags)
|
2018-05-24 21:27:05 +00:00
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
// compileMounts returns the supported mounts from the mount spec, adding any
|
2018-07-03 17:35:27 +00:00
|
|
|
// mandatory mounts that are required by the OCI specification.
|
2018-06-29 21:46:45 +00:00
|
|
|
func compileMounts(spec *specs.Spec) []specs.Mount {
|
2019-01-16 20:47:21 +00:00
|
|
|
// Keep track of whether proc and sys were mounted.
|
|
|
|
var procMounted, sysMounted bool
|
2018-06-29 21:46:45 +00:00
|
|
|
var mounts []specs.Mount
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-06-15 20:57:29 +00:00
|
|
|
// Always mount /dev.
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: devtmpfs,
|
2018-06-15 20:57:29 +00:00
|
|
|
Destination: "/dev",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-06-15 20:57:29 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: devpts,
|
2018-06-15 20:57:29 +00:00
|
|
|
Destination: "/dev/pts",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-06-15 20:57:29 +00:00
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Mount all submounts from the spec.
|
|
|
|
for _, m := range spec.Mounts {
|
2018-06-15 20:57:29 +00:00
|
|
|
if !specutils.IsSupportedDevMount(m) {
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Warningf("ignoring dev mount at %q", m.Destination)
|
|
|
|
continue
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, m)
|
2018-06-15 20:57:29 +00:00
|
|
|
switch filepath.Clean(m.Destination) {
|
2018-04-27 17:37:02 +00:00
|
|
|
case "/proc":
|
|
|
|
procMounted = true
|
|
|
|
case "/sys":
|
|
|
|
sysMounted = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mount proc and sys even if the user did not ask for it, as the spec
|
|
|
|
// says we SHOULD.
|
2018-07-03 17:35:27 +00:00
|
|
|
var mandatoryMounts []specs.Mount
|
2018-04-27 17:37:02 +00:00
|
|
|
if !procMounted {
|
2018-07-03 17:35:27 +00:00
|
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: proc,
|
2018-04-27 17:37:02 +00:00
|
|
|
Destination: "/proc",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
if !sysMounted {
|
2018-07-03 17:35:27 +00:00
|
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: sysfs,
|
2018-04-27 17:37:02 +00:00
|
|
|
Destination: "/sys",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-07-03 17:35:27 +00:00
|
|
|
// The mandatory mounts should be ordered right after the root, in case
|
|
|
|
// there are submounts of these mandatory mounts already in the spec.
|
|
|
|
mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
return mounts
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
// p9MountOptions creates a slice of options for a p9 mount.
|
|
|
|
func p9MountOptions(fd int, fa FileAccessType) []string {
|
|
|
|
opts := []string{
|
|
|
|
"trans=fd",
|
|
|
|
"rfdno=" + strconv.Itoa(fd),
|
|
|
|
"wfdno=" + strconv.Itoa(fd),
|
|
|
|
"privateunixsocket=true",
|
|
|
|
}
|
|
|
|
if fa == FileAccessShared {
|
|
|
|
opts = append(opts, "cache=remote_revalidating")
|
|
|
|
}
|
|
|
|
return opts
|
|
|
|
}
|
|
|
|
|
|
|
|
// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
|
|
|
|
// keys.
|
|
|
|
func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
|
|
|
|
var out []string
|
|
|
|
for _, o := range opts {
|
|
|
|
kv := strings.Split(o, "=")
|
|
|
|
switch len(kv) {
|
|
|
|
case 1:
|
|
|
|
if specutils.ContainsStr(allowedKeys, o) {
|
|
|
|
out = append(out, o)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
log.Warningf("ignoring unsupported key %q", kv)
|
|
|
|
case 2:
|
|
|
|
if specutils.ContainsStr(allowedKeys, kv[0]) {
|
|
|
|
out = append(out, o)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
log.Warningf("ignoring unsupported key %q", kv[0])
|
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("invalid option %q", o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return out, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// mountDevice returns a device string based on the fs type and target
|
|
|
|
// of the mount.
|
|
|
|
func mountDevice(m specs.Mount) string {
|
|
|
|
if m.Type == bind {
|
|
|
|
// Make a device string that includes the target, which is consistent across
|
|
|
|
// S/R and uniquely identifies the connection.
|
|
|
|
return "9pfs-" + m.Destination
|
|
|
|
}
|
|
|
|
// All other fs types use device "none".
|
|
|
|
return "none"
|
|
|
|
}
|
|
|
|
|
|
|
|
func mountFlags(opts []string) fs.MountSourceFlags {
|
|
|
|
mf := fs.MountSourceFlags{}
|
|
|
|
for _, o := range opts {
|
|
|
|
switch o {
|
|
|
|
case "rw":
|
|
|
|
mf.ReadOnly = false
|
|
|
|
case "ro":
|
|
|
|
mf.ReadOnly = true
|
|
|
|
case "noatime":
|
|
|
|
mf.NoAtime = true
|
|
|
|
case "noexec":
|
|
|
|
mf.NoExec = true
|
|
|
|
default:
|
|
|
|
log.Warningf("ignoring unknown mount option %q", o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return mf
|
|
|
|
}
|
|
|
|
|
|
|
|
func mustFindFilesystem(name string) fs.Filesystem {
|
|
|
|
fs, ok := fs.FindFilesystem(name)
|
|
|
|
if !ok {
|
|
|
|
panic(fmt.Sprintf("could not find filesystem %q", name))
|
|
|
|
}
|
|
|
|
return fs
|
|
|
|
}
|
|
|
|
|
|
|
|
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
|
|
|
|
// paths.
|
|
|
|
func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
|
2019-06-27 21:22:40 +00:00
|
|
|
// Construct a ramfs tree of mount points. The contents never
|
|
|
|
// change, so this can be fully caching. There's no real
|
|
|
|
// filesystem backing this tree, so we set the filesystem to
|
|
|
|
// nil.
|
|
|
|
msrc := fs.NewCachingMountSource(ctx, nil, fs.MountSourceFlags{})
|
2019-06-04 01:19:52 +00:00
|
|
|
mountTree, err := ramfs.MakeDirectoryTree(ctx, msrc, submounts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("creating mount tree: %v", err)
|
|
|
|
}
|
|
|
|
overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("adding mount overlay: %v", err)
|
|
|
|
}
|
|
|
|
return overlayInode, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// subtargets takes a set of Mounts and returns only the targets that are
|
|
|
|
// children of the given root. The returned paths are relative to the root.
|
|
|
|
func subtargets(root string, mnts []specs.Mount) []string {
|
|
|
|
var targets []string
|
|
|
|
for _, mnt := range mnts {
|
|
|
|
if relPath, isSubpath := fs.IsSubpath(mnt.Destination, root); isSubpath {
|
|
|
|
targets = append(targets, relPath)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return targets
|
|
|
|
}
|
|
|
|
|
|
|
|
// setExecutablePath sets the procArgs.Filename by searching the PATH for an
|
|
|
|
// executable matching the procArgs.Argv[0].
|
2019-07-23 21:35:50 +00:00
|
|
|
func setExecutablePath(ctx context.Context, procArgs *kernel.CreateProcessArgs) error {
|
2019-06-04 01:19:52 +00:00
|
|
|
paths := fs.GetPath(procArgs.Envv)
|
|
|
|
exe := procArgs.Argv[0]
|
2019-07-23 21:35:50 +00:00
|
|
|
f, err := procArgs.MountNamespace.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, exe, paths)
|
2019-06-04 01:19:52 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("searching for executable %q, cwd: %q, $PATH=%q: %v", exe, procArgs.WorkingDirectory, strings.Join(paths, ":"), err)
|
|
|
|
}
|
|
|
|
procArgs.Filename = f
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func adjustDirentCache(k *kernel.Kernel) error {
|
|
|
|
var hl syscall.Rlimit
|
|
|
|
if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
|
|
|
|
return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
|
|
|
|
}
|
|
|
|
if int64(hl.Cur) != syscall.RLIM_INFINITY {
|
|
|
|
newSize := hl.Cur / 2
|
|
|
|
if newSize < gofer.DefaultDirentCacheSize {
|
|
|
|
log.Infof("Setting gofer dirent cache size to %d", newSize)
|
|
|
|
gofer.DefaultDirentCacheSize = newSize
|
|
|
|
k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
type fdDispenser struct {
|
|
|
|
fds []int
|
|
|
|
}
|
|
|
|
|
|
|
|
func (f *fdDispenser) remove() int {
|
|
|
|
if f.empty() {
|
|
|
|
panic("fdDispenser out of fds")
|
|
|
|
}
|
|
|
|
rv := f.fds[0]
|
|
|
|
f.fds = f.fds[1:]
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (f *fdDispenser) empty() bool {
|
|
|
|
return len(f.fds) == 0
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
type shareType int
|
|
|
|
|
|
|
|
const (
|
|
|
|
invalid shareType = iota
|
|
|
|
|
|
|
|
// container shareType indicates that the mount is used by a single container.
|
|
|
|
container
|
|
|
|
|
|
|
|
// pod shareType indicates that the mount is used by more than one container
|
|
|
|
// inside the pod.
|
|
|
|
pod
|
|
|
|
|
|
|
|
// shared shareType indicates that the mount can also be shared with a process
|
|
|
|
// outside the pod, e.g. NFS.
|
|
|
|
shared
|
|
|
|
)
|
|
|
|
|
|
|
|
func parseShare(val string) (shareType, error) {
|
|
|
|
switch val {
|
|
|
|
case "container":
|
|
|
|
return container, nil
|
|
|
|
case "pod":
|
|
|
|
return pod, nil
|
|
|
|
case "shared":
|
|
|
|
return shared, nil
|
|
|
|
default:
|
|
|
|
return 0, fmt.Errorf("invalid share value %q", val)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s shareType) String() string {
|
|
|
|
switch s {
|
|
|
|
case invalid:
|
|
|
|
return "invalid"
|
|
|
|
case container:
|
|
|
|
return "container"
|
|
|
|
case pod:
|
|
|
|
return "pod"
|
|
|
|
case shared:
|
|
|
|
return "shared"
|
|
|
|
default:
|
|
|
|
return fmt.Sprintf("invalid share value %d", s)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// mountHint represents extra information about mounts that are provided via
|
|
|
|
// annotations. They can override mount type, and provide sharing information
|
|
|
|
// so that mounts can be correctly shared inside the pod.
|
|
|
|
type mountHint struct {
|
|
|
|
name string
|
|
|
|
share shareType
|
|
|
|
mount specs.Mount
|
|
|
|
|
|
|
|
// root is the inode where the volume is mounted. For mounts with 'pod' share
|
|
|
|
// the volume is mounted once and then bind mounted inside the containers.
|
|
|
|
root *fs.Inode
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) setField(key, val string) error {
|
|
|
|
switch key {
|
|
|
|
case "source":
|
|
|
|
if len(val) == 0 {
|
|
|
|
return fmt.Errorf("source cannot be empty")
|
|
|
|
}
|
|
|
|
m.mount.Source = val
|
|
|
|
case "type":
|
|
|
|
return m.setType(val)
|
|
|
|
case "share":
|
|
|
|
share, err := parseShare(val)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
m.share = share
|
|
|
|
case "options":
|
|
|
|
return m.setOptions(val)
|
|
|
|
default:
|
|
|
|
return fmt.Errorf("invalid mount annotation: %s=%s", key, val)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) setType(val string) error {
|
|
|
|
switch val {
|
|
|
|
case "tmpfs", "bind":
|
|
|
|
m.mount.Type = val
|
|
|
|
default:
|
|
|
|
return fmt.Errorf("invalid type %q", val)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) setOptions(val string) error {
|
|
|
|
opts := strings.Split(val, ",")
|
|
|
|
if err := specutils.ValidateMountOptions(opts); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// Sort options so it can be compared with container mount options later on.
|
|
|
|
sort.Strings(opts)
|
|
|
|
m.mount.Options = opts
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (m *mountHint) isSupported() bool {
|
|
|
|
return m.mount.Type == tmpfs && m.share == pod
|
|
|
|
}
|
|
|
|
|
|
|
|
// podMountHints contains a collection of mountHints for the pod.
|
|
|
|
type podMountHints struct {
|
|
|
|
mounts map[string]*mountHint
|
|
|
|
}
|
|
|
|
|
|
|
|
func newPodMountHints(spec *specs.Spec) (*podMountHints, error) {
|
|
|
|
mnts := make(map[string]*mountHint)
|
|
|
|
for k, v := range spec.Annotations {
|
|
|
|
// Look for 'gvisor.dev/spec/mount' annotations and parse them.
|
|
|
|
if strings.HasPrefix(k, MountPrefix) {
|
|
|
|
parts := strings.Split(k, "/")
|
|
|
|
if len(parts) != 5 {
|
|
|
|
return nil, fmt.Errorf("invalid mount annotation: %s=%s", k, v)
|
|
|
|
}
|
|
|
|
name := parts[3]
|
|
|
|
if len(name) == 0 || path.Clean(name) != name {
|
|
|
|
return nil, fmt.Errorf("invalid mount name: %s", name)
|
|
|
|
}
|
|
|
|
mnt := mnts[name]
|
|
|
|
if mnt == nil {
|
|
|
|
mnt = &mountHint{name: name}
|
|
|
|
mnts[name] = mnt
|
|
|
|
}
|
|
|
|
if err := mnt.setField(parts[4], v); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Validate all hints after done parsing.
|
|
|
|
for name, m := range mnts {
|
|
|
|
log.Infof("Mount annotation found, name: %s, source: %q, type: %s, share: %v", name, m.mount.Source, m.mount.Type, m.share)
|
|
|
|
if m.share == invalid {
|
|
|
|
return nil, fmt.Errorf("share field for %q has not been set", m.name)
|
|
|
|
}
|
|
|
|
if len(m.mount.Source) == 0 {
|
|
|
|
return nil, fmt.Errorf("source field for %q has not been set", m.name)
|
|
|
|
}
|
|
|
|
if len(m.mount.Type) == 0 {
|
|
|
|
return nil, fmt.Errorf("type field for %q has not been set", m.name)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Check for duplicate mount sources.
|
|
|
|
for name2, m2 := range mnts {
|
|
|
|
if name != name2 && m.mount.Source == m2.mount.Source {
|
|
|
|
return nil, fmt.Errorf("mounts %q and %q have the same mount source %q", m.name, m2.name, m.mount.Source)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &podMountHints{mounts: mnts}, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (p *podMountHints) findMount(mount specs.Mount) *mountHint {
|
|
|
|
for _, m := range p.mounts {
|
|
|
|
if m.mount.Source == mount.Source {
|
|
|
|
return m
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
type containerMounter struct {
|
|
|
|
// cid is the container ID. May be set to empty for the root container.
|
|
|
|
cid string
|
|
|
|
|
|
|
|
root *specs.Root
|
|
|
|
|
|
|
|
// mounts is the set of submounts for the container. It's a copy from the spec
|
|
|
|
// that may be freely modified without affecting the original spec.
|
|
|
|
mounts []specs.Mount
|
|
|
|
|
|
|
|
// fds is the list of FDs to be dispensed for mounts that require it.
|
|
|
|
fds fdDispenser
|
|
|
|
|
|
|
|
k *kernel.Kernel
|
2019-06-11 21:52:06 +00:00
|
|
|
|
|
|
|
hints *podMountHints
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
func newContainerMounter(spec *specs.Spec, cid string, goferFDs []int, k *kernel.Kernel, hints *podMountHints) *containerMounter {
|
2019-06-04 01:19:52 +00:00
|
|
|
return &containerMounter{
|
|
|
|
cid: cid,
|
|
|
|
root: spec.Root,
|
|
|
|
mounts: compileMounts(spec),
|
|
|
|
fds: fdDispenser{fds: goferFDs},
|
|
|
|
k: k,
|
2019-06-11 21:52:06 +00:00
|
|
|
hints: hints,
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// setupFS is used to set up the file system for containers and amend
|
|
|
|
// the procArgs accordingly. This is the main entry point for this rest of
|
|
|
|
// functions in this file. procArgs are passed by reference and the FDMap field
|
|
|
|
// is modified. It dups stdioFDs.
|
|
|
|
func (c *containerMounter) setupFS(ctx context.Context, conf *Config, procArgs *kernel.CreateProcessArgs, creds *auth.Credentials) error {
|
|
|
|
// Use root user to configure mounts. The current user might not have
|
|
|
|
// permission to do so.
|
|
|
|
rootProcArgs := kernel.CreateProcessArgs{
|
|
|
|
WorkingDirectory: "/",
|
|
|
|
Credentials: auth.NewRootCredentials(creds.UserNamespace),
|
|
|
|
Umask: 0022,
|
|
|
|
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
|
2019-04-30 15:35:36 +00:00
|
|
|
PIDNamespace: procArgs.PIDNamespace,
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
rootCtx := rootProcArgs.NewContext(c.k)
|
|
|
|
|
|
|
|
// If this is the root container, we also need to setup the root mount
|
|
|
|
// namespace.
|
2019-07-23 21:35:50 +00:00
|
|
|
rootMNS := c.k.RootMountNamespace()
|
|
|
|
if rootMNS == nil {
|
2019-06-04 01:19:52 +00:00
|
|
|
// Setup the root container.
|
2019-07-23 21:35:50 +00:00
|
|
|
if err := c.setupRootContainer(ctx, rootCtx, conf, func(rootMNS *fs.MountNamespace) {
|
|
|
|
// The callback to setupRootContainer inherits a
|
|
|
|
// reference on the rootMNS, so we don't need to take
|
|
|
|
// an additional reference here.
|
|
|
|
procArgs.MountNamespace = rootMNS
|
|
|
|
procArgs.Root = rootMNS.Root()
|
|
|
|
c.k.SetRootMountNamespace(rootMNS)
|
2019-06-04 01:19:52 +00:00
|
|
|
}); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return c.checkDispenser()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup a child container.
|
|
|
|
log.Infof("Creating new process in child container.")
|
|
|
|
|
2019-07-23 21:35:50 +00:00
|
|
|
// Create a new root inode and mount namespace for the container.
|
2019-06-04 01:19:52 +00:00
|
|
|
rootInode, err := c.createRootMount(rootCtx, conf)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("creating filesystem for container: %v", err)
|
|
|
|
}
|
2019-07-23 21:35:50 +00:00
|
|
|
mns, err := fs.NewMountNamespace(rootCtx, rootInode)
|
2019-06-04 01:19:52 +00:00
|
|
|
if err != nil {
|
2019-07-23 21:35:50 +00:00
|
|
|
return fmt.Errorf("creating new mount namespace for container: %v", err)
|
2019-06-04 01:19:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Set process root here, so 'rootCtx.Value(CtxRoot)' will return it.
|
2019-07-23 21:35:50 +00:00
|
|
|
// This will also donate a reference to procArgs, as required.
|
|
|
|
procArgs.MountNamespace = mns
|
|
|
|
procArgs.Root = mns.Root()
|
2019-06-04 01:19:52 +00:00
|
|
|
|
|
|
|
// Mount all submounts.
|
2019-07-23 21:35:50 +00:00
|
|
|
if err := c.mountSubmounts(rootCtx, conf, mns, procArgs.Root); err != nil {
|
2019-06-04 01:19:52 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
return c.checkDispenser()
|
|
|
|
}
|
|
|
|
|
|
|
|
func (c *containerMounter) checkDispenser() error {
|
|
|
|
if !c.fds.empty() {
|
|
|
|
return fmt.Errorf("not all gofer FDs were consumed, remaining: %v", c.fds)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// setupRootContainer creates a mount namespace containing the root filesystem
|
|
|
|
// and all mounts. 'rootCtx' is used to walk directories to find mount points.
|
2019-07-23 21:35:50 +00:00
|
|
|
// The 'setMountNS' callback is called after the mount namespace is created and
|
|
|
|
// will get a reference on that namespace. The callback must ensure that the
|
|
|
|
// rootCtx has the provided mount namespace.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) setupRootContainer(userCtx context.Context, rootCtx context.Context, conf *Config, setMountNS func(*fs.MountNamespace)) error {
|
2019-06-11 21:52:06 +00:00
|
|
|
for _, hint := range c.hints.mounts {
|
|
|
|
log.Infof("Mounting master of shared mount %q from %q type %q", hint.name, hint.mount.Source, hint.mount.Type)
|
|
|
|
inode, err := c.mountSharedMaster(rootCtx, conf, hint)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("mounting shared master %q: %v", hint.name, err)
|
|
|
|
}
|
|
|
|
hint.root = inode
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
rootInode, err := c.createRootMount(rootCtx, conf)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("creating root mount: %v", err)
|
|
|
|
}
|
|
|
|
mns, err := fs.NewMountNamespace(userCtx, rootInode)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("creating root mount namespace: %v", err)
|
|
|
|
}
|
|
|
|
setMountNS(mns)
|
|
|
|
|
|
|
|
root := mns.Root()
|
|
|
|
defer root.DecRef()
|
|
|
|
return c.mountSubmounts(rootCtx, conf, mns, root)
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// mountSharedMaster mounts the master of a volume that is shared among
|
|
|
|
// containers in a pod. It returns the root mount's inode.
|
|
|
|
func (c *containerMounter) mountSharedMaster(ctx context.Context, conf *Config, hint *mountHint) (*fs.Inode, error) {
|
|
|
|
// Map mount type to filesystem name, and parse out the options that we are
|
|
|
|
// capable of dealing with.
|
|
|
|
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, hint.mount)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
if len(fsName) == 0 {
|
|
|
|
return nil, fmt.Errorf("mount type not supported %q", hint.mount.Type)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mount with revalidate because it's shared among containers.
|
|
|
|
opts = append(opts, "cache=revalidate")
|
|
|
|
|
|
|
|
// All filesystem names should have been mapped to something we know.
|
|
|
|
filesystem := mustFindFilesystem(fsName)
|
|
|
|
|
|
|
|
mf := mountFlags(hint.mount.Options)
|
|
|
|
if useOverlay {
|
|
|
|
// All writes go to upper, be paranoid and make lower readonly.
|
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
|
|
|
|
|
|
|
inode, err := filesystem.Mount(ctx, mountDevice(hint.mount), mf, strings.Join(opts, ","), nil)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("creating mount %q: %v", hint.name, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
if useOverlay {
|
|
|
|
log.Debugf("Adding overlay on top of shared mount %q", hint.name)
|
|
|
|
inode, err = addOverlay(ctx, conf, inode, hint.mount.Type, mf)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return inode, nil
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// createRootMount creates the root filesystem.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*fs.Inode, error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
// First construct the filesystem from the spec.Root.
|
2019-06-04 01:19:52 +00:00
|
|
|
mf := fs.MountSourceFlags{ReadOnly: c.root.Readonly || conf.Overlay}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
fd := c.fds.remove()
|
2018-09-25 00:21:16 +00:00
|
|
|
log.Infof("Mounting root over 9P, ioFD: %d", fd)
|
2018-10-18 19:41:07 +00:00
|
|
|
p9FS := mustFindFilesystem("9p")
|
2018-09-25 00:21:16 +00:00
|
|
|
opts := p9MountOptions(fd, conf.FileAccess)
|
2019-06-11 21:52:06 +00:00
|
|
|
rootInode, err := p9FS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","), nil)
|
2018-09-25 00:21:16 +00:00
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return nil, fmt.Errorf("creating root mount point: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// We need to overlay the root on top of a ramfs with stub directories
|
|
|
|
// for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
|
|
|
|
// mounted even if they are not in the spec.
|
2019-06-04 01:19:52 +00:00
|
|
|
submounts := append(subtargets("/", c.mounts), "/dev", "/sys", "/proc", "/tmp")
|
2018-04-27 17:37:02 +00:00
|
|
|
rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
|
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return nil, fmt.Errorf("adding submount overlay: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
if conf.Overlay && !c.root.Readonly {
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Debugf("Adding overlay on top of root mount")
|
|
|
|
// Overlay a tmpfs filesystem on top of the root.
|
|
|
|
rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
|
|
|
|
if err != nil {
|
2019-06-04 01:19:52 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-06-04 01:19:52 +00:00
|
|
|
|
|
|
|
log.Infof("Mounted %q to %q type root", c.root.Path, "/")
|
|
|
|
return rootInode, nil
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
|
2018-06-21 17:17:19 +00:00
|
|
|
// used for mounts.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (string, []string, bool, error) {
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
var (
|
|
|
|
fsName string
|
|
|
|
opts []string
|
|
|
|
useOverlay bool
|
|
|
|
err error
|
|
|
|
)
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
switch m.Type {
|
2018-08-15 23:24:07 +00:00
|
|
|
case devpts, devtmpfs, proc, sysfs:
|
2018-04-27 17:37:02 +00:00
|
|
|
fsName = m.Type
|
2018-08-15 23:24:07 +00:00
|
|
|
case nonefs:
|
|
|
|
fsName = sysfs
|
|
|
|
case tmpfs:
|
2018-04-27 17:37:02 +00:00
|
|
|
fsName = m.Type
|
|
|
|
|
|
|
|
// tmpfs has some extra supported options that we must pass through.
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
|
2018-06-21 17:17:19 +00:00
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
case bind:
|
2019-06-04 01:19:52 +00:00
|
|
|
fd := c.fds.remove()
|
2018-09-25 00:21:16 +00:00
|
|
|
fsName = "9p"
|
|
|
|
// Non-root bind mounts are always shared.
|
|
|
|
opts = p9MountOptions(fd, FileAccessShared)
|
2018-06-13 17:19:03 +00:00
|
|
|
// If configured, add overlay to all writable mounts.
|
|
|
|
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
default:
|
2019-06-04 01:19:52 +00:00
|
|
|
// TODO(nlacasse): Support all the mount types and make this a fatal error.
|
|
|
|
// Most applications will "just work" without them, so this is a warning
|
|
|
|
// for now.
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Warningf("ignoring unknown filesystem type %q", m.Type)
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
return fsName, opts, useOverlay, err
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) mountSubmounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
|
|
|
|
for _, m := range c.mounts {
|
2019-06-11 21:52:06 +00:00
|
|
|
if hint := c.hints.findMount(m); hint != nil && hint.isSupported() {
|
|
|
|
if err := c.mountSharedSubmount(ctx, mns, root, m, hint); err != nil {
|
|
|
|
return fmt.Errorf("mount shared mount %q to %q: %v", hint.name, m.Destination, err)
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
if err := c.mountSubmount(ctx, conf, mns, root, m); err != nil {
|
|
|
|
return fmt.Errorf("mount submount %q: %v", m.Destination, err)
|
|
|
|
}
|
2019-01-16 20:47:21 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
if err := c.mountTmp(ctx, conf, mns, root); err != nil {
|
2019-01-16 20:47:21 +00:00
|
|
|
return fmt.Errorf("mount submount %q: %v", "tmp", err)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-10-18 19:41:07 +00:00
|
|
|
// mountSubmount mounts volumes inside the container's root. Because mounts may
|
|
|
|
// be readonly, a lower ramfs overlay is added to create the mount point dir.
|
|
|
|
// Another overlay is added with tmpfs on top if Config.Overlay is true.
|
|
|
|
// 'm.Destination' must be an absolute path with '..' and symlinks resolved.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent, m specs.Mount) error {
|
2018-06-21 17:17:19 +00:00
|
|
|
// Map mount type to filesystem name, and parse out the options that we are
|
|
|
|
// capable of dealing with.
|
2019-06-04 01:19:52 +00:00
|
|
|
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
|
2018-06-21 17:17:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if fsName == "" {
|
2019-06-04 01:19:52 +00:00
|
|
|
// Filesystem is not supported (e.g. cgroup), just skip it.
|
2018-04-27 17:37:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// All filesystem names should have been mapped to something we know.
|
|
|
|
filesystem := mustFindFilesystem(fsName)
|
|
|
|
|
|
|
|
mf := mountFlags(m.Options)
|
|
|
|
if useOverlay {
|
|
|
|
// All writes go to upper, be paranoid and make lower readonly.
|
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
|
|
|
|
2019-03-14 02:23:02 +00:00
|
|
|
inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","), nil)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return fmt.Errorf("creating mount with source %q: %v", m.Source, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-06-04 01:19:52 +00:00
|
|
|
// If there are submounts, we need to overlay the mount on top of a ramfs
|
|
|
|
// with stub directories for submount paths.
|
|
|
|
submounts := subtargets(m.Destination, c.mounts)
|
2018-06-15 20:57:29 +00:00
|
|
|
if len(submounts) > 0 {
|
|
|
|
log.Infof("Adding submount overlay over %q", m.Destination)
|
|
|
|
inode, err = addSubmountOverlay(ctx, inode, submounts)
|
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return fmt.Errorf("adding submount overlay: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if useOverlay {
|
|
|
|
log.Debugf("Adding overlay on top of mount %q", m.Destination)
|
2018-06-04 19:30:47 +00:00
|
|
|
inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
|
|
|
|
if err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-04 22:31:08 +00:00
|
|
|
maxTraversals := uint(0)
|
|
|
|
dirent, err := mns.FindInode(ctx, root, root, m.Destination, &maxTraversals)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return fmt.Errorf("can't find mount destination %q: %v", m.Destination, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
defer dirent.DecRef()
|
|
|
|
if err := mns.Mount(ctx, dirent, inode); err != nil {
|
2019-01-19 01:35:09 +00:00
|
|
|
return fmt.Errorf("mount %q error: %v", m.Destination, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-10-18 19:41:07 +00:00
|
|
|
log.Infof("Mounted %q to %q type %s", m.Source, m.Destination, m.Type)
|
2018-04-27 17:37:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// mountSharedSubmount binds mount to a previously mounted volume that is shared
|
|
|
|
// among containers in the same pod.
|
|
|
|
func (c *containerMounter) mountSharedSubmount(ctx context.Context, mns *fs.MountNamespace, root *fs.Dirent, mount specs.Mount, source *mountHint) error {
|
|
|
|
// For now enforce that all options are the same. Once bind mount is properly
|
|
|
|
// supported, then we should ensure the master is less restrictive than the
|
|
|
|
// container, e.g. master can be 'rw' while container mounts as 'ro'.
|
|
|
|
if len(mount.Options) != len(source.mount.Options) {
|
|
|
|
return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
|
|
|
|
}
|
|
|
|
sort.Strings(mount.Options)
|
|
|
|
for i, opt := range mount.Options {
|
|
|
|
if opt != source.mount.Options[i] {
|
|
|
|
return fmt.Errorf("mount options in annotations differ from container mount, annotation: %s, mount: %s", source.mount.Options, mount.Options)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
maxTraversals := uint(0)
|
|
|
|
target, err := mns.FindInode(ctx, root, root, mount.Destination, &maxTraversals)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("can't find mount destination %q: %v", mount.Destination, err)
|
|
|
|
}
|
|
|
|
defer target.DecRef()
|
|
|
|
|
2019-07-12 20:11:53 +00:00
|
|
|
// Take a ref on the inode that is about to be (re)-mounted.
|
|
|
|
source.root.IncRef()
|
2019-06-11 21:52:06 +00:00
|
|
|
if err := mns.Mount(ctx, target, source.root); err != nil {
|
2019-07-12 20:11:53 +00:00
|
|
|
source.root.DecRef()
|
2019-06-11 21:52:06 +00:00
|
|
|
return fmt.Errorf("bind mount %q error: %v", mount.Destination, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Infof("Mounted %q type shared bind to %q", mount.Destination, source.name)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
// addRestoreMount adds a mount to the MountSources map used for restoring a
|
|
|
|
// checkpointed container.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount) error {
|
|
|
|
fsName, opts, useOverlay, err := c.getMountNameAndOptions(conf, m)
|
2018-06-21 17:17:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
if fsName == "" {
|
2019-06-04 01:19:52 +00:00
|
|
|
// Filesystem is not supported (e.g. cgroup), just skip it.
|
2018-06-29 21:46:45 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
newMount := fs.MountArgs{
|
2019-03-14 02:23:02 +00:00
|
|
|
Dev: mountDevice(m),
|
|
|
|
Flags: mountFlags(m.Options),
|
|
|
|
DataString: strings.Join(opts, ","),
|
2018-06-29 21:46:45 +00:00
|
|
|
}
|
2019-05-04 04:40:48 +00:00
|
|
|
if useOverlay {
|
|
|
|
newMount.Flags.ReadOnly = true
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
|
|
|
|
log.Infof("Added mount at %q: %+v", fsName, newMount)
|
2018-06-21 17:17:19 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-06-11 21:52:06 +00:00
|
|
|
// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding
|
|
|
|
// the mounts to the environment.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEnvironment, error) {
|
2018-06-21 17:17:19 +00:00
|
|
|
renv := &fs.RestoreEnvironment{
|
|
|
|
MountSources: make(map[string][]fs.MountArgs),
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add root mount.
|
2019-06-04 01:19:52 +00:00
|
|
|
fd := c.fds.remove()
|
2018-09-25 00:21:16 +00:00
|
|
|
opts := p9MountOptions(fd, conf.FileAccess)
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
mf := fs.MountSourceFlags{}
|
2019-06-04 01:19:52 +00:00
|
|
|
if c.root.Readonly || conf.Overlay {
|
2018-06-21 17:17:19 +00:00
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
|
|
|
|
rootMount := fs.MountArgs{
|
2019-03-14 02:23:02 +00:00
|
|
|
Dev: rootDevice,
|
|
|
|
Flags: mf,
|
|
|
|
DataString: strings.Join(opts, ","),
|
2018-06-29 21:46:45 +00:00
|
|
|
}
|
|
|
|
renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
|
2018-06-21 17:17:19 +00:00
|
|
|
|
2018-08-27 18:09:06 +00:00
|
|
|
// Add submounts.
|
2019-01-16 20:47:21 +00:00
|
|
|
var tmpMounted bool
|
2019-06-04 01:19:52 +00:00
|
|
|
for _, m := range c.mounts {
|
|
|
|
if err := c.addRestoreMount(conf, renv, m); err != nil {
|
2018-06-21 17:17:19 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
2019-01-16 20:47:21 +00:00
|
|
|
if filepath.Clean(m.Destination) == "/tmp" {
|
|
|
|
tmpMounted = true
|
|
|
|
}
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
2019-01-16 20:47:21 +00:00
|
|
|
|
2019-04-29 21:03:04 +00:00
|
|
|
// TODO(b/67958150): handle '/tmp' properly (see mountTmp()).
|
2019-01-16 20:47:21 +00:00
|
|
|
if !tmpMounted {
|
|
|
|
tmpMount := specs.Mount{
|
|
|
|
Type: tmpfs,
|
|
|
|
Destination: "/tmp",
|
|
|
|
}
|
2019-06-04 01:19:52 +00:00
|
|
|
if err := c.addRestoreMount(conf, renv, tmpMount); err != nil {
|
2019-01-16 20:47:21 +00:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
return renv, nil
|
|
|
|
}
|
|
|
|
|
2019-01-16 20:47:21 +00:00
|
|
|
// mountTmp mounts an internal tmpfs at '/tmp' if it's safe to do so.
|
|
|
|
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
|
|
|
|
// the host /tmp, but this is a nice optimization, and fixes some apps that call
|
|
|
|
// mknod in /tmp. It's unsafe to mount tmpfs if:
|
2019-06-11 21:52:06 +00:00
|
|
|
// 1. /tmp is mounted explicitly: we should not override user's wish
|
2019-01-16 20:47:21 +00:00
|
|
|
// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
|
|
|
|
//
|
|
|
|
// Note that when there are submounts inside of '/tmp', directories for the
|
|
|
|
// mount points must be present, making '/tmp' not empty anymore.
|
2019-06-04 01:19:52 +00:00
|
|
|
func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.MountNamespace, root *fs.Dirent) error {
|
|
|
|
for _, m := range c.mounts {
|
2019-01-16 20:47:21 +00:00
|
|
|
if filepath.Clean(m.Destination) == "/tmp" {
|
|
|
|
log.Debugf("Explict %q mount found, skipping internal tmpfs, mount: %+v", "/tmp", m)
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
maxTraversals := uint(0)
|
|
|
|
tmp, err := mns.FindInode(ctx, root, root, "tmp", &maxTraversals)
|
|
|
|
switch err {
|
|
|
|
case nil:
|
|
|
|
// Found '/tmp' in filesystem, check if it's empty.
|
|
|
|
defer tmp.DecRef()
|
|
|
|
f, err := tmp.Inode.GetFile(ctx, tmp, fs.FileFlags{Read: true, Directory: true})
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
defer f.DecRef()
|
|
|
|
serializer := &fs.CollectEntriesSerializer{}
|
|
|
|
if err := f.Readdir(ctx, serializer); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// If more than "." and ".." is found, skip internal tmpfs to prevent hiding
|
|
|
|
// existing files.
|
|
|
|
if len(serializer.Order) > 2 {
|
|
|
|
log.Infof("Skipping internal tmpfs on top %q, because it's not empty", "/tmp")
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
log.Infof("Mounting internal tmpfs on top of empty %q", "/tmp")
|
|
|
|
fallthrough
|
|
|
|
|
|
|
|
case syserror.ENOENT:
|
|
|
|
// No '/tmp' found (or fallthrough from above). Safe to mount internal
|
|
|
|
// tmpfs.
|
|
|
|
tmpMount := specs.Mount{
|
|
|
|
Type: tmpfs,
|
|
|
|
Destination: "/tmp",
|
2019-05-23 13:46:55 +00:00
|
|
|
// Sticky bit is added to prevent accidental deletion of files from
|
|
|
|
// another user. This is normally done for /tmp.
|
|
|
|
Options: []string{"mode=1777"},
|
2019-01-16 20:47:21 +00:00
|
|
|
}
|
2019-06-04 01:19:52 +00:00
|
|
|
return c.mountSubmount(ctx, conf, mns, root, tmpMount)
|
2019-01-16 20:47:21 +00:00
|
|
|
|
|
|
|
default:
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|