2018-04-27 17:37:02 +00:00
|
|
|
// Copyright 2018 Google Inc.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
package boot
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
2018-09-28 19:20:56 +00:00
|
|
|
"path"
|
2018-04-27 17:37:02 +00:00
|
|
|
"path/filepath"
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
"strconv"
|
2018-04-27 17:37:02 +00:00
|
|
|
"strings"
|
|
|
|
|
|
|
|
// Include filesystem types that OCI spec might mount.
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
|
2018-06-12 18:02:35 +00:00
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
|
2018-07-18 23:57:29 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/log"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/syserror"
|
2018-06-15 20:57:29 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/runsc/specutils"
|
2018-04-27 17:37:02 +00:00
|
|
|
)
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
const (
|
|
|
|
// Filesystem name for 9p gofer mounts.
|
|
|
|
rootFsName = "9p"
|
|
|
|
|
|
|
|
// Device name for root mount.
|
|
|
|
rootDevice = "9pfs-/"
|
2018-08-15 23:24:07 +00:00
|
|
|
|
2018-09-20 01:52:53 +00:00
|
|
|
// ChildContainersDir is the directory where child container root
|
2018-08-15 23:24:07 +00:00
|
|
|
// filesystems are mounted.
|
2018-09-20 01:52:53 +00:00
|
|
|
ChildContainersDir = "/__runsc_containers__"
|
2018-08-15 23:24:07 +00:00
|
|
|
|
|
|
|
// Filesystems that runsc supports.
|
|
|
|
bind = "bind"
|
|
|
|
devpts = "devpts"
|
|
|
|
devtmpfs = "devtmpfs"
|
|
|
|
proc = "proc"
|
|
|
|
sysfs = "sysfs"
|
|
|
|
tmpfs = "tmpfs"
|
|
|
|
nonefs = "none"
|
2018-06-29 21:46:45 +00:00
|
|
|
)
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
type fdDispenser struct {
|
|
|
|
fds []int
|
|
|
|
}
|
|
|
|
|
|
|
|
func (f *fdDispenser) remove() int {
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
if f.empty() {
|
|
|
|
panic("fdDispenser out of fds")
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
rv := f.fds[0]
|
|
|
|
f.fds = f.fds[1:]
|
|
|
|
return rv
|
|
|
|
}
|
|
|
|
|
|
|
|
func (f *fdDispenser) empty() bool {
|
|
|
|
return len(f.fds) == 0
|
|
|
|
}
|
|
|
|
|
2018-05-24 21:27:05 +00:00
|
|
|
// createMountNamespace creates a mount namespace containing the root filesystem
|
|
|
|
// and all mounts. 'rootCtx' is used to walk directories to find mount points.
|
2018-09-20 05:19:10 +00:00
|
|
|
func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int) (*fs.MountNamespace, error) {
|
2018-08-15 23:24:07 +00:00
|
|
|
mounts := compileMounts(spec)
|
2018-10-01 17:29:45 +00:00
|
|
|
|
|
|
|
// Create a tmpfs mount where we create and mount a root filesystem for
|
|
|
|
// each child container.
|
|
|
|
mounts = append(mounts, specs.Mount{
|
|
|
|
Type: tmpfs,
|
|
|
|
Destination: ChildContainersDir,
|
|
|
|
})
|
|
|
|
|
2018-09-20 05:19:10 +00:00
|
|
|
fds := &fdDispenser{fds: goferFDs}
|
2018-08-15 23:24:07 +00:00
|
|
|
rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2018-05-24 21:27:05 +00:00
|
|
|
return nil, fmt.Errorf("failed to create root mount: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2018-05-24 21:27:05 +00:00
|
|
|
mns, err := fs.NewMountNamespace(userCtx, rootInode)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2018-05-24 21:27:05 +00:00
|
|
|
return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
|
|
|
|
}
|
2018-08-15 23:24:07 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil {
|
2018-05-24 21:27:05 +00:00
|
|
|
return nil, fmt.Errorf("failed to configure mounts: %v", err)
|
|
|
|
}
|
|
|
|
if !fds.empty() {
|
|
|
|
return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2018-05-24 21:27:05 +00:00
|
|
|
return mns, nil
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
// compileMounts returns the supported mounts from the mount spec, adding any
|
2018-07-03 17:35:27 +00:00
|
|
|
// mandatory mounts that are required by the OCI specification.
|
2018-06-29 21:46:45 +00:00
|
|
|
func compileMounts(spec *specs.Spec) []specs.Mount {
|
2018-04-27 17:37:02 +00:00
|
|
|
// Keep track of whether proc, sys, and tmp were mounted.
|
|
|
|
var procMounted, sysMounted, tmpMounted bool
|
2018-06-29 21:46:45 +00:00
|
|
|
var mounts []specs.Mount
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-06-15 20:57:29 +00:00
|
|
|
// Always mount /dev.
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: devtmpfs,
|
2018-06-15 20:57:29 +00:00
|
|
|
Destination: "/dev",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-06-15 20:57:29 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: devpts,
|
2018-06-15 20:57:29 +00:00
|
|
|
Destination: "/dev/pts",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-06-15 20:57:29 +00:00
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Mount all submounts from the spec.
|
|
|
|
for _, m := range spec.Mounts {
|
2018-06-15 20:57:29 +00:00
|
|
|
if !specutils.IsSupportedDevMount(m) {
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Warningf("ignoring dev mount at %q", m.Destination)
|
|
|
|
continue
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
mounts = append(mounts, m)
|
2018-06-15 20:57:29 +00:00
|
|
|
switch filepath.Clean(m.Destination) {
|
2018-04-27 17:37:02 +00:00
|
|
|
case "/proc":
|
|
|
|
procMounted = true
|
|
|
|
case "/sys":
|
|
|
|
sysMounted = true
|
|
|
|
case "/tmp":
|
|
|
|
tmpMounted = true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Mount proc and sys even if the user did not ask for it, as the spec
|
|
|
|
// says we SHOULD.
|
2018-07-03 17:35:27 +00:00
|
|
|
var mandatoryMounts []specs.Mount
|
2018-04-27 17:37:02 +00:00
|
|
|
if !procMounted {
|
2018-07-03 17:35:27 +00:00
|
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: proc,
|
2018-04-27 17:37:02 +00:00
|
|
|
Destination: "/proc",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
if !sysMounted {
|
2018-07-03 17:35:27 +00:00
|
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: sysfs,
|
2018-04-27 17:37:02 +00:00
|
|
|
Destination: "/sys",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Technically we don't have to mount tmpfs at /tmp, as we could just
|
|
|
|
// rely on the host /tmp, but this is a nice optimization, and fixes
|
|
|
|
// some apps that call mknod in /tmp.
|
|
|
|
if !tmpMounted {
|
2018-07-03 17:35:27 +00:00
|
|
|
// TODO: If the host /tmp (or a mount at /tmp) has
|
|
|
|
// files in it, we should overlay our tmpfs implementation over
|
|
|
|
// that. Until then, the /tmp mount will always appear empty at
|
|
|
|
// container creation.
|
|
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
2018-08-15 23:24:07 +00:00
|
|
|
Type: tmpfs,
|
2018-04-27 17:37:02 +00:00
|
|
|
Destination: "/tmp",
|
2018-06-29 21:46:45 +00:00
|
|
|
})
|
|
|
|
}
|
2018-07-03 17:35:27 +00:00
|
|
|
|
|
|
|
// The mandatory mounts should be ordered right after the root, in case
|
|
|
|
// there are submounts of these mandatory mounts already in the spec.
|
|
|
|
mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
return mounts
|
|
|
|
}
|
|
|
|
|
|
|
|
// setMounts iterates over mounts and mounts them in the specified
|
|
|
|
// mount namespace.
|
|
|
|
func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
|
|
|
|
for _, m := range mounts {
|
2018-08-15 23:24:07 +00:00
|
|
|
if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil {
|
2018-05-24 21:27:05 +00:00
|
|
|
return err
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
2018-05-24 21:27:05 +00:00
|
|
|
return nil
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// createRootMount creates the root filesystem.
|
2018-08-15 23:24:07 +00:00
|
|
|
func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
// First construct the filesystem from the spec.Root.
|
2018-05-10 21:58:51 +00:00
|
|
|
mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
var (
|
|
|
|
rootInode *fs.Inode
|
|
|
|
err error
|
|
|
|
)
|
2018-06-29 21:46:45 +00:00
|
|
|
|
2018-09-25 00:21:16 +00:00
|
|
|
fd := fds.remove()
|
|
|
|
log.Infof("Mounting root over 9P, ioFD: %d", fd)
|
|
|
|
hostFS := mustFindFilesystem("9p")
|
|
|
|
opts := p9MountOptions(fd, conf.FileAccess)
|
|
|
|
rootInode, err = hostFS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to generate root mount point: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// We need to overlay the root on top of a ramfs with stub directories
|
|
|
|
// for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
|
|
|
|
// mounted even if they are not in the spec.
|
2018-08-15 23:24:07 +00:00
|
|
|
submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
|
2018-04-27 17:37:02 +00:00
|
|
|
rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error adding submount overlay: %v", err)
|
|
|
|
}
|
|
|
|
|
2018-07-03 19:00:09 +00:00
|
|
|
if conf.Overlay && !spec.Root.Readonly {
|
2018-04-27 17:37:02 +00:00
|
|
|
log.Debugf("Adding overlay on top of root mount")
|
|
|
|
// Overlay a tmpfs filesystem on top of the root.
|
|
|
|
rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
log.Infof("Mounted %q to \"/\" type root", spec.Root.Path)
|
|
|
|
return rootInode, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
|
|
|
|
// Upper layer uses the same flags as lower, but it must be read-write.
|
|
|
|
lowerFlags.ReadOnly = false
|
|
|
|
|
|
|
|
tmpFS := mustFindFilesystem("tmpfs")
|
2018-06-13 17:19:03 +00:00
|
|
|
if !fs.IsDir(lower.StableAttr) {
|
|
|
|
// Create overlay on top of mount file, e.g. /etc/hostname.
|
|
|
|
msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
|
|
|
|
return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create overlay on top of mount dir.
|
2018-04-27 17:37:02 +00:00
|
|
|
upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
|
|
|
|
}
|
|
|
|
return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
|
|
|
|
}
|
|
|
|
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
|
2018-06-21 17:17:19 +00:00
|
|
|
// used for mounts.
|
|
|
|
func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
var (
|
|
|
|
fsName string
|
|
|
|
opts []string
|
|
|
|
useOverlay bool
|
|
|
|
err error
|
|
|
|
)
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
switch m.Type {
|
2018-08-15 23:24:07 +00:00
|
|
|
case devpts, devtmpfs, proc, sysfs:
|
2018-04-27 17:37:02 +00:00
|
|
|
fsName = m.Type
|
2018-08-15 23:24:07 +00:00
|
|
|
case nonefs:
|
|
|
|
fsName = sysfs
|
|
|
|
case tmpfs:
|
2018-04-27 17:37:02 +00:00
|
|
|
fsName = m.Type
|
|
|
|
|
|
|
|
// tmpfs has some extra supported options that we must pass through.
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
|
2018-06-21 17:17:19 +00:00
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
case bind:
|
2018-09-25 00:21:16 +00:00
|
|
|
fd := fds.remove()
|
|
|
|
fsName = "9p"
|
|
|
|
// Non-root bind mounts are always shared.
|
|
|
|
opts = p9MountOptions(fd, FileAccessShared)
|
2018-06-13 17:19:03 +00:00
|
|
|
// If configured, add overlay to all writable mounts.
|
|
|
|
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
default:
|
|
|
|
// TODO: Support all the mount types and make this a
|
|
|
|
// fatal error. Most applications will "just work" without
|
|
|
|
// them, so this is a warning for now.
|
|
|
|
// we do not support.
|
|
|
|
log.Warningf("ignoring unknown filesystem type %q", m.Type)
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
return fsName, opts, useOverlay, err
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error {
|
2018-06-21 17:17:19 +00:00
|
|
|
// Map mount type to filesystem name, and parse out the options that we are
|
|
|
|
// capable of dealing with.
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
|
2018-06-21 17:17:19 +00:00
|
|
|
|
|
|
|
// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if fsName == "" {
|
2018-04-27 17:37:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// All filesystem names should have been mapped to something we know.
|
|
|
|
filesystem := mustFindFilesystem(fsName)
|
|
|
|
|
|
|
|
mf := mountFlags(m.Options)
|
|
|
|
if useOverlay {
|
|
|
|
// All writes go to upper, be paranoid and make lower readonly.
|
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
|
|
|
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","))
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// If there are submounts, we need to overlay the mount on top of a
|
|
|
|
// ramfs with stub directories for submount paths.
|
2018-06-15 20:57:29 +00:00
|
|
|
submounts := subtargets(m.Destination, mounts)
|
|
|
|
if len(submounts) > 0 {
|
|
|
|
log.Infof("Adding submount overlay over %q", m.Destination)
|
|
|
|
inode, err = addSubmountOverlay(ctx, inode, submounts)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error adding submount overlay: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if useOverlay {
|
|
|
|
log.Debugf("Adding overlay on top of mount %q", m.Destination)
|
2018-06-04 19:30:47 +00:00
|
|
|
inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
|
|
|
|
if err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-04 19:30:47 +00:00
|
|
|
// Create destination in case it doesn't exist. This is required, in addition
|
|
|
|
// to 'addSubmountOverlay', in case there are symlinks to create directories
|
|
|
|
// in the right location, e.g.
|
|
|
|
// mount: /var/run/secrets, may be created in '/run/secrets' if
|
|
|
|
// '/var/run' => '/var'.
|
2018-08-15 23:24:07 +00:00
|
|
|
if err := mkdirAll(ctx, mns, dest); err != nil {
|
2018-06-04 19:30:47 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
root := mns.Root()
|
|
|
|
defer root.DecRef()
|
2018-08-15 23:24:07 +00:00
|
|
|
dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2018-08-15 23:24:07 +00:00
|
|
|
return fmt.Errorf("failed to find mount destination %q: %v", dest, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
defer dirent.DecRef()
|
|
|
|
if err := mns.Mount(ctx, dirent, inode); err != nil {
|
2018-08-15 23:24:07 +00:00
|
|
|
return fmt.Errorf("failed to mount at destination %q: %v", dest, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type)
|
2018-04-27 17:37:02 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
|
2018-08-15 23:24:07 +00:00
|
|
|
log.Infof("mkdirAll called with path %s", path)
|
2018-04-27 17:37:02 +00:00
|
|
|
root := mns.Root()
|
|
|
|
defer root.DecRef()
|
|
|
|
|
|
|
|
// Starting at the root, walk the path.
|
|
|
|
parent := root
|
|
|
|
ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
|
2018-08-15 23:24:07 +00:00
|
|
|
for _, pathElem := range ps {
|
|
|
|
if pathElem == "" {
|
2018-04-27 17:37:02 +00:00
|
|
|
// This will be case for the first and last element, if the path
|
|
|
|
// begins or ends with '/'. Note that we always treat the path as
|
|
|
|
// absolute, regardless of what the first character contains.
|
|
|
|
continue
|
|
|
|
}
|
2018-08-15 23:24:07 +00:00
|
|
|
d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err == syserror.ENOENT {
|
|
|
|
// If we encounter a path that does not exist, then
|
|
|
|
// create it.
|
2018-08-15 23:24:07 +00:00
|
|
|
if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil {
|
|
|
|
return fmt.Errorf("failed to create directory %q: %v", pathElem, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2018-08-15 23:24:07 +00:00
|
|
|
if d, err = parent.Walk(ctx, root, pathElem); err != nil {
|
|
|
|
return fmt.Errorf("walk to %q failed: %v", pathElem, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
} else if err != nil {
|
2018-08-15 23:24:07 +00:00
|
|
|
return fmt.Errorf("failed to find inode %q: %v", pathElem, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
parent = d
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
// p9MountOptions creates a slice of options for a p9 mount.
|
2018-09-25 00:21:16 +00:00
|
|
|
func p9MountOptions(fd int, fa FileAccessType) []string {
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
opts := []string{
|
|
|
|
"trans=fd",
|
|
|
|
"rfdno=" + strconv.Itoa(fd),
|
|
|
|
"wfdno=" + strconv.Itoa(fd),
|
|
|
|
"privateunixsocket=true",
|
|
|
|
}
|
2018-09-25 00:21:16 +00:00
|
|
|
if fa == FileAccessShared {
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
opts = append(opts, "cache=remote_revalidating")
|
|
|
|
}
|
|
|
|
return opts
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
|
|
|
|
// keys.
|
|
|
|
func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
|
|
|
|
var out []string
|
|
|
|
for _, o := range opts {
|
|
|
|
kv := strings.Split(o, "=")
|
|
|
|
switch len(kv) {
|
|
|
|
case 1:
|
2018-09-20 00:14:20 +00:00
|
|
|
if specutils.ContainsStr(allowedKeys, o) {
|
2018-04-27 17:37:02 +00:00
|
|
|
out = append(out, o)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
log.Warningf("ignoring unsupported key %q", kv)
|
|
|
|
case 2:
|
2018-09-20 00:14:20 +00:00
|
|
|
if specutils.ContainsStr(allowedKeys, kv[0]) {
|
2018-04-27 17:37:02 +00:00
|
|
|
out = append(out, o)
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
log.Warningf("ignoring unsupported key %q", kv[0])
|
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("invalid option %q", o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return out, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func destinations(mounts []specs.Mount, extra ...string) []string {
|
|
|
|
var ds []string
|
|
|
|
for _, m := range mounts {
|
|
|
|
ds = append(ds, m.Destination)
|
|
|
|
}
|
|
|
|
return append(ds, extra...)
|
|
|
|
}
|
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
// mountDevice returns a device string based on the fs type and target
|
|
|
|
// of the mount.
|
|
|
|
func mountDevice(m specs.Mount) string {
|
2018-08-15 23:24:07 +00:00
|
|
|
if m.Type == bind {
|
2018-06-21 17:17:19 +00:00
|
|
|
// Make a device string that includes the target, which is consistent across
|
|
|
|
// S/R and uniquely identifies the connection.
|
2018-06-29 21:46:45 +00:00
|
|
|
return "9pfs-" + m.Destination
|
2018-06-21 17:17:19 +00:00
|
|
|
}
|
|
|
|
// All other fs types use device "none".
|
|
|
|
return "none"
|
|
|
|
}
|
|
|
|
|
|
|
|
// addRestoreMount adds a mount to the MountSources map used for restoring a
|
|
|
|
// checkpointed container.
|
|
|
|
func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
fsName, opts, _, err := getMountNameAndOptions(conf, m, fds)
|
2018-06-29 21:46:45 +00:00
|
|
|
|
|
|
|
// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
|
2018-06-21 17:17:19 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2018-07-03 17:35:27 +00:00
|
|
|
// TODO: Fix this when we support all the mount types and
|
|
|
|
// make this a fatal error.
|
2018-06-29 21:46:45 +00:00
|
|
|
if fsName == "" {
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
newMount := fs.MountArgs{
|
2018-06-21 17:17:19 +00:00
|
|
|
Dev: mountDevice(m),
|
|
|
|
Flags: mountFlags(m.Options),
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
Data: strings.Join(opts, ","),
|
2018-06-29 21:46:45 +00:00
|
|
|
}
|
|
|
|
renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
|
|
|
|
log.Infof("Added mount at %q: %+v", fsName, newMount)
|
2018-06-21 17:17:19 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-06-25 17:40:24 +00:00
|
|
|
// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
|
2018-06-21 17:17:19 +00:00
|
|
|
// to the environment.
|
|
|
|
func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
|
|
|
|
renv := &fs.RestoreEnvironment{
|
|
|
|
MountSources: make(map[string][]fs.MountArgs),
|
|
|
|
}
|
|
|
|
|
|
|
|
// Add root mount.
|
|
|
|
fd := fds.remove()
|
2018-09-25 00:21:16 +00:00
|
|
|
opts := p9MountOptions(fd, conf.FileAccess)
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
|
2018-06-21 17:17:19 +00:00
|
|
|
mf := fs.MountSourceFlags{}
|
|
|
|
if spec.Root.Readonly {
|
|
|
|
mf.ReadOnly = true
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
|
|
|
|
rootMount := fs.MountArgs{
|
|
|
|
Dev: rootDevice,
|
2018-06-21 17:17:19 +00:00
|
|
|
Flags: mf,
|
runsc: Change cache policy for root fs and volume mounts.
Previously, gofer filesystems were configured with the default "fscache"
policy, which caches filesystem metadata and contents aggressively. While this
setting is best for performance, it means that changes from inside the sandbox
may not be immediately propagated outside the sandbox, and vice-versa.
This CL changes volumes and the root fs configuration to use a new
"remote-revalidate" cache policy which tries to retain as much caching as
possible while still making fs changes visible across the sandbox boundary.
This cache policy is enabled by default for the root filesystem. The default
value for the "--file-access" flag is still "proxy", but the behavior is
changed to use the new cache policy.
A new value for the "--file-access" flag is added, called "proxy-exclusive",
which turns on the previous aggressive caching behavior. As the name implies,
this flag should be used when the sandbox has "exclusive" access to the
filesystem.
All volume mounts are configured to use the new cache policy, since it is
safest and most likely to be correct. There is not currently a way to change
this behavior, but it's possible to add such a mechanism in the future. The
configurability is a smaller issue for volumes, since most of the expensive
application fs operations (walking + stating files) will likely served by the
root fs.
PiperOrigin-RevId: 208735037
Change-Id: Ife048fab1948205f6665df8563434dbc6ca8cfc9
2018-08-14 23:24:46 +00:00
|
|
|
Data: strings.Join(opts, ","),
|
2018-06-29 21:46:45 +00:00
|
|
|
}
|
|
|
|
renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
|
2018-06-21 17:17:19 +00:00
|
|
|
|
2018-08-27 18:09:06 +00:00
|
|
|
// Add submounts.
|
|
|
|
for _, m := range compileMounts(spec) {
|
2018-06-21 17:17:19 +00:00
|
|
|
if err := addRestoreMount(conf, renv, m, fds); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return renv, nil
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
func mountFlags(opts []string) fs.MountSourceFlags {
|
|
|
|
mf := fs.MountSourceFlags{}
|
|
|
|
for _, o := range opts {
|
|
|
|
switch o {
|
2018-06-18 17:33:06 +00:00
|
|
|
case "rw":
|
|
|
|
mf.ReadOnly = false
|
2018-04-27 17:37:02 +00:00
|
|
|
case "ro":
|
|
|
|
mf.ReadOnly = true
|
|
|
|
case "noatime":
|
|
|
|
mf.NoAtime = true
|
|
|
|
default:
|
2018-06-18 17:33:06 +00:00
|
|
|
log.Warningf("ignoring unknown mount option %q", o)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
return mf
|
|
|
|
}
|
|
|
|
|
|
|
|
func mustFindFilesystem(name string) fs.Filesystem {
|
|
|
|
fs, ok := fs.FindFilesystem(name)
|
|
|
|
if !ok {
|
|
|
|
panic(fmt.Sprintf("could not find filesystem %q", name))
|
|
|
|
}
|
|
|
|
return fs
|
|
|
|
}
|
|
|
|
|
|
|
|
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
|
|
|
|
// paths.
|
|
|
|
func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
|
|
|
|
// There is no real filesystem backing this ramfs tree, so we pass in
|
|
|
|
// "nil" here.
|
|
|
|
mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error creating mount tree: %v", err)
|
|
|
|
}
|
|
|
|
overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to make mount overlay: %v", err)
|
|
|
|
}
|
|
|
|
return overlayInode, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// subtargets takes a set of Mounts and returns only the targets that are
|
|
|
|
// children of the given root. The returned paths are relative to the root.
|
|
|
|
func subtargets(root string, mnts []specs.Mount) []string {
|
|
|
|
r := filepath.Clean(root)
|
2018-06-12 20:54:02 +00:00
|
|
|
if len(r) > 0 && r[len(r)-1] != '/' {
|
|
|
|
r += "/"
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
var targets []string
|
|
|
|
for _, mnt := range mnts {
|
|
|
|
t := filepath.Clean(mnt.Destination)
|
|
|
|
if strings.HasPrefix(t, r) {
|
|
|
|
// Make the mnt path relative to the root path. If the
|
|
|
|
// result is empty, then mnt IS the root mount, not a
|
|
|
|
// submount. We don't want to include those.
|
|
|
|
if t := strings.TrimPrefix(t, r); t != "" {
|
|
|
|
targets = append(targets, t)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return targets
|
|
|
|
}
|
2018-07-18 23:57:29 +00:00
|
|
|
|
2018-09-28 19:20:56 +00:00
|
|
|
// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
|
2018-09-20 05:19:10 +00:00
|
|
|
// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
|
2018-09-28 19:20:56 +00:00
|
|
|
func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
|
2018-07-18 23:57:29 +00:00
|
|
|
ctx := procArgs.NewContext(k)
|
|
|
|
|
|
|
|
// Create the FD map, which will set stdin, stdout, and stderr. If
|
|
|
|
// console is true, then ioctl calls will be passed through to the host
|
|
|
|
// fd.
|
2018-09-20 05:19:10 +00:00
|
|
|
fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
|
2018-07-18 23:57:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error importing fds: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// CreateProcess takes a reference on FDMap if successful. We
|
|
|
|
// won't need ours either way.
|
|
|
|
procArgs.FDMap = fdm
|
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
// Use root user to configure mounts. The current user might not have
|
|
|
|
// permission to do so.
|
|
|
|
rootProcArgs := kernel.CreateProcessArgs{
|
|
|
|
WorkingDirectory: "/",
|
|
|
|
Credentials: auth.NewRootCredentials(creds.UserNamespace),
|
|
|
|
Umask: 0022,
|
|
|
|
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
|
|
|
|
}
|
|
|
|
rootCtx := rootProcArgs.NewContext(k)
|
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// If this is the root container, we also need to setup the root mount
|
|
|
|
// namespace.
|
2018-08-15 23:24:07 +00:00
|
|
|
mns := k.RootMountNamespace()
|
|
|
|
if mns == nil {
|
2018-07-18 23:57:29 +00:00
|
|
|
// Create the virtual filesystem.
|
2018-09-20 05:19:10 +00:00
|
|
|
mns, err := createMountNamespace(ctx, rootCtx, spec, conf, goferFDs)
|
2018-07-18 23:57:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error creating mounts: %v", err)
|
|
|
|
}
|
|
|
|
k.SetRootMountNamespace(mns)
|
2018-08-15 23:24:07 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Setup a child container.
|
|
|
|
|
|
|
|
// Create the container's root filesystem mount.
|
|
|
|
log.Infof("Creating new process in child container.")
|
2018-09-20 05:19:10 +00:00
|
|
|
fds := &fdDispenser{fds: append([]int{}, goferFDs...)}
|
2018-08-15 23:24:07 +00:00
|
|
|
rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error creating filesystem for container: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Make directories for submounts within the container.
|
|
|
|
rootDir := mns.Root()
|
|
|
|
defer rootDir.DecRef()
|
2018-09-20 01:52:53 +00:00
|
|
|
containerRoot := filepath.Join(ChildContainersDir, cid)
|
2018-08-15 23:24:07 +00:00
|
|
|
mkdirAll(ctx, mns, containerRoot)
|
|
|
|
|
|
|
|
// Mount the container's root filesystem to the newly created
|
|
|
|
// mount point.
|
|
|
|
containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err)
|
|
|
|
}
|
|
|
|
if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil {
|
|
|
|
return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err)
|
|
|
|
}
|
|
|
|
containerRootDirent.DecRef()
|
|
|
|
|
|
|
|
// We have to re-walk to the dirent to find the mounted
|
|
|
|
// directory. The old dirent is invalid at this point.
|
|
|
|
containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err)
|
|
|
|
}
|
|
|
|
log.Infof("Mounted child's root fs to %q", containerRoot)
|
|
|
|
|
|
|
|
// Mount all submounts.
|
|
|
|
mounts := compileMounts(spec)
|
|
|
|
for _, m := range mounts {
|
|
|
|
dest := filepath.Join(containerRoot, m.Destination)
|
|
|
|
if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil {
|
|
|
|
return fmt.Errorf("error mounting filesystem for container: %v", err)
|
|
|
|
}
|
2018-07-18 23:57:29 +00:00
|
|
|
}
|
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
// Set the procArgs root directory.
|
|
|
|
procArgs.Root = containerRootDirent
|
2018-07-18 23:57:29 +00:00
|
|
|
return nil
|
|
|
|
}
|
2018-08-24 21:41:38 +00:00
|
|
|
|
2018-09-05 20:00:08 +00:00
|
|
|
// setExecutablePath sets the procArgs.Filename by searching the PATH for an
|
|
|
|
// executable matching the procArgs.Argv[0].
|
|
|
|
func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
|
2018-09-08 00:38:34 +00:00
|
|
|
paths := fs.GetPath(procArgs.Envv)
|
|
|
|
f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, procArgs.Argv[0], paths)
|
2018-09-05 20:00:08 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
procArgs.Filename = f
|
|
|
|
return nil
|
2018-08-24 21:41:38 +00:00
|
|
|
}
|
2018-09-28 19:20:56 +00:00
|
|
|
|
|
|
|
// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
|
|
|
|
// given container and deleting the container root directory.
|
|
|
|
func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
|
|
|
|
// First get a reference to the container root directory.
|
|
|
|
mns := k.RootMountNamespace()
|
|
|
|
mnsRoot := mns.Root()
|
|
|
|
defer mnsRoot.DecRef()
|
|
|
|
containerRoot := path.Join(ChildContainersDir, cid)
|
|
|
|
containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
|
|
|
|
if err == syserror.ENOENT {
|
|
|
|
// Container must have been destroyed already. That's fine.
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
|
|
|
|
}
|
|
|
|
defer containerRootDirent.DecRef()
|
|
|
|
|
|
|
|
// Iterate through all submounts and unmount them. We unmount lazily by
|
|
|
|
// setting detach=true, so we can unmount in any order.
|
|
|
|
for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
|
|
|
|
root := m.Root()
|
|
|
|
defer root.DecRef()
|
|
|
|
|
|
|
|
// Do a best-effort unmount by flushing the refs and unmount
|
|
|
|
// with "detach only = true".
|
|
|
|
log.Debugf("Unmounting container submount %q", root.BaseName())
|
|
|
|
m.FlushDirentRefs()
|
|
|
|
if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
|
|
|
|
return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unmount the container root itself.
|
|
|
|
log.Debugf("Unmounting container root %q", containerRoot)
|
|
|
|
containerRootDirent.Inode.MountSource.FlushDirentRefs()
|
|
|
|
if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
|
|
|
|
return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get a reference to the parent directory and remove the root
|
|
|
|
// container directory.
|
|
|
|
containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
|
|
|
|
}
|
|
|
|
defer containersDirDirent.DecRef()
|
|
|
|
log.Debugf("Deleting container root %q", containerRoot)
|
|
|
|
if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
|
|
|
|
return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Flushing dirent references triggers many async close operations. We
|
|
|
|
// must wait for those to complete before returning, otherwise the
|
|
|
|
// caller may kill the gofer before they complete, causing a cascade of
|
|
|
|
// failing RPCs.
|
|
|
|
log.Infof("Waiting for async filesystem operations to complete")
|
|
|
|
fs.AsyncBarrier()
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|