From 5c4f4ed9eb05cfef036b55883edb8de780288441 Mon Sep 17 00:00:00 2001 From: Jamie Liu Date: Thu, 18 Mar 2021 11:10:36 -0700 Subject: [PATCH] Skip /dev submount hack on VFS2. containerd usually configures both /dev and /dev/shm as tmpfs mounts, e.g.: ``` "mounts": [ ... { "destination": "/dev", "type": "tmpfs", "source": "/run/containerd/io.containerd.runtime.v2.task/moby/10eedbd6a0e7937ddfcab90f2c25bd9a9968b734c4ae361318142165d445e67e/tmpfs", "options": [ "nosuid", "strictatime", "mode=755", "size=65536k" ] }, ... { "destination": "/dev/shm", "type": "tmpfs", "source": "/run/containerd/io.containerd.runtime.v2.task/moby/10eedbd6a0e7937ddfcab90f2c25bd9a9968b734c4ae361318142165d445e67e/shm", "options": [ "nosuid", "noexec", "nodev", "mode=1777", "size=67108864" ] }, ... ``` (This is mostly consistent with how Linux is usually configured, except that /dev is conventionally devtmpfs, not regular tmpfs. runc/libcontainer implements OCI-runtime-spec-undocumented behavior to create /dev/{ptmx,fd,stdin,stdout,stderr} in non-bind /dev mounts. runsc silently switches /dev to devtmpfs. In VFS1, this is necessary to get device files like /dev/null at all, since VFS1 doesn't support real device special files, only what is hardcoded in devfs. VFS2 does support device special files, but using devtmpfs is the easiest way to get pre-created files in /dev.) runsc ignores many /dev submounts in the spec, including /dev/shm. In VFS1, this appears to be to avoid introducing a submount overlay for /dev, and is mostly fine since the typical mode for the /dev/shm mount is ~consistent with the mode of the /dev/shm directory provided by devfs (modulo the sticky bit). In VFS2, this is vestigial (VFS2 does not use submount overlays), and devtmpfs' /dev/shm mode is correct for the mount point but not the mount. So turn off this behavior for VFS2. After this change: ``` $ docker run --rm -it ubuntu:focal ls -lah /dev/shm total 0 drwxrwxrwt 2 root root 40 Mar 18 00:16 . drwxr-xr-x 5 root root 360 Mar 18 00:16 .. $ docker run --runtime=runsc --rm -it ubuntu:focal ls -lah /dev/shm total 0 drwxrwxrwx 1 root root 0 Mar 18 00:16 . dr-xr-xr-x 1 root root 0 Mar 18 00:16 .. $ docker run --runtime=runsc-vfs2 --rm -it ubuntu:focal ls -lah /dev/shm total 0 drwxrwxrwt 2 root root 40 Mar 18 00:16 . drwxr-xr-x 5 root root 320 Mar 18 00:16 .. ``` Fixes #5687 PiperOrigin-RevId: 363699385 --- runsc/boot/controller.go | 2 +- runsc/boot/fs.go | 8 ++++---- runsc/boot/loader.go | 2 +- runsc/boot/loader_test.go | 6 +++--- runsc/cmd/gofer.go | 4 ++-- runsc/specutils/specutils.go | 11 +++++------ 6 files changed, 16 insertions(+), 17 deletions(-) diff --git a/runsc/boot/controller.go b/runsc/boot/controller.go index 1cd5fba5c..1ae76d7d7 100644 --- a/runsc/boot/controller.go +++ b/runsc/boot/controller.go @@ -400,7 +400,7 @@ func (cm *containerManager) Restore(o *RestoreOpts, _ *struct{}) error { // Set up the restore environment. ctx := k.SupervisorContext() - mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints) + mntr := newContainerMounter(cm.l.root.spec, cm.l.root.goferFDs, cm.l.k, cm.l.mountHints, kernel.VFS2Enabled) if kernel.VFS2Enabled { ctx, err = mntr.configureRestore(ctx, cm.l.root.conf) if err != nil { diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 77f632bb9..d1dacee03 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -103,14 +103,14 @@ func addOverlay(ctx context.Context, conf *config.Config, lower *fs.Inode, name // compileMounts returns the supported mounts from the mount spec, adding any // mandatory mounts that are required by the OCI specification. -func compileMounts(spec *specs.Spec) []specs.Mount { +func compileMounts(spec *specs.Spec, vfs2Enabled bool) []specs.Mount { // Keep track of whether proc and sys were mounted. var procMounted, sysMounted, devMounted, devptsMounted bool var mounts []specs.Mount // Mount all submounts from the spec. for _, m := range spec.Mounts { - if !specutils.IsSupportedDevMount(m) { + if !vfs2Enabled && !specutils.IsVFS1SupportedDevMount(m) { log.Warningf("ignoring dev mount at %q", m.Destination) continue } @@ -572,10 +572,10 @@ type containerMounter struct { hints *podMountHints } -func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints) *containerMounter { +func newContainerMounter(spec *specs.Spec, goferFDs []*fd.FD, k *kernel.Kernel, hints *podMountHints, vfs2Enabled bool) *containerMounter { return &containerMounter{ root: spec.Root, - mounts: compileMounts(spec), + mounts: compileMounts(spec, vfs2Enabled), fds: fdDispenser{fds: goferFDs}, k: k, hints: hints, diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 5afce232d..774621970 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -752,7 +752,7 @@ func (l *Loader) createContainerProcess(root bool, cid string, info *containerIn // Setup the child container file system. l.startGoferMonitor(cid, info.goferFDs) - mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints) + mntr := newContainerMounter(info.spec, info.goferFDs, l.k, l.mountHints, kernel.VFS2Enabled) if root { if err := mntr.processHints(info.conf, info.procArgs.Credentials); err != nil { return nil, nil, nil, err diff --git a/runsc/boot/loader_test.go b/runsc/boot/loader_test.go index 3121ca6eb..8b39bc59a 100644 --- a/runsc/boot/loader_test.go +++ b/runsc/boot/loader_test.go @@ -439,7 +439,7 @@ func TestCreateMountNamespace(t *testing.T) { } defer cleanup() - mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}) + mntr := newContainerMounter(&tc.spec, []*fd.FD{fd.New(sandEnd)}, nil, &podMountHints{}, false /* vfs2Enabled */) mns, err := mntr.createMountNamespace(ctx, conf) if err != nil { t.Fatalf("failed to create mount namespace: %v", err) @@ -479,7 +479,7 @@ func TestCreateMountNamespaceVFS2(t *testing.T) { defer l.Destroy() defer loaderCleanup() - mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints) + mntr := newContainerMounter(l.root.spec, l.root.goferFDs, l.k, l.mountHints, true /* vfs2Enabled */) if err := mntr.processHints(l.root.conf, l.root.procArgs.Credentials); err != nil { t.Fatalf("failed process hints: %v", err) } @@ -702,7 +702,7 @@ func TestRestoreEnvironment(t *testing.T) { for _, ioFD := range tc.ioFDs { ioFDs = append(ioFDs, fd.New(ioFD)) } - mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}) + mntr := newContainerMounter(tc.spec, ioFDs, nil, &podMountHints{}, false /* vfs2Enabled */) actualRenv, err := mntr.createRestoreEnvironment(conf) if !tc.errorExpected && err != nil { t.Fatalf("could not create restore environment for test:%s", tc.name) diff --git a/runsc/cmd/gofer.go b/runsc/cmd/gofer.go index 639b2219c..d703e4042 100644 --- a/runsc/cmd/gofer.go +++ b/runsc/cmd/gofer.go @@ -346,7 +346,7 @@ func setupRootFS(spec *specs.Spec, conf *config.Config) error { // creates directories as needed. func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error { for _, m := range mounts { - if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + if m.Type != "bind" || !specutils.IsVFS1SupportedDevMount(m) { continue } @@ -386,7 +386,7 @@ func setupMounts(conf *config.Config, mounts []specs.Mount, root string) error { func resolveMounts(conf *config.Config, mounts []specs.Mount, root string) ([]specs.Mount, error) { cleanMounts := make([]specs.Mount, 0, len(mounts)) for _, m := range mounts { - if m.Type != "bind" || !specutils.IsSupportedDevMount(m) { + if m.Type != "bind" || !specutils.IsVFS1SupportedDevMount(m) { cleanMounts = append(cleanMounts, m) continue } diff --git a/runsc/specutils/specutils.go b/runsc/specutils/specutils.go index 5ba38bfe4..45856fd58 100644 --- a/runsc/specutils/specutils.go +++ b/runsc/specutils/specutils.go @@ -334,14 +334,13 @@ func capsFromNames(names []string, skipSet map[linux.Capability]struct{}) (auth. // Is9PMount returns true if the given mount can be mounted as an external gofer. func Is9PMount(m specs.Mount) bool { - return m.Type == "bind" && m.Source != "" && IsSupportedDevMount(m) + return m.Type == "bind" && m.Source != "" && IsVFS1SupportedDevMount(m) } -// IsSupportedDevMount returns true if the mount is a supported /dev mount. -// Only mount that does not conflict with runsc default /dev mount is -// supported. -func IsSupportedDevMount(m specs.Mount) bool { - // These are devices exist inside sentry. See pkg/sentry/fs/dev/dev.go +// IsVFS1SupportedDevMount returns true if m.Destination does not specify a +// path that is hardcoded by VFS1's implementation of /dev. +func IsVFS1SupportedDevMount(m specs.Mount) bool { + // See pkg/sentry/fs/dev/dev.go. var existingDevices = []string{ "/dev/fd", "/dev/stdin", "/dev/stdout", "/dev/stderr", "/dev/null", "/dev/zero", "/dev/full", "/dev/random",