Fix /proc/self/mounts and /proc/self/mountinfo in VFS2.

Some extra fields were added to the Mount type to expose necessary data to the proc filesystem. PiperOrigin-RevId: 304053361
2020-03-31 15:00:26 -07:00 · 2020-03-31 15:00:26 -07:00 · e1c8eaca8f
parent 9de982ea79
commit e1c8eaca8f
2 changed files with 220 additions and 157 deletions
--- a/pkg/sentry/fsimpl/proc/task_files.go
+++ b/pkg/sentry/fsimpl/proc/task_files.go
@ -18,13 +18,10 @@ import (
 	"bytes"
 	"fmt"
 	"io"
-	"sort"
-	"strings"

 	"gvisor.dev/gvisor/pkg/abi/linux"
 	"gvisor.dev/gvisor/pkg/context"
 	"gvisor.dev/gvisor/pkg/safemem"
-	"gvisor.dev/gvisor/pkg/sentry/fs"
 	"gvisor.dev/gvisor/pkg/sentry/fsbridge"
 	"gvisor.dev/gvisor/pkg/sentry/fsimpl/kernfs"
 	"gvisor.dev/gvisor/pkg/sentry/kernel"
@ -634,51 +631,6 @@ func (s *exeSymlink) executable() (file fsbridge.File, err error) {
 	return
 }

-// forEachMountSource runs f for the process root mount and each mount that is
-// a descendant of the root.
-func forEachMount(t *kernel.Task, fn func(string, *fs.Mount)) {
-	var fsctx *kernel.FSContext
-	t.WithMuLocked(func(t *kernel.Task) {
-		fsctx = t.FSContext()
-	})
-	if fsctx == nil {
-		// The task has been destroyed. Nothing to show here.
-		return
-	}
-
-	// All mount points must be relative to the rootDir, and mounts outside
-	// will be excluded.
-	rootDir := fsctx.RootDirectory()
-	if rootDir == nil {
-		// The task has been destroyed. Nothing to show here.
-		return
-	}
-	defer rootDir.DecRef()
-
-	mnt := t.MountNamespace().FindMount(rootDir)
-	if mnt == nil {
-		// Has it just been unmounted?
-		return
-	}
-	ms := t.MountNamespace().AllMountsUnder(mnt)
-	sort.Slice(ms, func(i, j int) bool {
-		return ms[i].ID < ms[j].ID
-	})
-	for _, m := range ms {
-		mroot := m.Root()
-		if mroot == nil {
-			continue // No longer valid.
-		}
-		mountPath, desc := mroot.FullName(rootDir)
-		mroot.DecRef()
-		if !desc {
-			// MountSources that are not descendants of the chroot jail are ignored.
-			continue
-		}
-		fn(mountPath, m)
-	}
-}
-
 // mountInfoData is used to implement /proc/[pid]/mountinfo.
 //
 // +stateify savable
@ -692,94 +644,24 @@ var _ dynamicInode = (*mountInfoData)(nil)

 // Generate implements vfs.DynamicBytesSource.Generate.
 func (i *mountInfoData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
-		mroot := m.Root()
-		if mroot == nil {
-			return // No longer valid.
-		}
-		defer mroot.DecRef()
-
-		// Format:
-		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
-		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
-
-		// (1) MountSource ID.
-		fmt.Fprintf(buf, "%d ", m.ID)
-
-		// (2)  Parent ID (or this ID if there is no parent).
-		pID := m.ID
-		if !m.IsRoot() && !m.IsUndo() {
-			pID = m.ParentID
-		}
-		fmt.Fprintf(buf, "%d ", pID)
-
-		// (3) Major:Minor device ID. We don't have a superblock, so we
-		// just use the root inode device number.
-		sa := mroot.Inode.StableAttr
-		fmt.Fprintf(buf, "%d:%d ", sa.DeviceFileMajor, sa.DeviceFileMinor)
-
-		// (4) Root: the pathname of the directory in the filesystem
-		// which forms the root of this mount.
-		//
-		// NOTE(b/78135857): This will always be "/" until we implement
-		// bind mounts.
-		fmt.Fprintf(buf, "/ ")
-
-		// (5) Mount point (relative to process root).
-		fmt.Fprintf(buf, "%s ", mountPath)
-
-		// (6) Mount options.
-		flags := mroot.Inode.MountSource.Flags
-		opts := "rw"
-		if flags.ReadOnly {
-			opts = "ro"
-		}
-		if flags.NoAtime {
-			opts += ",noatime"
-		}
-		if flags.NoExec {
-			opts += ",noexec"
-		}
-		fmt.Fprintf(buf, "%s ", opts)
-
-		// (7) Optional fields: zero or more fields of the form "tag[:value]".
-		// (8) Separator: the end of the optional fields is marked by a single hyphen.
-		fmt.Fprintf(buf, "- ")
-
-		// (9) Filesystem type.
-		fmt.Fprintf(buf, "%s ", mroot.Inode.MountSource.FilesystemType)
-
-		// (10) Mount source: filesystem-specific information or "none".
-		fmt.Fprintf(buf, "none ")
-
-		// (11) Superblock options, and final newline.
-		fmt.Fprintf(buf, "%s\n", superBlockOpts(mountPath, mroot.Inode.MountSource))
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
 	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMountInfo(ctx, rootDir, buf)
 	return nil
 }

-func superBlockOpts(mountPath string, msrc *fs.MountSource) string {
-	// gVisor doesn't (yet) have a concept of super block options, so we
-	// use the ro/rw bit from the mount flag.
-	opts := "rw"
-	if msrc.Flags.ReadOnly {
-		opts = "ro"
-	}
-
-	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
-	// the cgroup name in the options. For now we just read that from the
-	// path.
-	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
-	// should get this value from the cgroup itself, and not rely on the
-	// path.
-	if msrc.FilesystemType == "cgroup" {
-		splitPath := strings.Split(mountPath, "/")
-		cgroupType := splitPath[len(splitPath)-1]
-		opts += "," + cgroupType
-	}
-	return opts
-}
-
 // mountsData is used to implement /proc/[pid]/mounts.
 //
 // +stateify savable
@ -789,33 +671,24 @@ type mountsData struct {
 	task *kernel.Task
 }

-var _ dynamicInode = (*mountInfoData)(nil)
+var _ dynamicInode = (*mountsData)(nil)

 // Generate implements vfs.DynamicBytesSource.Generate.
 func (i *mountsData) Generate(ctx context.Context, buf *bytes.Buffer) error {
-	forEachMount(i.task, func(mountPath string, m *fs.Mount) {
-		// Format:
-		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
-		//
-		// We use the filesystem name as the first field, since there
-		// is no real block device we can point to, and we also should
-		// not expose anything about the remote filesystem.
-		//
-		// Only ro/rw option is supported for now.
-		//
-		// The "needs dump"and fsck flags are always 0, which is allowed.
-		root := m.Root()
-		if root == nil {
-			return // No longer valid.
-		}
-		defer root.DecRef()
-
-		flags := root.Inode.MountSource.Flags
-		opts := "rw"
-		if flags.ReadOnly {
-			opts = "ro"
-		}
-		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", mountPath, root.Inode.MountSource.FilesystemType, opts, 0, 0)
+	var fsctx *kernel.FSContext
+	i.task.WithMuLocked(func(t *kernel.Task) {
+		fsctx = t.FSContext()
 	})
+	if fsctx == nil {
+		// The task has been destroyed. Nothing to show here.
+		return nil
+	}
+	rootDir := fsctx.RootDirectoryVFS2()
+	if !rootDir.Ok() {
+		// Root has been destroyed. Don't try to read mounts.
+		return nil
+	}
+	defer rootDir.DecRef()
+	i.task.Kernel().VFS().GenerateProcMounts(ctx, rootDir, buf)
 	return nil
 }
--- a/pkg/sentry/vfs/mount.go
+++ b/pkg/sentry/vfs/mount.go
@ -15,7 +15,11 @@
 package vfs

 import (
+	"bytes"
+	"fmt"
 	"math"
+	"sort"
+	"strings"
 	"sync/atomic"

 	"gvisor.dev/gvisor/pkg/abi/linux"
@ -44,7 +48,7 @@ var lastMountID uint64
 //
 // +stateify savable
 type Mount struct {
-	// vfs, fs, and root are immutable. References are held on fs and root.
+	// vfs, fs, root are immutable. References are held on fs and root.
 	//
 	// Invariant: root belongs to fs.
 	vfs  *VirtualFilesystem
@ -639,12 +643,28 @@ func (mnt *Mount) setReadOnlyLocked(ro bool) error {
 	return nil
 }

+func (mnt *Mount) readOnly() bool {
+	return atomic.LoadInt64(&mnt.writers) < 0
+}
+
 // Filesystem returns the mounted Filesystem. It does not take a reference on
 // the returned Filesystem.
 func (mnt *Mount) Filesystem() *Filesystem {
 	return mnt.fs
 }

+// submountsLocked returns this Mount and all Mounts that are descendents of
+// it.
+//
+// Precondition: mnt.vfs.mountMu must be held.
+func (mnt *Mount) submountsLocked() []*Mount {
+	mounts := []*Mount{mnt}
+	for m := range mnt.children {
+		mounts = append(mounts, m.submountsLocked()...)
+	}
+	return mounts
+}
+
 // Root returns mntns' root. A reference is taken on the returned
 // VirtualDentry.
 func (mntns *MountNamespace) Root() VirtualDentry {
@ -655,3 +675,173 @@ func (mntns *MountNamespace) Root() VirtualDentry {
 	vd.IncRef()
 	return vd
 }
+
+// GenerateProcMounts emits the contents of /proc/[pid]/mounts for vfs to buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMounts(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.flags.NoExec {
+			opts += ",noexec"
+		}
+
+		// Format:
+		// <special device or remote filesystem> <mount point> <filesystem type> <mount options> <needs dump> <fsck order>
+		//
+		// The "needs dump" and "fsck order" flags are always 0, which
+		// is allowed.
+		fmt.Fprintf(buf, "%s %s %s %s %d %d\n", "none", path, mnt.fs.FilesystemType().Name(), opts, 0, 0)
+	}
+}
+
+// GenerateProcMountInfo emits the contents of /proc/[pid]/mountinfo for vfs to
+// buf.
+//
+// Preconditions: taskRootDir.Ok().
+func (vfs *VirtualFilesystem) GenerateProcMountInfo(ctx context.Context, taskRootDir VirtualDentry, buf *bytes.Buffer) {
+	vfs.mountMu.Lock()
+	defer vfs.mountMu.Unlock()
+	rootMnt := taskRootDir.mount
+	mounts := rootMnt.submountsLocked()
+	sort.Slice(mounts, func(i, j int) bool { return mounts[i].ID < mounts[j].ID })
+	for _, mnt := range mounts {
+		// Get the path to this mount relative to task root.
+		mntRootVD := VirtualDentry{
+			mount:  mnt,
+			dentry: mnt.root,
+		}
+		path, err := vfs.PathnameReachable(ctx, taskRootDir, mntRootVD)
+		if err != nil {
+			// For some reason we didn't get a path. Log a warning
+			// and run with empty path.
+			ctx.Warningf("Error getting pathname for mount root %+v: %v", mnt.root, err)
+			path = ""
+		}
+		if path == "" {
+			// Either an error occurred, or path is not reachable
+			// from root.
+			break
+		}
+		// Stat the mount root to get the major/minor device numbers.
+		pop := &PathOperation{
+			Root:  mntRootVD,
+			Start: mntRootVD,
+		}
+		statx, err := vfs.StatAt(ctx, auth.NewAnonymousCredentials(), pop, &StatOptions{})
+		if err != nil {
+			// Well that's not good. Ignore this mount.
+			break
+		}
+
+		// Format:
+		// 36 35 98:0 /mnt1 /mnt2 rw,noatime master:1 - ext3 /dev/root rw,errors=continue
+		// (1)(2)(3)   (4)   (5)      (6)      (7)   (8) (9)   (10)         (11)
+
+		// (1) Mount ID.
+		fmt.Fprintf(buf, "%d ", mnt.ID)
+
+		// (2)  Parent ID (or this ID if there is no parent).
+		pID := mnt.ID
+		if p := mnt.parent(); p != nil {
+			pID = p.ID
+		}
+		fmt.Fprintf(buf, "%d ", pID)
+
+		// (3) Major:Minor device ID. We don't have a superblock, so we
+		// just use the root inode device number.
+		fmt.Fprintf(buf, "%d:%d ", statx.DevMajor, statx.DevMinor)
+
+		// (4) Root: the pathname of the directory in the filesystem
+		// which forms the root of this mount.
+		//
+		// NOTE(b/78135857): This will always be "/" until we implement
+		// bind mounts.
+		fmt.Fprintf(buf, "/ ")
+
+		// (5) Mount point (relative to process root).
+		fmt.Fprintf(buf, "%s ", manglePath(path))
+
+		// (6) Mount options.
+		opts := "rw"
+		if mnt.readOnly() {
+			opts = "ro"
+		}
+		if mnt.flags.NoExec {
+			opts += ",noexec"
+		}
+		// TODO(gvisor.dev/issue/1193): Add "noatime" if MS_NOATIME is
+		// set.
+		fmt.Fprintf(buf, "%s ", opts)
+
+		// (7) Optional fields: zero or more fields of the form "tag[:value]".
+		// (8) Separator: the end of the optional fields is marked by a single hyphen.
+		fmt.Fprintf(buf, "- ")
+
+		// (9) Filesystem type.
+		fmt.Fprintf(buf, "%s ", mnt.fs.FilesystemType().Name())
+
+		// (10) Mount source: filesystem-specific information or "none".
+		fmt.Fprintf(buf, "none ")
+
+		// (11) Superblock options, and final newline.
+		fmt.Fprintf(buf, "%s\n", superBlockOpts(path, mnt))
+	}
+}
+
+// manglePath replaces ' ', '\t', '\n', and '\\' with their octal equivalents.
+// See Linux fs/seq_file.c:mangle_path.
+func manglePath(p string) string {
+	r := strings.NewReplacer(" ", "\\040", "\t", "\\011", "\n", "\\012", "\\", "\\134")
+	return r.Replace(p)
+}
+
+// superBlockOpts returns the super block options string for the the mount at
+// the given path.
+func superBlockOpts(mountPath string, mnt *Mount) string {
+	// gVisor doesn't (yet) have a concept of super block options, so we
+	// use the ro/rw bit from the mount flag.
+	opts := "rw"
+	if mnt.readOnly() {
+		opts = "ro"
+	}
+
+	// NOTE(b/147673608): If the mount is a cgroup, we also need to include
+	// the cgroup name in the options. For now we just read that from the
+	// path.
+	// TODO(gvisor.dev/issues/190): Once gVisor has full cgroup support, we
+	// should get this value from the cgroup itself, and not rely on the
+	// path.
+	if mnt.fs.FilesystemType().Name() == "cgroup" {
+		splitPath := strings.Split(mountPath, "/")
+		cgroupType := splitPath[len(splitPath)-1]
+		opts += "," + cgroupType
+	}
+	return opts
+}