More runsc changes for VFS2
- Add /tmp handling - Apply mount options - Enable more container_test tests - Forward signals to child process when test respaws process to run as root inside namespace. Updates #1487 PiperOrigin-RevId: 314263281
This commit is contained in:
parent
050d8e6e33
commit
ca5912d13c
1
Makefile
1
Makefile
|
@ -217,6 +217,7 @@ dev: ## Installs a set of local runtimes. Requires sudo.
|
|||
@$(MAKE) configure RUNTIME="$(RUNTIME)" ARGS="--net-raw"
|
||||
@$(MAKE) configure RUNTIME="$(RUNTIME)-d" ARGS="--net-raw --debug --strace --log-packets"
|
||||
@$(MAKE) configure RUNTIME="$(RUNTIME)-p" ARGS="--net-raw --profile"
|
||||
@$(MAKE) configure RUNTIME="$(RUNTIME)-vfs2-d" ARGS="--net-raw --debug --strace --log-packets --vfs2"
|
||||
@sudo systemctl restart docker
|
||||
.PHONY: dev
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ const (
|
|||
)
|
||||
|
||||
// tmpfs has some extra supported options that we must pass through.
|
||||
var tmpfsAllowedOptions = []string{"mode", "uid", "gid"}
|
||||
var tmpfsAllowedData = []string{"mode", "uid", "gid"}
|
||||
|
||||
func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
|
||||
// Upper layer uses the same flags as lower, but it must be read-write.
|
||||
|
@ -154,8 +154,8 @@ func compileMounts(spec *specs.Spec) []specs.Mount {
|
|||
return mounts
|
||||
}
|
||||
|
||||
// p9MountOptions creates a slice of options for a p9 mount.
|
||||
func p9MountOptions(fd int, fa FileAccessType, vfs2 bool) []string {
|
||||
// p9MountData creates a slice of p9 mount data.
|
||||
func p9MountData(fd int, fa FileAccessType, vfs2 bool) []string {
|
||||
opts := []string{
|
||||
"trans=fd",
|
||||
"rfdno=" + strconv.Itoa(fd),
|
||||
|
@ -235,7 +235,7 @@ func isSupportedMountFlag(fstype, opt string) bool {
|
|||
return true
|
||||
}
|
||||
if fstype == tmpfsvfs2.Name {
|
||||
ok, err := parseMountOption(opt, tmpfsAllowedOptions...)
|
||||
ok, err := parseMountOption(opt, tmpfsAllowedData...)
|
||||
return ok && err == nil
|
||||
}
|
||||
return false
|
||||
|
@ -716,7 +716,7 @@ func (c *containerMounter) createRootMount(ctx context.Context, conf *Config) (*
|
|||
fd := c.fds.remove()
|
||||
log.Infof("Mounting root over 9P, ioFD: %d", fd)
|
||||
p9FS := mustFindFilesystem("9p")
|
||||
opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */)
|
||||
opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
|
||||
|
||||
if conf.OverlayfsStaleRead {
|
||||
// We can't check for overlayfs here because sandbox is chroot'ed and gofer
|
||||
|
@ -770,7 +770,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
|
|||
fsName = m.Type
|
||||
|
||||
var err error
|
||||
opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
|
||||
opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
|
||||
if err != nil {
|
||||
return "", nil, false, err
|
||||
}
|
||||
|
@ -778,7 +778,7 @@ func (c *containerMounter) getMountNameAndOptions(conf *Config, m specs.Mount) (
|
|||
case bind:
|
||||
fd := c.fds.remove()
|
||||
fsName = gofervfs2.Name
|
||||
opts = p9MountOptions(fd, c.getMountAccessType(m), conf.VFS2)
|
||||
opts = p9MountData(fd, c.getMountAccessType(m), conf.VFS2)
|
||||
// If configured, add overlay to all writable mounts.
|
||||
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
|
||||
|
||||
|
@ -931,7 +931,7 @@ func (c *containerMounter) createRestoreEnvironment(conf *Config) (*fs.RestoreEn
|
|||
|
||||
// Add root mount.
|
||||
fd := c.fds.remove()
|
||||
opts := p9MountOptions(fd, conf.FileAccess, false /* vfs2 */)
|
||||
opts := p9MountData(fd, conf.FileAccess, false /* vfs2 */)
|
||||
|
||||
mf := fs.MountSourceFlags{}
|
||||
if c.root.Readonly || conf.Overlay {
|
||||
|
@ -1019,7 +1019,7 @@ func (c *containerMounter) mountTmp(ctx context.Context, conf *Config, mns *fs.M
|
|||
Destination: "/tmp",
|
||||
// Sticky bit is added to prevent accidental deletion of files from
|
||||
// another user. This is normally done for /tmp.
|
||||
Options: []string{"mode=1777"},
|
||||
Options: []string{"mode=01777"},
|
||||
}
|
||||
return c.mountSubmount(ctx, conf, mns, root, tmpMount)
|
||||
|
||||
|
|
|
@ -136,7 +136,7 @@ func (c *containerMounter) setupVFS2(ctx context.Context, conf *Config, procArgs
|
|||
|
||||
func (c *containerMounter) createMountNamespaceVFS2(ctx context.Context, conf *Config, creds *auth.Credentials) (*vfs.MountNamespace, error) {
|
||||
fd := c.fds.remove()
|
||||
opts := strings.Join(p9MountOptions(fd, conf.FileAccess, true /* vfs2 */), ",")
|
||||
opts := strings.Join(p9MountData(fd, conf.FileAccess, true /* vfs2 */), ",")
|
||||
|
||||
log.Infof("Mounting root over 9P, ioFD: %d", fd)
|
||||
mns, err := c.k.VFS().NewMountNamespace(ctx, creds, "", gofer.Name, &vfs.GetFilesystemOptions{Data: opts})
|
||||
|
@ -160,8 +160,9 @@ func (c *containerMounter) mountSubmountsVFS2(ctx context.Context, conf *Config,
|
|||
}
|
||||
}
|
||||
|
||||
// TODO(gvisor.dev/issue/1487): implement mountTmp from fs.go.
|
||||
|
||||
if err := c.mountTmpVFS2(ctx, conf, creds, mns); err != nil {
|
||||
return fmt.Errorf(`mount submount "\tmp": %w`, err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -199,8 +200,6 @@ func (c *containerMounter) prepareMountsVFS2() ([]mountAndFD, error) {
|
|||
return mounts, nil
|
||||
}
|
||||
|
||||
// TODO(gvisor.dev/issue/1487): Implement submount options similar to the VFS1
|
||||
// version.
|
||||
func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config, mns *vfs.MountNamespace, creds *auth.Credentials, submount *mountAndFD) error {
|
||||
root := mns.Root()
|
||||
defer root.DecRef()
|
||||
|
@ -209,12 +208,11 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
|
|||
Start: root,
|
||||
Path: fspath.Parse(submount.Destination),
|
||||
}
|
||||
|
||||
fsName, options, useOverlay, err := c.getMountNameAndOptionsVFS2(conf, submount)
|
||||
fsName, opts, err := c.getMountNameAndOptionsVFS2(conf, submount)
|
||||
if err != nil {
|
||||
return fmt.Errorf("mountOptions failed: %w", err)
|
||||
}
|
||||
if fsName == "" {
|
||||
if len(fsName) == 0 {
|
||||
// Filesystem is not supported (e.g. cgroup), just skip it.
|
||||
return nil
|
||||
}
|
||||
|
@ -222,17 +220,6 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
|
|||
if err := c.makeSyntheticMount(ctx, submount.Destination, root, creds); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
opts := &vfs.MountOptions{
|
||||
GetFilesystemOptions: vfs.GetFilesystemOptions{
|
||||
Data: strings.Join(options, ","),
|
||||
},
|
||||
InternalMount: true,
|
||||
}
|
||||
|
||||
// All writes go to upper, be paranoid and make lower readonly.
|
||||
opts.ReadOnly = useOverlay
|
||||
|
||||
if err := c.k.VFS().MountAt(ctx, creds, "", target, fsName, opts); err != nil {
|
||||
return fmt.Errorf("failed to mount %q (type: %s): %w, opts: %v", submount.Destination, submount.Type, err, opts)
|
||||
}
|
||||
|
@ -242,13 +229,13 @@ func (c *containerMounter) mountSubmountVFS2(ctx context.Context, conf *Config,
|
|||
|
||||
// getMountNameAndOptionsVFS2 retrieves the fsName, opts, and useOverlay values
|
||||
// used for mounts.
|
||||
func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, []string, bool, error) {
|
||||
func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndFD) (string, *vfs.MountOptions, error) {
|
||||
var (
|
||||
fsName string
|
||||
opts []string
|
||||
useOverlay bool
|
||||
fsName string
|
||||
data []string
|
||||
)
|
||||
|
||||
// Find filesystem name and FS specific data field.
|
||||
switch m.Type {
|
||||
case devpts.Name, devtmpfs.Name, proc.Name, sys.Name:
|
||||
fsName = m.Type
|
||||
|
@ -258,21 +245,46 @@ func (c *containerMounter) getMountNameAndOptionsVFS2(conf *Config, m *mountAndF
|
|||
fsName = m.Type
|
||||
|
||||
var err error
|
||||
opts, err = parseAndFilterOptions(m.Options, tmpfsAllowedOptions...)
|
||||
data, err = parseAndFilterOptions(m.Options, tmpfsAllowedData...)
|
||||
if err != nil {
|
||||
return "", nil, false, err
|
||||
return "", nil, err
|
||||
}
|
||||
|
||||
case bind:
|
||||
fsName = gofer.Name
|
||||
opts = p9MountOptions(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
|
||||
// If configured, add overlay to all writable mounts.
|
||||
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
|
||||
data = p9MountData(m.fd, c.getMountAccessType(m.Mount), true /* vfs2 */)
|
||||
|
||||
default:
|
||||
log.Warningf("ignoring unknown filesystem type %q", m.Type)
|
||||
}
|
||||
return fsName, opts, useOverlay, nil
|
||||
|
||||
opts := &vfs.MountOptions{
|
||||
GetFilesystemOptions: vfs.GetFilesystemOptions{
|
||||
Data: strings.Join(data, ","),
|
||||
},
|
||||
InternalMount: true,
|
||||
}
|
||||
|
||||
for _, o := range m.Options {
|
||||
switch o {
|
||||
case "rw":
|
||||
opts.ReadOnly = false
|
||||
case "ro":
|
||||
opts.ReadOnly = true
|
||||
case "noatime":
|
||||
// TODO(gvisor.dev/issue/1193): Implement MS_NOATIME.
|
||||
case "noexec":
|
||||
opts.Flags.NoExec = true
|
||||
default:
|
||||
log.Warningf("ignoring unknown mount option %q", o)
|
||||
}
|
||||
}
|
||||
|
||||
if conf.Overlay {
|
||||
// All writes go to upper, be paranoid and make lower readonly.
|
||||
opts.ReadOnly = true
|
||||
}
|
||||
return fsName, opts, nil
|
||||
}
|
||||
|
||||
func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath string, root vfs.VirtualDentry, creds *auth.Credentials) error {
|
||||
|
@ -301,3 +313,63 @@ func (c *containerMounter) makeSyntheticMount(ctx context.Context, currentPath s
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// mountTmpVFS2 mounts an internal tmpfs at '/tmp' if it's safe to do so.
|
||||
// Technically we don't have to mount tmpfs at /tmp, as we could just rely on
|
||||
// the host /tmp, but this is a nice optimization, and fixes some apps that call
|
||||
// mknod in /tmp. It's unsafe to mount tmpfs if:
|
||||
// 1. /tmp is mounted explicitly: we should not override user's wish
|
||||
// 2. /tmp is not empty: mounting tmpfs would hide existing files in /tmp
|
||||
//
|
||||
// Note that when there are submounts inside of '/tmp', directories for the
|
||||
// mount points must be present, making '/tmp' not empty anymore.
|
||||
func (c *containerMounter) mountTmpVFS2(ctx context.Context, conf *Config, creds *auth.Credentials, mns *vfs.MountNamespace) error {
|
||||
for _, m := range c.mounts {
|
||||
// m.Destination has been cleaned, so it's to use equality here.
|
||||
if m.Destination == "/tmp" {
|
||||
log.Debugf(`Explict "/tmp" mount found, skipping internal tmpfs, mount: %+v`, m)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
root := mns.Root()
|
||||
defer root.DecRef()
|
||||
pop := vfs.PathOperation{
|
||||
Root: root,
|
||||
Start: root,
|
||||
Path: fspath.Parse("/tmp"),
|
||||
}
|
||||
// TODO(gvisor.dev/issue/2782): Use O_PATH when available.
|
||||
statx, err := c.k.VFS().StatAt(ctx, creds, &pop, &vfs.StatOptions{})
|
||||
switch err {
|
||||
case nil:
|
||||
// Found '/tmp' in filesystem, check if it's empty.
|
||||
if linux.FileMode(statx.Mode).FileType() != linux.ModeDirectory {
|
||||
// Not a dir?! Leave it be.
|
||||
return nil
|
||||
}
|
||||
if statx.Nlink > 2 {
|
||||
// If more than "." and ".." is found, skip internal tmpfs to prevent
|
||||
// hiding existing files.
|
||||
log.Infof(`Skipping internal tmpfs mount for "/tmp" because it's not empty`)
|
||||
return nil
|
||||
}
|
||||
log.Infof(`Mounting internal tmpfs on top of empty "/tmp"`)
|
||||
fallthrough
|
||||
|
||||
case syserror.ENOENT:
|
||||
// No '/tmp' found (or fallthrough from above). It's safe to mount internal
|
||||
// tmpfs.
|
||||
tmpMount := specs.Mount{
|
||||
Type: tmpfs.Name,
|
||||
Destination: "/tmp",
|
||||
// Sticky bit is added to prevent accidental deletion of files from
|
||||
// another user. This is normally done for /tmp.
|
||||
Options: []string{"mode=01777"},
|
||||
}
|
||||
return c.mountSubmountVFS2(ctx, conf, mns, creds, &mountAndFD{Mount: tmpMount})
|
||||
|
||||
default:
|
||||
return fmt.Errorf(`stating "/tmp" inside container: %w`, err)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ go_test(
|
|||
"//test/cmd/test_app",
|
||||
],
|
||||
library = ":container",
|
||||
shard_count = 5,
|
||||
shard_count = 10,
|
||||
tags = [
|
||||
"requires-kvm",
|
||||
],
|
||||
|
|
|
@ -119,7 +119,7 @@ func receiveConsolePTY(srv *unet.ServerSocket) (*os.File, error) {
|
|||
|
||||
// Test that an pty FD is sent over the console socket if one is provided.
|
||||
func TestConsoleSocket(t *testing.T) {
|
||||
for name, conf := range configs(t, all...) {
|
||||
for name, conf := range configsWithVFS2(t, all...) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
spec := testutil.NewSpecWithArgs("true")
|
||||
_, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
|
||||
|
|
|
@ -256,8 +256,6 @@ var (
|
|||
func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
|
||||
// Always load the default config.
|
||||
cs := make(map[string]*boot.Config)
|
||||
cs["default"] = testutil.TestConfig(t)
|
||||
|
||||
for _, o := range opts {
|
||||
switch o {
|
||||
case overlay:
|
||||
|
@ -285,9 +283,16 @@ func configs(t *testing.T, opts ...configOption) map[string]*boot.Config {
|
|||
|
||||
func configsWithVFS2(t *testing.T, opts ...configOption) map[string]*boot.Config {
|
||||
vfs1 := configs(t, opts...)
|
||||
vfs2 := configs(t, opts...)
|
||||
|
||||
for key, value := range vfs2 {
|
||||
var optsVFS2 []configOption
|
||||
for _, opt := range opts {
|
||||
// TODO(gvisor.dev/issue/1487): Enable overlay tests.
|
||||
if opt != overlay {
|
||||
optsVFS2 = append(optsVFS2, opt)
|
||||
}
|
||||
}
|
||||
|
||||
for key, value := range configs(t, optsVFS2...) {
|
||||
value.VFS2 = true
|
||||
vfs1[key+"VFS2"] = value
|
||||
}
|
||||
|
@ -603,7 +608,7 @@ func doAppExitStatus(t *testing.T, vfs2 bool) {
|
|||
|
||||
// TestExec verifies that a container can exec a new program.
|
||||
func TestExec(t *testing.T) {
|
||||
for name, conf := range configs(t, overlay) {
|
||||
for name, conf := range configsWithVFS2(t, overlay) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
const uid = 343
|
||||
spec := testutil.NewSpecWithArgs("sleep", "100")
|
||||
|
@ -695,7 +700,7 @@ func TestExec(t *testing.T) {
|
|||
|
||||
// TestKillPid verifies that we can signal individual exec'd processes.
|
||||
func TestKillPid(t *testing.T) {
|
||||
for name, conf := range configs(t, overlay) {
|
||||
for name, conf := range configsWithVFS2(t, overlay) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
app, err := testutil.FindFile("test/cmd/test_app/test_app")
|
||||
if err != nil {
|
||||
|
@ -1211,7 +1216,7 @@ func TestCapabilities(t *testing.T) {
|
|||
uid := auth.KUID(os.Getuid() + 1)
|
||||
gid := auth.KGID(os.Getgid() + 1)
|
||||
|
||||
for name, conf := range configs(t, all...) {
|
||||
for name, conf := range configsWithVFS2(t, all...) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
spec := testutil.NewSpecWithArgs("sleep", "100")
|
||||
rootDir, bundleDir, cleanup, err := testutil.SetupContainer(spec, conf)
|
||||
|
@ -1409,7 +1414,7 @@ func TestReadonlyRoot(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestUIDMap(t *testing.T) {
|
||||
for name, conf := range configs(t, noOverlay...) {
|
||||
for name, conf := range configsWithVFS2(t, noOverlay...) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
testDir, err := ioutil.TempDir(testutil.TmpDir(), "test-mount")
|
||||
if err != nil {
|
||||
|
@ -1886,7 +1891,7 @@ func doDestroyStartingTest(t *testing.T, vfs2 bool) {
|
|||
}
|
||||
|
||||
func TestCreateWorkingDir(t *testing.T) {
|
||||
for name, conf := range configs(t, overlay) {
|
||||
for name, conf := range configsWithVFS2(t, overlay) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
tmpDir, err := ioutil.TempDir(testutil.TmpDir(), "cwd-create")
|
||||
if err != nil {
|
||||
|
@ -2009,7 +2014,7 @@ func TestMountPropagation(t *testing.T) {
|
|||
}
|
||||
|
||||
func TestMountSymlink(t *testing.T) {
|
||||
for name, conf := range configs(t, overlay) {
|
||||
for name, conf := range configsWithVFS2(t, overlay) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
dir, err := ioutil.TempDir(testutil.TmpDir(), "mount-symlink")
|
||||
if err != nil {
|
||||
|
|
|
@ -129,7 +129,7 @@ func createSharedMount(mount specs.Mount, name string, pod ...*specs.Spec) {
|
|||
// TestMultiContainerSanity checks that it is possible to run 2 dead-simple
|
||||
// containers in the same sandbox.
|
||||
func TestMultiContainerSanity(t *testing.T) {
|
||||
for name, conf := range configs(t, all...) {
|
||||
for name, conf := range configsWithVFS2(t, all...) {
|
||||
t.Run(name, func(t *testing.T) {
|
||||
rootDir, cleanup, err := testutil.SetupRootDir()
|
||||
if err != nil {
|
||||
|
|
|
@ -18,6 +18,7 @@ import (
|
|||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"os/signal"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"syscall"
|
||||
|
@ -261,7 +262,18 @@ func MaybeRunAsRoot() error {
|
|||
cmd.Stdin = os.Stdin
|
||||
cmd.Stdout = os.Stdout
|
||||
cmd.Stderr = os.Stderr
|
||||
if err := cmd.Run(); err != nil {
|
||||
if err := cmd.Start(); err != nil {
|
||||
return fmt.Errorf("re-executing self: %w", err)
|
||||
}
|
||||
ch := make(chan os.Signal, 1)
|
||||
signal.Notify(ch)
|
||||
go func() {
|
||||
for {
|
||||
// Forward all signals to child process.
|
||||
cmd.Process.Signal(<-ch)
|
||||
}
|
||||
}()
|
||||
if err := cmd.Wait(); err != nil {
|
||||
if exit, ok := err.(*exec.ExitError); ok {
|
||||
if ws, ok := exit.Sys().(syscall.WaitStatus); ok {
|
||||
os.Exit(ws.ExitStatus())
|
||||
|
@ -269,7 +281,7 @@ func MaybeRunAsRoot() error {
|
|||
log.Warningf("No wait status provided, exiting with -1: %v", err)
|
||||
os.Exit(-1)
|
||||
}
|
||||
return fmt.Errorf("re-executing self: %v", err)
|
||||
return err
|
||||
}
|
||||
// Child completed with success.
|
||||
os.Exit(0)
|
||||
|
|
Loading…
Reference in New Issue