742 lines
24 KiB
Go
742 lines
24 KiB
Go
// Copyright 2018 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package boot
|
|
|
|
import (
|
|
"fmt"
|
|
"path"
|
|
"path/filepath"
|
|
"strconv"
|
|
"strings"
|
|
|
|
// Include filesystem types that OCI spec might mount.
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tmpfs"
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/tty"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
|
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
|
|
"gvisor.googlesource.com/gvisor/pkg/log"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
|
|
"gvisor.googlesource.com/gvisor/pkg/syserror"
|
|
"gvisor.googlesource.com/gvisor/runsc/specutils"
|
|
)
|
|
|
|
const (
|
|
// Filesystem name for 9p gofer mounts.
|
|
rootFsName = "9p"
|
|
|
|
// Device name for root mount.
|
|
rootDevice = "9pfs-/"
|
|
|
|
// ChildContainersDir is the directory where child container root
|
|
// filesystems are mounted.
|
|
ChildContainersDir = "/__runsc_containers__"
|
|
|
|
// Filesystems that runsc supports.
|
|
bind = "bind"
|
|
devpts = "devpts"
|
|
devtmpfs = "devtmpfs"
|
|
proc = "proc"
|
|
sysfs = "sysfs"
|
|
tmpfs = "tmpfs"
|
|
nonefs = "none"
|
|
)
|
|
|
|
type fdDispenser struct {
|
|
fds []int
|
|
}
|
|
|
|
func (f *fdDispenser) remove() int {
|
|
if f.empty() {
|
|
panic("fdDispenser out of fds")
|
|
}
|
|
rv := f.fds[0]
|
|
f.fds = f.fds[1:]
|
|
return rv
|
|
}
|
|
|
|
func (f *fdDispenser) empty() bool {
|
|
return len(f.fds) == 0
|
|
}
|
|
|
|
// createMountNamespace creates a mount namespace containing the root filesystem
|
|
// and all mounts. 'rootCtx' is used to walk directories to find mount points.
|
|
func createMountNamespace(userCtx context.Context, rootCtx context.Context, spec *specs.Spec, conf *Config, goferFDs []int) (*fs.MountNamespace, error) {
|
|
mounts := compileMounts(spec)
|
|
|
|
// Create a tmpfs mount where we create and mount a root filesystem for
|
|
// each child container.
|
|
mounts = append(mounts, specs.Mount{
|
|
Type: tmpfs,
|
|
Destination: ChildContainersDir,
|
|
})
|
|
|
|
fds := &fdDispenser{fds: goferFDs}
|
|
rootInode, err := createRootMount(rootCtx, spec, conf, fds, mounts)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create root mount: %v", err)
|
|
}
|
|
mns, err := fs.NewMountNamespace(userCtx, rootInode)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create root mount namespace: %v", err)
|
|
}
|
|
|
|
if err := setMounts(rootCtx, conf, mns, fds, mounts); err != nil {
|
|
return nil, fmt.Errorf("failed to configure mounts: %v", err)
|
|
}
|
|
if !fds.empty() {
|
|
return nil, fmt.Errorf("not all mount points were consumed, remaining: %v", fds)
|
|
}
|
|
return mns, nil
|
|
}
|
|
|
|
// compileMounts returns the supported mounts from the mount spec, adding any
|
|
// mandatory mounts that are required by the OCI specification.
|
|
func compileMounts(spec *specs.Spec) []specs.Mount {
|
|
// Keep track of whether proc, sys, and tmp were mounted.
|
|
var procMounted, sysMounted, tmpMounted bool
|
|
var mounts []specs.Mount
|
|
|
|
// Always mount /dev.
|
|
mounts = append(mounts, specs.Mount{
|
|
Type: devtmpfs,
|
|
Destination: "/dev",
|
|
})
|
|
|
|
mounts = append(mounts, specs.Mount{
|
|
Type: devpts,
|
|
Destination: "/dev/pts",
|
|
})
|
|
|
|
// Mount all submounts from the spec.
|
|
for _, m := range spec.Mounts {
|
|
if !specutils.IsSupportedDevMount(m) {
|
|
log.Warningf("ignoring dev mount at %q", m.Destination)
|
|
continue
|
|
}
|
|
mounts = append(mounts, m)
|
|
switch filepath.Clean(m.Destination) {
|
|
case "/proc":
|
|
procMounted = true
|
|
case "/sys":
|
|
sysMounted = true
|
|
case "/tmp":
|
|
tmpMounted = true
|
|
}
|
|
}
|
|
|
|
// Mount proc and sys even if the user did not ask for it, as the spec
|
|
// says we SHOULD.
|
|
var mandatoryMounts []specs.Mount
|
|
if !procMounted {
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
|
Type: proc,
|
|
Destination: "/proc",
|
|
})
|
|
}
|
|
if !sysMounted {
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
|
Type: sysfs,
|
|
Destination: "/sys",
|
|
})
|
|
}
|
|
|
|
// Technically we don't have to mount tmpfs at /tmp, as we could just
|
|
// rely on the host /tmp, but this is a nice optimization, and fixes
|
|
// some apps that call mknod in /tmp.
|
|
if !tmpMounted {
|
|
// TODO: If the host /tmp (or a mount at /tmp) has
|
|
// files in it, we should overlay our tmpfs implementation over
|
|
// that. Until then, the /tmp mount will always appear empty at
|
|
// container creation.
|
|
mandatoryMounts = append(mandatoryMounts, specs.Mount{
|
|
Type: tmpfs,
|
|
Destination: "/tmp",
|
|
})
|
|
}
|
|
|
|
// The mandatory mounts should be ordered right after the root, in case
|
|
// there are submounts of these mandatory mounts already in the spec.
|
|
mounts = append(mounts[:0], append(mandatoryMounts, mounts[0:]...)...)
|
|
|
|
return mounts
|
|
}
|
|
|
|
// setMounts iterates over mounts and mounts them in the specified
|
|
// mount namespace.
|
|
func setMounts(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, mounts []specs.Mount) error {
|
|
for _, m := range mounts {
|
|
if err := mountSubmount(ctx, conf, mns, fds, m, mounts, m.Destination); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// createRootMount creates the root filesystem.
|
|
func createRootMount(ctx context.Context, spec *specs.Spec, conf *Config, fds *fdDispenser, mounts []specs.Mount) (*fs.Inode, error) {
|
|
// First construct the filesystem from the spec.Root.
|
|
mf := fs.MountSourceFlags{ReadOnly: spec.Root.Readonly}
|
|
|
|
var (
|
|
rootInode *fs.Inode
|
|
err error
|
|
)
|
|
|
|
fd := fds.remove()
|
|
log.Infof("Mounting root over 9P, ioFD: %d", fd)
|
|
hostFS := mustFindFilesystem("9p")
|
|
opts := p9MountOptions(fd, conf.FileAccess)
|
|
rootInode, err = hostFS.Mount(ctx, rootDevice, mf, strings.Join(opts, ","))
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to generate root mount point: %v", err)
|
|
}
|
|
|
|
// We need to overlay the root on top of a ramfs with stub directories
|
|
// for submount paths. "/dev" "/sys" "/proc" and "/tmp" are always
|
|
// mounted even if they are not in the spec.
|
|
submounts := append(subtargets("/", mounts), "/dev", "/sys", "/proc", "/tmp")
|
|
rootInode, err = addSubmountOverlay(ctx, rootInode, submounts)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error adding submount overlay: %v", err)
|
|
}
|
|
|
|
if conf.Overlay && !spec.Root.Readonly {
|
|
log.Debugf("Adding overlay on top of root mount")
|
|
// Overlay a tmpfs filesystem on top of the root.
|
|
rootInode, err = addOverlay(ctx, conf, rootInode, "root-overlay-upper", mf)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
log.Infof("Mounted %q to \"/\" type root", spec.Root.Path)
|
|
return rootInode, nil
|
|
}
|
|
|
|
func addOverlay(ctx context.Context, conf *Config, lower *fs.Inode, name string, lowerFlags fs.MountSourceFlags) (*fs.Inode, error) {
|
|
// Upper layer uses the same flags as lower, but it must be read-write.
|
|
lowerFlags.ReadOnly = false
|
|
|
|
tmpFS := mustFindFilesystem("tmpfs")
|
|
if !fs.IsDir(lower.StableAttr) {
|
|
// Create overlay on top of mount file, e.g. /etc/hostname.
|
|
msrc := fs.NewCachingMountSource(tmpFS, lowerFlags)
|
|
return fs.NewOverlayRootFile(ctx, msrc, lower, lowerFlags)
|
|
}
|
|
|
|
// Create overlay on top of mount dir.
|
|
upper, err := tmpFS.Mount(ctx, name+"-upper", lowerFlags, "")
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to create tmpfs overlay: %v", err)
|
|
}
|
|
return fs.NewOverlayRoot(ctx, upper, lower, lowerFlags)
|
|
}
|
|
|
|
// getMountNameAndOptions retrieves the fsName, opts, and useOverlay values
|
|
// used for mounts.
|
|
func getMountNameAndOptions(conf *Config, m specs.Mount, fds *fdDispenser) (string, []string, bool, error) {
|
|
var (
|
|
fsName string
|
|
opts []string
|
|
useOverlay bool
|
|
err error
|
|
)
|
|
|
|
switch m.Type {
|
|
case devpts, devtmpfs, proc, sysfs:
|
|
fsName = m.Type
|
|
case nonefs:
|
|
fsName = sysfs
|
|
case tmpfs:
|
|
fsName = m.Type
|
|
|
|
// tmpfs has some extra supported options that we must pass through.
|
|
opts, err = parseAndFilterOptions(m.Options, "mode", "uid", "gid")
|
|
|
|
case bind:
|
|
fd := fds.remove()
|
|
fsName = "9p"
|
|
// Non-root bind mounts are always shared.
|
|
opts = p9MountOptions(fd, FileAccessShared)
|
|
// If configured, add overlay to all writable mounts.
|
|
useOverlay = conf.Overlay && !mountFlags(m.Options).ReadOnly
|
|
|
|
default:
|
|
// TODO: Support all the mount types and make this a
|
|
// fatal error. Most applications will "just work" without
|
|
// them, so this is a warning for now.
|
|
// we do not support.
|
|
log.Warningf("ignoring unknown filesystem type %q", m.Type)
|
|
}
|
|
return fsName, opts, useOverlay, err
|
|
}
|
|
|
|
func mountSubmount(ctx context.Context, conf *Config, mns *fs.MountNamespace, fds *fdDispenser, m specs.Mount, mounts []specs.Mount, dest string) error {
|
|
// Map mount type to filesystem name, and parse out the options that we are
|
|
// capable of dealing with.
|
|
fsName, opts, useOverlay, err := getMountNameAndOptions(conf, m, fds)
|
|
|
|
// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if fsName == "" {
|
|
return nil
|
|
}
|
|
|
|
// All filesystem names should have been mapped to something we know.
|
|
filesystem := mustFindFilesystem(fsName)
|
|
|
|
mf := mountFlags(m.Options)
|
|
if useOverlay {
|
|
// All writes go to upper, be paranoid and make lower readonly.
|
|
mf.ReadOnly = true
|
|
}
|
|
|
|
inode, err := filesystem.Mount(ctx, mountDevice(m), mf, strings.Join(opts, ","))
|
|
if err != nil {
|
|
return fmt.Errorf("failed to create mount with source %q: %v", m.Source, err)
|
|
}
|
|
|
|
// If there are submounts, we need to overlay the mount on top of a
|
|
// ramfs with stub directories for submount paths.
|
|
submounts := subtargets(m.Destination, mounts)
|
|
if len(submounts) > 0 {
|
|
log.Infof("Adding submount overlay over %q", m.Destination)
|
|
inode, err = addSubmountOverlay(ctx, inode, submounts)
|
|
if err != nil {
|
|
return fmt.Errorf("error adding submount overlay: %v", err)
|
|
}
|
|
}
|
|
|
|
if useOverlay {
|
|
log.Debugf("Adding overlay on top of mount %q", m.Destination)
|
|
inode, err = addOverlay(ctx, conf, inode, m.Type, mf)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Create destination in case it doesn't exist. This is required, in addition
|
|
// to 'addSubmountOverlay', in case there are symlinks to create directories
|
|
// in the right location, e.g.
|
|
// mount: /var/run/secrets, may be created in '/run/secrets' if
|
|
// '/var/run' => '/var'.
|
|
if err := mkdirAll(ctx, mns, dest); err != nil {
|
|
return err
|
|
}
|
|
|
|
root := mns.Root()
|
|
defer root.DecRef()
|
|
dirent, err := mns.FindInode(ctx, root, nil, dest, linux.MaxSymlinkTraversals)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to find mount destination %q: %v", dest, err)
|
|
}
|
|
defer dirent.DecRef()
|
|
if err := mns.Mount(ctx, dirent, inode); err != nil {
|
|
return fmt.Errorf("failed to mount at destination %q: %v", dest, err)
|
|
}
|
|
|
|
log.Infof("Mounted %q to %q type %s", m.Source, dest, m.Type)
|
|
return nil
|
|
}
|
|
|
|
func mkdirAll(ctx context.Context, mns *fs.MountNamespace, path string) error {
|
|
log.Infof("mkdirAll called with path %s", path)
|
|
root := mns.Root()
|
|
defer root.DecRef()
|
|
|
|
// Starting at the root, walk the path.
|
|
parent := root
|
|
ps := strings.Split(filepath.Clean(path), string(filepath.Separator))
|
|
for _, pathElem := range ps {
|
|
if pathElem == "" {
|
|
// This will be case for the first and last element, if the path
|
|
// begins or ends with '/'. Note that we always treat the path as
|
|
// absolute, regardless of what the first character contains.
|
|
continue
|
|
}
|
|
d, err := mns.FindInode(ctx, root, parent, pathElem, fs.DefaultTraversalLimit)
|
|
if err == syserror.ENOENT {
|
|
// If we encounter a path that does not exist, then
|
|
// create it.
|
|
if err := parent.CreateDirectory(ctx, root, pathElem, fs.FilePermsFromMode(0755)); err != nil {
|
|
return fmt.Errorf("failed to create directory %q: %v", pathElem, err)
|
|
}
|
|
if d, err = parent.Walk(ctx, root, pathElem); err != nil {
|
|
return fmt.Errorf("walk to %q failed: %v", pathElem, err)
|
|
}
|
|
} else if err != nil {
|
|
return fmt.Errorf("failed to find inode %q: %v", pathElem, err)
|
|
}
|
|
parent = d
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// p9MountOptions creates a slice of options for a p9 mount.
|
|
func p9MountOptions(fd int, fa FileAccessType) []string {
|
|
opts := []string{
|
|
"trans=fd",
|
|
"rfdno=" + strconv.Itoa(fd),
|
|
"wfdno=" + strconv.Itoa(fd),
|
|
"privateunixsocket=true",
|
|
}
|
|
if fa == FileAccessShared {
|
|
opts = append(opts, "cache=remote_revalidating")
|
|
}
|
|
return opts
|
|
}
|
|
|
|
// parseAndFilterOptions parses a MountOptions slice and filters by the allowed
|
|
// keys.
|
|
func parseAndFilterOptions(opts []string, allowedKeys ...string) ([]string, error) {
|
|
var out []string
|
|
for _, o := range opts {
|
|
kv := strings.Split(o, "=")
|
|
switch len(kv) {
|
|
case 1:
|
|
if specutils.ContainsStr(allowedKeys, o) {
|
|
out = append(out, o)
|
|
continue
|
|
}
|
|
log.Warningf("ignoring unsupported key %q", kv)
|
|
case 2:
|
|
if specutils.ContainsStr(allowedKeys, kv[0]) {
|
|
out = append(out, o)
|
|
continue
|
|
}
|
|
log.Warningf("ignoring unsupported key %q", kv[0])
|
|
default:
|
|
return nil, fmt.Errorf("invalid option %q", o)
|
|
}
|
|
}
|
|
return out, nil
|
|
}
|
|
|
|
func destinations(mounts []specs.Mount, extra ...string) []string {
|
|
var ds []string
|
|
for _, m := range mounts {
|
|
ds = append(ds, m.Destination)
|
|
}
|
|
return append(ds, extra...)
|
|
}
|
|
|
|
// mountDevice returns a device string based on the fs type and target
|
|
// of the mount.
|
|
func mountDevice(m specs.Mount) string {
|
|
if m.Type == bind {
|
|
// Make a device string that includes the target, which is consistent across
|
|
// S/R and uniquely identifies the connection.
|
|
return "9pfs-" + m.Destination
|
|
}
|
|
// All other fs types use device "none".
|
|
return "none"
|
|
}
|
|
|
|
// addRestoreMount adds a mount to the MountSources map used for restoring a
|
|
// checkpointed container.
|
|
func addRestoreMount(conf *Config, renv *fs.RestoreEnvironment, m specs.Mount, fds *fdDispenser) error {
|
|
fsName, opts, _, err := getMountNameAndOptions(conf, m, fds)
|
|
|
|
// Return the error or nil that corresponds to the default case in getMountNameAndOptions.
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// TODO: Fix this when we support all the mount types and
|
|
// make this a fatal error.
|
|
if fsName == "" {
|
|
return nil
|
|
}
|
|
|
|
newMount := fs.MountArgs{
|
|
Dev: mountDevice(m),
|
|
Flags: mountFlags(m.Options),
|
|
Data: strings.Join(opts, ","),
|
|
}
|
|
renv.MountSources[fsName] = append(renv.MountSources[fsName], newMount)
|
|
log.Infof("Added mount at %q: %+v", fsName, newMount)
|
|
return nil
|
|
}
|
|
|
|
// createRestoreEnvironment builds a fs.RestoreEnvironment called renv by adding the mounts
|
|
// to the environment.
|
|
func createRestoreEnvironment(spec *specs.Spec, conf *Config, fds *fdDispenser) (*fs.RestoreEnvironment, error) {
|
|
renv := &fs.RestoreEnvironment{
|
|
MountSources: make(map[string][]fs.MountArgs),
|
|
}
|
|
|
|
// Add root mount.
|
|
fd := fds.remove()
|
|
opts := p9MountOptions(fd, conf.FileAccess)
|
|
|
|
mf := fs.MountSourceFlags{}
|
|
if spec.Root.Readonly {
|
|
mf.ReadOnly = true
|
|
}
|
|
|
|
rootMount := fs.MountArgs{
|
|
Dev: rootDevice,
|
|
Flags: mf,
|
|
Data: strings.Join(opts, ","),
|
|
}
|
|
renv.MountSources[rootFsName] = append(renv.MountSources[rootFsName], rootMount)
|
|
|
|
// Add submounts.
|
|
for _, m := range compileMounts(spec) {
|
|
if err := addRestoreMount(conf, renv, m, fds); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return renv, nil
|
|
}
|
|
|
|
func mountFlags(opts []string) fs.MountSourceFlags {
|
|
mf := fs.MountSourceFlags{}
|
|
for _, o := range opts {
|
|
switch o {
|
|
case "rw":
|
|
mf.ReadOnly = false
|
|
case "ro":
|
|
mf.ReadOnly = true
|
|
case "noatime":
|
|
mf.NoAtime = true
|
|
default:
|
|
log.Warningf("ignoring unknown mount option %q", o)
|
|
}
|
|
}
|
|
return mf
|
|
}
|
|
|
|
func mustFindFilesystem(name string) fs.Filesystem {
|
|
fs, ok := fs.FindFilesystem(name)
|
|
if !ok {
|
|
panic(fmt.Sprintf("could not find filesystem %q", name))
|
|
}
|
|
return fs
|
|
}
|
|
|
|
// addSubmountOverlay overlays the inode over a ramfs tree containing the given
|
|
// paths.
|
|
func addSubmountOverlay(ctx context.Context, inode *fs.Inode, submounts []string) (*fs.Inode, error) {
|
|
// There is no real filesystem backing this ramfs tree, so we pass in
|
|
// "nil" here.
|
|
mountTree, err := ramfs.MakeDirectoryTree(ctx, fs.NewNonCachingMountSource(nil, fs.MountSourceFlags{}), submounts)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error creating mount tree: %v", err)
|
|
}
|
|
overlayInode, err := fs.NewOverlayRoot(ctx, inode, mountTree, fs.MountSourceFlags{})
|
|
if err != nil {
|
|
return nil, fmt.Errorf("failed to make mount overlay: %v", err)
|
|
}
|
|
return overlayInode, err
|
|
}
|
|
|
|
// subtargets takes a set of Mounts and returns only the targets that are
|
|
// children of the given root. The returned paths are relative to the root.
|
|
func subtargets(root string, mnts []specs.Mount) []string {
|
|
r := filepath.Clean(root)
|
|
if len(r) > 0 && r[len(r)-1] != '/' {
|
|
r += "/"
|
|
}
|
|
var targets []string
|
|
for _, mnt := range mnts {
|
|
t := filepath.Clean(mnt.Destination)
|
|
if strings.HasPrefix(t, r) {
|
|
// Make the mnt path relative to the root path. If the
|
|
// result is empty, then mnt IS the root mount, not a
|
|
// submount. We don't want to include those.
|
|
if t := strings.TrimPrefix(t, r); t != "" {
|
|
targets = append(targets, t)
|
|
}
|
|
}
|
|
}
|
|
return targets
|
|
}
|
|
|
|
// setupContainerFS is used to set up the file system and amend the procArgs accordingly.
|
|
// procArgs are passed by reference and the FDMap field is modified. It dups stdioFDs.
|
|
func setupContainerFS(procArgs *kernel.CreateProcessArgs, spec *specs.Spec, conf *Config, stdioFDs, goferFDs []int, console bool, creds *auth.Credentials, ls *limits.LimitSet, k *kernel.Kernel, cid string) error {
|
|
ctx := procArgs.NewContext(k)
|
|
|
|
// Create the FD map, which will set stdin, stdout, and stderr. If
|
|
// console is true, then ioctl calls will be passed through to the host
|
|
// fd.
|
|
fdm, err := createFDMap(ctx, k, ls, console, stdioFDs)
|
|
if err != nil {
|
|
return fmt.Errorf("error importing fds: %v", err)
|
|
}
|
|
|
|
// CreateProcess takes a reference on FDMap if successful. We
|
|
// won't need ours either way.
|
|
procArgs.FDMap = fdm
|
|
|
|
// Use root user to configure mounts. The current user might not have
|
|
// permission to do so.
|
|
rootProcArgs := kernel.CreateProcessArgs{
|
|
WorkingDirectory: "/",
|
|
Credentials: auth.NewRootCredentials(creds.UserNamespace),
|
|
Umask: 0022,
|
|
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
|
|
}
|
|
rootCtx := rootProcArgs.NewContext(k)
|
|
|
|
// If this is the root container, we also need to setup the root mount
|
|
// namespace.
|
|
mns := k.RootMountNamespace()
|
|
if mns == nil {
|
|
// Create the virtual filesystem.
|
|
mns, err := createMountNamespace(ctx, rootCtx, spec, conf, goferFDs)
|
|
if err != nil {
|
|
return fmt.Errorf("error creating mounts: %v", err)
|
|
}
|
|
k.SetRootMountNamespace(mns)
|
|
return nil
|
|
}
|
|
|
|
// Setup a child container.
|
|
|
|
// Create the container's root filesystem mount.
|
|
log.Infof("Creating new process in child container.")
|
|
fds := &fdDispenser{fds: append([]int{}, goferFDs...)}
|
|
rootInode, err := createRootMount(rootCtx, spec, conf, fds, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("error creating filesystem for container: %v", err)
|
|
}
|
|
|
|
// Make directories for submounts within the container.
|
|
rootDir := mns.Root()
|
|
defer rootDir.DecRef()
|
|
containerRoot := filepath.Join(ChildContainersDir, cid)
|
|
mkdirAll(ctx, mns, containerRoot)
|
|
|
|
// Mount the container's root filesystem to the newly created
|
|
// mount point.
|
|
containerRootDirent, err := mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to find mount destination: %q: %v", containerRoot, err)
|
|
}
|
|
if err := mns.Mount(ctx, containerRootDirent, rootInode); err != nil {
|
|
return fmt.Errorf("failed to mount at destination %q: %v", containerRoot, err)
|
|
}
|
|
containerRootDirent.DecRef()
|
|
|
|
// We have to re-walk to the dirent to find the mounted
|
|
// directory. The old dirent is invalid at this point.
|
|
containerRootDirent, err = mns.FindInode(ctx, rootDir, nil, containerRoot, linux.MaxSymlinkTraversals)
|
|
if err != nil {
|
|
return fmt.Errorf("failed to find mount destination2: %q: %v", containerRoot, err)
|
|
}
|
|
log.Infof("Mounted child's root fs to %q", containerRoot)
|
|
|
|
// Mount all submounts.
|
|
mounts := compileMounts(spec)
|
|
for _, m := range mounts {
|
|
dest := filepath.Join(containerRoot, m.Destination)
|
|
if err := mountSubmount(rootCtx, conf, k.RootMountNamespace(), fds, m, mounts, dest); err != nil {
|
|
return fmt.Errorf("error mounting filesystem for container: %v", err)
|
|
}
|
|
}
|
|
|
|
// Set the procArgs root directory.
|
|
procArgs.Root = containerRootDirent
|
|
return nil
|
|
}
|
|
|
|
// setExecutablePath sets the procArgs.Filename by searching the PATH for an
|
|
// executable matching the procArgs.Argv[0].
|
|
func setExecutablePath(ctx context.Context, mns *fs.MountNamespace, procArgs *kernel.CreateProcessArgs) error {
|
|
paths := fs.GetPath(procArgs.Envv)
|
|
f, err := mns.ResolveExecutablePath(ctx, procArgs.WorkingDirectory, procArgs.Argv[0], paths)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
procArgs.Filename = f
|
|
return nil
|
|
}
|
|
|
|
// destroyContainerFS cleans up the filesystem by unmounting all mounts for the
|
|
// given container and deleting the container root directory.
|
|
func destroyContainerFS(ctx context.Context, cid string, k *kernel.Kernel) error {
|
|
// First get a reference to the container root directory.
|
|
mns := k.RootMountNamespace()
|
|
mnsRoot := mns.Root()
|
|
defer mnsRoot.DecRef()
|
|
containerRoot := path.Join(ChildContainersDir, cid)
|
|
containerRootDirent, err := mns.FindInode(ctx, mnsRoot, nil, containerRoot, linux.MaxSymlinkTraversals)
|
|
if err == syserror.ENOENT {
|
|
// Container must have been destroyed already. That's fine.
|
|
return nil
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("error finding container root directory %q: %v", containerRoot, err)
|
|
}
|
|
defer containerRootDirent.DecRef()
|
|
|
|
// Iterate through all submounts and unmount them. We unmount lazily by
|
|
// setting detach=true, so we can unmount in any order.
|
|
for _, m := range containerRootDirent.Inode.MountSource.Submounts() {
|
|
root := m.Root()
|
|
defer root.DecRef()
|
|
|
|
// Do a best-effort unmount by flushing the refs and unmount
|
|
// with "detach only = true".
|
|
log.Debugf("Unmounting container submount %q", root.BaseName())
|
|
m.FlushDirentRefs()
|
|
if err := mns.Unmount(ctx, root, true /* detach only */); err != nil {
|
|
return fmt.Errorf("error unmounting container submount %q: %v", root.BaseName(), err)
|
|
}
|
|
}
|
|
|
|
// Unmount the container root itself.
|
|
log.Debugf("Unmounting container root %q", containerRoot)
|
|
containerRootDirent.Inode.MountSource.FlushDirentRefs()
|
|
if err := mns.Unmount(ctx, containerRootDirent, true /* detach only */); err != nil {
|
|
return fmt.Errorf("error unmounting container root mount %q: %v", containerRootDirent.BaseName(), err)
|
|
}
|
|
|
|
// Get a reference to the parent directory and remove the root
|
|
// container directory.
|
|
containersDirDirent, err := mns.FindInode(ctx, mnsRoot, nil, ChildContainersDir, linux.MaxSymlinkTraversals)
|
|
if err != nil {
|
|
return fmt.Errorf("error finding containers directory %q: %v", ChildContainersDir, err)
|
|
}
|
|
defer containersDirDirent.DecRef()
|
|
log.Debugf("Deleting container root %q", containerRoot)
|
|
if err := containersDirDirent.RemoveDirectory(ctx, mnsRoot, cid); err != nil {
|
|
return fmt.Errorf("error removing directory %q: %v", containerRoot, err)
|
|
}
|
|
|
|
// Flushing dirent references triggers many async close operations. We
|
|
// must wait for those to complete before returning, otherwise the
|
|
// caller may kill the gofer before they complete, causing a cascade of
|
|
// failing RPCs.
|
|
log.Infof("Waiting for async filesystem operations to complete")
|
|
fs.AsyncBarrier()
|
|
|
|
return nil
|
|
}
|