gvisor/runsc/container/fs.go

278 lines
9.1 KiB
Go

// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package container
import (
"bufio"
"fmt"
"os"
"path/filepath"
"strings"
"syscall"
specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/runsc/boot"
"gvisor.googlesource.com/gvisor/runsc/specutils"
)
type mapping struct {
set bool
val uint32
}
var optionsMap = map[string]mapping{
"acl": {set: true, val: syscall.MS_POSIXACL},
"async": {set: false, val: syscall.MS_SYNCHRONOUS},
"atime": {set: false, val: syscall.MS_NOATIME},
"bind": {set: true, val: syscall.MS_BIND},
"defaults": {set: true, val: 0},
"dev": {set: false, val: syscall.MS_NODEV},
"diratime": {set: false, val: syscall.MS_NODIRATIME},
"dirsync": {set: true, val: syscall.MS_DIRSYNC},
"exec": {set: false, val: syscall.MS_NOEXEC},
"iversion": {set: true, val: syscall.MS_I_VERSION},
"loud": {set: false, val: syscall.MS_SILENT},
"mand": {set: true, val: syscall.MS_MANDLOCK},
"noacl": {set: false, val: syscall.MS_POSIXACL},
"noatime": {set: true, val: syscall.MS_NOATIME},
"nodev": {set: true, val: syscall.MS_NODEV},
"nodiratime": {set: true, val: syscall.MS_NODIRATIME},
"noexec": {set: true, val: syscall.MS_NOEXEC},
"noiversion": {set: false, val: syscall.MS_I_VERSION},
"nomand": {set: false, val: syscall.MS_MANDLOCK},
"norelatime": {set: false, val: syscall.MS_RELATIME},
"nostrictatime": {set: false, val: syscall.MS_STRICTATIME},
"nosuid": {set: true, val: syscall.MS_NOSUID},
"private": {set: true, val: syscall.MS_PRIVATE},
"rbind": {set: true, val: syscall.MS_BIND | syscall.MS_REC},
"relatime": {set: true, val: syscall.MS_RELATIME},
"remount": {set: true, val: syscall.MS_REMOUNT},
"ro": {set: true, val: syscall.MS_RDONLY},
"rprivate": {set: true, val: syscall.MS_PRIVATE | syscall.MS_REC},
"rw": {set: false, val: syscall.MS_RDONLY},
"silent": {set: true, val: syscall.MS_SILENT},
"strictatime": {set: true, val: syscall.MS_STRICTATIME},
"suid": {set: false, val: syscall.MS_NOSUID},
"sync": {set: true, val: syscall.MS_SYNCHRONOUS},
}
// setupFS creates the container directory structure under 'spec.Root.Path'.
// This allows the gofer serving the containers to be chroot under this
// directory to create an extra layer to security in case the gofer gets
// compromised.
// Returns list of mounts equivalent to 'spec.Mounts' with all destination paths
// cleaned and with symlinks resolved.
func setupFS(spec *specs.Spec, conf *boot.Config, bundleDir string) ([]specs.Mount, error) {
rv := make([]specs.Mount, 0, len(spec.Mounts))
for _, m := range spec.Mounts {
if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
rv = append(rv, m)
continue
}
// It's possible that 'm.Destination' follows symlinks inside the
// container.
dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
if err != nil {
return nil, fmt.Errorf("failed to resolve symlinks: %v", err)
}
flags := optionsToFlags(m.Options)
flags |= syscall.MS_BIND
log.Infof("Mounting src: %q, dst: %q, flags: %#x", m.Source, dst, flags)
if err := specutils.Mount(m.Source, dst, m.Type, flags); err != nil {
return nil, fmt.Errorf("failed to mount %v: %v", m, err)
}
// Make the mount a slave, so that for recursive bind mount, umount won't
// propagate to the source.
flags = syscall.MS_SLAVE | syscall.MS_REC
if err := syscall.Mount("", dst, "", uintptr(flags), ""); err != nil {
return nil, fmt.Errorf("failed to rslave mount dst: %q, flags: %#x, err: %v", dst, flags, err)
}
cpy := m
relDst, err := filepath.Rel(spec.Root.Path, dst)
if err != nil {
panic(fmt.Sprintf("%q could not be made relative to %q: %v", dst, spec.Root.Path, err))
}
cpy.Destination = filepath.Join("/", relDst)
rv = append(rv, cpy)
}
// If root is read only, check if it needs to be remounted as readonly.
if spec.Root.Readonly {
isMountPoint, readonly, err := mountInfo(spec.Root.Path)
if err != nil {
return nil, err
}
if readonly {
return rv, nil
}
if !isMountPoint {
// Readonly root is not a mount point nor read-only. Can't do much other
// than just logging a warning. The gofer will prevent files to be open
// in write mode.
log.Warningf("Mount where root is located is not read-only and cannot be changed: %q", spec.Root.Path)
return rv, nil
}
// If root is a mount point but not read-only, we can change mount options
// to make it read-only for extra safety.
log.Infof("Remounting root as readonly: %q", spec.Root.Path)
flags := uintptr(syscall.MS_BIND | syscall.MS_REMOUNT | syscall.MS_RDONLY | syscall.MS_REC)
src := spec.Root.Path
if err := syscall.Mount(src, src, "bind", flags, ""); err != nil {
return nil, fmt.Errorf("failed to remount root as read-only with source: %q, target: %q, flags: %#x, err: %v", spec.Root.Path, spec.Root.Path, flags, err)
}
}
return rv, nil
}
// mountInfo returns whether the path is a mount point and whether the mount
// that path belongs to is read-only.
func mountInfo(path string) (bool, bool, error) {
// Mounts are listed by their real paths.
realPath, err := filepath.EvalSymlinks(path)
if err != nil {
return false, false, err
}
f, err := os.Open("/proc/mounts")
if err != nil {
return false, false, err
}
scanner := bufio.NewScanner(f)
var mountPoint string
var readonly bool
for scanner.Scan() {
line := scanner.Text()
parts := strings.Split(line, " ")
if len(parts) < 4 {
return false, false, fmt.Errorf("invalid /proc/mounts line format %q", line)
}
mp := parts[1]
opts := strings.Split(parts[3], ",")
// Find the closest submount to the path.
if strings.Contains(realPath, mp) && len(mp) > len(mountPoint) {
mountPoint = mp
readonly = specutils.ContainsStr(opts, "ro")
}
}
if err := scanner.Err(); err != nil {
return false, false, err
}
return mountPoint == realPath, readonly, nil
}
// destroyFS unmounts mounts done by runsc under `spec.Root.Path`. This
// recovers the container rootfs into the original state.
func destroyFS(spec *specs.Spec) error {
for _, m := range spec.Mounts {
if m.Type != "bind" || !specutils.IsSupportedDevMount(m) {
continue
}
// It's possible that 'm.Destination' follows symlinks inside the
// container.
dst, err := resolveSymlinks(spec.Root.Path, m.Destination)
if err != nil {
return err
}
flags := syscall.MNT_DETACH
log.Infof("Unmounting dst: %q, flags: %#x", dst, flags)
// Do not return error if dst is not a mountpoint.
// Based on http://man7.org/linux/man-pages/man2/umount.2.html
// For kernel version 2.6+ and MNT_DETACH flag, EINVAL means
// the dst is not a mount point.
if err := syscall.Unmount(dst, flags); err != nil &&
!os.IsNotExist(err) && err != syscall.EINVAL {
return err
}
}
return nil
}
// resolveSymlinks walks 'rel' having 'root' as the root directory. If there are
// symlinks, they are evaluated relative to 'root' to ensure the end result is
// the same as if the process was running inside the container.
func resolveSymlinks(root, rel string) (string, error) {
return resolveSymlinksImpl(root, root, rel, 255)
}
func resolveSymlinksImpl(root, base, rel string, followCount uint) (string, error) {
if followCount == 0 {
return "", fmt.Errorf("too many symlinks to follow, path: %q", filepath.Join(base, rel))
}
rel = filepath.Clean(rel)
for _, name := range strings.Split(rel, string(filepath.Separator)) {
if name == "" {
continue
}
// Note that Join() resolves things like ".." and returns a clean path.
path := filepath.Join(base, name)
if !strings.HasPrefix(path, root) {
// One cannot '..' their way out of root.
path = root
continue
}
fi, err := os.Lstat(path)
if err != nil {
if !os.IsNotExist(err) {
return "", err
}
// Not found means there is no symlink to check. Just keep walking dirs.
base = path
continue
}
if fi.Mode()&os.ModeSymlink != 0 {
link, err := os.Readlink(path)
if err != nil {
return "", err
}
if filepath.IsAbs(link) {
base = root
}
base, err = resolveSymlinksImpl(root, base, link, followCount-1)
if err != nil {
return "", err
}
continue
}
base = path
}
return base, nil
}
func optionsToFlags(opts []string) uint32 {
var rv uint32
for _, opt := range opts {
if m, ok := optionsMap[opt]; ok {
if m.set {
rv |= m.val
} else {
rv ^= m.val
}
} else {
log.Warningf("Ignoring mount option %q", opt)
}
}
return rv
}