2018-04-27 17:37:02 +00:00
|
|
|
// Copyright 2018 Google Inc.
|
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2018-06-20 04:42:21 +00:00
|
|
|
// Package boot loads the kernel and runs a container.
|
2018-04-27 17:37:02 +00:00
|
|
|
package boot
|
|
|
|
|
|
|
|
import (
|
2018-06-22 21:30:33 +00:00
|
|
|
"errors"
|
2018-04-27 17:37:02 +00:00
|
|
|
"fmt"
|
|
|
|
"math/rand"
|
2018-08-15 23:24:07 +00:00
|
|
|
"os"
|
2018-06-15 16:17:40 +00:00
|
|
|
"runtime"
|
2018-06-22 21:30:33 +00:00
|
|
|
"sync"
|
2018-04-27 17:37:02 +00:00
|
|
|
"sync/atomic"
|
|
|
|
gtime "time"
|
|
|
|
|
|
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/cpuid"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/log"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/inet"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/loader"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/platform"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/kvm"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/ptrace"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/sighandling"
|
|
|
|
slinux "gvisor.googlesource.com/gvisor/pkg/sentry/syscalls/linux"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/time"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/watchdog"
|
2018-05-02 05:11:07 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip"
|
2018-04-27 17:37:02 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/link/sniffer"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/network/arp"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv4"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/network/ipv6"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/stack"
|
2018-05-02 05:50:55 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/ping"
|
2018-04-27 17:37:02 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/tcp"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/tcpip/transport/udp"
|
|
|
|
"gvisor.googlesource.com/gvisor/runsc/boot/filter"
|
|
|
|
"gvisor.googlesource.com/gvisor/runsc/specutils"
|
|
|
|
|
|
|
|
// Include supported socket providers.
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/socket/epsocket"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/socket/hostinet"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/netlink/route"
|
|
|
|
_ "gvisor.googlesource.com/gvisor/pkg/sentry/socket/unix"
|
|
|
|
)
|
|
|
|
|
2018-05-17 18:54:36 +00:00
|
|
|
// Loader keeps state needed to start the kernel and run the container..
|
2018-04-27 17:37:02 +00:00
|
|
|
type Loader struct {
|
|
|
|
// k is the kernel.
|
|
|
|
k *kernel.Kernel
|
|
|
|
|
|
|
|
// ctrl is the control server.
|
|
|
|
ctrl *controller
|
|
|
|
|
|
|
|
conf *Config
|
|
|
|
|
|
|
|
// console is set to true if terminal is enabled.
|
|
|
|
console bool
|
|
|
|
|
|
|
|
watchdog *watchdog.Watchdog
|
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// ioFDs are the FDs that attach the sandbox to the gofers.
|
|
|
|
ioFDs []int
|
|
|
|
|
|
|
|
// spec is the base configuration for the root container.
|
|
|
|
spec *specs.Spec
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// stopSignalForwarding disables forwarding of signals to the sandboxed
|
2018-05-17 18:54:36 +00:00
|
|
|
// container. It should be called when a sandbox is destroyed.
|
2018-04-27 17:37:02 +00:00
|
|
|
stopSignalForwarding func()
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
// restore is set to true if we are restoring a container.
|
|
|
|
restore bool
|
|
|
|
|
2018-06-20 04:42:21 +00:00
|
|
|
// rootProcArgs refers to the root sandbox init task.
|
|
|
|
rootProcArgs kernel.CreateProcessArgs
|
2018-06-22 21:30:33 +00:00
|
|
|
|
2018-06-28 21:55:46 +00:00
|
|
|
// sandboxID is the ID for the whole sandbox.
|
|
|
|
sandboxID string
|
|
|
|
|
2018-06-22 21:30:33 +00:00
|
|
|
// mu guards containerRootTGIDs.
|
|
|
|
mu sync.Mutex
|
|
|
|
|
|
|
|
// containerRootTGIDs maps container IDs to their root processes. It
|
|
|
|
// can be used to determine which process to manipulate when clients
|
|
|
|
// call methods on particular containers.
|
|
|
|
//
|
|
|
|
// containerRootTGIDs is guarded by mu.
|
|
|
|
containerRootTGIDs map[string]kernel.ThreadID
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func init() {
|
|
|
|
// Initialize the random number generator.
|
|
|
|
rand.Seed(gtime.Now().UnixNano())
|
|
|
|
|
|
|
|
// Register the global syscall table.
|
|
|
|
kernel.RegisterSyscallTable(slinux.AMD64)
|
|
|
|
}
|
|
|
|
|
|
|
|
// New initializes a new kernel loader configured by spec.
|
2018-06-29 21:46:45 +00:00
|
|
|
// New also handles setting up a kernel for restoring a container.
|
2018-07-18 23:57:29 +00:00
|
|
|
func New(spec *specs.Spec, conf *Config, controllerFD int, ioFDs []int, console bool) (*Loader, error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
// Create kernel and platform.
|
|
|
|
p, err := createPlatform(conf)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error creating platform: %v", err)
|
|
|
|
}
|
|
|
|
k := &kernel.Kernel{
|
|
|
|
Platform: p,
|
|
|
|
}
|
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// Create VDSO.
|
|
|
|
//
|
|
|
|
// Pass k as the platform since it is savable, unlike the actual platform.
|
|
|
|
vdso, err := loader.PrepareVDSO(k)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error creating vdso: %v", err)
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// Create timekeeper.
|
|
|
|
tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error creating timekeeper: %v", err)
|
|
|
|
}
|
|
|
|
tk.SetClocks(time.NewCalibratedClocks())
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// Create capabilities.
|
|
|
|
caps, err := specutils.Capabilities(spec.Process.Capabilities)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error creating capabilities: %v", err)
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// Convert the spec's additional GIDs to KGIDs.
|
|
|
|
extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
|
|
|
|
for _, GID := range spec.Process.User.AdditionalGids {
|
|
|
|
extraKGIDs = append(extraKGIDs, auth.KGID(GID))
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// Create credentials.
|
|
|
|
creds := auth.NewUserCredentials(
|
|
|
|
auth.KUID(spec.Process.User.UID),
|
|
|
|
auth.KGID(spec.Process.User.GID),
|
|
|
|
extraKGIDs,
|
|
|
|
caps,
|
|
|
|
auth.NewRootUserNamespace())
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// Create user namespace.
|
|
|
|
// TODO: Not clear what domain name should be here. It is
|
|
|
|
// not configurable from runtime spec.
|
|
|
|
utsns := kernel.NewUTSNamespace(spec.Hostname, "", creds.UserNamespace)
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
ipcns := kernel.NewIPCNamespace(creds.UserNamespace)
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
if err := enableStrace(conf); err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to enable strace: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create an empty network stack because the network namespace may be empty at
|
|
|
|
// this point. Netns is configured before Run() is called. Netstack is
|
|
|
|
// configured using a control uRPC message. Host network is configured inside
|
|
|
|
// Run().
|
2018-08-08 17:24:53 +00:00
|
|
|
networkStack, err := newEmptyNetworkStack(conf, k)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create network: %v", err)
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// Initiate the Kernel object, which is required by the Context passed
|
|
|
|
// to createVFS in order to mount (among other things) procfs.
|
|
|
|
if err = k.Init(kernel.InitKernelArgs{
|
|
|
|
FeatureSet: cpuid.HostFeatureSet(),
|
|
|
|
Timekeeper: tk,
|
|
|
|
RootUserNamespace: creds.UserNamespace,
|
|
|
|
NetworkStack: networkStack,
|
|
|
|
// TODO: use number of logical processors from cgroups.
|
|
|
|
ApplicationCores: uint(runtime.NumCPU()),
|
|
|
|
Vdso: vdso,
|
|
|
|
RootUTSNamespace: utsns,
|
|
|
|
RootIPCNamespace: ipcns,
|
|
|
|
}); err != nil {
|
|
|
|
return nil, fmt.Errorf("error initializing kernel: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Turn on packet logging if enabled.
|
|
|
|
if conf.LogPackets {
|
|
|
|
log.Infof("Packet logging enabled")
|
|
|
|
atomic.StoreUint32(&sniffer.LogPackets, 1)
|
|
|
|
} else {
|
|
|
|
log.Infof("Packet logging disabled")
|
|
|
|
atomic.StoreUint32(&sniffer.LogPackets, 0)
|
|
|
|
}
|
|
|
|
|
2018-06-06 18:43:01 +00:00
|
|
|
// Create a watchdog.
|
2018-06-28 16:45:52 +00:00
|
|
|
watchdog := watchdog.New(k, watchdog.DefaultTimeout, conf.WatchdogAction)
|
2018-06-06 18:43:01 +00:00
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Create the control server using the provided FD.
|
|
|
|
//
|
|
|
|
// This must be done *after* we have initialized the kernel since the
|
|
|
|
// controller is used to configure the kernel's network stack.
|
|
|
|
//
|
|
|
|
// This should also be *before* we create the process, since a
|
|
|
|
// misconfigured process will cause an error, and we want the control
|
|
|
|
// server up before that so that we don't time out trying to connect to
|
|
|
|
// it.
|
2018-06-06 18:43:01 +00:00
|
|
|
ctrl, err := newController(controllerFD, k, watchdog)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("error creating control server: %v", err)
|
|
|
|
}
|
|
|
|
|
2018-06-20 04:42:21 +00:00
|
|
|
// We don't care about child signals; some platforms can generate a
|
|
|
|
// tremendous number of useless ones (I'm looking at you, ptrace).
|
|
|
|
if err := sighandling.IgnoreChildStop(); err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to ignore child stop signals: %v", err)
|
|
|
|
}
|
2018-06-21 20:21:25 +00:00
|
|
|
// Ensure that signals received are forwarded to the emulated kernel.
|
|
|
|
stopSignalForwarding := sighandling.PrepareForwarding(k, false)()
|
2018-06-20 04:42:21 +00:00
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
procArgs, err := newProcess(spec, creds, utsns, ipcns, k)
|
2018-07-18 23:57:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to create root process: %v", err)
|
2018-06-20 04:42:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
l := &Loader{
|
|
|
|
k: k,
|
|
|
|
ctrl: ctrl,
|
|
|
|
conf: conf,
|
|
|
|
console: console,
|
|
|
|
watchdog: watchdog,
|
2018-07-18 23:57:29 +00:00
|
|
|
ioFDs: ioFDs,
|
|
|
|
spec: spec,
|
2018-06-20 04:42:21 +00:00
|
|
|
stopSignalForwarding: stopSignalForwarding,
|
|
|
|
rootProcArgs: procArgs,
|
|
|
|
}
|
|
|
|
ctrl.manager.l = l
|
|
|
|
return l, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// newProcess creates a process that can be run with kernel.CreateProcess.
|
2018-08-15 23:24:07 +00:00
|
|
|
func newProcess(spec *specs.Spec, creds *auth.Credentials, utsns *kernel.UTSNamespace, ipcns *kernel.IPCNamespace, k *kernel.Kernel) (kernel.CreateProcessArgs, error) {
|
2018-06-20 04:42:21 +00:00
|
|
|
// Create initial limits.
|
|
|
|
ls, err := createLimitSet(spec)
|
|
|
|
if err != nil {
|
|
|
|
return kernel.CreateProcessArgs{}, fmt.Errorf("error creating limits: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the executable path, which is a bit tricky because we have to
|
|
|
|
// inspect the environment PATH which is relative to the root path.
|
|
|
|
exec, err := specutils.GetExecutablePath(spec.Process.Args[0], spec.Root.Path, spec.Process.Env)
|
|
|
|
if err != nil {
|
|
|
|
return kernel.CreateProcessArgs{}, fmt.Errorf("error getting executable path: %v", err)
|
|
|
|
}
|
|
|
|
|
2018-05-24 21:27:05 +00:00
|
|
|
// Create the process arguments.
|
|
|
|
procArgs := kernel.CreateProcessArgs{
|
2018-06-20 04:42:21 +00:00
|
|
|
Filename: exec,
|
|
|
|
Argv: spec.Process.Args,
|
|
|
|
Envv: spec.Process.Env,
|
2018-06-28 16:56:23 +00:00
|
|
|
WorkingDirectory: spec.Process.Cwd, // Defaults to '/' if empty.
|
2018-06-20 04:42:21 +00:00
|
|
|
Credentials: creds,
|
2018-06-26 20:39:07 +00:00
|
|
|
Umask: 0022,
|
2018-05-24 21:27:05 +00:00
|
|
|
Limits: ls,
|
|
|
|
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
|
|
|
|
UTSNamespace: utsns,
|
|
|
|
IPCNamespace: ipcns,
|
|
|
|
}
|
2018-06-20 04:42:21 +00:00
|
|
|
return procArgs, nil
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Destroy cleans up all resources used by the loader.
|
2018-07-12 20:36:01 +00:00
|
|
|
//
|
|
|
|
// Note that this will block until all open control server connections have
|
|
|
|
// been closed. For that reason, this should NOT be called in a defer, because
|
|
|
|
// a panic in a control server rpc would then hang forever.
|
2018-04-27 17:37:02 +00:00
|
|
|
func (l *Loader) Destroy() {
|
|
|
|
if l.ctrl != nil {
|
|
|
|
l.ctrl.srv.Stop()
|
|
|
|
}
|
|
|
|
l.stopSignalForwarding()
|
|
|
|
l.watchdog.Stop()
|
|
|
|
}
|
|
|
|
|
|
|
|
func createPlatform(conf *Config) (platform.Platform, error) {
|
|
|
|
switch conf.Platform {
|
|
|
|
case PlatformPtrace:
|
|
|
|
log.Infof("Platform: ptrace")
|
|
|
|
return ptrace.New()
|
|
|
|
case PlatformKVM:
|
|
|
|
log.Infof("Platform: kvm")
|
|
|
|
return kvm.New()
|
|
|
|
default:
|
|
|
|
return nil, fmt.Errorf("invalid platform %v", conf.Platform)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-05-17 18:54:36 +00:00
|
|
|
// Run runs the root container..
|
2018-04-27 17:37:02 +00:00
|
|
|
func (l *Loader) Run() error {
|
|
|
|
err := l.run()
|
2018-05-17 18:54:36 +00:00
|
|
|
l.ctrl.manager.startResultChan <- err
|
2018-05-09 21:12:44 +00:00
|
|
|
if err != nil {
|
|
|
|
// Give the controller some time to send the error to the
|
|
|
|
// runtime. If we return too quickly here the process will exit
|
|
|
|
// and the control connection will be closed before the error
|
|
|
|
// is returned.
|
|
|
|
gtime.Sleep(2 * gtime.Second)
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
return nil
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Loader) run() error {
|
|
|
|
if l.conf.Network == NetworkHost {
|
|
|
|
// Delay host network configuration to this point because network namespace
|
|
|
|
// is configured after the loader is created and before Run() is called.
|
|
|
|
log.Debugf("Configuring host network")
|
|
|
|
stack := l.k.NetworkStack().(*hostinet.Stack)
|
|
|
|
if err := stack.Configure(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Finally done with all configuration. Setup filters before user code
|
|
|
|
// is loaded.
|
|
|
|
if l.conf.DisableSeccomp {
|
|
|
|
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
|
|
|
|
} else {
|
|
|
|
whitelistFS := l.conf.FileAccess == FileAccessDirect
|
|
|
|
hostNet := l.conf.Network == NetworkHost
|
|
|
|
if err := filter.Install(l.k.Platform, whitelistFS, l.console, hostNet); err != nil {
|
|
|
|
return fmt.Errorf("Failed to install seccomp filters: %v", err)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
// If we are restoring, we do not want to create a process.
|
2018-07-18 23:57:29 +00:00
|
|
|
// l.restore is set by the container manager when a restore call is made.
|
2018-06-29 21:46:45 +00:00
|
|
|
if !l.restore {
|
2018-07-18 23:57:29 +00:00
|
|
|
err := setFileSystemForProcess(
|
|
|
|
&l.rootProcArgs,
|
|
|
|
l.spec,
|
|
|
|
l.conf,
|
|
|
|
l.ioFDs,
|
|
|
|
l.console,
|
|
|
|
l.rootProcArgs.Credentials,
|
|
|
|
l.rootProcArgs.Limits,
|
2018-08-15 23:24:07 +00:00
|
|
|
l.k,
|
|
|
|
"" /* CID, which isn't needed for the root container */)
|
2018-07-18 23:57:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2018-06-29 21:46:45 +00:00
|
|
|
// Create the root container init task.
|
|
|
|
if _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
|
|
|
|
return fmt.Errorf("failed to create init process: %v", err)
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-06-29 21:46:45 +00:00
|
|
|
// CreateProcess takes a reference on FDMap if successful.
|
|
|
|
l.rootProcArgs.FDMap.DecRef()
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
log.Infof("Process should have started...")
|
2018-04-27 17:37:02 +00:00
|
|
|
l.watchdog.Start()
|
|
|
|
return l.k.Start()
|
|
|
|
}
|
|
|
|
|
2018-06-22 21:30:33 +00:00
|
|
|
// startContainer starts a child container. It returns the thread group ID of
|
|
|
|
// the newly created process.
|
2018-08-15 23:24:07 +00:00
|
|
|
func (l *Loader) startContainer(k *kernel.Kernel, spec *specs.Spec, conf *Config, cid string, file *os.File) (kernel.ThreadID, error) {
|
2018-06-20 04:42:21 +00:00
|
|
|
// Create capabilities.
|
|
|
|
caps, err := specutils.Capabilities(spec.Process.Capabilities)
|
|
|
|
if err != nil {
|
2018-06-22 21:30:33 +00:00
|
|
|
return 0, fmt.Errorf("error creating capabilities: %v", err)
|
2018-06-20 04:42:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Convert the spec's additional GIDs to KGIDs.
|
|
|
|
extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
|
|
|
|
for _, GID := range spec.Process.User.AdditionalGids {
|
|
|
|
extraKGIDs = append(extraKGIDs, auth.KGID(GID))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create credentials. We reuse the root user namespace because the
|
|
|
|
// sentry currently supports only 1 mount namespace, which is tied to a
|
|
|
|
// single user namespace. Thus we must run in the same user namespace
|
|
|
|
// to access mounts.
|
|
|
|
// TODO: Create a new mount namespace for the container.
|
|
|
|
creds := auth.NewUserCredentials(
|
|
|
|
auth.KUID(spec.Process.User.UID),
|
|
|
|
auth.KGID(spec.Process.User.GID),
|
|
|
|
extraKGIDs,
|
|
|
|
caps,
|
|
|
|
l.k.RootUserNamespace())
|
|
|
|
|
|
|
|
// TODO New containers should be started in new PID namespaces
|
|
|
|
// when indicated by the spec.
|
|
|
|
|
|
|
|
procArgs, err := newProcess(
|
2018-08-15 23:24:07 +00:00
|
|
|
spec,
|
2018-06-20 04:42:21 +00:00
|
|
|
creds,
|
2018-08-15 23:24:07 +00:00
|
|
|
l.k.RootUTSNamespace(),
|
|
|
|
l.k.RootIPCNamespace(),
|
|
|
|
l.k)
|
2018-06-20 04:42:21 +00:00
|
|
|
if err != nil {
|
2018-06-22 21:30:33 +00:00
|
|
|
return 0, fmt.Errorf("failed to create new process: %v", err)
|
|
|
|
}
|
2018-07-18 23:57:29 +00:00
|
|
|
err = setFileSystemForProcess(
|
|
|
|
&procArgs,
|
2018-08-15 23:24:07 +00:00
|
|
|
spec,
|
|
|
|
conf,
|
|
|
|
[]int{int(file.Fd())}, // ioFDs
|
2018-07-18 23:57:29 +00:00
|
|
|
false,
|
|
|
|
creds,
|
|
|
|
procArgs.Limits,
|
2018-08-15 23:24:07 +00:00
|
|
|
k,
|
|
|
|
cid)
|
2018-07-18 23:57:29 +00:00
|
|
|
if err != nil {
|
|
|
|
return 0, fmt.Errorf("failed to create new process: %v", err)
|
|
|
|
}
|
2018-06-22 21:30:33 +00:00
|
|
|
|
|
|
|
tg, err := l.k.CreateProcess(procArgs)
|
|
|
|
if err != nil {
|
|
|
|
return 0, fmt.Errorf("failed to create process in sentry: %v", err)
|
2018-06-20 04:42:21 +00:00
|
|
|
}
|
|
|
|
|
2018-08-15 23:24:07 +00:00
|
|
|
ts := l.k.TaskSet()
|
2018-06-22 21:30:33 +00:00
|
|
|
tgid := ts.Root.IDOfThreadGroup(tg)
|
|
|
|
if tgid == 0 {
|
|
|
|
return 0, errors.New("failed to get thread group ID of new process")
|
2018-06-20 04:42:21 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// CreateProcess takes a reference on FDMap if successful.
|
|
|
|
procArgs.FDMap.DecRef()
|
|
|
|
|
2018-06-22 21:30:33 +00:00
|
|
|
l.mu.Lock()
|
|
|
|
defer l.mu.Unlock()
|
2018-08-15 23:24:07 +00:00
|
|
|
l.containerRootTGIDs[cid] = tgid
|
2018-06-22 21:30:33 +00:00
|
|
|
|
|
|
|
return tgid, nil
|
|
|
|
}
|
|
|
|
|
2018-06-28 21:55:46 +00:00
|
|
|
// TODO: Per-container namespaces must be supported
|
|
|
|
// for -pid.
|
|
|
|
|
|
|
|
// waitContainer waits for the root process of a container to exit.
|
|
|
|
func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
|
|
|
|
// Don't defer unlock, as doing so would make it impossible for
|
|
|
|
// multiple clients to wait on the same container.
|
2018-06-22 21:30:33 +00:00
|
|
|
l.mu.Lock()
|
2018-06-28 21:55:46 +00:00
|
|
|
tgid, ok := l.containerRootTGIDs[cid]
|
2018-06-22 21:30:33 +00:00
|
|
|
if !ok {
|
2018-07-13 20:45:13 +00:00
|
|
|
defer l.mu.Unlock()
|
2018-06-28 21:55:46 +00:00
|
|
|
return fmt.Errorf("can't find process for container %q in %v", cid, l.containerRootTGIDs)
|
2018-06-22 21:30:33 +00:00
|
|
|
}
|
2018-07-13 20:45:13 +00:00
|
|
|
l.mu.Unlock()
|
|
|
|
|
2018-06-22 21:30:33 +00:00
|
|
|
// If the thread either has already exited or exits during waiting,
|
|
|
|
// consider the container exited.
|
2018-06-28 21:55:46 +00:00
|
|
|
defer func() {
|
|
|
|
l.mu.Lock()
|
|
|
|
defer l.mu.Unlock()
|
|
|
|
// TODO: Containers don't map 1:1 with their root
|
|
|
|
// processes. Container exits should be managed explicitly
|
|
|
|
// rather than via PID.
|
|
|
|
delete(l.containerRootTGIDs, cid)
|
|
|
|
}()
|
|
|
|
return l.wait(tgid, cid, waitStatus)
|
|
|
|
}
|
|
|
|
|
|
|
|
func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
|
|
|
|
// TODO: Containers all currently share a PID namespace.
|
|
|
|
// When per-container PID namespaces are supported, wait should use cid
|
|
|
|
// to find the appropriate PID namespace.
|
|
|
|
if cid != l.sandboxID {
|
|
|
|
return errors.New("non-sandbox PID namespaces are not yet implemented")
|
|
|
|
}
|
|
|
|
return l.wait(tgid, cid, waitStatus)
|
|
|
|
}
|
2018-06-22 21:30:33 +00:00
|
|
|
|
2018-06-28 21:55:46 +00:00
|
|
|
// wait waits for the process with TGID 'tgid' in a container's PID namespace
|
|
|
|
// to exit.
|
|
|
|
func (l *Loader) wait(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
|
|
|
|
tg := l.k.TaskSet().Root.ThreadGroupWithID(kernel.ThreadID(tgid))
|
2018-06-22 21:30:33 +00:00
|
|
|
if tg == nil {
|
|
|
|
return fmt.Errorf("no thread group with ID %d", tgid)
|
|
|
|
}
|
|
|
|
tg.WaitExited()
|
|
|
|
*waitStatus = tg.ExitStatus().Status()
|
2018-06-20 04:42:21 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-06-22 21:30:33 +00:00
|
|
|
func (l *Loader) setRootContainerID(cid string) {
|
|
|
|
l.mu.Lock()
|
|
|
|
defer l.mu.Unlock()
|
|
|
|
// The root container has PID 1.
|
|
|
|
l.containerRootTGIDs = map[string]kernel.ThreadID{cid: 1}
|
2018-06-28 21:55:46 +00:00
|
|
|
l.sandboxID = cid
|
2018-06-22 21:30:33 +00:00
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// WaitForStartSignal waits for a start signal from the control server.
|
|
|
|
func (l *Loader) WaitForStartSignal() {
|
2018-05-17 18:54:36 +00:00
|
|
|
<-l.ctrl.manager.startChan
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-07-18 23:57:29 +00:00
|
|
|
// NotifyLoaderCreated sends a signal to the container manager that this
|
|
|
|
// loader has been created.
|
|
|
|
func (l *Loader) NotifyLoaderCreated() {
|
|
|
|
l.ctrl.manager.loaderCreatedChan <- struct{}{}
|
|
|
|
}
|
|
|
|
|
2018-05-17 18:54:36 +00:00
|
|
|
// WaitExit waits for the root container to exit, and returns its exit status.
|
2018-04-27 17:37:02 +00:00
|
|
|
func (l *Loader) WaitExit() kernel.ExitStatus {
|
2018-05-17 18:54:36 +00:00
|
|
|
// Wait for container.
|
2018-04-27 17:37:02 +00:00
|
|
|
l.k.WaitExited()
|
|
|
|
|
|
|
|
return l.k.GlobalInit().ExitStatus()
|
|
|
|
}
|
|
|
|
|
2018-08-08 17:24:53 +00:00
|
|
|
func newEmptyNetworkStack(conf *Config, clock tcpip.Clock) (inet.Stack, error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
switch conf.Network {
|
|
|
|
case NetworkHost:
|
2018-08-08 17:24:53 +00:00
|
|
|
return hostinet.NewStack(), nil
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
case NetworkNone, NetworkSandbox:
|
|
|
|
// NetworkNone sets up loopback using netstack.
|
|
|
|
netProtos := []string{ipv4.ProtocolName, ipv6.ProtocolName, arp.ProtocolName}
|
2018-05-02 05:50:55 +00:00
|
|
|
protoNames := []string{tcp.ProtocolName, udp.ProtocolName, ping.ProtocolName4}
|
2018-08-08 17:24:53 +00:00
|
|
|
s := &epsocket.Stack{stack.New(netProtos, protoNames, stack.Options{Clock: clock})}
|
|
|
|
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
|
|
|
|
return nil, fmt.Errorf("failed to enable SACK: %v", err)
|
|
|
|
}
|
|
|
|
return s, nil
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
default:
|
|
|
|
panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
|
|
|
|
}
|
|
|
|
}
|