gvisor/runsc/boot/loader.go

1279 lines
40 KiB
Go
Raw Normal View History

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package boot loads the kernel and runs a container.
package boot
import (
"fmt"
mrand "math/rand"
"os"
"runtime"
"sync/atomic"
"syscall"
gtime "time"
specs "github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/cpuid"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/memutil"
"gvisor.dev/gvisor/pkg/rand"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/control"
"gvisor.dev/gvisor/pkg/sentry/fdimport"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/fs/user"
hostvfs2 "gvisor.dev/gvisor/pkg/sentry/fsimpl/host"
"gvisor.dev/gvisor/pkg/sentry/inet"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/loader"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/sighandling"
"gvisor.dev/gvisor/pkg/sentry/syscalls/linux/vfs2"
"gvisor.dev/gvisor/pkg/sentry/time"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sentry/watchdog"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/tcpip"
"gvisor.dev/gvisor/pkg/tcpip/link/loopback"
"gvisor.dev/gvisor/pkg/tcpip/link/sniffer"
"gvisor.dev/gvisor/pkg/tcpip/network/arp"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv4"
"gvisor.dev/gvisor/pkg/tcpip/network/ipv6"
"gvisor.dev/gvisor/pkg/tcpip/stack"
"gvisor.dev/gvisor/pkg/tcpip/transport/icmp"
"gvisor.dev/gvisor/pkg/tcpip/transport/raw"
"gvisor.dev/gvisor/pkg/tcpip/transport/tcp"
"gvisor.dev/gvisor/pkg/tcpip/transport/udp"
"gvisor.dev/gvisor/runsc/boot/filter"
_ "gvisor.dev/gvisor/runsc/boot/platforms" // register all platforms.
"gvisor.dev/gvisor/runsc/boot/pprof"
"gvisor.dev/gvisor/runsc/specutils"
// Include supported socket providers.
"gvisor.dev/gvisor/pkg/sentry/socket/hostinet"
_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink"
_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/route"
_ "gvisor.dev/gvisor/pkg/sentry/socket/netlink/uevent"
"gvisor.dev/gvisor/pkg/sentry/socket/netstack"
_ "gvisor.dev/gvisor/pkg/sentry/socket/unix"
)
// Loader keeps state needed to start the kernel and run the container..
type Loader struct {
// k is the kernel.
k *kernel.Kernel
// ctrl is the control server.
ctrl *controller
conf *Config
// console is set to true if terminal is enabled.
console bool
watchdog *watchdog.Watchdog
// stdioFDs contains stdin, stdout, and stderr.
stdioFDs []int
// goferFDs are the FDs that attach the sandbox to the gofers.
goferFDs []int
// spec is the base configuration for the root container.
spec *specs.Spec
// stopSignalForwarding disables forwarding of signals to the sandboxed
// container. It should be called when a sandbox is destroyed.
stopSignalForwarding func()
// restore is set to true if we are restoring a container.
restore bool
// rootProcArgs refers to the root sandbox init task.
rootProcArgs kernel.CreateProcessArgs
// sandboxID is the ID for the whole sandbox.
sandboxID string
// mu guards processes.
mu sync.Mutex
// processes maps containers init process and invocation of exec. Root
// processes are keyed with container ID and pid=0, while exec invocations
// have the corresponding pid set.
//
// processes is guardded by mu.
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
processes map[execID]*execProcess
// mountHints provides extra information about mounts for containers that
// apply to the entire pod.
mountHints *podMountHints
}
// execID uniquely identifies a sentry process that is executed in a container.
type execID struct {
cid string
pid kernel.ThreadID
}
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
// execProcess contains the thread group and host TTY of a sentry process.
type execProcess struct {
// tg will be nil for containers that haven't started yet.
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
tg *kernel.ThreadGroup
// tty will be nil if the process is not attached to a terminal.
tty *host.TTYFileOperations
// tty will be nil if the process is not attached to a terminal.
ttyVFS2 *hostvfs2.TTYFileDescription
// pidnsPath is the pid namespace path in spec
pidnsPath string
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
}
func init() {
// Initialize the random number generator.
mrand.Seed(gtime.Now().UnixNano())
}
// Args are the arguments for New().
type Args struct {
// Id is the sandbox ID.
ID string
// Spec is the sandbox specification.
Spec *specs.Spec
// Conf is the system configuration.
Conf *Config
// ControllerFD is the FD to the URPC controller. The Loader takes ownership
// of this FD and may close it at any time.
ControllerFD int
// Device is an optional argument that is passed to the platform. The Loader
// takes ownership of this file and may close it at any time.
Device *os.File
// GoferFDs is an array of FDs used to connect with the Gofer. The Loader
// takes ownership of these FDs and may close them at any time.
GoferFDs []int
// StdioFDs is the stdio for the application. The Loader takes ownership of
// these FDs and may close them at any time.
StdioFDs []int
// Console is set to true if using TTY.
Console bool
// NumCPU is the number of CPUs to create inside the sandbox.
NumCPU int
// TotalMem is the initial amount of total memory to report back to the
// container.
TotalMem uint64
// UserLogFD is the file descriptor to write user logs to.
UserLogFD int
}
checkpoint/restore: make sure the donated stdioFDs have the same value Suppose I start a runsc container using kvm platform like this: $ sudo runsc --debug=true --debug-log=1.txt --platform=kvm run rootbash The donating FD and the corresponding cmdline for runsc-sandbox is: D0313 17:50:12.608203 44389 x:0] Donating FD 3: "1.txt" D0313 17:50:12.608214 44389 x:0] Donating FD 4: "control_server_socket" D0313 17:50:12.608224 44389 x:0] Donating FD 5: "|0" D0313 17:50:12.608229 44389 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:12.608234 44389 x:0] Donating FD 7: "|1" D0313 17:50:12.608238 44389 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:12.608242 44389 x:0] Donating FD 9: "/dev/kvm" D0313 17:50:12.608246 44389 x:0] Donating FD 10: "/dev/stdin" D0313 17:50:12.608249 44389 x:0] Donating FD 11: "/dev/stdout" D0313 17:50:12.608253 44389 x:0] Donating FD 12: "/dev/stderr" D0313 17:50:12.608257 44389 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=kvm --strace=false --strace-syscalls=--strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --device-fd=9 --stdio-fds=10 --stdio-fds=11 --stdio-fds=12 --pidns=true --setup-root --cpu-num 32 --total-memory 4294967296 rootbash] Note stdioFDs starts from 10 with kvm platform and stderr's FD is 12. If I restore a container from the checkpoint image which is derived by checkpointing the above rootbash container, but either omit the platform switch or specify to use ptrace platform explicitely: $ sudo runsc --debug=true --debug-log=1.txt restore --image-path=some_path restored_rootbash the donating FD and corresponding cmdline for runsc-sandbox is: D0313 17:50:15.258632 44452 x:0] Donating FD 3: "1.txt" D0313 17:50:15.258640 44452 x:0] Donating FD 4: "control_server_socket" D0313 17:50:15.258645 44452 x:0] Donating FD 5: "|0" D0313 17:50:15.258648 44452 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:15.258653 44452 x:0] Donating FD 7: "|1" D0313 17:50:15.258657 44452 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:15.258661 44452 x:0] Donating FD 9: "/dev/stdin" D0313 17:50:15.258675 44452 x:0] Donating FD 10: "/dev/stdout" D0313 17:50:15.258680 44452 x:0] Donating FD 11: "/dev/stderr" D0313 17:50:15.258684 44452 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=ptrace --strace=false --strace-syscalls= --strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --stdio-fds=9 --stdio-fds=10 --stdio-fds=11 --setup-root --cpu-num 32 --total-memory 4294967296 restored_rootbash] Note this time, stdioFDs starts from 9 and stderr's FD is 11(so the saved host.descritor.origFD which is 12 for stderr is no longer valid). For the three host FD based files, The s.Dev and s.Ino derived from fstat(fd) shall all be the same and since the two fields are used as device.MultiDeviceKey, the host.inodeFileState.sattr.InodeId which is the value of MultiDevice.Map(MultiDeviceKey), shall also all be the same. Note that for MultiDevice m, m.cache records the mapping of key to value and m.rcache records the mapping of value to key. If same value doesn't map to the same key, it will panic on restore. Now that stderr's origFD 12 is no longer valid(it happens to be /memfd:runsc-memory in my test on restore), the s.Dev and s.Ino derived from fstat(fd=12) in host.inodeFileState.afterLoad() will neither be correct. But its InodeID is still the same as saved, MultiDevice.Load() will complain about the same value(InodeID) being mapped to different keys (different from stdin and stdout's) and panic with: "MultiDevice's caches are inconsistent". Solve this problem by making sure stdioFDs for root container's init task are always the same on initial start and on restore time, no matter what cmdline user has used: debug log specified or not, platform changed or not etc. shall not affect the ability to restore. Fixes #1844.
2020-03-16 07:12:56 +00:00
// make sure stdioFDs are always the same on initial start and on restore
const startingStdioFD = 64
// New initializes a new kernel loader configured by spec.
// New also handles setting up a kernel for restoring a container.
func New(args Args) (*Loader, error) {
// We initialize the rand package now to make sure /dev/urandom is pre-opened
// on kernels that do not support getrandom(2).
if err := rand.Init(); err != nil {
return nil, fmt.Errorf("setting up rand: %v", err)
}
if err := usage.Init(); err != nil {
return nil, fmt.Errorf("setting up memory usage: %v", err)
}
// Is this a VFSv2 kernel?
if args.Conf.VFS2 {
kernel.VFS2Enabled = true
vfs2.Override()
}
// Create kernel and platform.
p, err := createPlatform(args.Conf, args.Device)
if err != nil {
return nil, fmt.Errorf("creating platform: %v", err)
}
k := &kernel.Kernel{
Platform: p,
}
// Create memory file.
mf, err := createMemoryFile()
if err != nil {
return nil, fmt.Errorf("creating memory file: %v", err)
}
k.SetMemoryFile(mf)
// Create VDSO.
//
// Pass k as the platform since it is savable, unlike the actual platform.
vdso, err := loader.PrepareVDSO(k)
if err != nil {
return nil, fmt.Errorf("creating vdso: %v", err)
}
// Create timekeeper.
tk, err := kernel.NewTimekeeper(k, vdso.ParamPage.FileRange())
if err != nil {
return nil, fmt.Errorf("creating timekeeper: %v", err)
}
tk.SetClocks(time.NewCalibratedClocks())
if err := enableStrace(args.Conf); err != nil {
return nil, fmt.Errorf("enabling strace: %v", err)
}
// Create root network namespace/stack.
netns, err := newRootNetworkNamespace(args.Conf, k, k)
if err != nil {
return nil, fmt.Errorf("creating network: %v", err)
}
// Create capabilities.
caps, err := specutils.Capabilities(args.Conf.EnableRaw, args.Spec.Process.Capabilities)
if err != nil {
return nil, fmt.Errorf("converting capabilities: %v", err)
}
// Convert the spec's additional GIDs to KGIDs.
extraKGIDs := make([]auth.KGID, 0, len(args.Spec.Process.User.AdditionalGids))
for _, GID := range args.Spec.Process.User.AdditionalGids {
extraKGIDs = append(extraKGIDs, auth.KGID(GID))
}
// Create credentials.
creds := auth.NewUserCredentials(
auth.KUID(args.Spec.Process.User.UID),
auth.KGID(args.Spec.Process.User.GID),
extraKGIDs,
caps,
auth.NewRootUserNamespace())
if args.NumCPU == 0 {
args.NumCPU = runtime.NumCPU()
}
log.Infof("CPUs: %d", args.NumCPU)
if args.TotalMem > 0 {
// Adjust the total memory returned by the Sentry so that applications that
// use /proc/meminfo can make allocations based on this limit.
usage.MinimumTotalMemoryBytes = args.TotalMem
log.Infof("Setting total memory to %.2f GB", float64(args.TotalMem)/(1<<30))
}
// Initiate the Kernel object, which is required by the Context passed
// to createVFS in order to mount (among other things) procfs.
if err = k.Init(kernel.InitKernelArgs{
FeatureSet: cpuid.HostFeatureSet(),
Timekeeper: tk,
RootUserNamespace: creds.UserNamespace,
RootNetworkNamespace: netns,
ApplicationCores: uint(args.NumCPU),
Vdso: vdso,
RootUTSNamespace: kernel.NewUTSNamespace(args.Spec.Hostname, args.Spec.Hostname, creds.UserNamespace),
RootIPCNamespace: kernel.NewIPCNamespace(creds.UserNamespace),
RootAbstractSocketNamespace: kernel.NewAbstractSocketNamespace(),
PIDNamespace: kernel.NewRootPIDNamespace(creds.UserNamespace),
}); err != nil {
return nil, fmt.Errorf("initializing kernel: %v", err)
}
if err := adjustDirentCache(k); err != nil {
return nil, err
}
// Turn on packet logging if enabled.
if args.Conf.LogPackets {
log.Infof("Packet logging enabled")
atomic.StoreUint32(&sniffer.LogPackets, 1)
} else {
log.Infof("Packet logging disabled")
atomic.StoreUint32(&sniffer.LogPackets, 0)
}
// Create a watchdog.
dogOpts := watchdog.DefaultOpts
dogOpts.TaskTimeoutAction = args.Conf.WatchdogAction
dog := watchdog.New(k, dogOpts)
procArgs, err := newProcess(args.ID, args.Spec, creds, k, k.RootPIDNamespace())
if err != nil {
return nil, fmt.Errorf("creating init process for root container: %v", err)
}
if err := initCompatLogs(args.UserLogFD); err != nil {
return nil, fmt.Errorf("initializing compat logs: %v", err)
}
mountHints, err := newPodMountHints(args.Spec)
if err != nil {
return nil, fmt.Errorf("creating pod mount hints: %v", err)
}
if kernel.VFS2Enabled {
// Set up host mount that will be used for imported fds.
hostFilesystem, err := hostvfs2.NewFilesystem(k.VFS())
if err != nil {
return nil, fmt.Errorf("failed to create hostfs filesystem: %v", err)
}
defer hostFilesystem.DecRef()
hostMount, err := k.VFS().NewDisconnectedMount(hostFilesystem, nil, &vfs.MountOptions{})
if err != nil {
return nil, fmt.Errorf("failed to create hostfs mount: %v", err)
}
k.SetHostMount(hostMount)
}
// Make host FDs stable between invocations. Host FDs must map to the exact
// same number when the sandbox is restored. Otherwise the wrong FD will be
// used.
checkpoint/restore: make sure the donated stdioFDs have the same value Suppose I start a runsc container using kvm platform like this: $ sudo runsc --debug=true --debug-log=1.txt --platform=kvm run rootbash The donating FD and the corresponding cmdline for runsc-sandbox is: D0313 17:50:12.608203 44389 x:0] Donating FD 3: "1.txt" D0313 17:50:12.608214 44389 x:0] Donating FD 4: "control_server_socket" D0313 17:50:12.608224 44389 x:0] Donating FD 5: "|0" D0313 17:50:12.608229 44389 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:12.608234 44389 x:0] Donating FD 7: "|1" D0313 17:50:12.608238 44389 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:12.608242 44389 x:0] Donating FD 9: "/dev/kvm" D0313 17:50:12.608246 44389 x:0] Donating FD 10: "/dev/stdin" D0313 17:50:12.608249 44389 x:0] Donating FD 11: "/dev/stdout" D0313 17:50:12.608253 44389 x:0] Donating FD 12: "/dev/stderr" D0313 17:50:12.608257 44389 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=kvm --strace=false --strace-syscalls=--strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --device-fd=9 --stdio-fds=10 --stdio-fds=11 --stdio-fds=12 --pidns=true --setup-root --cpu-num 32 --total-memory 4294967296 rootbash] Note stdioFDs starts from 10 with kvm platform and stderr's FD is 12. If I restore a container from the checkpoint image which is derived by checkpointing the above rootbash container, but either omit the platform switch or specify to use ptrace platform explicitely: $ sudo runsc --debug=true --debug-log=1.txt restore --image-path=some_path restored_rootbash the donating FD and corresponding cmdline for runsc-sandbox is: D0313 17:50:15.258632 44452 x:0] Donating FD 3: "1.txt" D0313 17:50:15.258640 44452 x:0] Donating FD 4: "control_server_socket" D0313 17:50:15.258645 44452 x:0] Donating FD 5: "|0" D0313 17:50:15.258648 44452 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:15.258653 44452 x:0] Donating FD 7: "|1" D0313 17:50:15.258657 44452 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:15.258661 44452 x:0] Donating FD 9: "/dev/stdin" D0313 17:50:15.258675 44452 x:0] Donating FD 10: "/dev/stdout" D0313 17:50:15.258680 44452 x:0] Donating FD 11: "/dev/stderr" D0313 17:50:15.258684 44452 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=ptrace --strace=false --strace-syscalls= --strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --stdio-fds=9 --stdio-fds=10 --stdio-fds=11 --setup-root --cpu-num 32 --total-memory 4294967296 restored_rootbash] Note this time, stdioFDs starts from 9 and stderr's FD is 11(so the saved host.descritor.origFD which is 12 for stderr is no longer valid). For the three host FD based files, The s.Dev and s.Ino derived from fstat(fd) shall all be the same and since the two fields are used as device.MultiDeviceKey, the host.inodeFileState.sattr.InodeId which is the value of MultiDevice.Map(MultiDeviceKey), shall also all be the same. Note that for MultiDevice m, m.cache records the mapping of key to value and m.rcache records the mapping of value to key. If same value doesn't map to the same key, it will panic on restore. Now that stderr's origFD 12 is no longer valid(it happens to be /memfd:runsc-memory in my test on restore), the s.Dev and s.Ino derived from fstat(fd=12) in host.inodeFileState.afterLoad() will neither be correct. But its InodeID is still the same as saved, MultiDevice.Load() will complain about the same value(InodeID) being mapped to different keys (different from stdin and stdout's) and panic with: "MultiDevice's caches are inconsistent". Solve this problem by making sure stdioFDs for root container's init task are always the same on initial start and on restore time, no matter what cmdline user has used: debug log specified or not, platform changed or not etc. shall not affect the ability to restore. Fixes #1844.
2020-03-16 07:12:56 +00:00
var stdioFDs []int
newfd := startingStdioFD
for _, fd := range args.StdioFDs {
err := syscall.Dup3(fd, newfd, syscall.O_CLOEXEC)
if err != nil {
return nil, fmt.Errorf("dup3 of stdioFDs failed: %v", err)
}
stdioFDs = append(stdioFDs, newfd)
err = syscall.Close(fd)
if err != nil {
return nil, fmt.Errorf("close original stdioFDs failed: %v", err)
}
newfd++
}
eid := execID{cid: args.ID}
l := &Loader{
k: k,
conf: args.Conf,
console: args.Console,
watchdog: dog,
spec: args.Spec,
goferFDs: args.GoferFDs,
checkpoint/restore: make sure the donated stdioFDs have the same value Suppose I start a runsc container using kvm platform like this: $ sudo runsc --debug=true --debug-log=1.txt --platform=kvm run rootbash The donating FD and the corresponding cmdline for runsc-sandbox is: D0313 17:50:12.608203 44389 x:0] Donating FD 3: "1.txt" D0313 17:50:12.608214 44389 x:0] Donating FD 4: "control_server_socket" D0313 17:50:12.608224 44389 x:0] Donating FD 5: "|0" D0313 17:50:12.608229 44389 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:12.608234 44389 x:0] Donating FD 7: "|1" D0313 17:50:12.608238 44389 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:12.608242 44389 x:0] Donating FD 9: "/dev/kvm" D0313 17:50:12.608246 44389 x:0] Donating FD 10: "/dev/stdin" D0313 17:50:12.608249 44389 x:0] Donating FD 11: "/dev/stdout" D0313 17:50:12.608253 44389 x:0] Donating FD 12: "/dev/stderr" D0313 17:50:12.608257 44389 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=kvm --strace=false --strace-syscalls=--strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --device-fd=9 --stdio-fds=10 --stdio-fds=11 --stdio-fds=12 --pidns=true --setup-root --cpu-num 32 --total-memory 4294967296 rootbash] Note stdioFDs starts from 10 with kvm platform and stderr's FD is 12. If I restore a container from the checkpoint image which is derived by checkpointing the above rootbash container, but either omit the platform switch or specify to use ptrace platform explicitely: $ sudo runsc --debug=true --debug-log=1.txt restore --image-path=some_path restored_rootbash the donating FD and corresponding cmdline for runsc-sandbox is: D0313 17:50:15.258632 44452 x:0] Donating FD 3: "1.txt" D0313 17:50:15.258640 44452 x:0] Donating FD 4: "control_server_socket" D0313 17:50:15.258645 44452 x:0] Donating FD 5: "|0" D0313 17:50:15.258648 44452 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:15.258653 44452 x:0] Donating FD 7: "|1" D0313 17:50:15.258657 44452 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:15.258661 44452 x:0] Donating FD 9: "/dev/stdin" D0313 17:50:15.258675 44452 x:0] Donating FD 10: "/dev/stdout" D0313 17:50:15.258680 44452 x:0] Donating FD 11: "/dev/stderr" D0313 17:50:15.258684 44452 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=ptrace --strace=false --strace-syscalls= --strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --stdio-fds=9 --stdio-fds=10 --stdio-fds=11 --setup-root --cpu-num 32 --total-memory 4294967296 restored_rootbash] Note this time, stdioFDs starts from 9 and stderr's FD is 11(so the saved host.descritor.origFD which is 12 for stderr is no longer valid). For the three host FD based files, The s.Dev and s.Ino derived from fstat(fd) shall all be the same and since the two fields are used as device.MultiDeviceKey, the host.inodeFileState.sattr.InodeId which is the value of MultiDevice.Map(MultiDeviceKey), shall also all be the same. Note that for MultiDevice m, m.cache records the mapping of key to value and m.rcache records the mapping of value to key. If same value doesn't map to the same key, it will panic on restore. Now that stderr's origFD 12 is no longer valid(it happens to be /memfd:runsc-memory in my test on restore), the s.Dev and s.Ino derived from fstat(fd=12) in host.inodeFileState.afterLoad() will neither be correct. But its InodeID is still the same as saved, MultiDevice.Load() will complain about the same value(InodeID) being mapped to different keys (different from stdin and stdout's) and panic with: "MultiDevice's caches are inconsistent". Solve this problem by making sure stdioFDs for root container's init task are always the same on initial start and on restore time, no matter what cmdline user has used: debug log specified or not, platform changed or not etc. shall not affect the ability to restore. Fixes #1844.
2020-03-16 07:12:56 +00:00
stdioFDs: stdioFDs,
rootProcArgs: procArgs,
sandboxID: args.ID,
processes: map[execID]*execProcess{eid: {}},
mountHints: mountHints,
}
// We don't care about child signals; some platforms can generate a
// tremendous number of useless ones (I'm looking at you, ptrace).
if err := sighandling.IgnoreChildStop(); err != nil {
return nil, fmt.Errorf("ignore child stop signals failed: %v", err)
}
// Create the control server using the provided FD.
//
// This must be done *after* we have initialized the kernel since the
// controller is used to configure the kernel's network stack.
ctrl, err := newController(args.ControllerFD, l)
if err != nil {
return nil, fmt.Errorf("creating control server: %v", err)
}
l.ctrl = ctrl
// Only start serving after Loader is set to controller and controller is set
// to Loader, because they are both used in the urpc methods.
if err := ctrl.srv.StartServing(); err != nil {
return nil, fmt.Errorf("starting control server: %v", err)
}
return l, nil
}
// newProcess creates a process that can be run with kernel.CreateProcess.
func newProcess(id string, spec *specs.Spec, creds *auth.Credentials, k *kernel.Kernel, pidns *kernel.PIDNamespace) (kernel.CreateProcessArgs, error) {
// Create initial limits.
ls, err := createLimitSet(spec)
if err != nil {
return kernel.CreateProcessArgs{}, fmt.Errorf("creating limits: %v", err)
}
wd := spec.Process.Cwd
if wd == "" {
wd = "/"
}
// Create the process arguments.
procArgs := kernel.CreateProcessArgs{
Argv: spec.Process.Args,
Envv: spec.Process.Env,
WorkingDirectory: wd,
Credentials: creds,
Umask: 0022,
Limits: ls,
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
UTSNamespace: k.RootUTSNamespace(),
IPCNamespace: k.RootIPCNamespace(),
AbstractSocketNamespace: k.RootAbstractSocketNamespace(),
ContainerID: id,
PIDNamespace: pidns,
}
return procArgs, nil
}
// Destroy cleans up all resources used by the loader.
//
// Note that this will block until all open control server connections have
// been closed. For that reason, this should NOT be called in a defer, because
// a panic in a control server rpc would then hang forever.
func (l *Loader) Destroy() {
if l.ctrl != nil {
l.ctrl.srv.Stop()
}
if l.stopSignalForwarding != nil {
l.stopSignalForwarding()
}
l.watchdog.Stop()
}
func createPlatform(conf *Config, deviceFile *os.File) (platform.Platform, error) {
p, err := platform.Lookup(conf.Platform)
if err != nil {
panic(fmt.Sprintf("invalid platform %v: %v", conf.Platform, err))
}
log.Infof("Platform: %s", conf.Platform)
return p.New(deviceFile)
}
func createMemoryFile() (*pgalloc.MemoryFile, error) {
const memfileName = "runsc-memory"
memfd, err := memutil.CreateMemFD(memfileName, 0)
if err != nil {
return nil, fmt.Errorf("error creating memfd: %v", err)
}
memfile := os.NewFile(uintptr(memfd), memfileName)
// We can't enable pgalloc.MemoryFileOpts.UseHostMemcgPressure even if
// there are memory cgroups specified, because at this point we're already
// in a mount namespace in which the relevant cgroupfs is not visible.
mf, err := pgalloc.NewMemoryFile(memfile, pgalloc.MemoryFileOpts{})
if err != nil {
memfile.Close()
return nil, fmt.Errorf("error creating pgalloc.MemoryFile: %v", err)
}
return mf, nil
}
func (l *Loader) installSeccompFilters() error {
if l.conf.DisableSeccomp {
filter.Report("syscall filter is DISABLED. Running in less secure mode.")
} else {
opts := filter.Options{
Platform: l.k.Platform,
HostNetwork: l.conf.Network == NetworkHost,
ProfileEnable: l.conf.ProfileEnable,
ControllerFD: l.ctrl.srv.FD(),
}
if err := filter.Install(opts); err != nil {
return fmt.Errorf("installing seccomp filters: %v", err)
}
}
return nil
}
// Run runs the root container.
func (l *Loader) Run() error {
err := l.run()
l.ctrl.manager.startResultChan <- err
if err != nil {
// Give the controller some time to send the error to the
// runtime. If we return too quickly here the process will exit
// and the control connection will be closed before the error
// is returned.
gtime.Sleep(2 * gtime.Second)
return err
}
return nil
}
func (l *Loader) run() error {
if l.conf.Network == NetworkHost {
// Delay host network configuration to this point because network namespace
// is configured after the loader is created and before Run() is called.
log.Debugf("Configuring host network")
stack := l.k.RootNetworkNamespace().Stack().(*hostinet.Stack)
if err := stack.Configure(); err != nil {
return err
}
}
l.mu.Lock()
defer l.mu.Unlock()
eid := execID{cid: l.sandboxID}
ep, ok := l.processes[eid]
if !ok {
return fmt.Errorf("trying to start deleted container %q", l.sandboxID)
}
// If we are restoring, we do not want to create a process.
// l.restore is set by the container manager when a restore call is made.
var ttyFile *host.TTYFileOperations
var ttyFileVFS2 *hostvfs2.TTYFileDescription
if !l.restore {
if l.conf.ProfileEnable {
pprof.Initialize()
}
// Finally done with all configuration. Setup filters before user code
// is loaded.
if err := l.installSeccompFilters(); err != nil {
return err
}
// Create the FD map, which will set stdin, stdout, and stderr. If console
// is true, then ioctl calls will be passed through to the host fd.
ctx := l.rootProcArgs.NewContext(l.k)
var err error
// CreateProcess takes a reference on FDMap if successful. We won't need
// ours either way.
l.rootProcArgs.FDTable, ttyFile, ttyFileVFS2, err = createFDTable(ctx, l.console, l.stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
// Setup the root container file system.
l.startGoferMonitor(l.sandboxID, l.goferFDs)
mntr := newContainerMounter(l.spec, l.goferFDs, l.k, l.mountHints)
if err := mntr.processHints(l.conf); err != nil {
return err
}
if err := setupContainerFS(ctx, l.conf, mntr, &l.rootProcArgs); err != nil {
return err
}
// Add the HOME enviroment variable if it is not already set.
var envv []string
if kernel.VFS2Enabled {
envv, err = user.MaybeAddExecUserHomeVFS2(ctx, l.rootProcArgs.MountNamespaceVFS2,
l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
} else {
envv, err = user.MaybeAddExecUserHome(ctx, l.rootProcArgs.MountNamespace,
l.rootProcArgs.Credentials.RealKUID, l.rootProcArgs.Envv)
}
if err != nil {
return err
}
l.rootProcArgs.Envv = envv
// Create the root container init task. It will begin running
// when the kernel is started.
if _, _, err := l.k.CreateProcess(l.rootProcArgs); err != nil {
return fmt.Errorf("creating init process: %v", err)
}
// CreateProcess takes a reference on FDTable if successful.
l.rootProcArgs.FDTable.DecRef()
}
ep.tg = l.k.GlobalInit()
if ns, ok := specutils.GetNS(specs.PIDNamespace, l.spec); ok {
ep.pidnsPath = ns.Path
}
if l.console {
// Set the foreground process group on the TTY to the global init process
// group, since that is what we are about to start running.
switch {
case ttyFileVFS2 != nil:
ep.ttyVFS2 = ttyFileVFS2
ttyFileVFS2.InitForegroundProcessGroup(ep.tg.ProcessGroup())
case ttyFile != nil:
ep.tty = ttyFile
ttyFile.InitForegroundProcessGroup(ep.tg.ProcessGroup())
}
}
// Handle signals by forwarding them to the root container process
// (except for panic signal, which should cause a panic).
l.stopSignalForwarding = sighandling.StartSignalForwarding(func(sig linux.Signal) {
// Panic signal should cause a panic.
if l.conf.PanicSignal != -1 && sig == linux.Signal(l.conf.PanicSignal) {
panic("Signal-induced panic")
}
// Otherwise forward to root container.
deliveryMode := DeliverToProcess
if l.console {
// Since we are running with a console, we should forward the signal to
// the foreground process group so that job control signals like ^C can
// be handled properly.
deliveryMode = DeliverToForegroundProcessGroup
}
log.Infof("Received external signal %d, mode: %v", sig, deliveryMode)
if err := l.signal(l.sandboxID, 0, int32(sig), deliveryMode); err != nil {
log.Warningf("error sending signal %v to container %q: %v", sig, l.sandboxID, err)
}
})
checkpoint/restore: make sure the donated stdioFDs have the same value Suppose I start a runsc container using kvm platform like this: $ sudo runsc --debug=true --debug-log=1.txt --platform=kvm run rootbash The donating FD and the corresponding cmdline for runsc-sandbox is: D0313 17:50:12.608203 44389 x:0] Donating FD 3: "1.txt" D0313 17:50:12.608214 44389 x:0] Donating FD 4: "control_server_socket" D0313 17:50:12.608224 44389 x:0] Donating FD 5: "|0" D0313 17:50:12.608229 44389 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:12.608234 44389 x:0] Donating FD 7: "|1" D0313 17:50:12.608238 44389 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:12.608242 44389 x:0] Donating FD 9: "/dev/kvm" D0313 17:50:12.608246 44389 x:0] Donating FD 10: "/dev/stdin" D0313 17:50:12.608249 44389 x:0] Donating FD 11: "/dev/stdout" D0313 17:50:12.608253 44389 x:0] Donating FD 12: "/dev/stderr" D0313 17:50:12.608257 44389 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=kvm --strace=false --strace-syscalls=--strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --device-fd=9 --stdio-fds=10 --stdio-fds=11 --stdio-fds=12 --pidns=true --setup-root --cpu-num 32 --total-memory 4294967296 rootbash] Note stdioFDs starts from 10 with kvm platform and stderr's FD is 12. If I restore a container from the checkpoint image which is derived by checkpointing the above rootbash container, but either omit the platform switch or specify to use ptrace platform explicitely: $ sudo runsc --debug=true --debug-log=1.txt restore --image-path=some_path restored_rootbash the donating FD and corresponding cmdline for runsc-sandbox is: D0313 17:50:15.258632 44452 x:0] Donating FD 3: "1.txt" D0313 17:50:15.258640 44452 x:0] Donating FD 4: "control_server_socket" D0313 17:50:15.258645 44452 x:0] Donating FD 5: "|0" D0313 17:50:15.258648 44452 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:15.258653 44452 x:0] Donating FD 7: "|1" D0313 17:50:15.258657 44452 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:15.258661 44452 x:0] Donating FD 9: "/dev/stdin" D0313 17:50:15.258675 44452 x:0] Donating FD 10: "/dev/stdout" D0313 17:50:15.258680 44452 x:0] Donating FD 11: "/dev/stderr" D0313 17:50:15.258684 44452 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=ptrace --strace=false --strace-syscalls= --strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --stdio-fds=9 --stdio-fds=10 --stdio-fds=11 --setup-root --cpu-num 32 --total-memory 4294967296 restored_rootbash] Note this time, stdioFDs starts from 9 and stderr's FD is 11(so the saved host.descritor.origFD which is 12 for stderr is no longer valid). For the three host FD based files, The s.Dev and s.Ino derived from fstat(fd) shall all be the same and since the two fields are used as device.MultiDeviceKey, the host.inodeFileState.sattr.InodeId which is the value of MultiDevice.Map(MultiDeviceKey), shall also all be the same. Note that for MultiDevice m, m.cache records the mapping of key to value and m.rcache records the mapping of value to key. If same value doesn't map to the same key, it will panic on restore. Now that stderr's origFD 12 is no longer valid(it happens to be /memfd:runsc-memory in my test on restore), the s.Dev and s.Ino derived from fstat(fd=12) in host.inodeFileState.afterLoad() will neither be correct. But its InodeID is still the same as saved, MultiDevice.Load() will complain about the same value(InodeID) being mapped to different keys (different from stdin and stdout's) and panic with: "MultiDevice's caches are inconsistent". Solve this problem by making sure stdioFDs for root container's init task are always the same on initial start and on restore time, no matter what cmdline user has used: debug log specified or not, platform changed or not etc. shall not affect the ability to restore. Fixes #1844.
2020-03-16 07:12:56 +00:00
// l.stdioFDs are derived from dup() in boot.New() and they are now dup()ed again
// either in createFDTable() during initial start or in descriptor.initAfterLoad()
// during restore, we can release l.stdioFDs now. VFS2 takes ownership of the
// passed FDs, so only close for VFS1.
if !kernel.VFS2Enabled {
for _, fd := range l.stdioFDs {
err := syscall.Close(fd)
if err != nil {
return fmt.Errorf("close dup()ed stdioFDs: %v", err)
}
checkpoint/restore: make sure the donated stdioFDs have the same value Suppose I start a runsc container using kvm platform like this: $ sudo runsc --debug=true --debug-log=1.txt --platform=kvm run rootbash The donating FD and the corresponding cmdline for runsc-sandbox is: D0313 17:50:12.608203 44389 x:0] Donating FD 3: "1.txt" D0313 17:50:12.608214 44389 x:0] Donating FD 4: "control_server_socket" D0313 17:50:12.608224 44389 x:0] Donating FD 5: "|0" D0313 17:50:12.608229 44389 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:12.608234 44389 x:0] Donating FD 7: "|1" D0313 17:50:12.608238 44389 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:12.608242 44389 x:0] Donating FD 9: "/dev/kvm" D0313 17:50:12.608246 44389 x:0] Donating FD 10: "/dev/stdin" D0313 17:50:12.608249 44389 x:0] Donating FD 11: "/dev/stdout" D0313 17:50:12.608253 44389 x:0] Donating FD 12: "/dev/stderr" D0313 17:50:12.608257 44389 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=kvm --strace=false --strace-syscalls=--strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --device-fd=9 --stdio-fds=10 --stdio-fds=11 --stdio-fds=12 --pidns=true --setup-root --cpu-num 32 --total-memory 4294967296 rootbash] Note stdioFDs starts from 10 with kvm platform and stderr's FD is 12. If I restore a container from the checkpoint image which is derived by checkpointing the above rootbash container, but either omit the platform switch or specify to use ptrace platform explicitely: $ sudo runsc --debug=true --debug-log=1.txt restore --image-path=some_path restored_rootbash the donating FD and corresponding cmdline for runsc-sandbox is: D0313 17:50:15.258632 44452 x:0] Donating FD 3: "1.txt" D0313 17:50:15.258640 44452 x:0] Donating FD 4: "control_server_socket" D0313 17:50:15.258645 44452 x:0] Donating FD 5: "|0" D0313 17:50:15.258648 44452 x:0] Donating FD 6: "/home/ziqian.lzq/bundle/bash/runsc/config.json" D0313 17:50:15.258653 44452 x:0] Donating FD 7: "|1" D0313 17:50:15.258657 44452 x:0] Donating FD 8: "sandbox IO FD" D0313 17:50:15.258661 44452 x:0] Donating FD 9: "/dev/stdin" D0313 17:50:15.258675 44452 x:0] Donating FD 10: "/dev/stdout" D0313 17:50:15.258680 44452 x:0] Donating FD 11: "/dev/stderr" D0313 17:50:15.258684 44452 x:0] Starting sandbox: /proc/self/exe [runsc-sandbox --root=/run/containerd/runsc/default --debug=true --log= --max-threads=256 --reclaim-period=5 --log-format=text --debug-log=1.txt --debug-log-format=text --file-access=exclusive --overlay=false --fsgofer-host-uds=false --network=sandbox --log-packets=false --platform=ptrace --strace=false --strace-syscalls= --strace-log-size=1024 --watchdog-action=Panic --panic-signal=-1 --profile=false --net-raw=true --num-network-channels=1 --rootless=false --alsologtostderr=false --ref-leak-mode=disabled --gso=true --software-gso=true --overlayfs-stale-read=false --shared-volume= --debug-log-fd=3 --panic-signal=15 boot --bundle=/home/ziqian.lzq/bundle/bash/runsc --controller-fd=4 --mounts-fd=5 --spec-fd=6 --start-sync-fd=7 --io-fds=8 --stdio-fds=9 --stdio-fds=10 --stdio-fds=11 --setup-root --cpu-num 32 --total-memory 4294967296 restored_rootbash] Note this time, stdioFDs starts from 9 and stderr's FD is 11(so the saved host.descritor.origFD which is 12 for stderr is no longer valid). For the three host FD based files, The s.Dev and s.Ino derived from fstat(fd) shall all be the same and since the two fields are used as device.MultiDeviceKey, the host.inodeFileState.sattr.InodeId which is the value of MultiDevice.Map(MultiDeviceKey), shall also all be the same. Note that for MultiDevice m, m.cache records the mapping of key to value and m.rcache records the mapping of value to key. If same value doesn't map to the same key, it will panic on restore. Now that stderr's origFD 12 is no longer valid(it happens to be /memfd:runsc-memory in my test on restore), the s.Dev and s.Ino derived from fstat(fd=12) in host.inodeFileState.afterLoad() will neither be correct. But its InodeID is still the same as saved, MultiDevice.Load() will complain about the same value(InodeID) being mapped to different keys (different from stdin and stdout's) and panic with: "MultiDevice's caches are inconsistent". Solve this problem by making sure stdioFDs for root container's init task are always the same on initial start and on restore time, no matter what cmdline user has used: debug log specified or not, platform changed or not etc. shall not affect the ability to restore. Fixes #1844.
2020-03-16 07:12:56 +00:00
}
}
log.Infof("Process should have started...")
l.watchdog.Start()
return l.k.Start()
}
// createContainer creates a new container inside the sandbox.
func (l *Loader) createContainer(cid string) error {
l.mu.Lock()
defer l.mu.Unlock()
eid := execID{cid: cid}
if _, ok := l.processes[eid]; ok {
return fmt.Errorf("container %q already exists", cid)
}
l.processes[eid] = &execProcess{}
return nil
}
// startContainer starts a child container. It returns the thread group ID of
// the newly created process. Caller owns 'files' and may close them after
// this method returns.
func (l *Loader) startContainer(spec *specs.Spec, conf *Config, cid string, files []*os.File) error {
// Create capabilities.
caps, err := specutils.Capabilities(conf.EnableRaw, spec.Process.Capabilities)
if err != nil {
return fmt.Errorf("creating capabilities: %v", err)
}
l.mu.Lock()
defer l.mu.Unlock()
eid := execID{cid: cid}
if _, ok := l.processes[eid]; !ok {
return fmt.Errorf("trying to start a deleted container %q", cid)
}
// Convert the spec's additional GIDs to KGIDs.
extraKGIDs := make([]auth.KGID, 0, len(spec.Process.User.AdditionalGids))
for _, GID := range spec.Process.User.AdditionalGids {
extraKGIDs = append(extraKGIDs, auth.KGID(GID))
}
// Create credentials. We reuse the root user namespace because the
// sentry currently supports only 1 mount namespace, which is tied to a
// single user namespace. Thus we must run in the same user namespace
// to access mounts.
creds := auth.NewUserCredentials(
auth.KUID(spec.Process.User.UID),
auth.KGID(spec.Process.User.GID),
extraKGIDs,
caps,
l.k.RootUserNamespace())
var pidns *kernel.PIDNamespace
if ns, ok := specutils.GetNS(specs.PIDNamespace, spec); ok {
if ns.Path != "" {
for _, p := range l.processes {
if ns.Path == p.pidnsPath {
pidns = p.tg.PIDNamespace()
break
}
}
}
if pidns == nil {
pidns = l.k.RootPIDNamespace().NewChild(l.k.RootUserNamespace())
}
l.processes[eid].pidnsPath = ns.Path
} else {
pidns = l.k.RootPIDNamespace()
}
procArgs, err := newProcess(cid, spec, creds, l.k, pidns)
if err != nil {
return fmt.Errorf("creating new process: %v", err)
}
// setupContainerFS() dups stdioFDs, so we don't need to dup them here.
var stdioFDs []int
for _, f := range files[:3] {
stdioFDs = append(stdioFDs, int(f.Fd()))
}
// Create the FD map, which will set stdin, stdout, and stderr.
ctx := procArgs.NewContext(l.k)
fdTable, _, _, err := createFDTable(ctx, false, stdioFDs)
if err != nil {
return fmt.Errorf("importing fds: %v", err)
}
// CreateProcess takes a reference on fdTable if successful. We won't
// need ours either way.
procArgs.FDTable = fdTable
// Can't take ownership away from os.File. dup them to get a new FDs.
var goferFDs []int
for _, f := range files[3:] {
fd, err := syscall.Dup(int(f.Fd()))
if err != nil {
return fmt.Errorf("failed to dup file: %v", err)
}
goferFDs = append(goferFDs, fd)
}
// Setup the child container file system.
l.startGoferMonitor(cid, goferFDs)
mntr := newContainerMounter(spec, goferFDs, l.k, l.mountHints)
if err := setupContainerFS(ctx, conf, mntr, &procArgs); err != nil {
return err
}
// Add the HOME enviroment variable if it is not already set.
var envv []string
if kernel.VFS2Enabled {
envv, err = user.MaybeAddExecUserHomeVFS2(ctx, procArgs.MountNamespaceVFS2,
procArgs.Credentials.RealKUID, procArgs.Envv)
} else {
envv, err = user.MaybeAddExecUserHome(ctx, procArgs.MountNamespace,
procArgs.Credentials.RealKUID, procArgs.Envv)
}
if err != nil {
return err
}
procArgs.Envv = envv
// Create and start the new process.
tg, _, err := l.k.CreateProcess(procArgs)
if err != nil {
return fmt.Errorf("creating process: %v", err)
}
l.k.StartProcess(tg)
// CreateProcess takes a reference on FDTable if successful.
procArgs.FDTable.DecRef()
l.processes[eid].tg = tg
return nil
}
// startGoferMonitor runs a goroutine to monitor gofer's health. It polls on
// the gofer FDs looking for disconnects, and destroys the container if a
// disconnect occurs in any of the gofer FDs.
func (l *Loader) startGoferMonitor(cid string, goferFDs []int) {
go func() {
log.Debugf("Monitoring gofer health for container %q", cid)
var events []unix.PollFd
for _, fd := range goferFDs {
events = append(events, unix.PollFd{
Fd: int32(fd),
Events: unix.POLLHUP | unix.POLLRDHUP,
})
}
_, _, err := specutils.RetryEintr(func() (uintptr, uintptr, error) {
// Use ppoll instead of poll because it's already whilelisted in seccomp.
n, err := unix.Ppoll(events, nil, nil)
return uintptr(n), 0, err
})
if err != nil {
panic(fmt.Sprintf("Error monitoring gofer FDs: %v", err))
}
// Check if the gofer has stopped as part of normal container destruction.
// This is done just to avoid sending an annoying error message to the log.
// Note that there is a small race window in between mu.Unlock() and the
// lock being reacquired in destroyContainer(), but it's harmless to call
// destroyContainer() multiple times.
l.mu.Lock()
_, ok := l.processes[execID{cid: cid}]
l.mu.Unlock()
if ok {
log.Infof("Gofer socket disconnected, destroying container %q", cid)
if err := l.destroyContainer(cid); err != nil {
log.Warningf("Error destroying container %q after gofer stopped: %v", cid, err)
}
}
}()
}
// destroyContainer stops a container if it is still running and cleans up its
// filesystem.
func (l *Loader) destroyContainer(cid string) error {
l.mu.Lock()
defer l.mu.Unlock()
tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid})
if err != nil {
// Container doesn't exist.
return err
}
// The container exists, but has it been started?
if tg != nil {
if err := l.signalAllProcesses(cid, int32(linux.SIGKILL)); err != nil {
return fmt.Errorf("sending SIGKILL to all container processes: %v", err)
}
// Wait for all processes that belong to the container to exit (including
// exec'd processes).
for _, t := range l.k.TaskSet().Root.Tasks() {
if t.ContainerID() == cid {
t.ThreadGroup().WaitExited()
}
}
// At this point, all processes inside of the container have exited,
// releasing all references to the container's MountNamespace and
// causing all submounts and overlays to be unmounted.
//
// Since the container's MountNamespace has been released,
// MountNamespace.destroy() will have executed, but that function may
// trigger async close operations. We must wait for those to complete
// before returning, otherwise the caller may kill the gofer before
// they complete, causing a cascade of failing RPCs.
fs.AsyncBarrier()
}
// No more failure from this point on. Remove all container thread groups
// from the map.
for key := range l.processes {
if key.cid == cid {
delete(l.processes, key)
}
}
log.Debugf("Container destroyed %q", cid)
return nil
}
func (l *Loader) executeAsync(args *control.ExecArgs) (kernel.ThreadID, error) {
// Hold the lock for the entire operation to ensure that exec'd process is
// added to 'processes' in case it races with destroyContainer().
l.mu.Lock()
defer l.mu.Unlock()
tg, err := l.tryThreadGroupFromIDLocked(execID{cid: args.ContainerID})
if err != nil {
return 0, err
}
if tg == nil {
return 0, fmt.Errorf("container %q not started", args.ContainerID)
}
// Get the container MountNamespace from the Task.
if kernel.VFS2Enabled {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
args.MountNamespaceVFS2 = tg.Leader().MountNamespaceVFS2()
args.MountNamespaceVFS2.IncRef()
} else {
tg.Leader().WithMuLocked(func(t *kernel.Task) {
// task.MountNamespace() does not take a ref, so we must do so ourselves.
args.MountNamespace = t.MountNamespace()
args.MountNamespace.IncRef()
})
}
// Add the HOME environment variable if it is not already set.
if kernel.VFS2Enabled {
defer args.MountNamespaceVFS2.DecRef()
root := args.MountNamespaceVFS2.Root()
defer root.DecRef()
ctx := vfs.WithRoot(l.k.SupervisorContext(), root)
envv, err := user.MaybeAddExecUserHomeVFS2(ctx, args.MountNamespaceVFS2, args.KUID, args.Envv)
if err != nil {
return 0, err
}
args.Envv = envv
} else {
defer args.MountNamespace.DecRef()
root := args.MountNamespace.Root()
defer root.DecRef()
ctx := fs.WithRoot(l.k.SupervisorContext(), root)
envv, err := user.MaybeAddExecUserHome(ctx, args.MountNamespace, args.KUID, args.Envv)
if err != nil {
return 0, err
}
args.Envv = envv
}
// Start the process.
proc := control.Proc{Kernel: l.k}
args.PIDNamespace = tg.PIDNamespace()
newTG, tgid, ttyFile, ttyFileVFS2, err := control.ExecAsync(&proc, args)
if err != nil {
return 0, err
}
eid := execID{cid: args.ContainerID, pid: tgid}
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
l.processes[eid] = &execProcess{
tg: newTG,
tty: ttyFile,
ttyVFS2: ttyFileVFS2,
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
}
log.Debugf("updated processes: %v", l.processes)
return tgid, nil
}
// waitContainer waits for the init process of a container to exit.
func (l *Loader) waitContainer(cid string, waitStatus *uint32) error {
// Don't defer unlock, as doing so would make it impossible for
// multiple clients to wait on the same container.
tg, err := l.threadGroupFromID(execID{cid: cid})
if err != nil {
return fmt.Errorf("can't wait for container %q: %v", cid, err)
}
// If the thread either has already exited or exits during waiting,
// consider the container exited.
ws := l.wait(tg)
*waitStatus = ws
return nil
}
func (l *Loader) waitPID(tgid kernel.ThreadID, cid string, waitStatus *uint32) error {
if tgid <= 0 {
return fmt.Errorf("PID (%d) must be positive", tgid)
}
// Try to find a process that was exec'd
eid := execID{cid: cid, pid: tgid}
execTG, err := l.threadGroupFromID(eid)
if err == nil {
ws := l.wait(execTG)
*waitStatus = ws
l.mu.Lock()
delete(l.processes, eid)
log.Debugf("updated processes (removal): %v", l.processes)
l.mu.Unlock()
return nil
}
// The caller may be waiting on a process not started directly via exec.
// In this case, find the process in the container's PID namespace.
initTG, err := l.threadGroupFromID(execID{cid: cid})
if err != nil {
return fmt.Errorf("waiting for PID %d: %v", tgid, err)
}
tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
if tg == nil {
return fmt.Errorf("waiting for PID %d: no such process", tgid)
}
if tg.Leader().ContainerID() != cid {
return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
}
ws := l.wait(tg)
*waitStatus = ws
return nil
}
// wait waits for the process with TGID 'tgid' in a container's PID namespace
// to exit.
func (l *Loader) wait(tg *kernel.ThreadGroup) uint32 {
tg.WaitExited()
return tg.ExitStatus().Status()
}
// WaitForStartSignal waits for a start signal from the control server.
func (l *Loader) WaitForStartSignal() {
<-l.ctrl.manager.startChan
}
// WaitExit waits for the root container to exit, and returns its exit status.
func (l *Loader) WaitExit() kernel.ExitStatus {
// Wait for container.
l.k.WaitExited()
return l.k.GlobalInit().ExitStatus()
}
func newRootNetworkNamespace(conf *Config, clock tcpip.Clock, uniqueID stack.UniqueID) (*inet.Namespace, error) {
// Create an empty network stack because the network namespace may be empty at
// this point. Netns is configured before Run() is called. Netstack is
// configured using a control uRPC message. Host network is configured inside
// Run().
switch conf.Network {
case NetworkHost:
// No network namespacing support for hostinet yet, hence creator is nil.
return inet.NewRootNamespace(hostinet.NewStack(), nil), nil
case NetworkNone, NetworkSandbox:
s, err := newEmptySandboxNetworkStack(clock, uniqueID)
if err != nil {
return nil, err
}
creator := &sandboxNetstackCreator{
clock: clock,
uniqueID: uniqueID,
}
return inet.NewRootNamespace(s, creator), nil
default:
panic(fmt.Sprintf("invalid network configuration: %v", conf.Network))
}
}
func newEmptySandboxNetworkStack(clock tcpip.Clock, uniqueID stack.UniqueID) (inet.Stack, error) {
netProtos := []stack.NetworkProtocol{ipv4.NewProtocol(), ipv6.NewProtocol(), arp.NewProtocol()}
transProtos := []stack.TransportProtocol{tcp.NewProtocol(), udp.NewProtocol(), icmp.NewProtocol4()}
s := netstack.Stack{stack.New(stack.Options{
NetworkProtocols: netProtos,
TransportProtocols: transProtos,
Clock: clock,
Stats: netstack.Metrics,
HandleLocal: true,
// Enable raw sockets for users with sufficient
// privileges.
RawFactory: raw.EndpointFactory{},
UniqueID: uniqueID,
})}
// Enable SACK Recovery.
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcp.SACKEnabled(true)); err != nil {
return nil, fmt.Errorf("failed to enable SACK: %s", err)
}
// Set default TTLs as required by socket/netstack.
s.Stack.SetNetworkProtocolOption(ipv4.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
s.Stack.SetNetworkProtocolOption(ipv6.ProtocolNumber, tcpip.DefaultTTLOption(netstack.DefaultTTL))
// Enable Receive Buffer Auto-Tuning.
if err := s.Stack.SetTransportProtocolOption(tcp.ProtocolNumber, tcpip.ModerateReceiveBufferOption(true)); err != nil {
return nil, fmt.Errorf("SetTransportProtocolOption failed: %s", err)
}
return &s, nil
}
// sandboxNetstackCreator implements kernel.NetworkStackCreator.
//
// +stateify savable
type sandboxNetstackCreator struct {
clock tcpip.Clock
uniqueID stack.UniqueID
}
// CreateStack implements kernel.NetworkStackCreator.CreateStack.
func (f *sandboxNetstackCreator) CreateStack() (inet.Stack, error) {
s, err := newEmptySandboxNetworkStack(f.clock, f.uniqueID)
if err != nil {
return nil, err
}
// Setup loopback.
n := &Network{Stack: s.(*netstack.Stack).Stack}
nicID := tcpip.NICID(f.uniqueID.UniqueID())
link := DefaultLoopbackLink
linkEP := loopback.New()
if err := n.createNICWithAddrs(nicID, link.Name, linkEP, link.Addresses); err != nil {
return nil, err
}
return s, nil
}
// signal sends a signal to one or more processes in a container. If PID is 0,
// then the container init process is used. Depending on the SignalDeliveryMode
// option, the signal may be sent directly to the indicated process, to all
// processes in the container, or to the foreground process group.
func (l *Loader) signal(cid string, pid, signo int32, mode SignalDeliveryMode) error {
if pid < 0 {
return fmt.Errorf("PID (%d) must be positive", pid)
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
}
switch mode {
case DeliverToProcess:
if err := l.signalProcess(cid, kernel.ThreadID(pid), signo); err != nil {
return fmt.Errorf("signaling process in container %q PID %d: %v", cid, pid, err)
}
return nil
case DeliverToForegroundProcessGroup:
if err := l.signalForegrondProcessGroup(cid, kernel.ThreadID(pid), signo); err != nil {
return fmt.Errorf("signaling foreground process group in container %q PID %d: %v", cid, pid, err)
}
return nil
runsc: Support job control signals in "exec -it". Terminal support in runsc relies on host tty file descriptors that are imported into the sandbox. Application tty ioctls are sent directly to the host fd. However, those host tty ioctls are associated in the host kernel with a host process (in this case runsc), and the host kernel intercepts job control characters like ^C and send signals to the host process. Thus, typing ^C into a "runsc exec" shell will send a SIGINT to the runsc process. This change makes "runsc exec" handle all signals, and forward them into the sandbox via the "ContainerSignal" urpc method. Since the "runsc exec" is associated with a particular container process in the sandbox, the signal must be associated with the same container process. One big difficulty is that the signal should not necessarily be sent to the sandbox process started by "exec", but instead must be sent to the foreground process group for the tty. For example, we may exec "bash", and from bash call "sleep 100". A ^C at this point should SIGINT sleep, not bash. To handle this, tty files inside the sandbox must keep track of their foreground process group, which is set/get via ioctls. When an incoming ContainerSignal urpc comes in, we look up the foreground process group via the tty file. Unfortunately, this means we have to expose and cache the tty file in the Loader. Note that "runsc exec" now handles signals properly, but "runs run" does not. That will come in a later CL, as this one is complex enough already. Example: root@:/usr/local/apache2# sleep 100 ^C root@:/usr/local/apache2# sleep 100 ^Z [1]+ Stopped sleep 100 root@:/usr/local/apache2# fg sleep 100 ^C root@:/usr/local/apache2# PiperOrigin-RevId: 215334554 Change-Id: I53cdce39653027908510a5ba8d08c49f9cf24f39
2018-10-02 05:05:41 +00:00
case DeliverToAllProcesses:
if pid != 0 {
return fmt.Errorf("PID (%d) cannot be set when signaling all processes", pid)
}
// Check that the container has actually started before signaling it.
if _, err := l.threadGroupFromID(execID{cid: cid}); err != nil {
return err
}
if err := l.signalAllProcesses(cid, signo); err != nil {
return fmt.Errorf("signaling all processes in container %q: %v", cid, err)
}
return nil
default:
panic(fmt.Sprintf("unknown signal delivery mode %v", mode))
}
}
func (l *Loader) signalProcess(cid string, tgid kernel.ThreadID, signo int32) error {
execTG, err := l.threadGroupFromID(execID{cid: cid, pid: tgid})
if err == nil {
// Send signal directly to the identified process.
return l.k.SendExternalSignalThreadGroup(execTG, &arch.SignalInfo{Signo: signo})
}
// The caller may be signaling a process not started directly via exec.
// In this case, find the process in the container's PID namespace and
// signal it.
initTG, err := l.threadGroupFromID(execID{cid: cid})
if err != nil {
return fmt.Errorf("no thread group found: %v", err)
}
tg := initTG.PIDNamespace().ThreadGroupWithID(tgid)
if tg == nil {
return fmt.Errorf("no such process with PID %d", tgid)
}
if tg.Leader().ContainerID() != cid {
return fmt.Errorf("process %d is part of a different container: %q", tgid, tg.Leader().ContainerID())
}
return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
}
// signalForegrondProcessGroup looks up foreground process group from the TTY
// for the given "tgid" inside container "cid", and send the signal to it.
func (l *Loader) signalForegrondProcessGroup(cid string, tgid kernel.ThreadID, signo int32) error {
l.mu.Lock()
tg, err := l.tryThreadGroupFromIDLocked(execID{cid: cid, pid: tgid})
if err != nil {
l.mu.Unlock()
return fmt.Errorf("no thread group found: %v", err)
}
if tg == nil {
l.mu.Unlock()
return fmt.Errorf("container %q not started", cid)
}
tty, ttyVFS2, err := l.ttyFromIDLocked(execID{cid: cid, pid: tgid})
l.mu.Unlock()
if err != nil {
return fmt.Errorf("no thread group found: %v", err)
}
var pg *kernel.ProcessGroup
switch {
case ttyVFS2 != nil:
pg = ttyVFS2.ForegroundProcessGroup()
case tty != nil:
pg = tty.ForegroundProcessGroup()
default:
return fmt.Errorf("no TTY attached")
}
if pg == nil {
// No foreground process group has been set. Signal the
// original thread group.
log.Warningf("No foreground process group for container %q and PID %d. Sending signal directly to PID %d.", cid, tgid, tgid)
return l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo})
}
// Send the signal to all processes in the process group.
var lastErr error
for _, tg := range l.k.TaskSet().Root.ThreadGroups() {
if tg.ProcessGroup() != pg {
continue
}
if err := l.k.SendExternalSignalThreadGroup(tg, &arch.SignalInfo{Signo: signo}); err != nil {
lastErr = err
}
}
return lastErr
}
// signalAllProcesses that belong to specified container. It's a noop if the
// container hasn't started or has exited.
func (l *Loader) signalAllProcesses(cid string, signo int32) error {
// Pause the kernel to prevent new processes from being created while
// the signal is delivered. This prevents process leaks when SIGKILL is
// sent to the entire container.
l.k.Pause()
defer l.k.Unpause()
return l.k.SendContainerSignal(cid, &arch.SignalInfo{Signo: signo})
}
// threadGroupFromID is similar to tryThreadGroupFromIDLocked except that it
// acquires mutex before calling it and fails in case container hasn't started
// yet.
func (l *Loader) threadGroupFromID(key execID) (*kernel.ThreadGroup, error) {
l.mu.Lock()
defer l.mu.Unlock()
tg, err := l.tryThreadGroupFromIDLocked(key)
if err != nil {
return nil, err
}
if tg == nil {
return nil, fmt.Errorf("container %q not started", key.cid)
}
return tg, nil
}
// tryThreadGroupFromIDLocked returns the thread group for the given execution
// ID. It may return nil in case the container has not started yet. Returns
// error if execution ID is invalid or if the container cannot be found (maybe
// it has been deleted). Caller must hold 'mu'.
func (l *Loader) tryThreadGroupFromIDLocked(key execID) (*kernel.ThreadGroup, error) {
ep := l.processes[key]
if ep == nil {
return nil, fmt.Errorf("container %q not found", key.cid)
}
return ep.tg, nil
}
// ttyFromIDLocked returns the TTY files for the given execution ID. It may
// return nil in case the container has not started yet. Returns error if
// execution ID is invalid or if the container cannot be found (maybe it has
// been deleted). Caller must hold 'mu'.
func (l *Loader) ttyFromIDLocked(key execID) (*host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
ep := l.processes[key]
if ep == nil {
return nil, nil, fmt.Errorf("container %q not found", key.cid)
}
return ep.tty, ep.ttyVFS2, nil
}
func createFDTable(ctx context.Context, console bool, stdioFDs []int) (*kernel.FDTable, *host.TTYFileOperations, *hostvfs2.TTYFileDescription, error) {
if len(stdioFDs) != 3 {
return nil, nil, nil, fmt.Errorf("stdioFDs should contain exactly 3 FDs (stdin, stdout, and stderr), but %d FDs received", len(stdioFDs))
}
k := kernel.KernelFromContext(ctx)
fdTable := k.NewFDTable()
ttyFile, ttyFileVFS2, err := fdimport.Import(ctx, fdTable, console, stdioFDs)
if err != nil {
fdTable.DecRef()
return nil, nil, nil, err
}
return fdTable, ttyFile, ttyFileVFS2, nil
}