2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
// Package kernel provides an emulation of the Linux kernel.
|
|
|
|
//
|
|
|
|
// See README.md for a detailed overview.
|
|
|
|
//
|
|
|
|
// Lock order (outermost locks must be taken first):
|
|
|
|
//
|
|
|
|
// Kernel.extMu
|
2018-08-23 23:31:25 +00:00
|
|
|
// ThreadGroup.timerMu
|
2018-10-17 22:48:55 +00:00
|
|
|
// ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
|
2018-08-23 23:31:25 +00:00
|
|
|
// TaskSet.mu
|
|
|
|
// SignalHandlers.mu
|
|
|
|
// Task.mu
|
Disable cpuClockTicker when app is idle
Kernel.cpuClockTicker increments kernel.cpuClock, which tasks use as a clock to
track their CPU usage. This improves latency in the syscall path by avoid
expensive monotonic clock calls on every syscall entry/exit.
However, this timer fires every 10ms. Thus, when all tasks are idle (i.e.,
blocked or stopped), this forces a sentry wakeup every 10ms, when we may
otherwise be able to sleep until the next app-relevant event. These wakeups
cause the sentry to utilize approximately 2% CPU when the application is
otherwise idle.
Updates to clock are not strictly necessary when the app is idle, as there are
no readers of cpuClock. This commit reduces idle CPU by disabling the timer
when tasks are completely idle, and computing its effects at the next wakeup.
Rather than disabling the timer as soon as the app goes idle, we wait until the
next tick, which provides a window for short sleeps to sleep and wakeup without
doing the (relatively) expensive work of disabling and enabling the timer.
PiperOrigin-RevId: 272265822
2019-10-01 19:13:09 +00:00
|
|
|
// runningTasksMu
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
|
|
|
|
// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
|
|
|
|
// time requires locking all of their signal mutexes first.
|
|
|
|
package kernel
|
|
|
|
|
|
|
|
import (
|
2019-01-08 20:56:59 +00:00
|
|
|
"errors"
|
2018-04-27 17:37:02 +00:00
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"path/filepath"
|
|
|
|
"sync"
|
|
|
|
"sync/atomic"
|
|
|
|
"time"
|
|
|
|
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
|
|
"gvisor.dev/gvisor/pkg/cpuid"
|
|
|
|
"gvisor.dev/gvisor/pkg/eventchannel"
|
|
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
|
|
"gvisor.dev/gvisor/pkg/refs"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/arch"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/context"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/inet"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
|
|
|
|
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/limits"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/loader"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/mm"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
|
|
|
|
sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/unimpl"
|
|
|
|
uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
|
|
|
|
"gvisor.dev/gvisor/pkg/state"
|
|
|
|
"gvisor.dev/gvisor/pkg/tcpip"
|
2018-04-27 17:37:02 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// Kernel represents an emulated Linux kernel. It must be initialized by calling
|
|
|
|
// Init() or LoadFrom().
|
2018-08-02 17:41:44 +00:00
|
|
|
//
|
|
|
|
// +stateify savable
|
2018-04-27 17:37:02 +00:00
|
|
|
type Kernel struct {
|
|
|
|
// extMu serializes external changes to the Kernel with calls to
|
|
|
|
// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
|
|
|
|
// remains frozen for the duration of the call; it requires that the Kernel
|
|
|
|
// is paused as a precondition, which ensures that none of the tasks
|
|
|
|
// running within the Kernel can affect its state, but extMu is required to
|
|
|
|
// ensure that concurrent users of the Kernel *outside* the Kernel's
|
|
|
|
// control cannot affect its state by calling e.g.
|
|
|
|
// Kernel.SendExternalSignal.)
|
|
|
|
extMu sync.Mutex `state:"nosave"`
|
|
|
|
|
|
|
|
// started is true if Start has been called. Unless otherwise specified,
|
|
|
|
// all Kernel fields become immutable once started becomes true.
|
|
|
|
started bool `state:"nosave"`
|
|
|
|
|
|
|
|
// All of the following fields are immutable unless otherwise specified.
|
|
|
|
|
2019-03-14 15:11:36 +00:00
|
|
|
// Platform is the platform that is used to execute tasks in the created
|
|
|
|
// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
|
|
|
|
// embedded anonymously (the same issue applies).
|
2018-04-27 17:37:02 +00:00
|
|
|
platform.Platform `state:"nosave"`
|
|
|
|
|
2019-03-14 15:11:36 +00:00
|
|
|
// mf provides application memory.
|
|
|
|
mf *pgalloc.MemoryFile `state:"nosave"`
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// See InitKernelArgs for the meaning of these fields.
|
2018-09-07 17:44:50 +00:00
|
|
|
featureSet *cpuid.FeatureSet
|
|
|
|
timekeeper *Timekeeper
|
|
|
|
tasks *TaskSet
|
|
|
|
rootUserNamespace *auth.UserNamespace
|
|
|
|
networkStack inet.Stack `state:"nosave"`
|
|
|
|
applicationCores uint
|
|
|
|
useHostCores bool
|
|
|
|
extraAuxv []arch.AuxEntry
|
|
|
|
vdso *loader.VDSO
|
|
|
|
rootUTSNamespace *UTSNamespace
|
|
|
|
rootIPCNamespace *IPCNamespace
|
|
|
|
rootAbstractSocketNamespace *AbstractSocketNamespace
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-10-08 17:19:27 +00:00
|
|
|
// futexes is the "root" futex.Manager, from which all others are forked.
|
|
|
|
// This is necessary to ensure that shared futexes are coherent across all
|
|
|
|
// tasks, including those created by CreateProcess.
|
|
|
|
futexes *futex.Manager
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// globalInit is the thread group whose leader has ID 1 in the root PID
|
|
|
|
// namespace. globalInit is stored separately so that it is accessible even
|
|
|
|
// after all tasks in the thread group have exited, such that ID 1 is no
|
|
|
|
// longer mapped.
|
|
|
|
//
|
|
|
|
// globalInit is mutable until it is assigned by the first successful call
|
|
|
|
// to CreateProcess, and is protected by extMu.
|
|
|
|
globalInit *ThreadGroup
|
|
|
|
|
|
|
|
// realtimeClock is a ktime.Clock based on timekeeper's Realtime.
|
|
|
|
realtimeClock *timekeeperClock
|
|
|
|
|
|
|
|
// monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
|
|
|
|
monotonicClock *timekeeperClock
|
|
|
|
|
|
|
|
// syslog is the kernel log.
|
|
|
|
syslog syslog
|
|
|
|
|
Disable cpuClockTicker when app is idle
Kernel.cpuClockTicker increments kernel.cpuClock, which tasks use as a clock to
track their CPU usage. This improves latency in the syscall path by avoid
expensive monotonic clock calls on every syscall entry/exit.
However, this timer fires every 10ms. Thus, when all tasks are idle (i.e.,
blocked or stopped), this forces a sentry wakeup every 10ms, when we may
otherwise be able to sleep until the next app-relevant event. These wakeups
cause the sentry to utilize approximately 2% CPU when the application is
otherwise idle.
Updates to clock are not strictly necessary when the app is idle, as there are
no readers of cpuClock. This commit reduces idle CPU by disabling the timer
when tasks are completely idle, and computing its effects at the next wakeup.
Rather than disabling the timer as soon as the app goes idle, we wait until the
next tick, which provides a window for short sleeps to sleep and wakeup without
doing the (relatively) expensive work of disabling and enabling the timer.
PiperOrigin-RevId: 272265822
2019-10-01 19:13:09 +00:00
|
|
|
// runningTasksMu synchronizes disable/enable of cpuClockTicker when
|
|
|
|
// the kernel is idle (runningTasks == 0).
|
|
|
|
//
|
|
|
|
// runningTasksMu is used to exclude critical sections when the timer
|
|
|
|
// disables itself and when the first active task enables the timer,
|
|
|
|
// ensuring that tasks always see a valid cpuClock value.
|
|
|
|
runningTasksMu sync.Mutex `state:"nosave"`
|
|
|
|
|
|
|
|
// runningTasks is the total count of tasks currently in
|
|
|
|
// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
|
|
|
|
// not blocked or stopped.
|
|
|
|
//
|
|
|
|
// runningTasks must be accessed atomically. Increments from 0 to 1 are
|
|
|
|
// further protected by runningTasksMu (see incRunningTasks).
|
|
|
|
runningTasks int64
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// cpuClock is incremented every linux.ClockTick. cpuClock is used to
|
|
|
|
// measure task CPU usage, since sampling monotonicClock twice on every
|
|
|
|
// syscall turns out to be unreasonably expensive. This is similar to how
|
|
|
|
// Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
|
|
|
|
// although Linux also uses scheduler timing information to improve
|
|
|
|
// resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
|
|
|
|
// since "preeemptive" scheduling is managed by the Go runtime, which
|
|
|
|
// doesn't provide this information.
|
|
|
|
//
|
|
|
|
// cpuClock is mutable, and is accessed using atomic memory operations.
|
|
|
|
cpuClock uint64
|
|
|
|
|
|
|
|
// cpuClockTicker increments cpuClock.
|
|
|
|
cpuClockTicker *ktime.Timer `state:"nosave"`
|
|
|
|
|
Disable cpuClockTicker when app is idle
Kernel.cpuClockTicker increments kernel.cpuClock, which tasks use as a clock to
track their CPU usage. This improves latency in the syscall path by avoid
expensive monotonic clock calls on every syscall entry/exit.
However, this timer fires every 10ms. Thus, when all tasks are idle (i.e.,
blocked or stopped), this forces a sentry wakeup every 10ms, when we may
otherwise be able to sleep until the next app-relevant event. These wakeups
cause the sentry to utilize approximately 2% CPU when the application is
otherwise idle.
Updates to clock are not strictly necessary when the app is idle, as there are
no readers of cpuClock. This commit reduces idle CPU by disabling the timer
when tasks are completely idle, and computing its effects at the next wakeup.
Rather than disabling the timer as soon as the app goes idle, we wait until the
next tick, which provides a window for short sleeps to sleep and wakeup without
doing the (relatively) expensive work of disabling and enabling the timer.
PiperOrigin-RevId: 272265822
2019-10-01 19:13:09 +00:00
|
|
|
// cpuClockTickerDisabled indicates that cpuClockTicker has been
|
|
|
|
// disabled because no tasks are running.
|
|
|
|
//
|
|
|
|
// cpuClockTickerDisabled is protected by runningTasksMu.
|
|
|
|
cpuClockTickerDisabled bool
|
|
|
|
|
|
|
|
// cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the
|
|
|
|
// point it was disabled. It is cached here to avoid a lock ordering
|
|
|
|
// violation with cpuClockTicker.mu when runningTaskMu is held.
|
|
|
|
//
|
|
|
|
// cpuClockTickerSetting is only valid when cpuClockTickerDisabled is
|
|
|
|
// true.
|
|
|
|
//
|
|
|
|
// cpuClockTickerSetting is protected by runningTasksMu.
|
|
|
|
cpuClockTickerSetting ktime.Setting
|
|
|
|
|
2019-07-03 02:27:51 +00:00
|
|
|
// fdMapUids is an ever-increasing counter for generating FDTable uids.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// fdMapUids is mutable, and is accessed using atomic memory operations.
|
|
|
|
fdMapUids uint64
|
|
|
|
|
|
|
|
// uniqueID is used to generate unique identifiers.
|
|
|
|
//
|
|
|
|
// uniqueID is mutable, and is accessed using atomic memory operations.
|
|
|
|
uniqueID uint64
|
|
|
|
|
|
|
|
// nextInotifyCookie is a monotonically increasing counter used for
|
|
|
|
// generating unique inotify event cookies.
|
|
|
|
//
|
2019-02-07 22:43:18 +00:00
|
|
|
// nextInotifyCookie is mutable, and is accessed using atomic memory
|
2018-04-27 17:37:02 +00:00
|
|
|
// operations.
|
|
|
|
nextInotifyCookie uint32
|
|
|
|
|
|
|
|
// netlinkPorts manages allocation of netlink socket port IDs.
|
|
|
|
netlinkPorts *port.Manager
|
|
|
|
|
2019-04-23 01:17:25 +00:00
|
|
|
// saveErr is the error causing the sandbox to exit during save, if
|
|
|
|
// any. It is protected by extMu.
|
|
|
|
saveErr error `state:"nosave"`
|
2018-07-10 16:22:37 +00:00
|
|
|
|
|
|
|
// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
|
|
|
|
danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`
|
2019-02-07 22:43:18 +00:00
|
|
|
|
2019-06-10 22:16:42 +00:00
|
|
|
// sockets is the list of all network sockets the system. Protected by
|
2019-02-07 22:43:18 +00:00
|
|
|
// extMu.
|
2019-06-10 22:16:42 +00:00
|
|
|
sockets socketList
|
|
|
|
|
|
|
|
// nextSocketEntry is the next entry number to use in sockets. Protected
|
|
|
|
// by extMu.
|
|
|
|
nextSocketEntry uint64
|
2019-04-01 22:38:08 +00:00
|
|
|
|
|
|
|
// deviceRegistry is used to save/restore device.SimpleDevices.
|
|
|
|
deviceRegistry struct{} `state:".(*device.Registry)"`
|
2019-04-17 19:56:23 +00:00
|
|
|
|
|
|
|
// DirentCacheLimiter controls the number of total dirent entries can be in
|
|
|
|
// caches. Not all caches use it, only the caches that use host resources use
|
|
|
|
// the limiter. It may be nil if disabled.
|
|
|
|
DirentCacheLimiter *fs.DirentCacheLimiter
|
2019-07-30 00:11:27 +00:00
|
|
|
|
2019-07-31 18:59:21 +00:00
|
|
|
// unimplementedSyscallEmitterOnce is used in the initialization of
|
|
|
|
// unimplementedSyscallEmitter.
|
|
|
|
unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`
|
|
|
|
|
2019-07-30 00:11:27 +00:00
|
|
|
// unimplementedSyscallEmitter is used to emit unimplemented syscall
|
|
|
|
// events. This is initialized lazily on the first unimplemented
|
|
|
|
// syscall.
|
|
|
|
unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// InitKernelArgs holds arguments to Init.
|
|
|
|
type InitKernelArgs struct {
|
|
|
|
// FeatureSet is the emulated CPU feature set.
|
|
|
|
FeatureSet *cpuid.FeatureSet
|
|
|
|
|
|
|
|
// Timekeeper manages time for all tasks in the system.
|
|
|
|
Timekeeper *Timekeeper
|
|
|
|
|
|
|
|
// RootUserNamespace is the root user namespace.
|
|
|
|
RootUserNamespace *auth.UserNamespace
|
|
|
|
|
|
|
|
// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
|
|
|
|
NetworkStack inet.Stack
|
|
|
|
|
|
|
|
// ApplicationCores is the number of logical CPUs visible to sandboxed
|
|
|
|
// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
|
|
|
|
// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
|
|
|
|
// most significant bit in cpu_possible_mask + 1.
|
|
|
|
ApplicationCores uint
|
|
|
|
|
|
|
|
// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
|
|
|
|
// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
|
|
|
|
// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
|
|
|
|
// will be overridden.
|
|
|
|
UseHostCores bool
|
|
|
|
|
|
|
|
// ExtraAuxv contains additional auxiliary vector entries that are added to
|
|
|
|
// each process by the ELF loader.
|
|
|
|
ExtraAuxv []arch.AuxEntry
|
|
|
|
|
|
|
|
// Vdso holds the VDSO and its parameter page.
|
|
|
|
Vdso *loader.VDSO
|
|
|
|
|
2018-09-07 17:44:50 +00:00
|
|
|
// RootUTSNamespace is the root UTS namespace.
|
2018-04-27 17:37:02 +00:00
|
|
|
RootUTSNamespace *UTSNamespace
|
|
|
|
|
2018-09-07 17:44:50 +00:00
|
|
|
// RootIPCNamespace is the root IPC namespace.
|
2018-04-27 17:37:02 +00:00
|
|
|
RootIPCNamespace *IPCNamespace
|
2018-09-07 17:44:50 +00:00
|
|
|
|
|
|
|
// RootAbstractSocketNamespace is the root Abstract Socket namespace.
|
|
|
|
RootAbstractSocketNamespace *AbstractSocketNamespace
|
2019-07-26 22:00:49 +00:00
|
|
|
|
|
|
|
// PIDNamespace is the root PID namespace.
|
|
|
|
PIDNamespace *PIDNamespace
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Init initialize the Kernel with no tasks.
|
|
|
|
//
|
2019-03-14 15:11:36 +00:00
|
|
|
// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
|
|
|
|
// before calling Init.
|
2018-04-27 17:37:02 +00:00
|
|
|
func (k *Kernel) Init(args InitKernelArgs) error {
|
|
|
|
if args.FeatureSet == nil {
|
|
|
|
return fmt.Errorf("FeatureSet is nil")
|
|
|
|
}
|
|
|
|
if args.Timekeeper == nil {
|
|
|
|
return fmt.Errorf("Timekeeper is nil")
|
|
|
|
}
|
|
|
|
if args.RootUserNamespace == nil {
|
|
|
|
return fmt.Errorf("RootUserNamespace is nil")
|
|
|
|
}
|
|
|
|
if args.ApplicationCores == 0 {
|
|
|
|
return fmt.Errorf("ApplicationCores is 0")
|
|
|
|
}
|
|
|
|
|
|
|
|
k.featureSet = args.FeatureSet
|
|
|
|
k.timekeeper = args.Timekeeper
|
2019-07-26 22:00:49 +00:00
|
|
|
k.tasks = newTaskSet(args.PIDNamespace)
|
2018-04-27 17:37:02 +00:00
|
|
|
k.rootUserNamespace = args.RootUserNamespace
|
|
|
|
k.rootUTSNamespace = args.RootUTSNamespace
|
|
|
|
k.rootIPCNamespace = args.RootIPCNamespace
|
2018-09-07 17:44:50 +00:00
|
|
|
k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
|
2018-04-27 17:37:02 +00:00
|
|
|
k.networkStack = args.NetworkStack
|
|
|
|
k.applicationCores = args.ApplicationCores
|
|
|
|
if args.UseHostCores {
|
|
|
|
k.useHostCores = true
|
|
|
|
maxCPU, err := hostcpu.MaxPossibleCPU()
|
|
|
|
if err != nil {
|
|
|
|
return fmt.Errorf("Failed to get maximum CPU number: %v", err)
|
|
|
|
}
|
|
|
|
minAppCores := uint(maxCPU) + 1
|
|
|
|
if k.applicationCores < minAppCores {
|
|
|
|
log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
|
|
|
|
k.applicationCores = minAppCores
|
|
|
|
}
|
|
|
|
}
|
|
|
|
k.extraAuxv = args.ExtraAuxv
|
|
|
|
k.vdso = args.Vdso
|
|
|
|
k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
|
|
|
|
k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
|
2018-10-08 17:19:27 +00:00
|
|
|
k.futexes = futex.NewManager()
|
2018-04-27 17:37:02 +00:00
|
|
|
k.netlinkPorts = port.New()
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// SaveTo saves the state of k to w.
|
|
|
|
//
|
|
|
|
// Preconditions: The kernel must be paused throughout the call to SaveTo.
|
|
|
|
func (k *Kernel) SaveTo(w io.Writer) error {
|
|
|
|
saveStart := time.Now()
|
|
|
|
ctx := k.SupervisorContext()
|
|
|
|
|
|
|
|
// Do not allow other Kernel methods to affect it while it's being saved.
|
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
|
|
|
|
// Stop time.
|
|
|
|
k.pauseTimeLocked()
|
|
|
|
defer k.resumeTimeLocked()
|
|
|
|
|
2019-04-30 20:55:41 +00:00
|
|
|
// Evict all evictable MemoryFile allocations.
|
2019-05-10 20:36:56 +00:00
|
|
|
k.mf.StartEvictions()
|
|
|
|
k.mf.WaitForEvictions()
|
2019-04-30 20:55:41 +00:00
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Flush write operations on open files so data reaches backing storage.
|
2019-05-10 20:36:56 +00:00
|
|
|
// This must come after MemoryFile eviction since eviction may cause file
|
2019-04-30 20:55:41 +00:00
|
|
|
// writes.
|
2018-04-27 17:37:02 +00:00
|
|
|
if err := k.tasks.flushWritesToFiles(ctx); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
// Remove all epoll waiter objects from underlying wait queues.
|
|
|
|
// NOTE: for programs to resume execution in future snapshot scenarios,
|
|
|
|
// we will need to re-establish these waiter objects after saving.
|
|
|
|
k.tasks.unregisterEpollWaiters()
|
|
|
|
|
|
|
|
// Clear the dirent cache before saving because Dirents must be Loaded in a
|
|
|
|
// particular order (parents before children), and Loading dirents from a cache
|
|
|
|
// breaks that order.
|
2019-04-10 18:26:10 +00:00
|
|
|
if err := k.flushMountSourceRefs(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Ensure that all pending asynchronous work is complete:
|
|
|
|
// - inode and mount release
|
|
|
|
// - asynchronuous IO
|
|
|
|
fs.AsyncBarrier()
|
|
|
|
|
|
|
|
// Once all fs work has completed (flushed references have all been released),
|
|
|
|
// reset mount mappings. This allows individual mounts to save how inodes map
|
|
|
|
// to filesystem resources. Without this, fs.Inodes cannot be restored.
|
|
|
|
fs.SaveInodeMappings()
|
|
|
|
|
|
|
|
// Discard unsavable mappings, such as those for host file descriptors.
|
|
|
|
// This must be done after waiting for "asynchronous fs work", which
|
|
|
|
// includes async I/O that may touch application memory.
|
|
|
|
if err := k.invalidateUnsavableMappings(ctx); err != nil {
|
|
|
|
return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
|
|
|
|
}
|
|
|
|
|
2019-04-26 00:45:56 +00:00
|
|
|
// Save the CPUID FeatureSet before the rest of the kernel so we can
|
|
|
|
// verify its compatibility on restore before attempting to restore the
|
|
|
|
// entire kernel, which may fail on an incompatible machine.
|
|
|
|
//
|
|
|
|
// N.B. This will also be saved along with the full kernel save below.
|
|
|
|
cpuidStart := time.Now()
|
2019-11-01 01:02:04 +00:00
|
|
|
if err := state.Save(k.SupervisorContext(), w, k.FeatureSet(), nil); err != nil {
|
2019-04-26 00:45:56 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Infof("CPUID save took [%s].", time.Since(cpuidStart))
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Save the kernel state.
|
|
|
|
kernelStart := time.Now()
|
|
|
|
var stats state.Stats
|
2019-11-01 01:02:04 +00:00
|
|
|
if err := state.Save(k.SupervisorContext(), w, k, &stats); err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Infof("Kernel save stats: %s", &stats)
|
|
|
|
log.Infof("Kernel save took [%s].", time.Since(kernelStart))
|
|
|
|
|
2019-03-14 15:11:36 +00:00
|
|
|
// Save the memory file's state.
|
2018-04-27 17:37:02 +00:00
|
|
|
memoryStart := time.Now()
|
2019-11-01 01:02:04 +00:00
|
|
|
if err := k.mf.SaveTo(k.SupervisorContext(), w); err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Infof("Memory save took [%s].", time.Since(memoryStart))
|
|
|
|
|
|
|
|
log.Infof("Overall save took [%s].", time.Since(saveStart))
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2019-04-10 18:26:10 +00:00
|
|
|
// flushMountSourceRefs flushes the MountSources for all mounted filesystems
|
|
|
|
// and open FDs.
|
|
|
|
func (k *Kernel) flushMountSourceRefs() error {
|
2019-08-02 18:21:50 +00:00
|
|
|
// Flush all mount sources for currently mounted filesystems in each task.
|
2019-06-19 16:20:10 +00:00
|
|
|
flushed := make(map[*fs.MountNamespace]struct{})
|
|
|
|
k.tasks.mu.RLock()
|
|
|
|
k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
|
|
|
|
if _, ok := flushed[tg.mounts]; ok {
|
|
|
|
// Already flushed.
|
|
|
|
return
|
|
|
|
}
|
|
|
|
tg.mounts.FlushMountSourceRefs()
|
|
|
|
flushed[tg.mounts] = struct{}{}
|
|
|
|
})
|
|
|
|
k.tasks.mu.RUnlock()
|
|
|
|
|
2019-04-10 18:26:10 +00:00
|
|
|
// There may be some open FDs whose filesystems have been unmounted. We
|
|
|
|
// must flush those as well.
|
2019-07-03 02:27:51 +00:00
|
|
|
return k.tasks.forEachFDPaused(func(file *fs.File) error {
|
|
|
|
file.Dirent.Inode.MountSource.FlushDirentRefs()
|
2019-04-10 18:26:10 +00:00
|
|
|
return nil
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
// forEachFDPaused applies the given function to each open file descriptor in each
|
|
|
|
// task.
|
|
|
|
//
|
|
|
|
// Precondition: Must be called with the kernel paused.
|
2019-07-03 02:27:51 +00:00
|
|
|
func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
ts.mu.RLock()
|
|
|
|
defer ts.mu.RUnlock()
|
|
|
|
for t := range ts.Root.tids {
|
2018-08-31 20:57:02 +00:00
|
|
|
// We can skip locking Task.mu here since the kernel is paused.
|
2019-07-03 02:27:51 +00:00
|
|
|
if t.fdTable == nil {
|
2019-04-10 18:26:10 +00:00
|
|
|
continue
|
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
|
|
|
|
if lastErr := f(file); lastErr != nil && err == nil {
|
|
|
|
err = lastErr
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
return err
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-04-10 18:26:10 +00:00
|
|
|
func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
|
2019-07-03 02:27:51 +00:00
|
|
|
return ts.forEachFDPaused(func(file *fs.File) error {
|
|
|
|
if flags := file.Flags(); !flags.Write {
|
2019-04-10 18:26:10 +00:00
|
|
|
return nil
|
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
|
2019-04-10 18:26:10 +00:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
// Here we need all metadata synced.
|
2019-07-03 02:27:51 +00:00
|
|
|
syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
|
2019-04-10 18:26:10 +00:00
|
|
|
if err := fs.SaveFileFsyncError(syncErr); err != nil {
|
2019-07-03 02:27:51 +00:00
|
|
|
name, _ := file.Dirent.FullName(nil /* root */)
|
2019-04-10 18:26:10 +00:00
|
|
|
// Wrap this error in ErrSaveRejection
|
|
|
|
// so that it will trigger a save
|
|
|
|
// error, rather than a panic. This
|
|
|
|
// also allows us to distinguish Fsync
|
|
|
|
// errors from state file errors in
|
|
|
|
// state.Save.
|
|
|
|
return fs.ErrSaveRejection{
|
|
|
|
Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Preconditions: The kernel must be paused.
|
|
|
|
func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
|
|
|
|
invalidated := make(map[*mm.MemoryManager]struct{})
|
|
|
|
k.tasks.mu.RLock()
|
|
|
|
defer k.tasks.mu.RUnlock()
|
|
|
|
for t := range k.tasks.Root.tids {
|
|
|
|
// We can skip locking Task.mu here since the kernel is paused.
|
|
|
|
if mm := t.tc.MemoryManager; mm != nil {
|
|
|
|
if _, ok := invalidated[mm]; !ok {
|
|
|
|
if err := mm.InvalidateUnsavable(ctx); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
invalidated[mm] = struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
// I really wish we just had a sync.Map of all MMs...
|
|
|
|
if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
|
|
|
|
if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (ts *TaskSet) unregisterEpollWaiters() {
|
|
|
|
ts.mu.RLock()
|
|
|
|
defer ts.mu.RUnlock()
|
|
|
|
for t := range ts.Root.tids {
|
2018-08-31 20:57:02 +00:00
|
|
|
// We can skip locking Task.mu here since the kernel is paused.
|
2019-07-03 02:27:51 +00:00
|
|
|
if t.fdTable != nil {
|
|
|
|
t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
|
|
|
|
if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
|
|
|
|
e.UnregisterEpollWaiters()
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// LoadFrom returns a new Kernel loaded from args.
|
2019-08-08 19:32:00 +00:00
|
|
|
func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
|
2018-04-27 17:37:02 +00:00
|
|
|
loadStart := time.Now()
|
|
|
|
|
|
|
|
k.networkStack = net
|
|
|
|
|
|
|
|
initAppCores := k.applicationCores
|
|
|
|
|
2019-04-26 00:45:56 +00:00
|
|
|
// Load the pre-saved CPUID FeatureSet.
|
|
|
|
//
|
|
|
|
// N.B. This was also saved along with the full kernel below, so we
|
|
|
|
// don't need to explicitly install it in the Kernel.
|
|
|
|
cpuidStart := time.Now()
|
|
|
|
var features cpuid.FeatureSet
|
2019-11-01 01:02:04 +00:00
|
|
|
if err := state.Load(k.SupervisorContext(), r, &features, nil); err != nil {
|
2019-04-26 00:45:56 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Infof("CPUID load took [%s].", time.Since(cpuidStart))
|
|
|
|
|
|
|
|
// Verify that the FeatureSet is usable on this host. We do this before
|
|
|
|
// Kernel load so that the explicit CPUID mismatch error has priority
|
|
|
|
// over floating point state restore errors that may occur on load on
|
|
|
|
// an incompatible machine.
|
|
|
|
if err := features.CheckHostCompatible(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Load the kernel state.
|
|
|
|
kernelStart := time.Now()
|
|
|
|
var stats state.Stats
|
2019-11-01 01:02:04 +00:00
|
|
|
if err := state.Load(k.SupervisorContext(), r, k, &stats); err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Infof("Kernel load stats: %s", &stats)
|
|
|
|
log.Infof("Kernel load took [%s].", time.Since(kernelStart))
|
|
|
|
|
2019-03-14 15:11:36 +00:00
|
|
|
// Load the memory file's state.
|
2018-04-27 17:37:02 +00:00
|
|
|
memoryStart := time.Now()
|
2019-11-01 01:02:04 +00:00
|
|
|
if err := k.mf.LoadFrom(k.SupervisorContext(), r); err != nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
log.Infof("Memory load took [%s].", time.Since(memoryStart))
|
|
|
|
|
2019-07-03 02:27:51 +00:00
|
|
|
log.Infof("Overall load took [%s]", time.Since(loadStart))
|
|
|
|
|
2019-08-08 19:32:00 +00:00
|
|
|
k.Timekeeper().SetClocks(clocks)
|
|
|
|
if net != nil {
|
|
|
|
net.Resume()
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Ensure that all pending asynchronous work is complete:
|
|
|
|
// - namedpipe opening
|
|
|
|
// - inode file opening
|
2018-05-08 18:36:11 +00:00
|
|
|
if err := fs.AsyncErrorBarrier(); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2018-04-27 17:37:02 +00:00
|
|
|
|
2018-07-10 16:22:37 +00:00
|
|
|
tcpip.AsyncLoading.Wait()
|
|
|
|
|
2019-08-08 19:32:00 +00:00
|
|
|
log.Infof("Overall load took [%s] after async work", time.Since(loadStart))
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Applications may size per-cpu structures based on k.applicationCores, so
|
|
|
|
// it can't change across save/restore. When we are virtualizing CPU
|
|
|
|
// numbers, this isn't a problem. However, when we are exposing host CPU
|
|
|
|
// assignments, we can't tolerate an increase in the number of host CPUs,
|
|
|
|
// which could result in getcpu(2) returning CPUs that applications expect
|
|
|
|
// not to exist.
|
|
|
|
if k.useHostCores && initAppCores > k.applicationCores {
|
|
|
|
return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// UniqueID returns a unique identifier.
|
|
|
|
func (k *Kernel) UniqueID() uint64 {
|
|
|
|
id := atomic.AddUint64(&k.uniqueID, 1)
|
|
|
|
if id == 0 {
|
|
|
|
panic("unique identifier generator wrapped around")
|
|
|
|
}
|
|
|
|
return id
|
|
|
|
}
|
|
|
|
|
|
|
|
// CreateProcessArgs holds arguments to kernel.CreateProcess.
|
|
|
|
type CreateProcessArgs struct {
|
2019-07-30 18:19:18 +00:00
|
|
|
// Filename is the filename to load as the init binary.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
2019-07-30 18:19:18 +00:00
|
|
|
// If this is provided as "", File will be checked, then the file will be
|
|
|
|
// guessed via Argv[0].
|
2018-04-27 17:37:02 +00:00
|
|
|
Filename string
|
|
|
|
|
2019-07-30 18:19:18 +00:00
|
|
|
// File is a passed host FD pointing to a file to load as the init binary.
|
|
|
|
//
|
|
|
|
// This is checked if and only if Filename is "".
|
|
|
|
File *fs.File
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Argvv is a list of arguments.
|
|
|
|
Argv []string
|
|
|
|
|
|
|
|
// Envv is a list of environment variables.
|
|
|
|
Envv []string
|
|
|
|
|
|
|
|
// WorkingDirectory is the initial working directory.
|
|
|
|
//
|
|
|
|
// This defaults to the root if empty.
|
|
|
|
WorkingDirectory string
|
|
|
|
|
|
|
|
// Credentials is the initial credentials.
|
|
|
|
Credentials *auth.Credentials
|
|
|
|
|
2019-07-03 02:27:51 +00:00
|
|
|
// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
|
|
|
|
// it takes a reference on FDTable.
|
|
|
|
FDTable *FDTable
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Umask is the initial umask.
|
|
|
|
Umask uint
|
|
|
|
|
|
|
|
// Limits is the initial resource limits.
|
|
|
|
Limits *limits.LimitSet
|
|
|
|
|
|
|
|
// MaxSymlinkTraversals is the maximum number of symlinks to follow
|
|
|
|
// during resolution.
|
|
|
|
MaxSymlinkTraversals uint
|
|
|
|
|
|
|
|
// UTSNamespace is the initial UTS namespace.
|
|
|
|
UTSNamespace *UTSNamespace
|
|
|
|
|
|
|
|
// IPCNamespace is the initial IPC namespace.
|
|
|
|
IPCNamespace *IPCNamespace
|
2018-08-15 23:24:07 +00:00
|
|
|
|
2019-04-30 15:35:36 +00:00
|
|
|
// PIDNamespace is the initial PID Namespace.
|
|
|
|
PIDNamespace *PIDNamespace
|
|
|
|
|
2018-09-07 17:44:50 +00:00
|
|
|
// AbstractSocketNamespace is the initial Abstract Socket namespace.
|
|
|
|
AbstractSocketNamespace *AbstractSocketNamespace
|
|
|
|
|
2019-06-19 16:20:10 +00:00
|
|
|
// MountNamespace optionally contains the mount namespace for this
|
2019-08-02 18:21:50 +00:00
|
|
|
// process. If nil, the init process's mount namespace is used.
|
2019-06-19 16:20:10 +00:00
|
|
|
//
|
|
|
|
// Anyone setting MountNamespace must donate a reference (i.e.
|
|
|
|
// increment it).
|
|
|
|
MountNamespace *fs.MountNamespace
|
|
|
|
|
2018-09-27 22:00:03 +00:00
|
|
|
// ContainerID is the container that the process belongs to.
|
|
|
|
ContainerID string
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// NewContext returns a context.Context that represents the task that will be
|
|
|
|
// created by args.NewContext(k).
|
|
|
|
func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
|
|
|
|
return &createProcessContext{
|
|
|
|
Logger: log.Log(),
|
|
|
|
k: k,
|
|
|
|
args: args,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// createProcessContext is a context.Context that represents the context
|
|
|
|
// associated with a task that is being created.
|
|
|
|
type createProcessContext struct {
|
|
|
|
context.NoopSleeper
|
|
|
|
log.Logger
|
|
|
|
k *Kernel
|
|
|
|
args *CreateProcessArgs
|
|
|
|
}
|
|
|
|
|
|
|
|
// Value implements context.Context.Value.
|
|
|
|
func (ctx *createProcessContext) Value(key interface{}) interface{} {
|
|
|
|
switch key {
|
|
|
|
case CtxKernel:
|
|
|
|
return ctx.k
|
|
|
|
case CtxPIDNamespace:
|
2019-04-30 15:35:36 +00:00
|
|
|
return ctx.args.PIDNamespace
|
2018-04-27 17:37:02 +00:00
|
|
|
case CtxUTSNamespace:
|
|
|
|
return ctx.args.UTSNamespace
|
|
|
|
case CtxIPCNamespace:
|
|
|
|
return ctx.args.IPCNamespace
|
|
|
|
case auth.CtxCredentials:
|
|
|
|
return ctx.args.Credentials
|
|
|
|
case fs.CtxRoot:
|
2019-08-02 18:21:50 +00:00
|
|
|
if ctx.args.MountNamespace != nil {
|
|
|
|
// MountNamespace.Root() will take a reference on the root
|
|
|
|
// dirent for us.
|
|
|
|
return ctx.args.MountNamespace.Root()
|
2018-09-06 20:46:45 +00:00
|
|
|
}
|
|
|
|
return nil
|
2019-04-17 19:56:23 +00:00
|
|
|
case fs.CtxDirentCacheLimiter:
|
|
|
|
return ctx.k.DirentCacheLimiter
|
2018-04-27 17:37:02 +00:00
|
|
|
case ktime.CtxRealtimeClock:
|
|
|
|
return ctx.k.RealtimeClock()
|
|
|
|
case limits.CtxLimits:
|
|
|
|
return ctx.args.Limits
|
2019-03-14 15:11:36 +00:00
|
|
|
case pgalloc.CtxMemoryFile:
|
|
|
|
return ctx.k.mf
|
|
|
|
case pgalloc.CtxMemoryFileProvider:
|
|
|
|
return ctx.k
|
2018-04-27 17:37:02 +00:00
|
|
|
case platform.CtxPlatform:
|
|
|
|
return ctx.k
|
|
|
|
case uniqueid.CtxGlobalUniqueID:
|
|
|
|
return ctx.k.UniqueID()
|
2018-10-16 00:47:24 +00:00
|
|
|
case uniqueid.CtxGlobalUniqueIDProvider:
|
|
|
|
return ctx.k
|
2018-04-27 17:37:02 +00:00
|
|
|
case uniqueid.CtxInotifyCookie:
|
|
|
|
return ctx.k.GenerateInotifyCookie()
|
2018-10-20 18:12:26 +00:00
|
|
|
case unimpl.CtxEvents:
|
|
|
|
return ctx.k
|
2018-04-27 17:37:02 +00:00
|
|
|
default:
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// CreateProcess creates a new task in a new thread group with the given
|
|
|
|
// options. The new task has no parent and is in the root PID namespace.
|
|
|
|
//
|
2019-02-14 23:46:25 +00:00
|
|
|
// If k.Start() has already been called, then the created process must be
|
|
|
|
// started by calling kernel.StartProcess(tg).
|
|
|
|
//
|
|
|
|
// If k.Start() has not yet been called, then the created task will begin
|
|
|
|
// running when k.Start() is called.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// CreateProcess has no analogue in Linux; it is used to create the initial
|
|
|
|
// application task, as well as processes started by the control server.
|
2018-09-17 23:24:05 +00:00
|
|
|
func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
log.Infof("EXEC: %v", args.Argv)
|
|
|
|
|
2019-06-19 16:20:10 +00:00
|
|
|
// Grab the mount namespace.
|
|
|
|
mounts := args.MountNamespace
|
|
|
|
if mounts == nil {
|
2019-08-02 18:21:50 +00:00
|
|
|
mounts = k.GlobalInit().Leader().MountNamespace()
|
2019-06-19 16:20:10 +00:00
|
|
|
mounts.IncRef()
|
|
|
|
}
|
|
|
|
|
2019-12-26 22:42:19 +00:00
|
|
|
tg := k.NewThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits)
|
2018-04-27 17:37:02 +00:00
|
|
|
ctx := args.NewContext(k)
|
|
|
|
|
2019-08-02 18:21:50 +00:00
|
|
|
// Get the root directory from the MountNamespace.
|
|
|
|
root := mounts.Root()
|
2019-06-19 16:20:10 +00:00
|
|
|
// The call to newFSContext below will take a reference on root, so we
|
|
|
|
// don't need to hold this one.
|
2018-04-27 17:37:02 +00:00
|
|
|
defer root.DecRef()
|
|
|
|
|
|
|
|
// Grab the working directory.
|
2018-12-04 22:31:08 +00:00
|
|
|
remainingTraversals := uint(args.MaxSymlinkTraversals)
|
2018-04-27 17:37:02 +00:00
|
|
|
wd := root // Default.
|
|
|
|
if args.WorkingDirectory != "" {
|
|
|
|
var err error
|
2019-08-02 18:21:50 +00:00
|
|
|
wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
|
2018-04-27 17:37:02 +00:00
|
|
|
if err != nil {
|
2018-09-17 23:24:05 +00:00
|
|
|
return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
defer wd.DecRef()
|
|
|
|
}
|
|
|
|
|
2019-07-30 18:19:18 +00:00
|
|
|
// Check which file to start from.
|
|
|
|
switch {
|
|
|
|
case args.Filename != "":
|
|
|
|
// If a filename is given, take that.
|
|
|
|
// Set File to nil so we resolve the path in LoadTaskImage.
|
|
|
|
args.File = nil
|
|
|
|
case args.File != nil:
|
|
|
|
// If File is set, take the File provided directly.
|
|
|
|
default:
|
|
|
|
// Otherwise look at Argv and see if the first argument is a valid path.
|
2018-04-27 17:37:02 +00:00
|
|
|
if len(args.Argv) == 0 {
|
2018-09-17 23:24:05 +00:00
|
|
|
return nil, 0, fmt.Errorf("no filename or command provided")
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
if !filepath.IsAbs(args.Argv[0]) {
|
2018-09-17 23:24:05 +00:00
|
|
|
return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
args.Filename = args.Argv[0]
|
|
|
|
}
|
|
|
|
|
|
|
|
// Create a fresh task context.
|
2018-12-04 22:31:08 +00:00
|
|
|
remainingTraversals = uint(args.MaxSymlinkTraversals)
|
2019-10-26 05:31:35 +00:00
|
|
|
loadArgs := loader.LoadArgs{
|
|
|
|
Mounts: mounts,
|
|
|
|
Root: root,
|
|
|
|
WorkingDirectory: wd,
|
|
|
|
RemainingTraversals: &remainingTraversals,
|
|
|
|
ResolveFinal: true,
|
|
|
|
Filename: args.Filename,
|
|
|
|
File: args.File,
|
2019-10-29 17:03:18 +00:00
|
|
|
CloseOnExec: false,
|
2019-10-26 05:31:35 +00:00
|
|
|
Argv: args.Argv,
|
|
|
|
Envv: args.Envv,
|
|
|
|
Features: k.featureSet,
|
|
|
|
}
|
2019-07-30 18:19:18 +00:00
|
|
|
|
2019-10-26 05:31:35 +00:00
|
|
|
tc, se := k.LoadTaskImage(ctx, loadArgs)
|
2019-01-08 20:56:59 +00:00
|
|
|
if se != nil {
|
|
|
|
return nil, 0, errors.New(se.String())
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2018-08-31 20:57:02 +00:00
|
|
|
|
2019-07-03 02:27:51 +00:00
|
|
|
// Take a reference on the FDTable, which will be transferred to
|
2018-08-31 20:57:02 +00:00
|
|
|
// TaskSet.NewTask().
|
2019-07-03 02:27:51 +00:00
|
|
|
args.FDTable.IncRef()
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Create the task.
|
|
|
|
config := &TaskConfig{
|
2018-08-31 20:57:02 +00:00
|
|
|
Kernel: k,
|
|
|
|
ThreadGroup: tg,
|
|
|
|
TaskContext: tc,
|
|
|
|
FSContext: newFSContext(root, wd, args.Umask),
|
2019-07-03 02:27:51 +00:00
|
|
|
FDTable: args.FDTable,
|
2018-08-31 20:57:02 +00:00
|
|
|
Credentials: args.Credentials,
|
|
|
|
AllowedCPUMask: sched.NewFullCPUSet(k.applicationCores),
|
|
|
|
UTSNamespace: args.UTSNamespace,
|
|
|
|
IPCNamespace: args.IPCNamespace,
|
2018-09-07 17:44:50 +00:00
|
|
|
AbstractSocketNamespace: args.AbstractSocketNamespace,
|
2018-09-27 22:00:03 +00:00
|
|
|
ContainerID: args.ContainerID,
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-12-07 00:58:28 +00:00
|
|
|
t, err := k.tasks.NewTask(config)
|
|
|
|
if err != nil {
|
2018-09-17 23:24:05 +00:00
|
|
|
return nil, 0, err
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
2019-12-07 00:58:28 +00:00
|
|
|
t.traceExecEvent(tc) // Simulate exec for tracing.
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Success.
|
2018-09-17 23:24:05 +00:00
|
|
|
tgid := k.tasks.Root.IDOfThreadGroup(tg)
|
2019-02-14 23:46:25 +00:00
|
|
|
if k.globalInit == nil {
|
2018-04-27 17:37:02 +00:00
|
|
|
k.globalInit = tg
|
|
|
|
}
|
2018-09-17 23:24:05 +00:00
|
|
|
return tg, tgid, nil
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-02-14 23:46:25 +00:00
|
|
|
// StartProcess starts running a process that was created with CreateProcess.
|
|
|
|
func (k *Kernel) StartProcess(tg *ThreadGroup) {
|
|
|
|
t := tg.Leader()
|
|
|
|
tid := k.tasks.Root.IDOfTask(t)
|
|
|
|
t.Start(tid)
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Start starts execution of all tasks in k.
|
|
|
|
//
|
|
|
|
// Preconditions: Start may be called exactly once.
|
|
|
|
func (k *Kernel) Start() error {
|
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
|
|
|
|
if k.globalInit == nil {
|
|
|
|
return fmt.Errorf("kernel contains no tasks")
|
|
|
|
}
|
|
|
|
if k.started {
|
|
|
|
return fmt.Errorf("kernel already started")
|
|
|
|
}
|
|
|
|
|
|
|
|
k.started = true
|
2018-10-17 22:48:55 +00:00
|
|
|
k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k))
|
2018-04-27 17:37:02 +00:00
|
|
|
k.cpuClockTicker.Swap(ktime.Setting{
|
|
|
|
Enabled: true,
|
|
|
|
Period: linux.ClockTick,
|
|
|
|
})
|
|
|
|
// If k was created by LoadKernelFrom, timers were stopped during
|
|
|
|
// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
|
|
|
|
// this is a no-op.
|
|
|
|
k.resumeTimeLocked()
|
|
|
|
// Start task goroutines.
|
|
|
|
k.tasks.mu.RLock()
|
|
|
|
defer k.tasks.mu.RUnlock()
|
|
|
|
for t, tid := range k.tasks.Root.tids {
|
|
|
|
t.Start(tid)
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// pauseTimeLocked pauses all Timers and Timekeeper updates.
|
|
|
|
//
|
|
|
|
// Preconditions: Any task goroutines running in k must be stopped. k.extMu
|
|
|
|
// must be locked.
|
|
|
|
func (k *Kernel) pauseTimeLocked() {
|
|
|
|
// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
|
|
|
|
// Kernel.Start().
|
|
|
|
if k.cpuClockTicker != nil {
|
|
|
|
k.cpuClockTicker.Pause()
|
|
|
|
}
|
|
|
|
|
|
|
|
// By precondition, nothing else can be interacting with PIDNamespace.tids
|
2019-07-03 02:27:51 +00:00
|
|
|
// or FDTable.files, so we can iterate them without synchronization. (We
|
2018-04-27 17:37:02 +00:00
|
|
|
// can't hold the TaskSet mutex when pausing thread group timers because
|
|
|
|
// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
|
|
|
|
// mutex, while holding the Timer mutex.)
|
|
|
|
for t := range k.tasks.Root.tids {
|
|
|
|
if t == t.tg.leader {
|
2018-10-17 22:48:55 +00:00
|
|
|
t.tg.itimerRealTimer.Pause()
|
|
|
|
for _, it := range t.tg.timers {
|
|
|
|
it.PauseTimer()
|
|
|
|
}
|
2018-08-23 23:31:25 +00:00
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
// This means we'll iterate FDTables shared by multiple tasks repeatedly,
|
2018-10-17 22:48:55 +00:00
|
|
|
// but ktime.Timer.Pause is idempotent so this is harmless.
|
2019-07-03 02:27:51 +00:00
|
|
|
if t.fdTable != nil {
|
|
|
|
t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
|
|
|
|
if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
|
2018-04-27 17:37:02 +00:00
|
|
|
tfd.PauseTimer()
|
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
k.timekeeper.PauseUpdates()
|
|
|
|
}
|
|
|
|
|
|
|
|
// resumeTimeLocked resumes all Timers and Timekeeper updates. If
|
|
|
|
// pauseTimeLocked has not been previously called, resumeTimeLocked has no
|
|
|
|
// effect.
|
|
|
|
//
|
|
|
|
// Preconditions: Any task goroutines running in k must be stopped. k.extMu
|
|
|
|
// must be locked.
|
|
|
|
func (k *Kernel) resumeTimeLocked() {
|
|
|
|
if k.cpuClockTicker != nil {
|
|
|
|
k.cpuClockTicker.Resume()
|
|
|
|
}
|
|
|
|
|
|
|
|
k.timekeeper.ResumeUpdates()
|
|
|
|
for t := range k.tasks.Root.tids {
|
|
|
|
if t == t.tg.leader {
|
2018-10-17 22:48:55 +00:00
|
|
|
t.tg.itimerRealTimer.Resume()
|
|
|
|
for _, it := range t.tg.timers {
|
|
|
|
it.ResumeTimer()
|
|
|
|
}
|
2018-08-23 23:31:25 +00:00
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
if t.fdTable != nil {
|
|
|
|
t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
|
|
|
|
if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
|
2018-04-27 17:37:02 +00:00
|
|
|
tfd.ResumeTimer()
|
|
|
|
}
|
2019-07-03 02:27:51 +00:00
|
|
|
})
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Disable cpuClockTicker when app is idle
Kernel.cpuClockTicker increments kernel.cpuClock, which tasks use as a clock to
track their CPU usage. This improves latency in the syscall path by avoid
expensive monotonic clock calls on every syscall entry/exit.
However, this timer fires every 10ms. Thus, when all tasks are idle (i.e.,
blocked or stopped), this forces a sentry wakeup every 10ms, when we may
otherwise be able to sleep until the next app-relevant event. These wakeups
cause the sentry to utilize approximately 2% CPU when the application is
otherwise idle.
Updates to clock are not strictly necessary when the app is idle, as there are
no readers of cpuClock. This commit reduces idle CPU by disabling the timer
when tasks are completely idle, and computing its effects at the next wakeup.
Rather than disabling the timer as soon as the app goes idle, we wait until the
next tick, which provides a window for short sleeps to sleep and wakeup without
doing the (relatively) expensive work of disabling and enabling the timer.
PiperOrigin-RevId: 272265822
2019-10-01 19:13:09 +00:00
|
|
|
func (k *Kernel) incRunningTasks() {
|
|
|
|
for {
|
|
|
|
tasks := atomic.LoadInt64(&k.runningTasks)
|
|
|
|
if tasks != 0 {
|
|
|
|
// Standard case. Simply increment.
|
|
|
|
if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// Transition from 0 -> 1. Synchronize with other transitions and timer.
|
|
|
|
k.runningTasksMu.Lock()
|
|
|
|
tasks = atomic.LoadInt64(&k.runningTasks)
|
|
|
|
if tasks != 0 {
|
|
|
|
// We're no longer the first task, no need to
|
|
|
|
// re-enable.
|
|
|
|
atomic.AddInt64(&k.runningTasks, 1)
|
|
|
|
k.runningTasksMu.Unlock()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
if !k.cpuClockTickerDisabled {
|
|
|
|
// Timer was never disabled.
|
|
|
|
atomic.StoreInt64(&k.runningTasks, 1)
|
|
|
|
k.runningTasksMu.Unlock()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
// We need to update cpuClock for all of the ticks missed while we
|
|
|
|
// slept, and then re-enable the timer.
|
|
|
|
//
|
|
|
|
// The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify
|
|
|
|
// always increments cpuClock by 1 regardless of the number of
|
|
|
|
// expirations as a heuristic to avoid over-accounting in cases of CPU
|
|
|
|
// throttling.
|
|
|
|
//
|
|
|
|
// We want to cover the normal case, when all time should be accounted,
|
|
|
|
// so we increment for all expirations. Throttling is less concerning
|
|
|
|
// here because the ticker is only disabled from Notify. This means
|
|
|
|
// that Notify must schedule and compensate for the throttled period
|
|
|
|
// before the timer is disabled. Throttling while the timer is disabled
|
|
|
|
// doesn't matter, as nothing is running or reading cpuClock anyways.
|
|
|
|
//
|
|
|
|
// S/R also adds complication, as there are two cases. Recall that
|
|
|
|
// monotonicClock will jump forward on restore.
|
|
|
|
//
|
|
|
|
// 1. If the ticker is enabled during save, then on Restore Notify is
|
|
|
|
// called with many expirations, covering the time jump, but cpuClock
|
|
|
|
// is only incremented by 1.
|
|
|
|
//
|
|
|
|
// 2. If the ticker is disabled during save, then after Restore the
|
|
|
|
// first wakeup will call this function and cpuClock will be
|
|
|
|
// incremented by the number of expirations across the S/R.
|
|
|
|
//
|
|
|
|
// These cause very different value of cpuClock. But again, since
|
|
|
|
// nothing was running while the ticker was disabled, those differences
|
|
|
|
// don't matter.
|
|
|
|
setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now())
|
|
|
|
if exp > 0 {
|
|
|
|
atomic.AddUint64(&k.cpuClock, exp)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Now that cpuClock is updated it is safe to allow other tasks to
|
|
|
|
// transition to running.
|
|
|
|
atomic.StoreInt64(&k.runningTasks, 1)
|
|
|
|
|
|
|
|
// N.B. we must unlock before calling Swap to maintain lock ordering.
|
|
|
|
//
|
|
|
|
// cpuClockTickerDisabled need not wait until after Swap to become
|
|
|
|
// true. It is sufficient that the timer *will* be enabled.
|
|
|
|
k.cpuClockTickerDisabled = false
|
|
|
|
k.runningTasksMu.Unlock()
|
|
|
|
|
|
|
|
// This won't call Notify (unless it's been ClockTick since setting.At
|
|
|
|
// above). This means we skip the thread group work in Notify. However,
|
|
|
|
// since nothing was running while we were disabled, none of the timers
|
|
|
|
// could have expired.
|
|
|
|
k.cpuClockTicker.Swap(setting)
|
|
|
|
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (k *Kernel) decRunningTasks() {
|
|
|
|
tasks := atomic.AddInt64(&k.runningTasks, -1)
|
|
|
|
if tasks < 0 {
|
|
|
|
panic(fmt.Sprintf("Invalid running count %d", tasks))
|
|
|
|
}
|
|
|
|
|
|
|
|
// Nothing to do. The next CPU clock tick will disable the timer if
|
|
|
|
// there is still nothing running. This provides approximately one tick
|
|
|
|
// of slack in which we can switch back and forth between idle and
|
|
|
|
// active without an expensive transition.
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// WaitExited blocks until all tasks in k have exited.
|
|
|
|
func (k *Kernel) WaitExited() {
|
|
|
|
k.tasks.liveGoroutines.Wait()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Kill requests that all tasks in k immediately exit as if group exiting with
|
|
|
|
// status es. Kill does not wait for tasks to exit.
|
|
|
|
func (k *Kernel) Kill(es ExitStatus) {
|
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
k.tasks.Kill(es)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Pause requests that all tasks in k temporarily stop executing, and blocks
|
|
|
|
// until all tasks in k have stopped. Multiple calls to Pause nest and require
|
|
|
|
// an equal number of calls to Unpause to resume execution.
|
|
|
|
func (k *Kernel) Pause() {
|
|
|
|
k.extMu.Lock()
|
|
|
|
k.tasks.BeginExternalStop()
|
|
|
|
k.extMu.Unlock()
|
|
|
|
k.tasks.runningGoroutines.Wait()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Unpause ends the effect of a previous call to Pause. If Unpause is called
|
|
|
|
// without a matching preceding call to Pause, Unpause may panic.
|
|
|
|
func (k *Kernel) Unpause() {
|
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
k.tasks.EndExternalStop()
|
|
|
|
}
|
|
|
|
|
|
|
|
// SendExternalSignal injects a signal into the kernel.
|
|
|
|
//
|
|
|
|
// context is used only for debugging to describe how the signal was received.
|
|
|
|
//
|
2018-06-21 21:53:05 +00:00
|
|
|
// Preconditions: Kernel must have an init process.
|
|
|
|
func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
|
2018-04-27 17:37:02 +00:00
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
2018-06-21 21:53:05 +00:00
|
|
|
k.sendExternalSignal(info, context)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2018-09-27 22:00:03 +00:00
|
|
|
// SendContainerSignal sends the given signal to all processes inside the
|
|
|
|
// namespace that match the given container ID.
|
|
|
|
func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
|
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
k.tasks.mu.RLock()
|
|
|
|
defer k.tasks.mu.RUnlock()
|
|
|
|
|
2018-10-17 19:27:58 +00:00
|
|
|
var lastErr error
|
2019-04-03 23:21:38 +00:00
|
|
|
for tg := range k.tasks.Root.tgids {
|
|
|
|
if tg.leader.ContainerID() == cid {
|
|
|
|
tg.signalHandlers.mu.Lock()
|
2018-09-27 22:00:03 +00:00
|
|
|
infoCopy := *info
|
2019-04-03 23:21:38 +00:00
|
|
|
if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
|
2018-10-17 19:27:58 +00:00
|
|
|
lastErr = err
|
2018-09-27 22:00:03 +00:00
|
|
|
}
|
2019-04-03 23:21:38 +00:00
|
|
|
tg.signalHandlers.mu.Unlock()
|
2018-09-27 22:00:03 +00:00
|
|
|
}
|
|
|
|
}
|
2018-10-17 19:27:58 +00:00
|
|
|
return lastErr
|
|
|
|
}
|
|
|
|
|
2019-12-07 00:58:28 +00:00
|
|
|
// RebuildTraceContexts rebuilds the trace context for all tasks.
|
|
|
|
//
|
|
|
|
// Unfortunately, if these are built while tracing is not enabled, then we will
|
|
|
|
// not have meaningful trace data. Rebuilding here ensures that we can do so
|
|
|
|
// after tracing has been enabled.
|
|
|
|
func (k *Kernel) RebuildTraceContexts() {
|
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
k.tasks.mu.RLock()
|
|
|
|
defer k.tasks.mu.RUnlock()
|
|
|
|
|
|
|
|
for t, tid := range k.tasks.Root.tids {
|
|
|
|
t.rebuildTraceContext(tid)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// FeatureSet returns the FeatureSet.
|
|
|
|
func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
|
|
|
|
return k.featureSet
|
|
|
|
}
|
|
|
|
|
|
|
|
// Timekeeper returns the Timekeeper.
|
|
|
|
func (k *Kernel) Timekeeper() *Timekeeper {
|
|
|
|
return k.timekeeper
|
|
|
|
}
|
|
|
|
|
|
|
|
// TaskSet returns the TaskSet.
|
|
|
|
func (k *Kernel) TaskSet() *TaskSet {
|
|
|
|
return k.tasks
|
|
|
|
}
|
|
|
|
|
|
|
|
// RootUserNamespace returns the root UserNamespace.
|
|
|
|
func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
|
|
|
|
return k.rootUserNamespace
|
|
|
|
}
|
|
|
|
|
|
|
|
// RootUTSNamespace returns the root UTSNamespace.
|
|
|
|
func (k *Kernel) RootUTSNamespace() *UTSNamespace {
|
|
|
|
return k.rootUTSNamespace
|
|
|
|
}
|
|
|
|
|
|
|
|
// RootIPCNamespace returns the root IPCNamespace.
|
|
|
|
func (k *Kernel) RootIPCNamespace() *IPCNamespace {
|
|
|
|
return k.rootIPCNamespace
|
|
|
|
}
|
|
|
|
|
2019-04-30 15:35:36 +00:00
|
|
|
// RootPIDNamespace returns the root PIDNamespace.
|
|
|
|
func (k *Kernel) RootPIDNamespace() *PIDNamespace {
|
|
|
|
return k.tasks.Root
|
|
|
|
}
|
|
|
|
|
2018-09-07 17:44:50 +00:00
|
|
|
// RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
|
|
|
|
func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
|
|
|
|
return k.rootAbstractSocketNamespace
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// NetworkStack returns the network stack. NetworkStack may return nil if no
|
|
|
|
// network stack is available.
|
|
|
|
func (k *Kernel) NetworkStack() inet.Stack {
|
|
|
|
return k.networkStack
|
|
|
|
}
|
|
|
|
|
|
|
|
// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
|
|
|
|
// nil if no such thread group exists. GlobalInit may return a thread group
|
|
|
|
// containing no tasks if the thread group has already exited.
|
|
|
|
func (k *Kernel) GlobalInit() *ThreadGroup {
|
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
|
|
|
return k.globalInit
|
|
|
|
}
|
|
|
|
|
2019-12-26 22:42:19 +00:00
|
|
|
// TestOnly_SetGlobalInit sets the thread group with ID 1 in the root PID namespace.
|
|
|
|
func (k *Kernel) TestOnly_SetGlobalInit(tg *ThreadGroup) {
|
|
|
|
k.globalInit = tg
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// ApplicationCores returns the number of CPUs visible to sandboxed
|
|
|
|
// applications.
|
|
|
|
func (k *Kernel) ApplicationCores() uint {
|
|
|
|
return k.applicationCores
|
|
|
|
}
|
|
|
|
|
|
|
|
// RealtimeClock returns the application CLOCK_REALTIME clock.
|
|
|
|
func (k *Kernel) RealtimeClock() ktime.Clock {
|
|
|
|
return k.realtimeClock
|
|
|
|
}
|
|
|
|
|
|
|
|
// MonotonicClock returns the application CLOCK_MONOTONIC clock.
|
|
|
|
func (k *Kernel) MonotonicClock() ktime.Clock {
|
|
|
|
return k.monotonicClock
|
|
|
|
}
|
|
|
|
|
|
|
|
// CPUClockNow returns the current value of k.cpuClock.
|
|
|
|
func (k *Kernel) CPUClockNow() uint64 {
|
|
|
|
return atomic.LoadUint64(&k.cpuClock)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Syslog returns the syslog.
|
|
|
|
func (k *Kernel) Syslog() *syslog {
|
|
|
|
return &k.syslog
|
|
|
|
}
|
|
|
|
|
|
|
|
// GenerateInotifyCookie generates a unique inotify event cookie.
|
|
|
|
//
|
|
|
|
// Returned values may overlap with previously returned values if the value
|
|
|
|
// space is exhausted. 0 is not a valid cookie value, all other values
|
|
|
|
// representable in a uint32 are allowed.
|
|
|
|
func (k *Kernel) GenerateInotifyCookie() uint32 {
|
|
|
|
id := atomic.AddUint32(&k.nextInotifyCookie, 1)
|
|
|
|
// Wrap-around is explicitly allowed for inotify event cookies.
|
|
|
|
if id == 0 {
|
|
|
|
id = atomic.AddUint32(&k.nextInotifyCookie, 1)
|
|
|
|
}
|
|
|
|
return id
|
|
|
|
}
|
|
|
|
|
|
|
|
// NetlinkPorts returns the netlink port manager.
|
|
|
|
func (k *Kernel) NetlinkPorts() *port.Manager {
|
|
|
|
return k.netlinkPorts
|
|
|
|
}
|
|
|
|
|
2019-04-23 01:17:25 +00:00
|
|
|
// SaveError returns the sandbox error that caused the kernel to exit during
|
|
|
|
// save.
|
|
|
|
func (k *Kernel) SaveError() error {
|
2018-04-27 17:37:02 +00:00
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
2019-04-23 01:17:25 +00:00
|
|
|
return k.saveErr
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
2019-04-23 01:17:25 +00:00
|
|
|
// SetSaveError sets the sandbox error that caused the kernel to exit during
|
|
|
|
// save, if one is not already set.
|
|
|
|
func (k *Kernel) SetSaveError(err error) {
|
2018-04-27 17:37:02 +00:00
|
|
|
k.extMu.Lock()
|
|
|
|
defer k.extMu.Unlock()
|
2019-04-23 01:17:25 +00:00
|
|
|
if k.saveErr == nil {
|
|
|
|
k.saveErr = err
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-09-14 02:11:12 +00:00
|
|
|
var _ tcpip.Clock = (*Kernel)(nil)
|
|
|
|
|
2018-05-02 05:11:07 +00:00
|
|
|
// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
|
|
|
|
func (k *Kernel) NowNanoseconds() int64 {
|
|
|
|
now, err := k.timekeeper.GetTime(sentrytime.Realtime)
|
|
|
|
if err != nil {
|
|
|
|
panic("Kernel.NowNanoseconds: " + err.Error())
|
|
|
|
}
|
|
|
|
return now
|
|
|
|
}
|
|
|
|
|
2018-09-14 02:11:12 +00:00
|
|
|
// NowMonotonic implements tcpip.Clock.NowMonotonic.
|
|
|
|
func (k *Kernel) NowMonotonic() int64 {
|
|
|
|
now, err := k.timekeeper.GetTime(sentrytime.Monotonic)
|
|
|
|
if err != nil {
|
|
|
|
panic("Kernel.NowMonotonic: " + err.Error())
|
|
|
|
}
|
|
|
|
return now
|
|
|
|
}
|
|
|
|
|
2019-03-14 15:11:36 +00:00
|
|
|
// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
|
|
|
|
// LoadFrom.
|
|
|
|
func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
|
|
|
|
k.mf = mf
|
|
|
|
}
|
|
|
|
|
|
|
|
// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
|
|
|
|
func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
|
|
|
|
return k.mf
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// SupervisorContext returns a Context with maximum privileges in k. It should
|
|
|
|
// only be used by goroutines outside the control of the emulated kernel
|
|
|
|
// defined by e.
|
|
|
|
//
|
|
|
|
// Callers are responsible for ensuring that the returned Context is not used
|
|
|
|
// concurrently with changes to the Kernel.
|
|
|
|
func (k *Kernel) SupervisorContext() context.Context {
|
|
|
|
return supervisorContext{
|
|
|
|
Logger: log.Log(),
|
|
|
|
k: k,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2019-06-10 22:16:42 +00:00
|
|
|
// SocketEntry represents a socket recorded in Kernel.sockets. It implements
|
2019-02-07 22:43:18 +00:00
|
|
|
// refs.WeakRefUser for sockets stored in the socket table.
|
|
|
|
//
|
|
|
|
// +stateify savable
|
2019-06-10 22:16:42 +00:00
|
|
|
type SocketEntry struct {
|
|
|
|
socketEntry
|
|
|
|
k *Kernel
|
|
|
|
Sock *refs.WeakRef
|
|
|
|
ID uint64 // Socket table entry number.
|
2019-02-07 22:43:18 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
|
2019-06-10 22:16:42 +00:00
|
|
|
func (s *SocketEntry) WeakRefGone() {
|
2019-02-07 22:43:18 +00:00
|
|
|
s.k.extMu.Lock()
|
2019-06-10 22:16:42 +00:00
|
|
|
s.k.sockets.Remove(s)
|
2019-02-07 22:43:18 +00:00
|
|
|
s.k.extMu.Unlock()
|
|
|
|
}
|
|
|
|
|
|
|
|
// RecordSocket adds a socket to the system-wide socket table for tracking.
|
|
|
|
//
|
|
|
|
// Precondition: Caller must hold a reference to sock.
|
2019-06-10 22:16:42 +00:00
|
|
|
func (k *Kernel) RecordSocket(sock *fs.File) {
|
2019-02-07 22:43:18 +00:00
|
|
|
k.extMu.Lock()
|
2019-06-10 22:16:42 +00:00
|
|
|
id := k.nextSocketEntry
|
|
|
|
k.nextSocketEntry++
|
|
|
|
s := &SocketEntry{k: k, ID: id}
|
|
|
|
s.Sock = refs.NewWeakRef(sock, s)
|
|
|
|
k.sockets.PushBack(s)
|
2019-02-07 22:43:18 +00:00
|
|
|
k.extMu.Unlock()
|
|
|
|
}
|
|
|
|
|
2019-06-10 22:16:42 +00:00
|
|
|
// ListSockets returns a snapshot of all sockets.
|
|
|
|
func (k *Kernel) ListSockets() []*SocketEntry {
|
2019-02-07 22:43:18 +00:00
|
|
|
k.extMu.Lock()
|
2019-06-10 22:16:42 +00:00
|
|
|
var socks []*SocketEntry
|
|
|
|
for s := k.sockets.Front(); s != nil; s = s.Next() {
|
|
|
|
socks = append(socks, s)
|
2019-02-07 22:43:18 +00:00
|
|
|
}
|
|
|
|
k.extMu.Unlock()
|
|
|
|
return socks
|
|
|
|
}
|
|
|
|
|
2019-11-01 01:02:04 +00:00
|
|
|
// supervisorContext is a privileged context.
|
2018-04-27 17:37:02 +00:00
|
|
|
type supervisorContext struct {
|
|
|
|
context.NoopSleeper
|
|
|
|
log.Logger
|
|
|
|
k *Kernel
|
|
|
|
}
|
|
|
|
|
|
|
|
// Value implements context.Context.
|
|
|
|
func (ctx supervisorContext) Value(key interface{}) interface{} {
|
|
|
|
switch key {
|
|
|
|
case CtxCanTrace:
|
|
|
|
// The supervisor context can trace anything. (None of
|
|
|
|
// supervisorContext's users are expected to invoke ptrace, but ptrace
|
|
|
|
// permissions are required for certain file accesses.)
|
|
|
|
return func(*Task, bool) bool { return true }
|
|
|
|
case CtxKernel:
|
|
|
|
return ctx.k
|
|
|
|
case CtxPIDNamespace:
|
|
|
|
return ctx.k.tasks.Root
|
|
|
|
case CtxUTSNamespace:
|
|
|
|
return ctx.k.rootUTSNamespace
|
|
|
|
case CtxIPCNamespace:
|
|
|
|
return ctx.k.rootIPCNamespace
|
|
|
|
case auth.CtxCredentials:
|
|
|
|
// The supervisor context is global root.
|
|
|
|
return auth.NewRootCredentials(ctx.k.rootUserNamespace)
|
|
|
|
case fs.CtxRoot:
|
2019-08-02 18:21:50 +00:00
|
|
|
if ctx.k.globalInit != nil {
|
|
|
|
return ctx.k.globalInit.mounts.Root()
|
|
|
|
}
|
|
|
|
return nil
|
2019-04-17 19:56:23 +00:00
|
|
|
case fs.CtxDirentCacheLimiter:
|
|
|
|
return ctx.k.DirentCacheLimiter
|
2018-04-27 17:37:02 +00:00
|
|
|
case ktime.CtxRealtimeClock:
|
|
|
|
return ctx.k.RealtimeClock()
|
|
|
|
case limits.CtxLimits:
|
|
|
|
// No limits apply.
|
|
|
|
return limits.NewLimitSet()
|
2019-03-14 15:11:36 +00:00
|
|
|
case pgalloc.CtxMemoryFile:
|
|
|
|
return ctx.k.mf
|
|
|
|
case pgalloc.CtxMemoryFileProvider:
|
|
|
|
return ctx.k
|
2018-04-27 17:37:02 +00:00
|
|
|
case platform.CtxPlatform:
|
|
|
|
return ctx.k
|
|
|
|
case uniqueid.CtxGlobalUniqueID:
|
|
|
|
return ctx.k.UniqueID()
|
2018-10-16 00:47:24 +00:00
|
|
|
case uniqueid.CtxGlobalUniqueIDProvider:
|
|
|
|
return ctx.k
|
2018-04-27 17:37:02 +00:00
|
|
|
case uniqueid.CtxInotifyCookie:
|
|
|
|
return ctx.k.GenerateInotifyCookie()
|
2018-10-20 18:12:26 +00:00
|
|
|
case unimpl.CtxEvents:
|
|
|
|
return ctx.k
|
2018-04-27 17:37:02 +00:00
|
|
|
default:
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
}
|
2019-07-30 00:11:27 +00:00
|
|
|
|
2019-07-31 18:59:21 +00:00
|
|
|
// Rate limits for the number of unimplemented syscall events.
|
2019-07-30 00:11:27 +00:00
|
|
|
const (
|
|
|
|
unimplementedSyscallsMaxRate = 100 // events per second
|
|
|
|
unimplementedSyscallBurst = 1000 // events
|
|
|
|
)
|
|
|
|
|
|
|
|
// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
|
|
|
|
// channel.
|
|
|
|
func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
|
2019-07-31 18:59:21 +00:00
|
|
|
k.unimplementedSyscallEmitterOnce.Do(func() {
|
2019-07-30 00:11:27 +00:00
|
|
|
k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
|
2019-07-31 18:59:21 +00:00
|
|
|
})
|
2019-07-30 00:11:27 +00:00
|
|
|
|
|
|
|
t := TaskFromContext(ctx)
|
|
|
|
k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
|
|
|
|
Tid: int32(t.ThreadID()),
|
|
|
|
Registers: t.Arch().StateData().Proto(),
|
|
|
|
})
|
|
|
|
}
|