gvisor/pkg/sentry/kernel/kernel.go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package kernel provides an emulation of the Linux kernel.
//
// See README.md for a detailed overview.
//
// Lock order (outermost locks must be taken first):
//
// Kernel.extMu
//   ThreadGroup.timerMu
//     ktime.Timer.mu (for kernelCPUClockTicker and IntervalTimer)
//       TaskSet.mu
//         SignalHandlers.mu
//           Task.mu
//       runningTasksMu
//
// Locking SignalHandlers.mu in multiple SignalHandlers requires locking
// TaskSet.mu exclusively first. Locking Task.mu in multiple Tasks at the same
// time requires locking all of their signal mutexes first.
package kernel

import (
	"errors"
	"fmt"
	"io"
	"path/filepath"
	"sync"
	"sync/atomic"
	"time"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/cpuid"
	"gvisor.dev/gvisor/pkg/eventchannel"
	"gvisor.dev/gvisor/pkg/log"
	"gvisor.dev/gvisor/pkg/refs"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/context"
	"gvisor.dev/gvisor/pkg/sentry/fs"
	"gvisor.dev/gvisor/pkg/sentry/fs/timerfd"
	"gvisor.dev/gvisor/pkg/sentry/hostcpu"
	"gvisor.dev/gvisor/pkg/sentry/inet"
	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
	"gvisor.dev/gvisor/pkg/sentry/kernel/epoll"
	"gvisor.dev/gvisor/pkg/sentry/kernel/futex"
	"gvisor.dev/gvisor/pkg/sentry/kernel/sched"
	ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
	"gvisor.dev/gvisor/pkg/sentry/limits"
	"gvisor.dev/gvisor/pkg/sentry/loader"
	"gvisor.dev/gvisor/pkg/sentry/mm"
	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
	"gvisor.dev/gvisor/pkg/sentry/platform"
	"gvisor.dev/gvisor/pkg/sentry/socket/netlink/port"
	sentrytime "gvisor.dev/gvisor/pkg/sentry/time"
	"gvisor.dev/gvisor/pkg/sentry/unimpl"
	uspb "gvisor.dev/gvisor/pkg/sentry/unimpl/unimplemented_syscall_go_proto"
	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
	"gvisor.dev/gvisor/pkg/state"
	"gvisor.dev/gvisor/pkg/tcpip"
)

// Kernel represents an emulated Linux kernel. It must be initialized by calling
// Init() or LoadFrom().
//
// +stateify savable
type Kernel struct {
	// extMu serializes external changes to the Kernel with calls to
	// Kernel.SaveTo. (Kernel.SaveTo requires that the state of the Kernel
	// remains frozen for the duration of the call; it requires that the Kernel
	// is paused as a precondition, which ensures that none of the tasks
	// running within the Kernel can affect its state, but extMu is required to
	// ensure that concurrent users of the Kernel *outside* the Kernel's
	// control cannot affect its state by calling e.g.
	// Kernel.SendExternalSignal.)
	extMu sync.Mutex `state:"nosave"`

	// started is true if Start has been called. Unless otherwise specified,
	// all Kernel fields become immutable once started becomes true.
	started bool `state:"nosave"`

	// All of the following fields are immutable unless otherwise specified.

	// Platform is the platform that is used to execute tasks in the created
	// Kernel. See comment on pgalloc.MemoryFileProvider for why Platform is
	// embedded anonymously (the same issue applies).
	platform.Platform `state:"nosave"`

	// mf provides application memory.
	mf *pgalloc.MemoryFile `state:"nosave"`

	// See InitKernelArgs for the meaning of these fields.
	featureSet                  *cpuid.FeatureSet
	timekeeper                  *Timekeeper
	tasks                       *TaskSet
	rootUserNamespace           *auth.UserNamespace
	networkStack                inet.Stack `state:"nosave"`
	applicationCores            uint
	useHostCores                bool
	extraAuxv                   []arch.AuxEntry
	vdso                        *loader.VDSO
	rootUTSNamespace            *UTSNamespace
	rootIPCNamespace            *IPCNamespace
	rootAbstractSocketNamespace *AbstractSocketNamespace

	// futexes is the "root" futex.Manager, from which all others are forked.
	// This is necessary to ensure that shared futexes are coherent across all
	// tasks, including those created by CreateProcess.
	futexes *futex.Manager

	// globalInit is the thread group whose leader has ID 1 in the root PID
	// namespace. globalInit is stored separately so that it is accessible even
	// after all tasks in the thread group have exited, such that ID 1 is no
	// longer mapped.
	//
	// globalInit is mutable until it is assigned by the first successful call
	// to CreateProcess, and is protected by extMu.
	globalInit *ThreadGroup

	// realtimeClock is a ktime.Clock based on timekeeper's Realtime.
	realtimeClock *timekeeperClock

	// monotonicClock is a ktime.Clock based on timekeeper's Monotonic.
	monotonicClock *timekeeperClock

	// syslog is the kernel log.
	syslog syslog

	// runningTasksMu synchronizes disable/enable of cpuClockTicker when
	// the kernel is idle (runningTasks == 0).
	//
	// runningTasksMu is used to exclude critical sections when the timer
	// disables itself and when the first active task enables the timer,
	// ensuring that tasks always see a valid cpuClock value.
	runningTasksMu sync.Mutex `state:"nosave"`

	// runningTasks is the total count of tasks currently in
	// TaskGoroutineRunningSys or TaskGoroutineRunningApp. i.e., they are
	// not blocked or stopped.
	//
	// runningTasks must be accessed atomically. Increments from 0 to 1 are
	// further protected by runningTasksMu (see incRunningTasks).
	runningTasks int64

	// cpuClock is incremented every linux.ClockTick. cpuClock is used to
	// measure task CPU usage, since sampling monotonicClock twice on every
	// syscall turns out to be unreasonably expensive. This is similar to how
	// Linux does task CPU accounting on x86 (CONFIG_IRQ_TIME_ACCOUNTING),
	// although Linux also uses scheduler timing information to improve
	// resolution (kernel/sched/cputime.c:cputime_adjust()), which we can't do
	// since "preeemptive" scheduling is managed by the Go runtime, which
	// doesn't provide this information.
	//
	// cpuClock is mutable, and is accessed using atomic memory operations.
	cpuClock uint64

	// cpuClockTicker increments cpuClock.
	cpuClockTicker *ktime.Timer `state:"nosave"`

	// cpuClockTickerDisabled indicates that cpuClockTicker has been
	// disabled because no tasks are running.
	//
	// cpuClockTickerDisabled is protected by runningTasksMu.
	cpuClockTickerDisabled bool

	// cpuClockTickerSetting is the ktime.Setting of cpuClockTicker at the
	// point it was disabled. It is cached here to avoid a lock ordering
	// violation with cpuClockTicker.mu when runningTaskMu is held.
	//
	// cpuClockTickerSetting is only valid when cpuClockTickerDisabled is
	// true.
	//
	// cpuClockTickerSetting is protected by runningTasksMu.
	cpuClockTickerSetting ktime.Setting

	// fdMapUids is an ever-increasing counter for generating FDTable uids.
	//
	// fdMapUids is mutable, and is accessed using atomic memory operations.
	fdMapUids uint64

	// uniqueID is used to generate unique identifiers.
	//
	// uniqueID is mutable, and is accessed using atomic memory operations.
	uniqueID uint64

	// nextInotifyCookie is a monotonically increasing counter used for
	// generating unique inotify event cookies.
	//
	// nextInotifyCookie is mutable, and is accessed using atomic memory
	// operations.
	nextInotifyCookie uint32

	// netlinkPorts manages allocation of netlink socket port IDs.
	netlinkPorts *port.Manager

	// saveErr is the error causing the sandbox to exit during save, if
	// any. It is protected by extMu.
	saveErr error `state:"nosave"`

	// danglingEndpoints is used to save / restore tcpip.DanglingEndpoints.
	danglingEndpoints struct{} `state:".([]tcpip.Endpoint)"`

	// sockets is the list of all network sockets the system. Protected by
	// extMu.
	sockets socketList

	// nextSocketEntry is the next entry number to use in sockets. Protected
	// by extMu.
	nextSocketEntry uint64

	// deviceRegistry is used to save/restore device.SimpleDevices.
	deviceRegistry struct{} `state:".(*device.Registry)"`

	// DirentCacheLimiter controls the number of total dirent entries can be in
	// caches. Not all caches use it, only the caches that use host resources use
	// the limiter. It may be nil if disabled.
	DirentCacheLimiter *fs.DirentCacheLimiter

	// unimplementedSyscallEmitterOnce is used in the initialization of
	// unimplementedSyscallEmitter.
	unimplementedSyscallEmitterOnce sync.Once `state:"nosave"`

	// unimplementedSyscallEmitter is used to emit unimplemented syscall
	// events. This is initialized lazily on the first unimplemented
	// syscall.
	unimplementedSyscallEmitter eventchannel.Emitter `state:"nosave"`
}

// InitKernelArgs holds arguments to Init.
type InitKernelArgs struct {
	// FeatureSet is the emulated CPU feature set.
	FeatureSet *cpuid.FeatureSet

	// Timekeeper manages time for all tasks in the system.
	Timekeeper *Timekeeper

	// RootUserNamespace is the root user namespace.
	RootUserNamespace *auth.UserNamespace

	// NetworkStack is the TCP/IP network stack. NetworkStack may be nil.
	NetworkStack inet.Stack

	// ApplicationCores is the number of logical CPUs visible to sandboxed
	// applications. The set of logical CPU IDs is [0, ApplicationCores); thus
	// ApplicationCores is analogous to Linux's nr_cpu_ids, the index of the
	// most significant bit in cpu_possible_mask + 1.
	ApplicationCores uint

	// If UseHostCores is true, Task.CPU() returns the task goroutine's CPU
	// instead of a virtualized CPU number, and Task.CopyToCPUMask() is a
	// no-op. If ApplicationCores is less than hostcpu.MaxPossibleCPU(), it
	// will be overridden.
	UseHostCores bool

	// ExtraAuxv contains additional auxiliary vector entries that are added to
	// each process by the ELF loader.
	ExtraAuxv []arch.AuxEntry

	// Vdso holds the VDSO and its parameter page.
	Vdso *loader.VDSO

	// RootUTSNamespace is the root UTS namespace.
	RootUTSNamespace *UTSNamespace

	// RootIPCNamespace is the root IPC namespace.
	RootIPCNamespace *IPCNamespace

	// RootAbstractSocketNamespace is the root Abstract Socket namespace.
	RootAbstractSocketNamespace *AbstractSocketNamespace

	// PIDNamespace is the root PID namespace.
	PIDNamespace *PIDNamespace
}

// Init initialize the Kernel with no tasks.
//
// Callers must manually set Kernel.Platform and call Kernel.SetMemoryFile
// before calling Init.
func (k *Kernel) Init(args InitKernelArgs) error {
	if args.FeatureSet == nil {
		return fmt.Errorf("FeatureSet is nil")
	}
	if args.Timekeeper == nil {
		return fmt.Errorf("Timekeeper is nil")
	}
	if args.RootUserNamespace == nil {
		return fmt.Errorf("RootUserNamespace is nil")
	}
	if args.ApplicationCores == 0 {
		return fmt.Errorf("ApplicationCores is 0")
	}

	k.featureSet = args.FeatureSet
	k.timekeeper = args.Timekeeper
	k.tasks = newTaskSet(args.PIDNamespace)
	k.rootUserNamespace = args.RootUserNamespace
	k.rootUTSNamespace = args.RootUTSNamespace
	k.rootIPCNamespace = args.RootIPCNamespace
	k.rootAbstractSocketNamespace = args.RootAbstractSocketNamespace
	k.networkStack = args.NetworkStack
	k.applicationCores = args.ApplicationCores
	if args.UseHostCores {
		k.useHostCores = true
		maxCPU, err := hostcpu.MaxPossibleCPU()
		if err != nil {
			return fmt.Errorf("Failed to get maximum CPU number: %v", err)
		}
		minAppCores := uint(maxCPU) + 1
		if k.applicationCores < minAppCores {
			log.Infof("UseHostCores enabled: increasing ApplicationCores from %d to %d", k.applicationCores, minAppCores)
			k.applicationCores = minAppCores
		}
	}
	k.extraAuxv = args.ExtraAuxv
	k.vdso = args.Vdso
	k.realtimeClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Realtime}
	k.monotonicClock = &timekeeperClock{tk: args.Timekeeper, c: sentrytime.Monotonic}
	k.futexes = futex.NewManager()
	k.netlinkPorts = port.New()
	return nil
}

// SaveTo saves the state of k to w.
//
// Preconditions: The kernel must be paused throughout the call to SaveTo.
func (k *Kernel) SaveTo(w io.Writer) error {
	saveStart := time.Now()
	ctx := k.SupervisorContext()

	// Do not allow other Kernel methods to affect it while it's being saved.
	k.extMu.Lock()
	defer k.extMu.Unlock()

	// Stop time.
	k.pauseTimeLocked()
	defer k.resumeTimeLocked()

	// Evict all evictable MemoryFile allocations.
	k.mf.StartEvictions()
	k.mf.WaitForEvictions()

	// Flush write operations on open files so data reaches backing storage.
	// This must come after MemoryFile eviction since eviction may cause file
	// writes.
	if err := k.tasks.flushWritesToFiles(ctx); err != nil {
		return err
	}

	// Remove all epoll waiter objects from underlying wait queues.
	// NOTE: for programs to resume execution in future snapshot scenarios,
	// we will need to re-establish these waiter objects after saving.
	k.tasks.unregisterEpollWaiters()

	// Clear the dirent cache before saving because Dirents must be Loaded in a
	// particular order (parents before children), and Loading dirents from a cache
	// breaks that order.
	if err := k.flushMountSourceRefs(); err != nil {
		return err
	}

	// Ensure that all pending asynchronous work is complete:
	//   - inode and mount release
	//   - asynchronuous IO
	fs.AsyncBarrier()

	// Once all fs work has completed (flushed references have all been released),
	// reset mount mappings. This allows individual mounts to save how inodes map
	// to filesystem resources. Without this, fs.Inodes cannot be restored.
	fs.SaveInodeMappings()

	// Discard unsavable mappings, such as those for host file descriptors.
	// This must be done after waiting for "asynchronous fs work", which
	// includes async I/O that may touch application memory.
	if err := k.invalidateUnsavableMappings(ctx); err != nil {
		return fmt.Errorf("failed to invalidate unsavable mappings: %v", err)
	}

	// Save the CPUID FeatureSet before the rest of the kernel so we can
	// verify its compatibility on restore before attempting to restore the
	// entire kernel, which may fail on an incompatible machine.
	//
	// N.B. This will also be saved along with the full kernel save below.
	cpuidStart := time.Now()
	if err := state.Save(w, k.FeatureSet(), nil); err != nil {
		return err
	}
	log.Infof("CPUID save took [%s].", time.Since(cpuidStart))

	// Save the kernel state.
	kernelStart := time.Now()
	var stats state.Stats
	if err := state.Save(w, k, &stats); err != nil {
		return err
	}
	log.Infof("Kernel save stats: %s", &stats)
	log.Infof("Kernel save took [%s].", time.Since(kernelStart))

	// Save the memory file's state.
	memoryStart := time.Now()
	if err := k.mf.SaveTo(w); err != nil {
		return err
	}
	log.Infof("Memory save took [%s].", time.Since(memoryStart))

	log.Infof("Overall save took [%s].", time.Since(saveStart))

	return nil
}

// flushMountSourceRefs flushes the MountSources for all mounted filesystems
// and open FDs.
func (k *Kernel) flushMountSourceRefs() error {
	// Flush all mount sources for currently mounted filesystems in each task.
	flushed := make(map[*fs.MountNamespace]struct{})
	k.tasks.mu.RLock()
	k.tasks.forEachThreadGroupLocked(func(tg *ThreadGroup) {
		if _, ok := flushed[tg.mounts]; ok {
			// Already flushed.
			return
		}
		tg.mounts.FlushMountSourceRefs()
		flushed[tg.mounts] = struct{}{}
	})
	k.tasks.mu.RUnlock()

	// There may be some open FDs whose filesystems have been unmounted. We
	// must flush those as well.
	return k.tasks.forEachFDPaused(func(file *fs.File) error {
		file.Dirent.Inode.MountSource.FlushDirentRefs()
		return nil
	})
}

// forEachFDPaused applies the given function to each open file descriptor in each
// task.
//
// Precondition: Must be called with the kernel paused.
func (ts *TaskSet) forEachFDPaused(f func(*fs.File) error) (err error) {
	ts.mu.RLock()
	defer ts.mu.RUnlock()
	for t := range ts.Root.tids {
		// We can skip locking Task.mu here since the kernel is paused.
		if t.fdTable == nil {
			continue
		}
		t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
			if lastErr := f(file); lastErr != nil && err == nil {
				err = lastErr
			}
		})
	}
	return err
}

func (ts *TaskSet) flushWritesToFiles(ctx context.Context) error {
	return ts.forEachFDPaused(func(file *fs.File) error {
		if flags := file.Flags(); !flags.Write {
			return nil
		}
		if sattr := file.Dirent.Inode.StableAttr; !fs.IsFile(sattr) && !fs.IsDir(sattr) {
			return nil
		}
		// Here we need all metadata synced.
		syncErr := file.Fsync(ctx, 0, fs.FileMaxOffset, fs.SyncAll)
		if err := fs.SaveFileFsyncError(syncErr); err != nil {
			name, _ := file.Dirent.FullName(nil /* root */)
			// Wrap this error in ErrSaveRejection
			// so that it will trigger a save
			// error, rather than a panic. This
			// also allows us to distinguish Fsync
			// errors from state file errors in
			// state.Save.
			return fs.ErrSaveRejection{
				Err: fmt.Errorf("%q was not sufficiently synced: %v", name, err),
			}
		}
		return nil
	})
}

// Preconditions: The kernel must be paused.
func (k *Kernel) invalidateUnsavableMappings(ctx context.Context) error {
	invalidated := make(map[*mm.MemoryManager]struct{})
	k.tasks.mu.RLock()
	defer k.tasks.mu.RUnlock()
	for t := range k.tasks.Root.tids {
		// We can skip locking Task.mu here since the kernel is paused.
		if mm := t.tc.MemoryManager; mm != nil {
			if _, ok := invalidated[mm]; !ok {
				if err := mm.InvalidateUnsavable(ctx); err != nil {
					return err
				}
				invalidated[mm] = struct{}{}
			}
		}
		// I really wish we just had a sync.Map of all MMs...
		if r, ok := t.runState.(*runSyscallAfterExecStop); ok {
			if err := r.tc.MemoryManager.InvalidateUnsavable(ctx); err != nil {
				return err
			}
		}
	}
	return nil
}

func (ts *TaskSet) unregisterEpollWaiters() {
	ts.mu.RLock()
	defer ts.mu.RUnlock()
	for t := range ts.Root.tids {
		// We can skip locking Task.mu here since the kernel is paused.
		if t.fdTable != nil {
			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
				if e, ok := file.FileOperations.(*epoll.EventPoll); ok {
					e.UnregisterEpollWaiters()
				}
			})
		}
	}
}

// LoadFrom returns a new Kernel loaded from args.
func (k *Kernel) LoadFrom(r io.Reader, net inet.Stack, clocks sentrytime.Clocks) error {
	loadStart := time.Now()

	k.networkStack = net

	initAppCores := k.applicationCores

	// Load the pre-saved CPUID FeatureSet.
	//
	// N.B. This was also saved along with the full kernel below, so we
	// don't need to explicitly install it in the Kernel.
	cpuidStart := time.Now()
	var features cpuid.FeatureSet
	if err := state.Load(r, &features, nil); err != nil {
		return err
	}
	log.Infof("CPUID load took [%s].", time.Since(cpuidStart))

	// Verify that the FeatureSet is usable on this host. We do this before
	// Kernel load so that the explicit CPUID mismatch error has priority
	// over floating point state restore errors that may occur on load on
	// an incompatible machine.
	if err := features.CheckHostCompatible(); err != nil {
		return err
	}

	// Load the kernel state.
	kernelStart := time.Now()
	var stats state.Stats
	if err := state.Load(r, k, &stats); err != nil {
		return err
	}
	log.Infof("Kernel load stats: %s", &stats)
	log.Infof("Kernel load took [%s].", time.Since(kernelStart))

	// Load the memory file's state.
	memoryStart := time.Now()
	if err := k.mf.LoadFrom(r); err != nil {
		return err
	}
	log.Infof("Memory load took [%s].", time.Since(memoryStart))

	log.Infof("Overall load took [%s]", time.Since(loadStart))

	k.Timekeeper().SetClocks(clocks)
	if net != nil {
		net.Resume()
	}

	// Ensure that all pending asynchronous work is complete:
	//   - namedpipe opening
	//   - inode file opening
	if err := fs.AsyncErrorBarrier(); err != nil {
		return err
	}

	tcpip.AsyncLoading.Wait()

	log.Infof("Overall load took [%s] after async work", time.Since(loadStart))

	// Applications may size per-cpu structures based on k.applicationCores, so
	// it can't change across save/restore. When we are virtualizing CPU
	// numbers, this isn't a problem. However, when we are exposing host CPU
	// assignments, we can't tolerate an increase in the number of host CPUs,
	// which could result in getcpu(2) returning CPUs that applications expect
	// not to exist.
	if k.useHostCores && initAppCores > k.applicationCores {
		return fmt.Errorf("UseHostCores enabled: can't increase ApplicationCores from %d to %d after restore", k.applicationCores, initAppCores)
	}

	return nil
}

// UniqueID returns a unique identifier.
func (k *Kernel) UniqueID() uint64 {
	id := atomic.AddUint64(&k.uniqueID, 1)
	if id == 0 {
		panic("unique identifier generator wrapped around")
	}
	return id
}

// CreateProcessArgs holds arguments to kernel.CreateProcess.
type CreateProcessArgs struct {
	// Filename is the filename to load as the init binary.
	//
	// If this is provided as "", File will be checked, then the file will be
	// guessed via Argv[0].
	Filename string

	// File is a passed host FD pointing to a file to load as the init binary.
	//
	// This is checked if and only if Filename is "".
	File *fs.File

	// Argvv is a list of arguments.
	Argv []string

	// Envv is a list of environment variables.
	Envv []string

	// WorkingDirectory is the initial working directory.
	//
	// This defaults to the root if empty.
	WorkingDirectory string

	// Credentials is the initial credentials.
	Credentials *auth.Credentials

	// FDTable is the initial set of file descriptors. If CreateProcess succeeds,
	// it takes a reference on FDTable.
	FDTable *FDTable

	// Umask is the initial umask.
	Umask uint

	// Limits is the initial resource limits.
	Limits *limits.LimitSet

	// MaxSymlinkTraversals is the maximum number of symlinks to follow
	// during resolution.
	MaxSymlinkTraversals uint

	// UTSNamespace is the initial UTS namespace.
	UTSNamespace *UTSNamespace

	// IPCNamespace is the initial IPC namespace.
	IPCNamespace *IPCNamespace

	// PIDNamespace is the initial PID Namespace.
	PIDNamespace *PIDNamespace

	// AbstractSocketNamespace is the initial Abstract Socket namespace.
	AbstractSocketNamespace *AbstractSocketNamespace

	// MountNamespace optionally contains the mount namespace for this
	// process. If nil, the init process's mount namespace is used.
	//
	// Anyone setting MountNamespace must donate a reference (i.e.
	// increment it).
	MountNamespace *fs.MountNamespace

	// ContainerID is the container that the process belongs to.
	ContainerID string
}

// NewContext returns a context.Context that represents the task that will be
// created by args.NewContext(k).
func (args *CreateProcessArgs) NewContext(k *Kernel) *createProcessContext {
	return &createProcessContext{
		Logger: log.Log(),
		k:      k,
		args:   args,
	}
}

// createProcessContext is a context.Context that represents the context
// associated with a task that is being created.
type createProcessContext struct {
	context.NoopSleeper
	log.Logger
	k    *Kernel
	args *CreateProcessArgs
}

// Value implements context.Context.Value.
func (ctx *createProcessContext) Value(key interface{}) interface{} {
	switch key {
	case CtxKernel:
		return ctx.k
	case CtxPIDNamespace:
		return ctx.args.PIDNamespace
	case CtxUTSNamespace:
		return ctx.args.UTSNamespace
	case CtxIPCNamespace:
		return ctx.args.IPCNamespace
	case auth.CtxCredentials:
		return ctx.args.Credentials
	case fs.CtxRoot:
		if ctx.args.MountNamespace != nil {
			// MountNamespace.Root() will take a reference on the root
			// dirent for us.
			return ctx.args.MountNamespace.Root()
		}
		return nil
	case fs.CtxDirentCacheLimiter:
		return ctx.k.DirentCacheLimiter
	case ktime.CtxRealtimeClock:
		return ctx.k.RealtimeClock()
	case limits.CtxLimits:
		return ctx.args.Limits
	case pgalloc.CtxMemoryFile:
		return ctx.k.mf
	case pgalloc.CtxMemoryFileProvider:
		return ctx.k
	case platform.CtxPlatform:
		return ctx.k
	case uniqueid.CtxGlobalUniqueID:
		return ctx.k.UniqueID()
	case uniqueid.CtxGlobalUniqueIDProvider:
		return ctx.k
	case uniqueid.CtxInotifyCookie:
		return ctx.k.GenerateInotifyCookie()
	case unimpl.CtxEvents:
		return ctx.k
	default:
		return nil
	}
}

// CreateProcess creates a new task in a new thread group with the given
// options. The new task has no parent and is in the root PID namespace.
//
// If k.Start() has already been called, then the created process must be
// started by calling kernel.StartProcess(tg).
//
// If k.Start() has not yet been called, then the created task will begin
// running when k.Start() is called.
//
// CreateProcess has no analogue in Linux; it is used to create the initial
// application task, as well as processes started by the control server.
func (k *Kernel) CreateProcess(args CreateProcessArgs) (*ThreadGroup, ThreadID, error) {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	log.Infof("EXEC: %v", args.Argv)

	// Grab the mount namespace.
	mounts := args.MountNamespace
	if mounts == nil {
		mounts = k.GlobalInit().Leader().MountNamespace()
		mounts.IncRef()
	}

	tg := k.newThreadGroup(mounts, args.PIDNamespace, NewSignalHandlers(), linux.SIGCHLD, args.Limits, k.monotonicClock)
	ctx := args.NewContext(k)

	// Get the root directory from the MountNamespace.
	root := mounts.Root()
	// The call to newFSContext below will take a reference on root, so we
	// don't need to hold this one.
	defer root.DecRef()

	// Grab the working directory.
	remainingTraversals := uint(args.MaxSymlinkTraversals)
	wd := root // Default.
	if args.WorkingDirectory != "" {
		var err error
		wd, err = mounts.FindInode(ctx, root, nil, args.WorkingDirectory, &remainingTraversals)
		if err != nil {
			return nil, 0, fmt.Errorf("failed to find initial working directory %q: %v", args.WorkingDirectory, err)
		}
		defer wd.DecRef()
	}

	// Check which file to start from.
	switch {
	case args.Filename != "":
		// If a filename is given, take that.
		// Set File to nil so we resolve the path in LoadTaskImage.
		args.File = nil
	case args.File != nil:
		// If File is set, take the File provided directly.
	default:
		// Otherwise look at Argv and see if the first argument is a valid path.
		if len(args.Argv) == 0 {
			return nil, 0, fmt.Errorf("no filename or command provided")
		}
		if !filepath.IsAbs(args.Argv[0]) {
			return nil, 0, fmt.Errorf("'%s' is not an absolute path", args.Argv[0])
		}
		args.Filename = args.Argv[0]
	}

	// Create a fresh task context.
	remainingTraversals = uint(args.MaxSymlinkTraversals)

	tc, se := k.LoadTaskImage(ctx, mounts, root, wd, &remainingTraversals, args.Filename, args.File, args.Argv, args.Envv, k.featureSet)
	if se != nil {
		return nil, 0, errors.New(se.String())
	}

	// Take a reference on the FDTable, which will be transferred to
	// TaskSet.NewTask().
	args.FDTable.IncRef()

	// Create the task.
	config := &TaskConfig{
		Kernel:                  k,
		ThreadGroup:             tg,
		TaskContext:             tc,
		FSContext:               newFSContext(root, wd, args.Umask),
		FDTable:                 args.FDTable,
		Credentials:             args.Credentials,
		AllowedCPUMask:          sched.NewFullCPUSet(k.applicationCores),
		UTSNamespace:            args.UTSNamespace,
		IPCNamespace:            args.IPCNamespace,
		AbstractSocketNamespace: args.AbstractSocketNamespace,
		ContainerID:             args.ContainerID,
	}
	if _, err := k.tasks.NewTask(config); err != nil {
		return nil, 0, err
	}

	// Success.
	tgid := k.tasks.Root.IDOfThreadGroup(tg)
	if k.globalInit == nil {
		k.globalInit = tg
	}
	return tg, tgid, nil
}

// StartProcess starts running a process that was created with CreateProcess.
func (k *Kernel) StartProcess(tg *ThreadGroup) {
	t := tg.Leader()
	tid := k.tasks.Root.IDOfTask(t)
	t.Start(tid)
}

// Start starts execution of all tasks in k.
//
// Preconditions: Start may be called exactly once.
func (k *Kernel) Start() error {
	k.extMu.Lock()
	defer k.extMu.Unlock()

	if k.globalInit == nil {
		return fmt.Errorf("kernel contains no tasks")
	}
	if k.started {
		return fmt.Errorf("kernel already started")
	}

	k.started = true
	k.cpuClockTicker = ktime.NewTimer(k.monotonicClock, newKernelCPUClockTicker(k))
	k.cpuClockTicker.Swap(ktime.Setting{
		Enabled: true,
		Period:  linux.ClockTick,
	})
	// If k was created by LoadKernelFrom, timers were stopped during
	// Kernel.SaveTo and need to be resumed. If k was created by NewKernel,
	// this is a no-op.
	k.resumeTimeLocked()
	// Start task goroutines.
	k.tasks.mu.RLock()
	defer k.tasks.mu.RUnlock()
	for t, tid := range k.tasks.Root.tids {
		t.Start(tid)
	}
	return nil
}

// pauseTimeLocked pauses all Timers and Timekeeper updates.
//
// Preconditions: Any task goroutines running in k must be stopped. k.extMu
// must be locked.
func (k *Kernel) pauseTimeLocked() {
	// k.cpuClockTicker may be nil since Kernel.SaveTo() may be called before
	// Kernel.Start().
	if k.cpuClockTicker != nil {
		k.cpuClockTicker.Pause()
	}

	// By precondition, nothing else can be interacting with PIDNamespace.tids
	// or FDTable.files, so we can iterate them without synchronization. (We
	// can't hold the TaskSet mutex when pausing thread group timers because
	// thread group timers call ThreadGroup.SendSignal, which takes the TaskSet
	// mutex, while holding the Timer mutex.)
	for t := range k.tasks.Root.tids {
		if t == t.tg.leader {
			t.tg.itimerRealTimer.Pause()
			for _, it := range t.tg.timers {
				it.PauseTimer()
			}
		}
		// This means we'll iterate FDTables shared by multiple tasks repeatedly,
		// but ktime.Timer.Pause is idempotent so this is harmless.
		if t.fdTable != nil {
			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
					tfd.PauseTimer()
				}
			})
		}
	}
	k.timekeeper.PauseUpdates()
}

// resumeTimeLocked resumes all Timers and Timekeeper updates. If
// pauseTimeLocked has not been previously called, resumeTimeLocked has no
// effect.
//
// Preconditions: Any task goroutines running in k must be stopped. k.extMu
// must be locked.
func (k *Kernel) resumeTimeLocked() {
	if k.cpuClockTicker != nil {
		k.cpuClockTicker.Resume()
	}

	k.timekeeper.ResumeUpdates()
	for t := range k.tasks.Root.tids {
		if t == t.tg.leader {
			t.tg.itimerRealTimer.Resume()
			for _, it := range t.tg.timers {
				it.ResumeTimer()
			}
		}
		if t.fdTable != nil {
			t.fdTable.forEach(func(_ int32, file *fs.File, _ FDFlags) {
				if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok {
					tfd.ResumeTimer()
				}
			})
		}
	}
}

func (k *Kernel) incRunningTasks() {
	for {
		tasks := atomic.LoadInt64(&k.runningTasks)
		if tasks != 0 {
			// Standard case. Simply increment.
			if !atomic.CompareAndSwapInt64(&k.runningTasks, tasks, tasks+1) {
				continue
			}
			return
		}

		// Transition from 0 -> 1. Synchronize with other transitions and timer.
		k.runningTasksMu.Lock()
		tasks = atomic.LoadInt64(&k.runningTasks)
		if tasks != 0 {
			// We're no longer the first task, no need to
			// re-enable.
			atomic.AddInt64(&k.runningTasks, 1)
			k.runningTasksMu.Unlock()
			return
		}

		if !k.cpuClockTickerDisabled {
			// Timer was never disabled.
			atomic.StoreInt64(&k.runningTasks, 1)
			k.runningTasksMu.Unlock()
			return
		}

		// We need to update cpuClock for all of the ticks missed while we
		// slept, and then re-enable the timer.
		//
		// The Notify in Swap isn't sufficient. kernelCPUClockTicker.Notify
		// always increments cpuClock by 1 regardless of the number of
		// expirations as a heuristic to avoid over-accounting in cases of CPU
		// throttling.
		//
		// We want to cover the normal case, when all time should be accounted,
		// so we increment for all expirations. Throttling is less concerning
		// here because the ticker is only disabled from Notify. This means
		// that Notify must schedule and compensate for the throttled period
		// before the timer is disabled. Throttling while the timer is disabled
		// doesn't matter, as nothing is running or reading cpuClock anyways.
		//
		// S/R also adds complication, as there are two cases. Recall that
		// monotonicClock will jump forward on restore.
		//
		// 1. If the ticker is enabled during save, then on Restore Notify is
		// called with many expirations, covering the time jump, but cpuClock
		// is only incremented by 1.
		//
		// 2. If the ticker is disabled during save, then after Restore the
		// first wakeup will call this function and cpuClock will be
		// incremented by the number of expirations across the S/R.
		//
		// These cause very different value of cpuClock. But again, since
		// nothing was running while the ticker was disabled, those differences
		// don't matter.
		setting, exp := k.cpuClockTickerSetting.At(k.monotonicClock.Now())
		if exp > 0 {
			atomic.AddUint64(&k.cpuClock, exp)
		}

		// Now that cpuClock is updated it is safe to allow other tasks to
		// transition to running.
		atomic.StoreInt64(&k.runningTasks, 1)

		// N.B. we must unlock before calling Swap to maintain lock ordering.
		//
		// cpuClockTickerDisabled need not wait until after Swap to become
		// true. It is sufficient that the timer *will* be enabled.
		k.cpuClockTickerDisabled = false
		k.runningTasksMu.Unlock()

		// This won't call Notify (unless it's been ClockTick since setting.At
		// above). This means we skip the thread group work in Notify. However,
		// since nothing was running while we were disabled, none of the timers
		// could have expired.
		k.cpuClockTicker.Swap(setting)

		return
	}
}

func (k *Kernel) decRunningTasks() {
	tasks := atomic.AddInt64(&k.runningTasks, -1)
	if tasks < 0 {
		panic(fmt.Sprintf("Invalid running count %d", tasks))
	}

	// Nothing to do. The next CPU clock tick will disable the timer if
	// there is still nothing running. This provides approximately one tick
	// of slack in which we can switch back and forth between idle and
	// active without an expensive transition.
}

// WaitExited blocks until all tasks in k have exited.
func (k *Kernel) WaitExited() {
	k.tasks.liveGoroutines.Wait()
}

// Kill requests that all tasks in k immediately exit as if group exiting with
// status es. Kill does not wait for tasks to exit.
func (k *Kernel) Kill(es ExitStatus) {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	k.tasks.Kill(es)
}

// Pause requests that all tasks in k temporarily stop executing, and blocks
// until all tasks in k have stopped. Multiple calls to Pause nest and require
// an equal number of calls to Unpause to resume execution.
func (k *Kernel) Pause() {
	k.extMu.Lock()
	k.tasks.BeginExternalStop()
	k.extMu.Unlock()
	k.tasks.runningGoroutines.Wait()
}

// Unpause ends the effect of a previous call to Pause. If Unpause is called
// without a matching preceding call to Pause, Unpause may panic.
func (k *Kernel) Unpause() {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	k.tasks.EndExternalStop()
}

// SendExternalSignal injects a signal into the kernel.
//
// context is used only for debugging to describe how the signal was received.
//
// Preconditions: Kernel must have an init process.
func (k *Kernel) SendExternalSignal(info *arch.SignalInfo, context string) {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	k.sendExternalSignal(info, context)
}

// SendContainerSignal sends the given signal to all processes inside the
// namespace that match the given container ID.
func (k *Kernel) SendContainerSignal(cid string, info *arch.SignalInfo) error {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	k.tasks.mu.RLock()
	defer k.tasks.mu.RUnlock()

	var lastErr error
	for tg := range k.tasks.Root.tgids {
		if tg.leader.ContainerID() == cid {
			tg.signalHandlers.mu.Lock()
			infoCopy := *info
			if err := tg.leader.sendSignalLocked(&infoCopy, true /*group*/); err != nil {
				lastErr = err
			}
			tg.signalHandlers.mu.Unlock()
		}
	}
	return lastErr
}

// FeatureSet returns the FeatureSet.
func (k *Kernel) FeatureSet() *cpuid.FeatureSet {
	return k.featureSet
}

// Timekeeper returns the Timekeeper.
func (k *Kernel) Timekeeper() *Timekeeper {
	return k.timekeeper
}

// TaskSet returns the TaskSet.
func (k *Kernel) TaskSet() *TaskSet {
	return k.tasks
}

// RootUserNamespace returns the root UserNamespace.
func (k *Kernel) RootUserNamespace() *auth.UserNamespace {
	return k.rootUserNamespace
}

// RootUTSNamespace returns the root UTSNamespace.
func (k *Kernel) RootUTSNamespace() *UTSNamespace {
	return k.rootUTSNamespace
}

// RootIPCNamespace returns the root IPCNamespace.
func (k *Kernel) RootIPCNamespace() *IPCNamespace {
	return k.rootIPCNamespace
}

// RootPIDNamespace returns the root PIDNamespace.
func (k *Kernel) RootPIDNamespace() *PIDNamespace {
	return k.tasks.Root
}

// RootAbstractSocketNamespace returns the root AbstractSocketNamespace.
func (k *Kernel) RootAbstractSocketNamespace() *AbstractSocketNamespace {
	return k.rootAbstractSocketNamespace
}

// NetworkStack returns the network stack. NetworkStack may return nil if no
// network stack is available.
func (k *Kernel) NetworkStack() inet.Stack {
	return k.networkStack
}

// GlobalInit returns the thread group with ID 1 in the root PID namespace, or
// nil if no such thread group exists. GlobalInit may return a thread group
// containing no tasks if the thread group has already exited.
func (k *Kernel) GlobalInit() *ThreadGroup {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	return k.globalInit
}

// ApplicationCores returns the number of CPUs visible to sandboxed
// applications.
func (k *Kernel) ApplicationCores() uint {
	return k.applicationCores
}

// RealtimeClock returns the application CLOCK_REALTIME clock.
func (k *Kernel) RealtimeClock() ktime.Clock {
	return k.realtimeClock
}

// MonotonicClock returns the application CLOCK_MONOTONIC clock.
func (k *Kernel) MonotonicClock() ktime.Clock {
	return k.monotonicClock
}

// CPUClockNow returns the current value of k.cpuClock.
func (k *Kernel) CPUClockNow() uint64 {
	return atomic.LoadUint64(&k.cpuClock)
}

// Syslog returns the syslog.
func (k *Kernel) Syslog() *syslog {
	return &k.syslog
}

// GenerateInotifyCookie generates a unique inotify event cookie.
//
// Returned values may overlap with previously returned values if the value
// space is exhausted. 0 is not a valid cookie value, all other values
// representable in a uint32 are allowed.
func (k *Kernel) GenerateInotifyCookie() uint32 {
	id := atomic.AddUint32(&k.nextInotifyCookie, 1)
	// Wrap-around is explicitly allowed for inotify event cookies.
	if id == 0 {
		id = atomic.AddUint32(&k.nextInotifyCookie, 1)
	}
	return id
}

// NetlinkPorts returns the netlink port manager.
func (k *Kernel) NetlinkPorts() *port.Manager {
	return k.netlinkPorts
}

// SaveError returns the sandbox error that caused the kernel to exit during
// save.
func (k *Kernel) SaveError() error {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	return k.saveErr
}

// SetSaveError sets the sandbox error that caused the kernel to exit during
// save, if one is not already set.
func (k *Kernel) SetSaveError(err error) {
	k.extMu.Lock()
	defer k.extMu.Unlock()
	if k.saveErr == nil {
		k.saveErr = err
	}
}

var _ tcpip.Clock = (*Kernel)(nil)

// NowNanoseconds implements tcpip.Clock.NowNanoseconds.
func (k *Kernel) NowNanoseconds() int64 {
	now, err := k.timekeeper.GetTime(sentrytime.Realtime)
	if err != nil {
		panic("Kernel.NowNanoseconds: " + err.Error())
	}
	return now
}

// NowMonotonic implements tcpip.Clock.NowMonotonic.
func (k *Kernel) NowMonotonic() int64 {
	now, err := k.timekeeper.GetTime(sentrytime.Monotonic)
	if err != nil {
		panic("Kernel.NowMonotonic: " + err.Error())
	}
	return now
}

// SetMemoryFile sets Kernel.mf. SetMemoryFile must be called before Init or
// LoadFrom.
func (k *Kernel) SetMemoryFile(mf *pgalloc.MemoryFile) {
	k.mf = mf
}

// MemoryFile implements pgalloc.MemoryFileProvider.MemoryFile.
func (k *Kernel) MemoryFile() *pgalloc.MemoryFile {
	return k.mf
}

// SupervisorContext returns a Context with maximum privileges in k. It should
// only be used by goroutines outside the control of the emulated kernel
// defined by e.
//
// Callers are responsible for ensuring that the returned Context is not used
// concurrently with changes to the Kernel.
func (k *Kernel) SupervisorContext() context.Context {
	return supervisorContext{
		Logger: log.Log(),
		k:      k,
	}
}

// SocketEntry represents a socket recorded in Kernel.sockets. It implements
// refs.WeakRefUser for sockets stored in the socket table.
//
// +stateify savable
type SocketEntry struct {
	socketEntry
	k    *Kernel
	Sock *refs.WeakRef
	ID   uint64 // Socket table entry number.
}

// WeakRefGone implements refs.WeakRefUser.WeakRefGone.
func (s *SocketEntry) WeakRefGone() {
	s.k.extMu.Lock()
	s.k.sockets.Remove(s)
	s.k.extMu.Unlock()
}

// RecordSocket adds a socket to the system-wide socket table for tracking.
//
// Precondition: Caller must hold a reference to sock.
func (k *Kernel) RecordSocket(sock *fs.File) {
	k.extMu.Lock()
	id := k.nextSocketEntry
	k.nextSocketEntry++
	s := &SocketEntry{k: k, ID: id}
	s.Sock = refs.NewWeakRef(sock, s)
	k.sockets.PushBack(s)
	k.extMu.Unlock()
}

// ListSockets returns a snapshot of all sockets.
func (k *Kernel) ListSockets() []*SocketEntry {
	k.extMu.Lock()
	var socks []*SocketEntry
	for s := k.sockets.Front(); s != nil; s = s.Next() {
		socks = append(socks, s)
	}
	k.extMu.Unlock()
	return socks
}

type supervisorContext struct {
	context.NoopSleeper
	log.Logger
	k *Kernel
}

// Value implements context.Context.
func (ctx supervisorContext) Value(key interface{}) interface{} {
	switch key {
	case CtxCanTrace:
		// The supervisor context can trace anything. (None of
		// supervisorContext's users are expected to invoke ptrace, but ptrace
		// permissions are required for certain file accesses.)
		return func(*Task, bool) bool { return true }
	case CtxKernel:
		return ctx.k
	case CtxPIDNamespace:
		return ctx.k.tasks.Root
	case CtxUTSNamespace:
		return ctx.k.rootUTSNamespace
	case CtxIPCNamespace:
		return ctx.k.rootIPCNamespace
	case auth.CtxCredentials:
		// The supervisor context is global root.
		return auth.NewRootCredentials(ctx.k.rootUserNamespace)
	case fs.CtxRoot:
		if ctx.k.globalInit != nil {
			return ctx.k.globalInit.mounts.Root()
		}
		return nil
	case fs.CtxDirentCacheLimiter:
		return ctx.k.DirentCacheLimiter
	case ktime.CtxRealtimeClock:
		return ctx.k.RealtimeClock()
	case limits.CtxLimits:
		// No limits apply.
		return limits.NewLimitSet()
	case pgalloc.CtxMemoryFile:
		return ctx.k.mf
	case pgalloc.CtxMemoryFileProvider:
		return ctx.k
	case platform.CtxPlatform:
		return ctx.k
	case uniqueid.CtxGlobalUniqueID:
		return ctx.k.UniqueID()
	case uniqueid.CtxGlobalUniqueIDProvider:
		return ctx.k
	case uniqueid.CtxInotifyCookie:
		return ctx.k.GenerateInotifyCookie()
	case unimpl.CtxEvents:
		return ctx.k
	default:
		return nil
	}
}

// Rate limits for the number of unimplemented syscall events.
const (
	unimplementedSyscallsMaxRate = 100  // events per second
	unimplementedSyscallBurst    = 1000 // events
)

// EmitUnimplementedEvent emits an UnimplementedSyscall event via the event
// channel.
func (k *Kernel) EmitUnimplementedEvent(ctx context.Context) {
	k.unimplementedSyscallEmitterOnce.Do(func() {
		k.unimplementedSyscallEmitter = eventchannel.RateLimitedEmitterFrom(eventchannel.DefaultEmitter, unimplementedSyscallsMaxRate, unimplementedSyscallBurst)
	})

	t := TaskFromContext(ctx)
	k.unimplementedSyscallEmitter.Emit(&uspb.UnimplementedSyscall{
		Tid:       int32(t.ThreadID()),
		Registers: t.Arch().StateData().Proto(),
	})
}