gvisor/pkg/sentry/kernel/task_signals.go

// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package kernel

// This file defines the behavior of task signal handling.

import (
	"fmt"
	"sync/atomic"
	"time"

	"gvisor.googlesource.com/gvisor/pkg/abi/linux"
	"gvisor.googlesource.com/gvisor/pkg/eventchannel"
	"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
	"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
	ucspb "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/uncaught_signal_go_proto"
	"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
	"gvisor.googlesource.com/gvisor/pkg/syserror"
)

// SignalAction is an internal signal action.
type SignalAction int

// Available signal actions.
// Note that although we refer the complete set internally,
// the application is only capable of using the Default and
// Ignore actions from the system call interface.
const (
	SignalActionTerm SignalAction = iota
	SignalActionCore
	SignalActionStop
	SignalActionIgnore
	SignalActionHandler
)

// Default signal handler actions. Note that for most signals,
// (except SIGKILL and SIGSTOP) these can be overridden by the app.
var defaultActions = map[linux.Signal]SignalAction{
	// POSIX.1-1990 standard.
	linux.SIGHUP:  SignalActionTerm,
	linux.SIGINT:  SignalActionTerm,
	linux.SIGQUIT: SignalActionCore,
	linux.SIGILL:  SignalActionCore,
	linux.SIGABRT: SignalActionCore,
	linux.SIGFPE:  SignalActionCore,
	linux.SIGKILL: SignalActionTerm, // but see ThreadGroup.applySignalSideEffects
	linux.SIGSEGV: SignalActionCore,
	linux.SIGPIPE: SignalActionTerm,
	linux.SIGALRM: SignalActionTerm,
	linux.SIGTERM: SignalActionTerm,
	linux.SIGUSR1: SignalActionTerm,
	linux.SIGUSR2: SignalActionTerm,
	linux.SIGCHLD: SignalActionIgnore,
	linux.SIGCONT: SignalActionIgnore, // but see ThreadGroup.applySignalSideEffects
	linux.SIGSTOP: SignalActionStop,
	linux.SIGTSTP: SignalActionStop,
	linux.SIGTTIN: SignalActionStop,
	linux.SIGTTOU: SignalActionStop,
	// POSIX.1-2001 standard.
	linux.SIGBUS:    SignalActionCore,
	linux.SIGPROF:   SignalActionTerm,
	linux.SIGSYS:    SignalActionCore,
	linux.SIGTRAP:   SignalActionCore,
	linux.SIGURG:    SignalActionIgnore,
	linux.SIGVTALRM: SignalActionTerm,
	linux.SIGXCPU:   SignalActionCore,
	linux.SIGXFSZ:   SignalActionCore,
	// The rest on linux.
	linux.SIGSTKFLT: SignalActionTerm,
	linux.SIGIO:     SignalActionTerm,
	linux.SIGPWR:    SignalActionTerm,
	linux.SIGWINCH:  SignalActionIgnore,
}

// computeAction figures out what to do given a signal number
// and an arch.SignalAct. SIGSTOP always results in a SignalActionStop,
// and SIGKILL always results in a SignalActionTerm.
// Signal 0 is always ignored as many programs use it for various internal functions
// and don't expect it to do anything.
//
// In the event the signal is not one of these, act.Handler determines what
// happens next.
// If act.Handler is:
// 0, the default action is taken;
// 1, the signal is ignored;
// anything else, the function returns SignalActionHandler.
func computeAction(sig linux.Signal, act arch.SignalAct) SignalAction {
	switch sig {
	case linux.SIGSTOP:
		return SignalActionStop
	case linux.SIGKILL:
		return SignalActionTerm
	case linux.Signal(0):
		return SignalActionIgnore
	}

	switch act.Handler {
	case arch.SignalActDefault:
		return defaultActions[sig]
	case arch.SignalActIgnore:
		return SignalActionIgnore
	default:
		return SignalActionHandler
	}
}

// UnblockableSignals contains the set of signals which cannot be blocked.
var UnblockableSignals = linux.MakeSignalSet(linux.SIGKILL, linux.SIGSTOP)

// StopSignals is the set of signals whose default action is SignalActionStop.
var StopSignals = linux.MakeSignalSet(linux.SIGSTOP, linux.SIGTSTP, linux.SIGTTIN, linux.SIGTTOU)

// dequeueSignalLocked returns a pending signal that is *not* included in mask.
// If there are no pending unmasked signals, dequeueSignalLocked returns nil.
//
// Preconditions: t.tg.signalHandlers.mu must be locked.
func (t *Task) dequeueSignalLocked(mask linux.SignalSet) *arch.SignalInfo {
	if info := t.pendingSignals.dequeue(mask); info != nil {
		return info
	}
	return t.tg.pendingSignals.dequeue(mask)
}

// discardSpecificLocked removes all instances of the given signal from all
// signal queues in tg.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) discardSpecificLocked(sig linux.Signal) {
	tg.pendingSignals.discardSpecific(sig)
	for t := tg.tasks.Front(); t != nil; t = t.Next() {
		t.pendingSignals.discardSpecific(sig)
	}
}

// PendingSignals returns the set of pending signals.
func (t *Task) PendingSignals() linux.SignalSet {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	return t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet
}

// deliverSignal delivers the given signal and returns the following run state.
func (t *Task) deliverSignal(info *arch.SignalInfo, act arch.SignalAct) taskRunState {
	sigact := computeAction(linux.Signal(info.Signo), act)

	if t.haveSyscallReturn {
		if sre, ok := SyscallRestartErrnoFromReturn(t.Arch().Return()); ok {
			// Signals that are ignored, cause a thread group stop, or
			// terminate the thread group do not interact with interrupted
			// syscalls; in Linux terms, they are never returned to the signal
			// handling path from get_signal => get_signal_to_deliver. The
			// behavior of an interrupted syscall is determined by the first
			// signal that is actually handled (by userspace).
			if sigact == SignalActionHandler {
				switch {
				case sre == ERESTARTNOHAND:
					fallthrough
				case sre == ERESTART_RESTARTBLOCK:
					fallthrough
				case (sre == ERESTARTSYS && !act.IsRestart()):
					t.Debugf("Not restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
					t.Arch().SetReturn(uintptr(-t.ExtractErrno(syserror.EINTR, -1)))
				default:
					t.Debugf("Restarting syscall %d after errno %d: interrupted by signal %d", t.Arch().SyscallNo(), sre, info.Signo)
					t.Arch().RestartSyscall()
				}
			}
		}
	}

	switch sigact {
	case SignalActionTerm, SignalActionCore:
		// "Default action is to terminate the process." - signal(7)
		t.Debugf("Signal %d: terminating thread group", info.Signo)

		// Emit an event channel messages related to this uncaught signal.
		ucs := &ucspb.UncaughtSignal{
			Tid:          int32(t.Kernel().TaskSet().Root.IDOfTask(t)),
			Pid:          int32(t.Kernel().TaskSet().Root.IDOfThreadGroup(t.ThreadGroup())),
			Registers:    t.Arch().StateData().Proto(),
			SignalNumber: info.Signo,
		}

		// Attach an fault address if appropriate.
		switch linux.Signal(info.Signo) {
		case linux.SIGSEGV, linux.SIGFPE, linux.SIGILL, linux.SIGTRAP, linux.SIGBUS:
			ucs.FaultAddr = info.Addr()
		}

		eventchannel.Emit(ucs)

		t.PrepareGroupExit(ExitStatus{Signo: int(info.Signo)})
		return (*runExit)(nil)

	case SignalActionStop:
		// "Default action is to stop the process."
		t.initiateGroupStop(info)

	case SignalActionIgnore:
		// "Default action is to ignore the signal."
		t.Debugf("Signal %d: ignored", info.Signo)

	case SignalActionHandler:
		// Try to deliver the signal to the user-configured handler.
		t.Debugf("Signal %d: delivering to handler", info.Signo)
		if err := t.deliverSignalToHandler(info, act); err != nil {
			// This is not a warning, it can occur during normal operation.
			t.Debugf("Failed to deliver signal %+v to user handler: %v", info, err)

			// Send a forced SIGSEGV. If the signal that couldn't be delivered
			// was a SIGSEGV, force the handler to SIG_DFL.
			t.forceSignal(linux.SIGSEGV, linux.Signal(info.Signo) == linux.SIGSEGV /* unconditional */)
			t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
		}

	default:
		panic(fmt.Sprintf("Unknown signal action %+v, %d?", info, computeAction(linux.Signal(info.Signo), act)))
	}
	return (*runInterrupt)(nil)
}

// deliverSignalToHandler changes the task's userspace state to enter the given
// user-configured handler for the given signal.
func (t *Task) deliverSignalToHandler(info *arch.SignalInfo, act arch.SignalAct) error {
	// Signal delivery to an application handler interrupts restartable
	// sequences.
	t.rseqInterrupt()

	// Are executing on the main stack,
	// or the provided alternate stack?
	sp := usermem.Addr(t.Arch().Stack())

	// N.B. This is a *copy* of the alternate stack that the user's signal
	// handler expects to see in its ucontext (even if it's not in use).
	alt := t.signalStack
	if act.IsOnStack() && alt.IsEnabled() {
		alt.SetOnStack()
		if !alt.Contains(sp) {
			sp = usermem.Addr(alt.Top())
		}
	}

	// Set up the signal handler. If we have a saved signal mask, the signal
	// handler should run with the current mask, but sigreturn should restore
	// the saved one.
	st := &arch.Stack{t.Arch(), t.MemoryManager(), sp}
	mask := t.signalMask
	if t.haveSavedSignalMask {
		mask = t.savedSignalMask
	}
	if err := t.Arch().SignalSetup(st, &act, info, &alt, mask); err != nil {
		return err
	}
	t.haveSavedSignalMask = false

	// Add our signal mask.
	newMask := t.signalMask | act.Mask
	if !act.IsNoDefer() {
		newMask |= linux.SignalSetOf(linux.Signal(info.Signo))
	}
	t.SetSignalMask(newMask)

	return nil
}

var ctrlResume = &SyscallControl{ignoreReturn: true}

// SignalReturn implements sigreturn(2) (if rt is false) or rt_sigreturn(2) (if
// rt is true).
func (t *Task) SignalReturn(rt bool) (*SyscallControl, error) {
	st := t.Stack()
	sigset, alt, err := t.Arch().SignalRestore(st, rt)
	if err != nil {
		return nil, err
	}

	// Attempt to record the given signal stack. Note that we silently
	// ignore failures here, as does Linux. Only an EFAULT may be
	// generated, but SignalRestore has already deserialized the entire
	// frame successfully.
	t.SetSignalStack(alt)

	// Restore our signal mask. SIGKILL and SIGSTOP should not be blocked.
	t.SetSignalMask(sigset &^ UnblockableSignals)

	return ctrlResume, nil
}

// Sigtimedwait implements the semantics of sigtimedwait(2).
//
// Preconditions: The caller must be running on the task goroutine. t.exitState
// < TaskExitZombie.
func (t *Task) Sigtimedwait(set linux.SignalSet, timeout time.Duration) (*arch.SignalInfo, error) {
	// set is the set of signals we're interested in; invert it to get the set
	// of signals to block.
	mask := ^(set &^ UnblockableSignals)

	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	if info := t.dequeueSignalLocked(mask); info != nil {
		return info, nil
	}

	if timeout == 0 {
		return nil, syserror.EAGAIN
	}

	// Unblock signals we're waiting for. Remember the original signal mask so
	// that Task.sendSignalTimerLocked doesn't discard ignored signals that
	// we're temporarily unblocking.
	t.realSignalMask = t.signalMask
	t.setSignalMaskLocked(t.signalMask & mask)

	// Wait for a timeout or new signal.
	t.tg.signalHandlers.mu.Unlock()
	_, err := t.BlockWithTimeout(nil, true, timeout)
	t.tg.signalHandlers.mu.Lock()

	// Restore the original signal mask.
	t.setSignalMaskLocked(t.realSignalMask)
	t.realSignalMask = 0

	if info := t.dequeueSignalLocked(mask); info != nil {
		return info, nil
	}
	if err == syserror.ETIMEDOUT {
		return nil, syserror.EAGAIN
	}
	return nil, err
}

// SendSignal sends the given signal to t.
//
// The following errors may be returned:
//
//	syserror.ESRCH - The task has exited.
//	syserror.EINVAL - The signal is not valid.
//	syserror.EAGAIN - THe signal is realtime, and cannot be queued.
//
func (t *Task) SendSignal(info *arch.SignalInfo) error {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	return t.sendSignalLocked(info, false /* group */)
}

// SendGroupSignal sends the given signal to t's thread group.
func (t *Task) SendGroupSignal(info *arch.SignalInfo) error {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	return t.sendSignalLocked(info, true /* group */)
}

// SendSignal sends the given signal to tg, using tg's leader to determine if
// the signal is blocked.
func (tg *ThreadGroup) SendSignal(info *arch.SignalInfo) error {
	tg.pidns.owner.mu.RLock()
	defer tg.pidns.owner.mu.RUnlock()
	tg.signalHandlers.mu.Lock()
	defer tg.signalHandlers.mu.Unlock()
	return tg.leader.sendSignalLocked(info, true /* group */)
}

func (t *Task) sendSignalLocked(info *arch.SignalInfo, group bool) error {
	return t.sendSignalTimerLocked(info, group, nil)
}

func (t *Task) sendSignalTimerLocked(info *arch.SignalInfo, group bool, timer *IntervalTimer) error {
	if t.exitState == TaskExitDead {
		return syserror.ESRCH
	}
	sig := linux.Signal(info.Signo)
	if sig == 0 {
		return nil
	}
	if !sig.IsValid() {
		return syserror.EINVAL
	}

	// Signal side effects apply even if the signal is ultimately discarded.
	t.tg.applySignalSideEffectsLocked(sig)

	// TODO: "Only signals for which the "init" process has established a
	// signal handler can be sent to the "init" process by other members of the
	// PID namespace. This restriction applies even to privileged processes,
	// and prevents other members of the PID namespace from accidentally
	// killing the "init" process." - pid_namespaces(7). We don't currently do
	// this for child namespaces, though we should; we also don't do this for
	// the root namespace (the same restriction applies to global init on
	// Linux), where whether or not we should is much murkier. In practice,
	// most sandboxed applications are not prepared to function as an init
	// process.

	// Unmasked, ignored signals are discarded without being queued, unless
	// they will be visible to a tracer. Even for group signals, it's the
	// originally-targeted task's signal mask and tracer that matter; compare
	// Linux's kernel/signal.c:__send_signal() => prepare_signal() =>
	// sig_ignored().
	ignored := computeAction(sig, t.tg.signalHandlers.actions[sig]) == SignalActionIgnore
	if sigset := linux.SignalSetOf(sig); sigset&t.signalMask == 0 && sigset&t.realSignalMask == 0 && ignored && !t.hasTracer() {
		t.Debugf("Discarding ignored signal %d", sig)
		if timer != nil {
			timer.signalRejectedLocked()
		}
		return nil
	}

	q := &t.pendingSignals
	if group {
		q = &t.tg.pendingSignals
	}
	if !q.enqueue(info, timer) {
		if sig.IsRealtime() {
			return syserror.EAGAIN
		}
		t.Debugf("Discarding duplicate signal %d", sig)
		if timer != nil {
			timer.signalRejectedLocked()
		}
		return nil
	}

	// Find a receiver to notify. Note that the task we choose to notify, if
	// any, may not be the task that actually dequeues and handles the signal;
	// e.g. a racing signal mask change may cause the notified task to become
	// ineligible, or a racing sibling task may dequeue the signal first.
	if t.canReceiveSignalLocked(sig) {
		t.Debugf("Notified of signal %d", sig)
		t.interrupt()
		return nil
	}
	if group {
		if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
			nt.Debugf("Notified of group signal %d", sig)
			nt.interrupt()
			return nil
		}
	}
	t.Debugf("No task notified of signal %d", sig)
	return nil
}

func (tg *ThreadGroup) applySignalSideEffectsLocked(sig linux.Signal) {
	switch {
	case linux.SignalSetOf(sig)&StopSignals != 0:
		// Stop signals cause all prior SIGCONT to be discarded. (This is
		// despite the fact this has little effect since SIGCONT's most
		// important effect is applied when the signal is sent in the branch
		// below, not when the signal is delivered.)
		tg.discardSpecificLocked(linux.SIGCONT)
	case sig == linux.SIGCONT:
		// "The SIGCONT signal has a side effect of waking up (all threads of)
		// a group-stopped process. This side effect happens before
		// signal-delivery-stop. The tracer can't suppress this side effect (it
		// can only suppress signal injection, which only causes the SIGCONT
		// handler to not be executed in the tracee, if such a handler is
		// installed." - ptrace(2)
		tg.endGroupStopLocked(true)
	case sig == linux.SIGKILL:
		// "SIGKILL does not generate signal-delivery-stop and therefore the
		// tracer can't suppress it. SIGKILL kills even within system calls
		// (syscall-exit-stop is not generated prior to death by SIGKILL)." -
		// ptrace(2)
		//
		// Note that this differs from ThreadGroup.requestExit in that it
		// ignores tg.execing.
		if !tg.exiting {
			tg.exiting = true
			tg.exitStatus = ExitStatus{Signo: int(linux.SIGKILL)}
		}
		for t := tg.tasks.Front(); t != nil; t = t.Next() {
			t.killLocked()
		}
	}
}

// canReceiveSignalLocked returns true if t should be interrupted to receive
// the given signal. canReceiveSignalLocked is analogous to Linux's
// kernel/signal.c:wants_signal(), but see below for divergences.
//
// Preconditions: The signal mutex must be locked.
func (t *Task) canReceiveSignalLocked(sig linux.Signal) bool {
	// - Do not choose tasks that are blocking the signal.
	if linux.SignalSetOf(sig)&t.signalMask != 0 {
		return false
	}
	// - No need to check Task.exitState, as the exit path sets every bit in the
	// signal mask when it transitions from TaskExitNone to TaskExitInitiated.
	// - No special case for SIGKILL: SIGKILL already interrupted all tasks in the
	// task group via applySignalSideEffects => killLocked.
	// - Do not choose stopped tasks, which cannot handle signals.
	if t.stop != nil {
		return false
	}
	// - TODO(b/38173783): No special case for when t is also the sending task,
	// because the identity of the sender is unknown.
	// - Do not choose tasks that have already been interrupted, as they may be
	// busy handling another signal.
	if len(t.interruptChan) != 0 {
		return false
	}
	return true
}

// findSignalReceiverLocked returns a task in tg that should be interrupted to
// receive the given signal. If no such task exists, findSignalReceiverLocked
// returns nil.
//
// Linux actually records curr_target to balance the group signal targets.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) findSignalReceiverLocked(sig linux.Signal) *Task {
	for t := tg.tasks.Front(); t != nil; t = t.Next() {
		if t.canReceiveSignalLocked(sig) {
			return t
		}
	}
	return nil
}

// forceSignal ensures that the task is not ignoring or blocking the given
// signal. If unconditional is true, forceSignal takes action even if the
// signal isn't being ignored or blocked.
func (t *Task) forceSignal(sig linux.Signal, unconditional bool) {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	t.forceSignalLocked(sig, unconditional)
}

func (t *Task) forceSignalLocked(sig linux.Signal, unconditional bool) {
	blocked := linux.SignalSetOf(sig)&t.signalMask != 0
	act := t.tg.signalHandlers.actions[sig]
	ignored := act.Handler == arch.SignalActIgnore
	if blocked || ignored || unconditional {
		act.Handler = arch.SignalActDefault
		t.tg.signalHandlers.actions[sig] = act
		if blocked {
			t.setSignalMaskLocked(t.signalMask &^ linux.SignalSetOf(sig))
		}
	}
}

// SignalMask returns a copy of t's signal mask.
func (t *Task) SignalMask() linux.SignalSet {
	return linux.SignalSet(atomic.LoadUint64((*uint64)(&t.signalMask)))
}

// SetSignalMask sets t's signal mask.
//
// Preconditions: SetSignalMask can only be called by the task goroutine.
// t.exitState < TaskExitZombie.
func (t *Task) SetSignalMask(mask linux.SignalSet) {
	// By precondition, t prevents t.tg from completing an execve and mutating
	// t.tg.signalHandlers, so we can skip the TaskSet mutex.
	t.tg.signalHandlers.mu.Lock()
	t.setSignalMaskLocked(mask)
	t.tg.signalHandlers.mu.Unlock()
}

// Preconditions: The signal mutex must be locked.
func (t *Task) setSignalMaskLocked(mask linux.SignalSet) {
	oldMask := t.signalMask
	atomic.StoreUint64((*uint64)(&t.signalMask), uint64(mask))

	// If the new mask blocks any signals that were not blocked by the old
	// mask, and at least one such signal is pending in tg.pendingSignals, and
	// t has been woken, it could be the case that t was woken to handle that
	// signal, but will no longer do so as a result of its new signal mask, so
	// we have to pick a replacement.
	blocked := mask &^ oldMask
	blockedGroupPending := blocked & t.tg.pendingSignals.pendingSet
	if blockedGroupPending != 0 && t.interrupted() {
		linux.ForEachSignal(blockedGroupPending, func(sig linux.Signal) {
			if nt := t.tg.findSignalReceiverLocked(sig); nt != nil {
				nt.interrupt()
				return
			}
		})
		// We have to re-issue the interrupt consumed by t.interrupted() since
		// it might have been for a different reason.
		t.interruptSelf()
	}

	// Conversely, if the new mask unblocks any signals that were blocked by
	// the old mask, and at least one such signal is pending, we may now need
	// to handle that signal.
	unblocked := oldMask &^ mask
	unblockedPending := unblocked & (t.pendingSignals.pendingSet | t.tg.pendingSignals.pendingSet)
	if unblockedPending != 0 {
		t.interruptSelf()
	}
}

// SetSavedSignalMask sets the saved signal mask (see Task.savedSignalMask's
// comment).
//
// Preconditions: SetSavedSignalMask can only be called by the task goroutine.
func (t *Task) SetSavedSignalMask(mask linux.SignalSet) {
	t.savedSignalMask = mask
	t.haveSavedSignalMask = true
}

// SignalStack returns the task-private signal stack.
func (t *Task) SignalStack() arch.SignalStack {
	alt := t.signalStack
	if t.onSignalStack(alt) {
		alt.Flags |= arch.SignalStackFlagOnStack
	}
	return alt
}

// onSignalStack returns true if the task is executing on the given signal stack.
func (t *Task) onSignalStack(alt arch.SignalStack) bool {
	sp := usermem.Addr(t.Arch().Stack())
	return alt.Contains(sp)
}

// SetSignalStack sets the task-private signal stack.
//
// This value may not be changed if the task is currently executing on the
// signal stack, i.e. if t.onSignalStack returns true. In this case, this
// function will return false. Otherwise, true is returned.
func (t *Task) SetSignalStack(alt arch.SignalStack) bool {
	// Check that we're not executing on the stack.
	if t.onSignalStack(t.signalStack) {
		return false
	}

	if alt.Flags&arch.SignalStackFlagDisable != 0 {
		// Don't record anything beyond the flags.
		t.signalStack = arch.SignalStack{
			Flags: arch.SignalStackFlagDisable,
		}
	} else {
		// Mask out irrelevant parts: only disable matters.
		alt.Flags &= arch.SignalStackFlagDisable
		t.signalStack = alt
	}
	return true
}

// SetSignalAct atomically sets the thread group's signal action for signal sig
// to *actptr (if actptr is not nil) and returns the old signal action.
func (tg *ThreadGroup) SetSignalAct(sig linux.Signal, actptr *arch.SignalAct) (arch.SignalAct, error) {
	if !sig.IsValid() {
		return arch.SignalAct{}, syserror.EINVAL
	}

	tg.pidns.owner.mu.RLock()
	defer tg.pidns.owner.mu.RUnlock()
	sh := tg.signalHandlers
	sh.mu.Lock()
	defer sh.mu.Unlock()
	oldact := sh.actions[sig]
	if actptr != nil {
		if sig == linux.SIGKILL || sig == linux.SIGSTOP {
			return oldact, syserror.EINVAL
		}

		act := *actptr
		act.Mask &^= UnblockableSignals
		sh.actions[sig] = act
		// From POSIX, by way of Linux:
		//
		// "Setting a signal action to SIG_IGN for a signal that is pending
		// shall cause the pending signal to be discarded, whether or not it is
		// blocked."
		//
		// "Setting a signal action to SIG_DFL for a signal that is pending and
		// whose default action is to ignore the signal (for example, SIGCHLD),
		// shall cause the pending signal to be discarded, whether or not it is
		// blocked."
		if computeAction(sig, act) == SignalActionIgnore {
			tg.discardSpecificLocked(sig)
		}
	}
	return oldact, nil
}

// CopyOutSignalAct converts the given SignalAct into an architecture-specific
// type and then copies it out to task memory.
func (t *Task) CopyOutSignalAct(addr usermem.Addr, s *arch.SignalAct) error {
	n := t.Arch().NewSignalAct()
	n.SerializeFrom(s)
	_, err := t.CopyOut(addr, n)
	return err
}

// CopyInSignalAct copies an architecture-specific sigaction type from task
// memory and then converts it into a SignalAct.
func (t *Task) CopyInSignalAct(addr usermem.Addr) (arch.SignalAct, error) {
	n := t.Arch().NewSignalAct()
	var s arch.SignalAct
	if _, err := t.CopyIn(addr, n); err != nil {
		return s, err
	}
	n.DeserializeTo(&s)
	return s, nil
}

// CopyOutSignalStack converts the given SignalStack into an
// architecture-specific type and then copies it out to task memory.
func (t *Task) CopyOutSignalStack(addr usermem.Addr, s *arch.SignalStack) error {
	n := t.Arch().NewSignalStack()
	n.SerializeFrom(s)
	_, err := t.CopyOut(addr, n)
	return err
}

// CopyInSignalStack copies an architecture-specific stack_t from task memory
// and then converts it into a SignalStack.
func (t *Task) CopyInSignalStack(addr usermem.Addr) (arch.SignalStack, error) {
	n := t.Arch().NewSignalStack()
	var s arch.SignalStack
	if _, err := t.CopyIn(addr, n); err != nil {
		return s, err
	}
	n.DeserializeTo(&s)
	return s, nil
}

// groupStop is a TaskStop placed on tasks that have received a stop signal
// (SIGSTOP, SIGTSTP, SIGTTIN, SIGTTOU). (The term "group-stop" originates from
// the ptrace man page.)
//
// +stateify savable
type groupStop struct{}

// Killable implements TaskStop.Killable.
func (*groupStop) Killable() bool { return true }

// initiateGroupStop attempts to initiate a group stop based on a
// previously-dequeued stop signal.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) initiateGroupStop(info *arch.SignalInfo) {
	t.tg.pidns.owner.mu.RLock()
	defer t.tg.pidns.owner.mu.RUnlock()
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	if t.groupStopPending {
		t.Debugf("Signal %d: not stopping thread group: lost to racing stop signal", info.Signo)
		return
	}
	if !t.tg.groupStopDequeued {
		t.Debugf("Signal %d: not stopping thread group: lost to racing SIGCONT", info.Signo)
		return
	}
	if t.tg.exiting {
		t.Debugf("Signal %d: not stopping thread group: lost to racing group exit", info.Signo)
		return
	}
	if t.tg.execing != nil {
		t.Debugf("Signal %d: not stopping thread group: lost to racing execve", info.Signo)
		return
	}
	if !t.tg.groupStopComplete {
		t.tg.groupStopSignal = linux.Signal(info.Signo)
	}
	t.tg.groupStopPendingCount = 0
	for t2 := t.tg.tasks.Front(); t2 != nil; t2 = t2.Next() {
		if t2.killedLocked() || t2.exitState >= TaskExitInitiated {
			t2.groupStopPending = false
			continue
		}
		t2.groupStopPending = true
		t2.groupStopAcknowledged = false
		if t2.ptraceSeized {
			t2.trapNotifyPending = true
			if s, ok := t2.stop.(*ptraceStop); ok && s.listen {
				t2.endInternalStopLocked()
			}
		}
		t2.interrupt()
		t.tg.groupStopPendingCount++
	}
	t.Debugf("Signal %d: stopping %d threads in thread group", info.Signo, t.tg.groupStopPendingCount)
}

// endGroupStopLocked ensures that all prior stop signals received by tg are
// not stopping tg and will not stop tg in the future. If broadcast is true,
// parent and tracer notification will be scheduled if appropriate.
//
// Preconditions: The signal mutex must be locked.
func (tg *ThreadGroup) endGroupStopLocked(broadcast bool) {
	// Discard all previously-queued stop signals.
	linux.ForEachSignal(StopSignals, tg.discardSpecificLocked)

	if tg.groupStopPendingCount == 0 && !tg.groupStopComplete {
		return
	}

	completeStr := "incomplete"
	if tg.groupStopComplete {
		completeStr = "complete"
	}
	tg.leader.Debugf("Ending %s group stop with %d threads pending", completeStr, tg.groupStopPendingCount)
	for t := tg.tasks.Front(); t != nil; t = t.Next() {
		t.groupStopPending = false
		if t.ptraceSeized {
			t.trapNotifyPending = true
			if s, ok := t.stop.(*ptraceStop); ok && s.listen {
				t.endInternalStopLocked()
			}
		} else {
			if _, ok := t.stop.(*groupStop); ok {
				t.endInternalStopLocked()
			}
		}
	}
	if broadcast {
		// Instead of notifying the parent here, set groupContNotify so that
		// one of the continuing tasks does so. (Linux does something similar.)
		// The reason we do this is to keep locking sane. In order to send a
		// signal to the parent, we need to lock its signal mutex, but we're
		// already holding tg's signal mutex, and the TaskSet mutex must be
		// locked for writing for us to hold two signal mutexes. Since we don't
		// want to require this for endGroupStopLocked (which is called from
		// signal-sending paths), nor do we want to lose atomicity by releasing
		// the mutexes we're already holding, just let the continuing thread
		// group deal with it.
		tg.groupContNotify = true
		tg.groupContInterrupted = !tg.groupStopComplete
		tg.groupContWaitable = true
	}
	// Unsetting groupStopDequeued will cause racing calls to initiateGroupStop
	// to recognize that the group stop has been cancelled.
	tg.groupStopDequeued = false
	tg.groupStopSignal = 0
	tg.groupStopPendingCount = 0
	tg.groupStopComplete = false
	tg.groupStopWaitable = false
}

// participateGroupStopLocked is called to handle thread group side effects
// after t unsets t.groupStopPending. The caller must handle task side effects
// (e.g. placing the task goroutine into the group stop). It returns true if
// the caller must notify t.tg.leader's parent of a completed group stop (which
// participateGroupStopLocked cannot do due to holding the wrong locks).
//
// Preconditions: The signal mutex must be locked.
func (t *Task) participateGroupStopLocked() bool {
	if t.groupStopAcknowledged {
		return false
	}
	t.groupStopAcknowledged = true
	t.tg.groupStopPendingCount--
	if t.tg.groupStopPendingCount != 0 {
		return false
	}
	if t.tg.groupStopComplete {
		return false
	}
	t.Debugf("Completing group stop")
	t.tg.groupStopComplete = true
	t.tg.groupStopWaitable = true
	t.tg.groupContNotify = false
	t.tg.groupContWaitable = false
	return true
}

// signalStop sends a signal to t's thread group of a new group stop, group
// continue, or ptrace stop, if appropriate. code and status are set in the
// signal sent to tg, if any.
//
// Preconditions: The TaskSet mutex must be locked (for reading or writing).
func (t *Task) signalStop(target *Task, code int32, status int32) {
	t.tg.signalHandlers.mu.Lock()
	defer t.tg.signalHandlers.mu.Unlock()
	act, ok := t.tg.signalHandlers.actions[linux.SIGCHLD]
	if !ok || (act.Handler != arch.SignalActIgnore && act.Flags&arch.SignalFlagNoCldStop == 0) {
		sigchld := &arch.SignalInfo{
			Signo: int32(linux.SIGCHLD),
			Code:  code,
		}
		sigchld.SetPid(int32(t.tg.pidns.tids[target]))
		sigchld.SetUid(int32(target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
		sigchld.SetStatus(status)
		// TODO(b/72102453): Set utime, stime.
		t.sendSignalLocked(sigchld, true /* group */)
	}
}

// The runInterrupt state handles conditions indicated by interrupts.
//
// +stateify savable
type runInterrupt struct{}

func (*runInterrupt) execute(t *Task) taskRunState {
	// Interrupts are de-duplicated (if t is interrupted twice before
	// t.interrupted() is called, t.interrupted() will only return true once),
	// so early exits from this function must re-enter the runInterrupt state
	// to check for more interrupt-signaled conditions.

	t.tg.signalHandlers.mu.Lock()

	// Did we just leave a group stop?
	if t.tg.groupContNotify {
		t.tg.groupContNotify = false
		sig := t.tg.groupStopSignal
		intr := t.tg.groupContInterrupted
		t.tg.signalHandlers.mu.Unlock()
		t.tg.pidns.owner.mu.RLock()
		// For consistency with Linux, if the parent and (thread group
		// leader's) tracer are in the same thread group, deduplicate
		// notifications.
		notifyParent := t.tg.leader.parent != nil
		if tracer := t.tg.leader.Tracer(); tracer != nil {
			if notifyParent && tracer.tg == t.tg.leader.parent.tg {
				notifyParent = false
			}
			// Sending CLD_STOPPED to the tracer doesn't really make any sense;
			// the thread group leader may have already entered the stop and
			// notified its tracer accordingly. But it's consistent with
			// Linux...
			if intr {
				tracer.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
				if !notifyParent {
					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop | EventChildGroupStop)
				} else {
					tracer.tg.eventQueue.Notify(EventGroupContinue | EventTraceeStop)
				}
			} else {
				tracer.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
				tracer.tg.eventQueue.Notify(EventGroupContinue)
			}
		}
		if notifyParent {
			// If groupContInterrupted, do as Linux does and pretend the group
			// stop completed just before it ended. The theoretical behavior in
			// this case would be to send a SIGCHLD indicating the completed
			// stop, followed by a SIGCHLD indicating the continue. However,
			// SIGCHLD is a standard signal, so the latter would always be
			// dropped. Hence sending only the former is equivalent.
			if intr {
				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue | EventChildGroupStop)
			} else {
				t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_CONTINUED, int32(sig))
				t.tg.leader.parent.tg.eventQueue.Notify(EventGroupContinue)
			}
		}
		t.tg.pidns.owner.mu.RUnlock()
		return (*runInterrupt)(nil)
	}

	// Do we need to enter a group stop or related ptrace stop? This path is
	// analogous to Linux's kernel/signal.c:get_signal() => do_signal_stop()
	// (with ptrace enabled) and do_jobctl_trap().
	if t.groupStopPending || t.trapStopPending || t.trapNotifyPending {
		sig := t.tg.groupStopSignal
		notifyParent := false
		if t.groupStopPending {
			t.groupStopPending = false
			// We care about t.tg.groupStopSignal (for tracer notification)
			// even if this doesn't complete a group stop, so keep the
			// value of sig we've already read.
			notifyParent = t.participateGroupStopLocked()
		}
		t.trapStopPending = false
		t.trapNotifyPending = false
		// Drop the signal mutex so we can take the TaskSet mutex.
		t.tg.signalHandlers.mu.Unlock()

		t.tg.pidns.owner.mu.RLock()
		if t.tg.leader.parent == nil {
			notifyParent = false
		}
		if tracer := t.Tracer(); tracer != nil {
			if t.ptraceSeized {
				if sig == 0 {
					sig = linux.SIGTRAP
				}
				// "If tracee was attached using PTRACE_SEIZE, group-stop is
				// indicated by PTRACE_EVENT_STOP: status>>16 ==
				// PTRACE_EVENT_STOP. This allows detection of group-stops
				// without requiring an extra PTRACE_GETSIGINFO call." -
				// "Group-stop", ptrace(2)
				t.ptraceCode = int32(sig) | linux.PTRACE_EVENT_STOP<<8
				t.ptraceSiginfo = &arch.SignalInfo{
					Signo: int32(sig),
					Code:  t.ptraceCode,
				}
				t.ptraceSiginfo.SetPid(int32(t.tg.pidns.tids[t]))
				t.ptraceSiginfo.SetUid(int32(t.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
			} else {
				t.ptraceCode = int32(sig)
				t.ptraceSiginfo = nil
			}
			if t.beginPtraceStopLocked() {
				tracer.signalStop(t, arch.CLD_STOPPED, int32(sig))
				// For consistency with Linux, if the parent and tracer are in the
				// same thread group, deduplicate notification signals.
				if notifyParent && tracer.tg == t.tg.leader.parent.tg {
					notifyParent = false
					tracer.tg.eventQueue.Notify(EventChildGroupStop | EventTraceeStop)
				} else {
					tracer.tg.eventQueue.Notify(EventTraceeStop)
				}
			}
		} else {
			t.tg.signalHandlers.mu.Lock()
			if !t.killedLocked() {
				t.beginInternalStopLocked((*groupStop)(nil))
			}
			t.tg.signalHandlers.mu.Unlock()
		}
		if notifyParent {
			t.tg.leader.parent.signalStop(t.tg.leader, arch.CLD_STOPPED, int32(sig))
			t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
		}
		t.tg.pidns.owner.mu.RUnlock()

		return (*runInterrupt)(nil)
	}

	// Are there signals pending?
	if info := t.dequeueSignalLocked(t.signalMask); info != nil {
		if linux.SignalSetOf(linux.Signal(info.Signo))&StopSignals != 0 {
			// Indicate that we've dequeued a stop signal before unlocking the
			// signal mutex; initiateGroupStop will check for races with
			// endGroupStopLocked after relocking it.
			t.tg.groupStopDequeued = true
		}
		if t.ptraceSignalLocked(info) {
			// Dequeueing the signal action must wait until after the
			// signal-delivery-stop ends since the tracer can change or
			// suppress the signal.
			t.tg.signalHandlers.mu.Unlock()
			return (*runInterruptAfterSignalDeliveryStop)(nil)
		}
		act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
		t.tg.signalHandlers.mu.Unlock()
		return t.deliverSignal(info, act)
	}

	t.tg.signalHandlers.mu.Unlock()
	return (*runApp)(nil)
}

// +stateify savable
type runInterruptAfterSignalDeliveryStop struct{}

func (*runInterruptAfterSignalDeliveryStop) execute(t *Task) taskRunState {
	t.tg.pidns.owner.mu.Lock()
	// Can't defer unlock: deliverSignal must be called without holding TaskSet
	// mutex.
	sig := linux.Signal(t.ptraceCode)
	defer func() {
		t.ptraceSiginfo = nil
	}()
	if !sig.IsValid() {
		t.tg.pidns.owner.mu.Unlock()
		return (*runInterrupt)(nil)
	}
	info := t.ptraceSiginfo
	if sig != linux.Signal(info.Signo) {
		info.Signo = int32(sig)
		info.Errno = 0
		info.Code = arch.SignalInfoUser
		// pid isn't a valid field for all signal numbers, but Linux
		// doesn't care (kernel/signal.c:ptrace_signal()).
		//
		// Linux uses t->parent for the tid and uid here, which is the tracer
		// if it hasn't detached or the real parent otherwise.
		parent := t.parent
		if tracer := t.Tracer(); tracer != nil {
			parent = tracer
		}
		if parent == nil {
			// Tracer has detached and t was created by Kernel.CreateProcess().
			// Pretend the parent is in an ancestor PID + user namespace.
			info.SetPid(0)
			info.SetUid(int32(auth.OverflowUID))
		} else {
			info.SetPid(int32(t.tg.pidns.tids[parent]))
			info.SetUid(int32(parent.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()))
		}
	}
	t.tg.signalHandlers.mu.Lock()
	t.tg.pidns.owner.mu.Unlock()
	// If the signal is masked, re-queue it.
	if linux.SignalSetOf(sig)&t.signalMask != 0 {
		t.sendSignalLocked(info, false /* group */)
		t.tg.signalHandlers.mu.Unlock()
		return (*runInterrupt)(nil)
	}
	act := t.tg.signalHandlers.dequeueAction(linux.Signal(info.Signo))
	t.tg.signalHandlers.mu.Unlock()
	return t.deliverSignal(info, act)
}