gvisor/pkg/sentry/kernel/task_exit.go

1140 lines
40 KiB
Go

// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
// This file implements the task exit cycle:
//
// - Tasks are asynchronously requested to exit with Task.Kill.
//
// - When able, the task goroutine enters the exit path starting from state
// runExit.
//
// - Other tasks observe completed exits with Task.Wait (which implements the
// wait*() family of syscalls).
import (
"errors"
"fmt"
"strconv"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/syserror"
"gvisor.googlesource.com/gvisor/pkg/waiter"
)
// An ExitStatus is a value communicated from an exiting task or thread group
// to the party that reaps it.
type ExitStatus struct {
// Code is the numeric value passed to the call to exit or exit_group that
// caused the exit. If the exit was not caused by such a call, Code is 0.
Code int
// Signo is the signal that caused the exit. If the exit was not caused by
// a signal, Signo is 0.
Signo int
}
// Signaled returns true if the ExitStatus indicates that the exiting task or
// thread group was killed by a signal.
func (es ExitStatus) Signaled() bool {
return es.Signo != 0
}
// Status returns the numeric representation of the ExitStatus returned by e.g.
// the wait4() system call.
func (es ExitStatus) Status() uint32 {
return ((uint32(es.Code) & 0xff) << 8) | (uint32(es.Signo) & 0xff)
}
// ShellExitCode returns the numeric exit code that Bash would return for an
// exit status of es.
func (es ExitStatus) ShellExitCode() int {
if es.Signaled() {
return 128 + es.Signo
}
return es.Code
}
// TaskExitState represents a step in the task exit path.
//
// "Exiting" and "exited" are often ambiguous; prefer to name specific states.
type TaskExitState int
const (
// TaskExitNone indicates that the task has not begun exiting.
TaskExitNone TaskExitState = iota
// TaskExitInitiated indicates that the task goroutine has entered the exit
// path, and the task is no longer eligible to participate in group stops
// or group signal handling. TaskExitInitiated is analogous to Linux's
// PF_EXITING.
TaskExitInitiated
// TaskExitZombie indicates that the task has released its resources, and
// the task no longer prevents a sibling thread from completing execve.
TaskExitZombie
// TaskExitDead indicates that the task's thread IDs have been released,
// and the task no longer prevents its thread group leader from being
// reaped. ("Reaping" refers to the transitioning of a task from
// TaskExitZombie to TaskExitDead.)
TaskExitDead
)
// String implements fmt.Stringer.
func (t TaskExitState) String() string {
switch t {
case TaskExitNone:
return "TaskExitNone"
case TaskExitInitiated:
return "TaskExitInitiated"
case TaskExitZombie:
return "TaskExitZombie"
case TaskExitDead:
return "TaskExitDead"
default:
return strconv.Itoa(int(t))
}
}
// killLocked marks t as killed by enqueueing a SIGKILL, without causing the
// thread-group-affecting side effects SIGKILL usually has.
//
// Preconditions: The signal mutex must be locked.
func (t *Task) killLocked() {
// Clear killable stops.
if t.stop != nil && t.stop.Killable() {
t.endInternalStopLocked()
}
t.groupStopRequired = false
t.pendingSignals.enqueue(&arch.SignalInfo{
Signo: int32(linux.SIGKILL),
// Linux just sets SIGKILL in the pending signal bitmask without
// enqueueing an actual siginfo, such that
// kernel/signal.c:collect_signal() initializes si_code to SI_USER.
Code: arch.SignalInfoUser,
})
t.interrupt()
}
// killed returns true if t has a SIGKILL pending. killed is analogous to
// Linux's fatal_signal_pending().
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) killed() bool {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
return t.killedLocked()
}
func (t *Task) killedLocked() bool {
return t.pendingSignals.pendingSet&linux.SignalSetOf(linux.SIGKILL) != 0
}
// PrepareExit indicates an exit with status es.
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) PrepareExit(es ExitStatus) {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
t.exitStatus = es
}
// PrepareGroupExit indicates a group exit with status es to t's thread group.
//
// PrepareGroupExit is analogous to Linux's do_group_exit(), except that it
// does not tail-call do_exit(), except that it *does* set Task.exitStatus.
// (Linux does not do so until within do_exit(), since it reuses exit_code for
// ptrace.)
//
// Preconditions: The caller must be running on the task goroutine.
func (t *Task) PrepareGroupExit(es ExitStatus) {
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
if t.tg.exiting || t.tg.execing != nil {
// Note that if t.tg.exiting is false but t.tg.execing is not nil, i.e.
// this "group exit" is being executed by the killed sibling of an
// execing task, then Task.Execve never set t.tg.exitStatus, so it's
// still the zero value. This is consistent with Linux, both in intent
// ("all other threads ... report death as if they exited via _exit(2)
// with exit code 0" - ptrace(2), "execve under ptrace") and in
// implementation (compare fs/exec.c:de_thread() =>
// kernel/signal.c:zap_other_threads() and
// kernel/exit.c:do_group_exit() =>
// include/linux/sched.h:signal_group_exit()).
t.exitStatus = t.tg.exitStatus
return
}
t.tg.exiting = true
t.tg.exitStatus = es
t.exitStatus = es
for sibling := t.tg.tasks.Front(); sibling != nil; sibling = sibling.Next() {
if sibling != t {
sibling.killLocked()
}
}
}
// Kill requests that all tasks in ts exit as if group exiting with status es.
// Kill does not wait for tasks to exit.
//
// Kill has no analogue in Linux; it's provided for save/restore only.
func (ts *TaskSet) Kill(es ExitStatus) {
ts.mu.Lock()
defer ts.mu.Unlock()
ts.Root.exiting = true
for t := range ts.Root.tids {
t.tg.signalHandlers.mu.Lock()
if !t.tg.exiting {
t.tg.exiting = true
t.tg.exitStatus = es
}
t.killLocked()
t.tg.signalHandlers.mu.Unlock()
}
}
// advanceExitStateLocked checks that t's current exit state is oldExit, then
// sets it to newExit. If t's current exit state is not oldExit,
// advanceExitStateLocked panics.
//
// Preconditions: The TaskSet mutex must be locked.
func (t *Task) advanceExitStateLocked(oldExit, newExit TaskExitState) {
if t.exitState != oldExit {
panic(fmt.Sprintf("Transitioning from exit state %v to %v: unexpected preceding state %v", oldExit, newExit, t.exitState))
}
t.Debugf("Transitioning from exit state %v to %v", oldExit, newExit)
t.exitState = newExit
}
// runExit is the entry point into the task exit path.
type runExit struct{}
func (*runExit) execute(t *Task) taskRunState {
t.ptraceExit()
return (*runExitMain)(nil)
}
type runExitMain struct{}
func (*runExitMain) execute(t *Task) taskRunState {
lastExiter := t.exitThreadGroup()
// If the task has a cleartid, and the thread group wasn't killed by a
// signal, handle that before releasing the MM.
if t.cleartid != 0 {
t.tg.signalHandlers.mu.Lock()
signaled := t.tg.exiting && t.tg.exitStatus.Signaled()
t.tg.signalHandlers.mu.Unlock()
if !signaled {
if _, err := t.CopyOut(t.cleartid, ThreadID(0)); err == nil {
t.Futex().Wake(uintptr(t.cleartid), ^uint32(0), 1)
}
// If the CopyOut fails, there's nothing we can do.
}
}
// Deactivate the address space before releasing the MM.
t.Deactivate()
// Update the max resident set size before releasing t.tc.mm.
t.tg.pidns.owner.mu.Lock()
t.updateRSSLocked()
t.tg.pidns.owner.mu.Unlock()
// Release all of the task's resources.
t.mu.Lock()
t.tc.release()
t.tr.release()
t.mu.Unlock()
t.unstopVforkParent()
// If this is the last task to exit from the thread group, release the
// thread group's resources.
if lastExiter {
t.tg.release()
}
// Detach tracees.
t.exitPtrace()
// Reparent the task's children.
t.exitChildren()
// Don't tail-call runExitNotify, as exitChildren may have initiated a stop
// to wait for a PID namespace to die.
return (*runExitNotify)(nil)
}
// exitThreadGroup transitions t to TaskExitInitiated, indicating to t's thread
// group that it is no longer eligible to participate in group activities. It
// returns true if t is the last task in its thread group to call
// exitThreadGroup.
func (t *Task) exitThreadGroup() bool {
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
t.tg.signalHandlers.mu.Lock()
// Can't defer unlock: see below.
t.advanceExitStateLocked(TaskExitNone, TaskExitInitiated)
t.tg.activeTasks--
last := t.tg.activeTasks == 0
// Ensure that someone will handle the signals we can't.
t.setSignalMaskLocked(^linux.SignalSet(0))
// Check if this task's exit interacts with an initiated group stop.
if t.tg.groupStopPhase != groupStopInitiated {
t.tg.signalHandlers.mu.Unlock()
return last
}
if t.groupStopAcknowledged {
// Un-acknowledge the group stop.
t.tg.groupStopCount--
t.groupStopAcknowledged = false
// If the group stop wasn't complete before, then there is still at
// least one other task that hasn't acknowledged the group stop, so
// it is still not complete now.
t.tg.signalHandlers.mu.Unlock()
return last
}
if t.tg.groupStopCount != t.tg.activeTasks {
t.tg.signalHandlers.mu.Unlock()
return last
}
t.Debugf("Completing group stop")
t.tg.groupStopPhase = groupStopComplete
t.tg.groupStopWaitable = true
sig := t.tg.groupStopSignal
t.tg.groupContNotify = false
t.tg.groupContWaitable = false
// signalStop must be called with t's signal mutex unlocked.
t.tg.signalHandlers.mu.Unlock()
if t.tg.leader.parent != nil {
t.tg.leader.parent.signalStop(t, arch.CLD_STOPPED, int32(sig))
t.tg.leader.parent.tg.eventQueue.Notify(EventChildGroupStop)
}
return last
}
func (t *Task) exitChildren() {
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
newParent := t.findReparentTargetLocked()
if newParent == nil {
// "If the init process of a PID namespace terminates, the kernel
// terminates all of the processes in the namespace via a SIGKILL
// signal." - pid_namespaces(7)
t.Debugf("Init process terminating, killing namespace")
t.tg.pidns.exiting = true
for other := range t.tg.pidns.tids {
if other.tg != t.tg {
other.tg.signalHandlers.mu.Lock()
other.sendSignalLocked(&arch.SignalInfo{
Signo: int32(linux.SIGKILL),
}, false /* group */)
other.tg.signalHandlers.mu.Unlock()
}
}
// TODO: The init process waits for all processes in the
// namespace to exit before completing its own exit
// (kernel/pid_namespace.c:zap_pid_ns_processes()). Stop until all
// other tasks in the namespace are dead, except possibly for this
// thread group's leader (which can't be reaped until this task exits).
}
// This is correct even if newParent is nil (it ensures that children don't
// wait for a parent to reap them.)
for c := range t.children {
if sig := c.ParentDeathSignal(); sig != 0 {
siginfo := &arch.SignalInfo{
Signo: int32(sig),
Code: arch.SignalInfoUser,
}
siginfo.SetPid(int32(c.tg.pidns.tids[t]))
siginfo.SetUid(int32(t.Credentials().RealKUID.In(c.UserNamespace()).OrOverflow()))
c.tg.signalHandlers.mu.Lock()
c.sendSignalLocked(siginfo, true /* group */)
c.tg.signalHandlers.mu.Unlock()
}
c.reparentLocked(newParent)
if newParent != nil {
newParent.children[c] = struct{}{}
}
}
}
// findReparentTargetLocked returns the task to which t's children should be
// reparented. If no such task exists, findNewParentLocked returns nil.
//
// Preconditions: The TaskSet mutex must be locked.
func (t *Task) findReparentTargetLocked() *Task {
// Reparent to any sibling in the same thread group that hasn't begun
// exiting.
if t2 := t.tg.anyNonExitingTaskLocked(); t2 != nil {
return t2
}
// "A child process that is orphaned within the namespace will be
// reparented to [the init process for the namespace] ..." -
// pid_namespaces(7)
if init := t.tg.pidns.tasks[InitTID]; init != nil {
return init.tg.anyNonExitingTaskLocked()
}
return nil
}
func (tg *ThreadGroup) anyNonExitingTaskLocked() *Task {
for t := tg.tasks.Front(); t != nil; t = t.Next() {
if t.exitState == TaskExitNone {
return t
}
}
return nil
}
// reparentLocked changes t's parent. The new parent may be nil.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) reparentLocked(parent *Task) {
oldParent := t.parent
t.parent = parent
// If a thread group leader's parent changes, reset the thread group's
// termination signal to SIGCHLD and re-check exit notification. (Compare
// kernel/exit.c:reparent_leader().)
if t != t.tg.leader {
return
}
if oldParent == nil && parent == nil {
return
}
if oldParent != nil && parent != nil && oldParent.tg == parent.tg {
return
}
t.tg.terminationSignal = linux.SIGCHLD
if t.exitParentNotified && !t.exitParentAcked {
t.exitParentNotified = false
t.exitNotifyLocked(false)
}
}
// When a task exits, other tasks in the system, notably the task's parent and
// ptracer, may want to be notified. The exit notification system ensures that
// interested tasks receive signals and/or are woken from blocking calls to
// wait*() syscalls; these notifications must be resolved before exiting tasks
// can be reaped and disappear from the system.
//
// Each task may have a parent task and/or a tracer task. If both a parent and
// a tracer exist, they may be the same task, different tasks in the same
// thread group, or tasks in different thread groups. (In the last case, Linux
// refers to the task as being ptrace-reparented due to an implementation
// detail; we avoid this terminology to avoid confusion.)
//
// A thread group is *empty* if all non-leader tasks in the thread group are
// dead, and the leader is either a zombie or dead. The exit of a thread group
// leader is never waitable - by either the parent or tracer - until the thread
// group is empty.
//
// There are a few ways for an exit notification to be resolved:
//
// - The exit notification may be acknowledged by a call to Task.Wait with
// WaitOptions.ConsumeEvent set (e.g. due to a wait4() syscall).
//
// - If the notified party is the parent, and the parent thread group is not
// also the tracer thread group, and the notification signal is SIGCHLD, the
// parent may explicitly ignore the notification (see quote in exitNotify).
// Note that it's possible for the notified party to ignore the signal in other
// cases, but the notification is only resolved under the above conditions.
// (Actually, there is one exception; see the last paragraph of the "leader,
// has tracer, tracer thread group is parent thread group" case below.)
//
// - If the notified party is the parent, and the parent does not exist, the
// notification is resolved as if ignored. (This is only possible in the
// sentry. In Linux, the only task / thread group without a parent is global
// init, and killing global init causes a kernel panic.)
//
// - If the notified party is a tracer, the tracer may detach the traced task.
// (Zombie tasks cannot be ptrace-attached, so the reverse is not possible.)
//
// In addition, if the notified party is the parent, the parent may exit and
// cause the notifying task to be reparented to another thread group. This does
// not resolve the notification; instead, the notification must be resent to
// the new parent.
//
// The series of notifications generated for a given task's exit depend on
// whether it is a thread group leader; whether the task is ptraced; and, if
// so, whether the tracer thread group is the same as the parent thread group.
//
// - Non-leader, no tracer: No notification is generated; the task is reaped
// immediately.
//
// - Non-leader, has tracer: SIGCHLD is sent to the tracer. When the tracer
// notification is resolved (by waiting or detaching), the task is reaped. (For
// non-leaders, whether the tracer and parent thread groups are the same is
// irrelevant.)
//
// - Leader, no tracer: The task remains a zombie, with no notification sent,
// until all other tasks in the thread group are dead. (In Linux terms, this
// condition is indicated by include/linux/sched.h:thread_group_empty(); tasks
// are removed from their thread_group list in kernel/exit.c:release_task() =>
// __exit_signal() => __unhash_process().) Then the thread group's termination
// signal is sent to the parent. When the parent notification is resolved (by
// waiting or ignoring), the task is reaped.
//
// - Leader, has tracer, tracer thread group is not parent thread group:
// SIGCHLD is sent to the tracer. When the tracer notification is resolved (by
// waiting or detaching), and all other tasks in the thread group are dead, the
// thread group's termination signal is sent to the parent. (Note that the
// tracer cannot resolve the exit notification by waiting until the thread
// group is empty.) When the parent notification is resolved, the task is
// reaped.
//
// - Leader, has tracer, tracer thread group is parent thread group:
//
// If all other tasks in the thread group are dead, the thread group's
// termination signal is sent to the parent. At this point, the notification
// can only be resolved by waiting. If the parent detaches from the task as a
// tracer, the notification is not resolved, but the notification can now be
// resolved by waiting or ignoring. When the parent notification is resolved,
// the task is reaped.
//
// If at least one task in the thread group is not dead, SIGCHLD is sent to the
// parent. At this point, the notification cannot be resolved at all; once the
// thread group becomes empty, it can be resolved only by waiting. If the
// parent detaches from the task as a tracer before all remaining tasks die,
// then exit notification proceeds as in the case where the leader never had a
// tracer. If the parent detaches from the task as a tracer after all remaining
// tasks die, the notification is not resolved, but the notification can now be
// resolved by waiting or ignoring. When the parent notification is resolved,
// the task is reaped.
//
// In both of the above cases, when the parent detaches from the task as a
// tracer while the thread group is empty, whether or not the parent resolves
// the notification by ignoring it is based on the parent's SIGCHLD signal
// action, whether or not the thread group's termination signal is SIGCHLD
// (Linux: kernel/ptrace.c:__ptrace_detach() => ignoring_children()).
//
// There is one final wrinkle: A leader can become a non-leader due to a
// sibling execve. In this case, the execing thread detaches the leader's
// tracer (if one exists) and reaps the leader immediately. In Linux, this is
// in fs/exec.c:de_thread(); in the sentry, this is in Task.promoteLocked().
type runExitNotify struct{}
func (*runExitNotify) execute(t *Task) taskRunState {
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
t.advanceExitStateLocked(TaskExitInitiated, TaskExitZombie)
t.tg.liveTasks--
// Check if this completes a sibling's execve.
if t.tg.execing != nil && t.tg.liveTasks == 1 {
// execing blocks the addition of new tasks to the thread group, so
// the sole living task must be the execing one.
e := t.tg.execing
e.tg.signalHandlers.mu.Lock()
if _, ok := e.stop.(*execStop); ok {
e.endInternalStopLocked()
}
e.tg.signalHandlers.mu.Unlock()
}
t.exitNotifyLocked(false)
// The task goroutine will now exit.
return nil
}
// exitNotifyLocked is called after changes to t's state that affect exit
// notification.
//
// If fromPtraceDetach is true, the caller is ptraceDetach or exitPtrace;
// thanks to Linux's haphazard implementation of this functionality, such cases
// determine whether parent notifications are ignored based on the parent's
// handling of SIGCHLD, regardless of what the exited task's thread group's
// termination signal is.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) exitNotifyLocked(fromPtraceDetach bool) {
if t.exitState != TaskExitZombie {
return
}
if !t.exitTracerNotified {
t.exitTracerNotified = true
tracer := t.Tracer()
if tracer == nil {
t.exitTracerAcked = true
} else if t != t.tg.leader || t.parent == nil || tracer.tg != t.parent.tg {
// Don't set exitParentNotified if t is non-leader, even if the
// tracer is in the parent thread group, so that if the parent
// detaches the following call to exitNotifyLocked passes through
// the !exitParentNotified case below and causes t to be reaped
// immediately.
//
// Tracer notification doesn't care about about
// SIG_IGN/SA_NOCLDWAIT.
tracer.tg.signalHandlers.mu.Lock()
tracer.sendSignalLocked(t.exitNotificationSignal(linux.SIGCHLD, tracer), true /* group */)
tracer.tg.signalHandlers.mu.Unlock()
// Wake EventTraceeStop waiters as well since this task will never
// ptrace-stop again.
tracer.tg.eventQueue.Notify(EventExit | EventTraceeStop)
} else {
// t is a leader and the tracer is in the parent thread group.
t.exitParentNotified = true
sig := linux.SIGCHLD
if t.tg.tasksCount == 1 {
sig = t.tg.terminationSignal
}
// This notification doesn't care about SIG_IGN/SA_NOCLDWAIT either
// (in Linux, the check in do_notify_parent() is gated by
// !tsk->ptrace.)
t.parent.tg.signalHandlers.mu.Lock()
t.parent.sendSignalLocked(t.exitNotificationSignal(sig, t.parent), true /* group */)
t.parent.tg.signalHandlers.mu.Unlock()
// See below for rationale for this event mask.
t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
}
}
if t.exitTracerAcked && !t.exitParentNotified {
if t != t.tg.leader {
t.exitParentNotified = true
t.exitParentAcked = true
} else if t.tg.tasksCount == 1 {
t.exitParentNotified = true
if t.parent == nil {
t.exitParentAcked = true
} else {
// "POSIX.1-2001 specifies that if the disposition of SIGCHLD is
// set to SIG_IGN or the SA_NOCLDWAIT flag is set for SIGCHLD (see
// sigaction(2)), then children that terminate do not become
// zombies and a call to wait() or waitpid() will block until all
// children have terminated, and then fail with errno set to
// ECHILD. (The original POSIX standard left the behavior of
// setting SIGCHLD to SIG_IGN unspecified. Note that even though
// the default disposition of SIGCHLD is "ignore", explicitly
// setting the disposition to SIG_IGN results in different
// treatment of zombie process children.) Linux 2.6 conforms to
// this specification." - wait(2)
//
// Some undocumented Linux-specific details:
//
// - All of the above is ignored if the termination signal isn't
// SIGCHLD.
//
// - SA_NOCLDWAIT causes the leader to be immediately reaped, but
// does not suppress the SIGCHLD.
signalParent := t.tg.terminationSignal.IsValid()
t.parent.tg.signalHandlers.mu.Lock()
if t.tg.terminationSignal == linux.SIGCHLD || fromPtraceDetach {
if act, ok := t.parent.tg.signalHandlers.actions[linux.SIGCHLD]; ok {
if act.Handler == arch.SignalActIgnore {
t.exitParentAcked = true
signalParent = false
} else if act.Flags&arch.SignalFlagNoCldWait != 0 {
t.exitParentAcked = true
}
}
}
if signalParent {
t.parent.tg.leader.sendSignalLocked(t.exitNotificationSignal(t.tg.terminationSignal, t.parent), true /* group */)
}
t.parent.tg.signalHandlers.mu.Unlock()
// If a task in the parent was waiting for a child group stop
// or continue, it needs to be notified of the exit, because
// there may be no remaining eligible tasks (so that wait
// should return ECHILD).
t.parent.tg.eventQueue.Notify(EventExit | EventChildGroupStop | EventGroupContinue)
}
}
}
if t.exitTracerAcked && t.exitParentAcked {
t.advanceExitStateLocked(TaskExitZombie, TaskExitDead)
for ns := t.tg.pidns; ns != nil; ns = ns.parent {
tid := ns.tids[t]
delete(ns.tasks, tid)
delete(ns.tids, t)
}
t.tg.exitedCPUStats.Accumulate(t.CPUStats())
t.tg.ioUsage.Accumulate(t.ioUsage)
t.tg.signalHandlers.mu.Lock()
t.tg.tasks.Remove(t)
if t.tg.lastTimerSignalTask == t {
t.tg.lastTimerSignalTask = nil
}
t.tg.tasksCount--
tc := t.tg.tasksCount
t.tg.signalHandlers.mu.Unlock()
if tc == 1 && t != t.tg.leader {
// Our fromPtraceDetach doesn't matter here (in Linux terms, this
// is via a call to release_task()).
t.tg.leader.exitNotifyLocked(false)
} else if tc == 0 {
t.tg.processGroup.decRefWithParent(t.tg.parentPG())
}
if t.parent != nil {
delete(t.parent.children, t)
t.parent = nil
}
}
}
// Preconditions: The TaskSet mutex must be locked.
func (t *Task) exitNotificationSignal(sig linux.Signal, receiver *Task) *arch.SignalInfo {
info := &arch.SignalInfo{
Signo: int32(sig),
}
info.SetPid(int32(receiver.tg.pidns.tids[t]))
info.SetUid(int32(t.Credentials().RealKUID.In(receiver.UserNamespace()).OrOverflow()))
if t.exitStatus.Signaled() {
info.Code = arch.CLD_KILLED
info.SetStatus(int32(t.exitStatus.Signo))
} else {
info.Code = arch.CLD_EXITED
info.SetStatus(int32(t.exitStatus.Code))
}
// TODO: Set utime, stime.
return info
}
// ExitStatus returns t's exit status, which is only guaranteed to be
// meaningful if t.ExitState() != TaskExitNone.
func (t *Task) ExitStatus() ExitStatus {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
t.tg.signalHandlers.mu.Lock()
defer t.tg.signalHandlers.mu.Unlock()
return t.exitStatus
}
// ExitStatus returns the exit status that would be returned by a consuming
// wait*() on tg.
func (tg *ThreadGroup) ExitStatus() ExitStatus {
tg.pidns.owner.mu.RLock()
defer tg.pidns.owner.mu.RUnlock()
tg.signalHandlers.mu.Lock()
defer tg.signalHandlers.mu.Unlock()
if tg.exiting {
return tg.exitStatus
}
return tg.leader.exitStatus
}
// TerminationSignal returns the thread group's termination signal.
func (tg *ThreadGroup) TerminationSignal() linux.Signal {
tg.pidns.owner.mu.RLock()
defer tg.pidns.owner.mu.RUnlock()
return tg.terminationSignal
}
// Task events that can be waited for.
const (
// EventExit represents an exit notification generated for a child thread
// group leader or a tracee under the conditions specified in the comment
// above runExitNotify.
EventExit waiter.EventMask = 1 << iota
// EventChildGroupStop occurs when a child thread group completes a group
// stop (i.e. all tasks in the child thread group have entered a stopped
// state as a result of a group stop).
EventChildGroupStop
// EventTraceeStop occurs when a task that is ptraced by a task in the
// notified thread group enters a ptrace stop (see ptrace(2)).
EventTraceeStop
// EventGroupContinue occurs when a child thread group, or a thread group
// whose leader is ptraced by a task in the notified thread group, that had
// initiated or completed a group stop leaves the group stop, due to the
// child thread group or any task in the child thread group being sent
// SIGCONT.
EventGroupContinue
)
// WaitOptions controls the behavior of Task.Wait.
type WaitOptions struct {
// If SpecificTID is non-zero, only events from the task with thread ID
// SpecificTID are eligible to be waited for. SpecificTID is resolved in
// the PID namespace of the waiter (the method receiver of Task.Wait). If
// no such task exists, or that task would not otherwise be eligible to be
// waited for by the waiting task, then there are no waitable tasks and
// Wait will return ECHILD.
SpecificTID ThreadID
// If SpecificPGID is non-zero, only events from ThreadGroups with a
// matching ProcessGroupID are eligible to be waited for. (Same
// constraints as SpecificTID apply.)
SpecificPGID ProcessGroupID
// Terminology note: Per waitpid(2), "a clone child is one which delivers
// no signal, or a signal other than SIGCHLD to its parent upon
// termination." In Linux, termination signal is technically a per-task
// property rather than a per-thread-group property. However, clone()
// forces no termination signal for tasks created with CLONE_THREAD, and
// execve() resets the termination signal to SIGCHLD, so all
// non-group-leader threads have no termination signal and are therefore
// "clone tasks".
// If NonCloneTasks is true, events from non-clone tasks are eligible to be
// waited for.
NonCloneTasks bool
// If CloneTasks is true, events from clone tasks are eligible to be waited
// for.
CloneTasks bool
// Events is a bitwise combination of the events defined above that specify
// what events are of interest to the call to Wait.
Events waiter.EventMask
// If ConsumeEvent is true, the Wait should consume the event such that it
// cannot be returned by a future Wait. Note that if a task exit is
// consumed in this way, in most cases the task will be reaped.
ConsumeEvent bool
// If BlockInterruptErr is not nil, Wait will block until either an event
// is available or there are no tasks that could produce a waitable event;
// if that blocking is interrupted, Wait returns BlockInterruptErr. If
// BlockInterruptErr is nil, Wait will not block.
BlockInterruptErr error
}
// Preconditions: The TaskSet mutex must be locked (for reading or writing).
func (o *WaitOptions) matchesTask(t *Task, pidns *PIDNamespace) bool {
if o.SpecificTID != 0 && o.SpecificTID != pidns.tids[t] {
return false
}
if o.SpecificPGID != 0 && o.SpecificPGID != pidns.pgids[t.tg.processGroup] {
return false
}
if t == t.tg.leader && t.tg.terminationSignal == linux.SIGCHLD {
return o.NonCloneTasks
}
return o.CloneTasks
}
// ErrNoWaitableEvent is returned by non-blocking Task.Waits (e.g.
// waitpid(WNOHANG)) that find no waitable events, but determine that waitable
// events may exist in the future. (In contrast, if a non-blocking or blocking
// Wait determines that there are no tasks that can produce a waitable event,
// Task.Wait returns ECHILD.)
var ErrNoWaitableEvent = errors.New("non-blocking Wait found eligible threads but no waitable events")
// WaitResult contains information about a waited-for event.
type WaitResult struct {
// Task is the task that reported the event.
Task *Task
// TID is the thread ID of Task in the PID namespace of the task that
// called Wait (that is, the method receiver of the call to Task.Wait). TID
// is provided because consuming exit waits cause the thread ID to be
// deallocated.
TID ThreadID
// UID is the real UID of Task in the user namespace of the task that
// called Wait.
UID auth.UID
// Event is exactly one of the events defined above.
Event waiter.EventMask
// Status is the numeric status associated with the event.
Status uint32
}
// Wait waits for an event from a thread group that is a child of t's thread
// group, or a task in such a thread group, or a task that is ptraced by t,
// subject to the options specified in opts.
func (t *Task) Wait(opts *WaitOptions) (*WaitResult, error) {
if opts.BlockInterruptErr == nil {
return t.waitOnce(opts)
}
w, ch := waiter.NewChannelEntry(nil)
t.tg.eventQueue.EventRegister(&w, opts.Events)
defer t.tg.eventQueue.EventUnregister(&w)
for {
wr, err := t.waitOnce(opts)
if err != ErrNoWaitableEvent {
// This includes err == nil.
return wr, err
}
if err := t.Block(ch); err != nil {
return wr, syserror.ConvertIntr(err, opts.BlockInterruptErr)
}
}
}
func (t *Task) waitOnce(opts *WaitOptions) (*WaitResult, error) {
anyWaitableTasks := false
t.tg.pidns.owner.mu.Lock()
defer t.tg.pidns.owner.mu.Unlock()
// Without the (unimplemented) __WNOTHREAD flag, a task can wait on the
// children and tracees of any task in the same thread group.
for parent := t.tg.tasks.Front(); parent != nil; parent = parent.Next() {
for child := range parent.children {
if !opts.matchesTask(child, parent.tg.pidns) {
continue
}
// Non-leaders don't notify parents on exit and aren't eligible to
// be waited on.
if opts.Events&EventExit != 0 && child == child.tg.leader && !child.exitParentAcked {
anyWaitableTasks = true
if wr := t.waitCollectZombieLocked(child, opts, false); wr != nil {
return wr, nil
}
}
// Check for group stops and continues. Tasks that have passed
// TaskExitInitiated can no longer participate in group stops.
if opts.Events&(EventChildGroupStop|EventGroupContinue) == 0 {
continue
}
if child.exitState >= TaskExitInitiated {
continue
}
// If the waiter is in the same thread group as the task's
// tracer, do not report its group stops; they will be reported
// as ptrace stops instead. This also skips checking for group
// continues, but they'll be checked for when scanning tracees
// below. (Per kernel/exit.c:wait_consider_task(): "If a
// ptracer wants to distinguish the two events for its own
// children, it should create a separate process which takes
// the role of real parent.")
if tracer := child.Tracer(); tracer != nil && tracer.tg == parent.tg {
continue
}
anyWaitableTasks = true
if opts.Events&EventChildGroupStop != 0 {
if wr := t.waitCollectChildGroupStopLocked(child, opts); wr != nil {
return wr, nil
}
}
if opts.Events&EventGroupContinue != 0 {
if wr := t.waitCollectGroupContinueLocked(child, opts); wr != nil {
return wr, nil
}
}
}
for tracee := range parent.ptraceTracees {
if !opts.matchesTask(tracee, parent.tg.pidns) {
continue
}
// Non-leaders do notify tracers on exit.
if opts.Events&EventExit != 0 && !tracee.exitTracerAcked {
anyWaitableTasks = true
if wr := t.waitCollectZombieLocked(tracee, opts, true); wr != nil {
return wr, nil
}
}
if opts.Events&(EventTraceeStop|EventGroupContinue) == 0 {
continue
}
if tracee.exitState >= TaskExitInitiated {
continue
}
anyWaitableTasks = true
if opts.Events&EventTraceeStop != 0 {
if wr := t.waitCollectTraceeStopLocked(tracee, opts); wr != nil {
return wr, nil
}
}
if opts.Events&EventGroupContinue != 0 {
if wr := t.waitCollectGroupContinueLocked(tracee, opts); wr != nil {
return wr, nil
}
}
}
}
if anyWaitableTasks {
return nil, ErrNoWaitableEvent
}
return nil, syserror.ECHILD
}
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectZombieLocked(target *Task, opts *WaitOptions, asPtracer bool) *WaitResult {
if asPtracer && !target.exitTracerNotified {
return nil
}
if !asPtracer && !target.exitParentNotified {
return nil
}
// Zombied thread group leaders are never waitable until their thread group
// is otherwise empty. Usually this is caught by the
// target.exitParentNotified check above, but if t is both (in the thread
// group of) target's tracer and parent, asPtracer may be true.
if target == target.tg.leader && target.tg.tasksCount != 1 {
return nil
}
pid := t.tg.pidns.tids[target]
uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
status := target.exitStatus.Status()
if !opts.ConsumeEvent {
return &WaitResult{
Task: target,
TID: pid,
UID: uid,
Event: EventExit,
Status: status,
}
}
// Surprisingly, the exit status reported by a non-consuming wait can
// differ from that reported by a consuming wait; the latter will return
// the group exit code if one is available.
if target.tg.exiting {
status = target.tg.exitStatus.Status()
}
// t may be (in the thread group of) target's parent, tracer, or both. We
// don't need to check for !exitTracerAcked because tracees are detached
// here, and we don't need to check for !exitParentAcked because zombies
// will be reaped here.
if tracer := target.Tracer(); tracer != nil && tracer.tg == t.tg && target.exitTracerNotified {
target.exitTracerAcked = true
target.ptraceTracer.Store((*Task)(nil))
delete(t.ptraceTracees, target)
}
if target.parent != nil && target.parent.tg == t.tg && target.exitParentNotified {
target.exitParentAcked = true
if target == target.tg.leader {
// target.tg.exitedCPUStats doesn't include target.CPUStats() yet,
// and won't until after target.exitNotifyLocked() (maybe). Include
// target.CPUStats() explicitly. This is consistent with Linux,
// which accounts an exited task's cputime to its thread group in
// kernel/exit.c:release_task() => __exit_signal(), and uses
// thread_group_cputime_adjusted() in wait_task_zombie().
t.tg.childCPUStats.Accumulate(target.CPUStats())
t.tg.childCPUStats.Accumulate(target.tg.exitedCPUStats)
t.tg.childCPUStats.Accumulate(target.tg.childCPUStats)
// Update t's child max resident set size. The size will be the maximum
// of this thread's size and all its childrens' sizes.
if t.tg.childMaxRSS < target.tg.maxRSS {
t.tg.childMaxRSS = target.tg.maxRSS
}
if t.tg.childMaxRSS < target.tg.childMaxRSS {
t.tg.childMaxRSS = target.tg.childMaxRSS
}
}
}
target.exitNotifyLocked(false)
return &WaitResult{
Task: target,
TID: pid,
UID: uid,
Event: EventExit,
Status: status,
}
}
// updateRSSLocked updates t.tg.maxRSS.
//
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) updateRSSLocked() {
if mmMaxRSS := t.MemoryManager().MaxResidentSetSize(); t.tg.maxRSS < mmMaxRSS {
t.tg.maxRSS = mmMaxRSS
}
}
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectChildGroupStopLocked(target *Task, opts *WaitOptions) *WaitResult {
target.tg.signalHandlers.mu.Lock()
defer target.tg.signalHandlers.mu.Unlock()
if !target.tg.groupStopWaitable {
return nil
}
pid := t.tg.pidns.tids[target]
uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
sig := target.tg.groupStopSignal
if opts.ConsumeEvent {
target.tg.groupStopWaitable = false
}
return &WaitResult{
Task: target,
TID: pid,
UID: uid,
Event: EventChildGroupStop,
// There is no name for these status constants.
Status: (uint32(sig)&0xff)<<8 | 0x7f,
}
}
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectGroupContinueLocked(target *Task, opts *WaitOptions) *WaitResult {
target.tg.signalHandlers.mu.Lock()
defer target.tg.signalHandlers.mu.Unlock()
if !target.tg.groupContWaitable {
return nil
}
pid := t.tg.pidns.tids[target]
uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
if opts.ConsumeEvent {
target.tg.groupContWaitable = false
}
return &WaitResult{
Task: target,
TID: pid,
UID: uid,
Event: EventGroupContinue,
Status: 0xffff,
}
}
// Preconditions: The TaskSet mutex must be locked for writing.
func (t *Task) waitCollectTraceeStopLocked(target *Task, opts *WaitOptions) *WaitResult {
target.tg.signalHandlers.mu.Lock()
defer target.tg.signalHandlers.mu.Unlock()
if target.stop == nil {
return nil
}
if _, ok := target.stop.(*ptraceStop); !ok {
return nil
}
if target.ptraceCode == 0 {
return nil
}
pid := t.tg.pidns.tids[target]
uid := target.Credentials().RealKUID.In(t.UserNamespace()).OrOverflow()
code := target.ptraceCode
if opts.ConsumeEvent {
target.ptraceCode = 0
}
return &WaitResult{
Task: target,
TID: pid,
UID: uid,
Event: EventTraceeStop,
Status: uint32(code)<<8 | 0x7f,
}
}
// ExitState returns t's current progress through the exit path.
func (t *Task) ExitState() TaskExitState {
t.tg.pidns.owner.mu.RLock()
defer t.tg.pidns.owner.mu.RUnlock()
return t.exitState
}
// ParentDeathSignal returns t's parent death signal.
func (t *Task) ParentDeathSignal() linux.Signal {
t.mu.Lock()
defer t.mu.Unlock()
return t.parentDeathSignal
}
// SetParentDeathSignal sets t's parent death signal.
func (t *Task) SetParentDeathSignal(sig linux.Signal) {
t.mu.Lock()
defer t.mu.Unlock()
t.parentDeathSignal = sig
}