gvisor/pkg/sentry/kernel/task_start.go

260 lines
7.4 KiB
Go

// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
import (
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/auth"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/futex"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
// TaskConfig defines the configuration of a new Task (see below).
type TaskConfig struct {
// Kernel is the owning Kernel.
*Kernel
// Parent is the new task's parent. Parent may be nil.
Parent *Task
// If InheritParent is not nil, use InheritParent's parent as the new
// task's parent.
InheritParent *Task
// ThreadGroup is the ThreadGroup the new task belongs to.
*ThreadGroup
// TaskContext is the TaskContext of the new task.
*TaskContext
// TaskResources is the TaskResources of the new task.
*TaskResources
// Credentials is the Credentials of the new task.
Credentials *auth.Credentials
// Niceness is the niceness of the new task.
Niceness int
// If NetworkNamespaced is true, the new task should observe a non-root
// network namespace.
NetworkNamespaced bool
// AllowedCPUMask contains the cpus that this task can run on.
AllowedCPUMask sched.CPUSet
// UTSNamespace is the UTSNamespace of the new task.
UTSNamespace *UTSNamespace
// IPCNamespace is the IPCNamespace of the new task.
IPCNamespace *IPCNamespace
}
// NewTask creates a new task defined by TaskConfig.
// Whether or not NewTask is successful, it takes ownership of both TaskContext
// and TaskResources of the TaskConfig.
//
// NewTask does not start the returned task; the caller must call Task.Start.
func (ts *TaskSet) NewTask(cfg *TaskConfig) (*Task, error) {
t, err := ts.newTask(cfg)
if err != nil {
cfg.TaskContext.release()
cfg.TaskResources.release()
return nil, err
}
return t, nil
}
// newTask is a helper for TaskSet.NewTask that only takes ownership of TaskContext
// and TaskResources of the TaskConfig if it succeeds.
func (ts *TaskSet) newTask(cfg *TaskConfig) (*Task, error) {
tg := cfg.ThreadGroup
tc := cfg.TaskContext
t := &Task{
taskNode: taskNode{
tg: tg,
parent: cfg.Parent,
children: make(map[*Task]struct{}),
},
runState: (*runApp)(nil),
interruptChan: make(chan struct{}, 1),
signalStack: arch.SignalStack{Flags: arch.SignalStackFlagDisable},
tc: *tc,
tr: *cfg.TaskResources,
p: cfg.Kernel.Platform.NewContext(),
k: cfg.Kernel,
ptraceTracees: make(map[*Task]struct{}),
allowedCPUMask: cfg.AllowedCPUMask.Copy(),
ioUsage: &usage.IO{},
creds: cfg.Credentials,
niceness: cfg.Niceness,
netns: cfg.NetworkNamespaced,
utsns: cfg.UTSNamespace,
ipcns: cfg.IPCNamespace,
rseqCPU: -1,
futexWaiter: futex.NewWaiter(),
}
t.endStopCond.L = &t.tg.signalHandlers.mu
t.ptraceTracer.Store((*Task)(nil))
// We don't construct t.blockingTimer until Task.run(); see that function
// for justification.
// Make the new task (and possibly thread group) visible to the rest of
// the system atomically.
ts.mu.Lock()
defer ts.mu.Unlock()
tg.signalHandlers.mu.Lock()
defer tg.signalHandlers.mu.Unlock()
if tg.exiting || tg.execing != nil {
// If the caller is in the same thread group, then what we return
// doesn't matter too much since the caller will exit before it returns
// to userspace. If the caller isn't in the same thread group, then
// we're in uncharted territory and can return whatever we want.
return nil, syserror.EINTR
}
if err := ts.assignTIDsLocked(t); err != nil {
return nil, err
}
// Below this point, newTask is expected not to fail (there is no rollback
// of assignTIDsLocked or any of the following).
// Logging on t's behalf will panic if t.logPrefix hasn't been initialized.
// This is the earliest point at which we can do so (since t now has thread
// IDs).
t.updateLogPrefixLocked()
if cfg.InheritParent != nil {
t.parent = cfg.InheritParent.parent
}
if t.parent != nil {
t.parent.children[t] = struct{}{}
}
if tg.leader == nil {
// New thread group.
tg.leader = t
if parentPG := tg.parentPG(); parentPG == nil {
tg.createSession()
} else {
// Inherit the process group.
parentPG.incRefWithParent(parentPG)
tg.processGroup = parentPG
}
}
tg.tasks.PushBack(t)
tg.tasksCount++
tg.liveTasks++
tg.activeTasks++
// Propagate external TaskSet stops to the new task.
t.stopCount = ts.stopCount
t.mu.Lock()
defer t.mu.Unlock()
t.cpu = assignCPU(t.allowedCPUMask, ts.Root.tids[t])
t.startTime = t.k.RealtimeClock().Now()
return t, nil
}
// assignTIDsLocked ensures that new task t is visible in all PID namespaces in
// which it should be visible.
//
// Preconditions: ts.mu must be locked for writing.
func (ts *TaskSet) assignTIDsLocked(t *Task) error {
type allocatedTID struct {
ns *PIDNamespace
tid ThreadID
}
var allocatedTIDs []allocatedTID
for ns := t.tg.pidns; ns != nil; ns = ns.parent {
tid, err := ns.allocateTID()
if err != nil {
// Failure. Remove the tids we already allocated in descendant
// namespaces.
for _, a := range allocatedTIDs {
delete(a.ns.tasks, a.tid)
delete(a.ns.tids, t)
}
return err
}
ns.tasks[tid] = t
ns.tids[t] = tid
allocatedTIDs = append(allocatedTIDs, allocatedTID{ns, tid})
}
return nil
}
// allocateTID returns an unused ThreadID from ns.
//
// Preconditions: ns.owner.mu must be locked for writing.
func (ns *PIDNamespace) allocateTID() (ThreadID, error) {
if ns.exiting {
// "In this case, a subsequent fork(2) into this PID namespace will
// fail with the error ENOMEM; it is not possible to create a new
// processes [sic] in a PID namespace whose init process has
// terminated." - pid_namespaces(7)
return 0, syserror.ENOMEM
}
tid := ns.last
for {
// Next.
tid++
if tid > TasksLimit {
tid = InitTID + 1
}
// Is it available?
_, ok := ns.tasks[tid]
if !ok {
ns.last = tid
return tid, nil
}
// Did we do a full cycle?
if tid == ns.last {
// No tid available.
return 0, syserror.EAGAIN
}
}
}
// Start starts the task goroutine. Start must be called exactly once for each
// task returned by NewTask.
//
// 'tid' must be the task's TID in the root PID namespace and it's used for
// debugging purposes only (set as parameter to Task.run to make it visible
// in stack dumps).
func (t *Task) Start(tid ThreadID) {
// If the task was restored, it may be "starting" after having already exited.
if t.runState == nil {
return
}
t.goroutineStopped.Add(1)
t.tg.liveGoroutines.Add(1)
t.tg.pidns.owner.liveGoroutines.Add(1)
t.tg.pidns.owner.runningGoroutines.Add(1)
// Task is now running in system mode.
t.accountTaskGoroutineLeave(TaskGoroutineNonexistent)
// Use the task's TID in the root PID namespace to make it visible in stack dumps.
go t.run(uintptr(tid)) // S/R-SAFE: synchronizes with saving through stops
}