330 lines
10 KiB
Go
330 lines
10 KiB
Go
// Copyright 2018 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package kernel
|
|
|
|
// CPU scheduling, real and fake.
|
|
|
|
import (
|
|
"fmt"
|
|
"sync/atomic"
|
|
"time"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/hostcpu"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/sched"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/usage"
|
|
"gvisor.googlesource.com/gvisor/pkg/syserror"
|
|
)
|
|
|
|
// TaskGoroutineState is a coarse representation of the current execution
|
|
// status of a kernel.Task goroutine.
|
|
type TaskGoroutineState int
|
|
|
|
const (
|
|
// TaskGoroutineNonexistent indicates that the task goroutine has either
|
|
// not yet been created by Task.Start() or has returned from Task.run().
|
|
// This must be the zero value for TaskGoroutineState.
|
|
TaskGoroutineNonexistent TaskGoroutineState = iota
|
|
|
|
// TaskGoroutineRunningSys indicates that the task goroutine is executing
|
|
// sentry code.
|
|
TaskGoroutineRunningSys
|
|
|
|
// TaskGoroutineRunningApp indicates that the task goroutine is executing
|
|
// application code.
|
|
TaskGoroutineRunningApp
|
|
|
|
// TaskGoroutineBlockedInterruptible indicates that the task goroutine is
|
|
// blocked in Task.block(), and hence may be woken by Task.interrupt()
|
|
// (e.g. due to signal delivery).
|
|
TaskGoroutineBlockedInterruptible
|
|
|
|
// TaskGoroutineBlockedUninterruptible indicates that the task goroutine is
|
|
// stopped outside of Task.block() and Task.doStop(), and hence cannot be
|
|
// woken by Task.interrupt().
|
|
TaskGoroutineBlockedUninterruptible
|
|
|
|
// TaskGoroutineStopped indicates that the task goroutine is blocked in
|
|
// Task.doStop(). TaskGoroutineStopped is similar to
|
|
// TaskGoroutineBlockedUninterruptible, but is a separate state to make it
|
|
// possible to determine when Task.stop is meaningful.
|
|
TaskGoroutineStopped
|
|
)
|
|
|
|
// TaskGoroutineSchedInfo contains task goroutine scheduling state which must
|
|
// be read and updated atomically.
|
|
type TaskGoroutineSchedInfo struct {
|
|
// Timestamp was the value of Kernel.cpuClock when this
|
|
// TaskGoroutineSchedInfo was last updated.
|
|
Timestamp uint64
|
|
|
|
// State is the current state of the task goroutine.
|
|
State TaskGoroutineState
|
|
|
|
// UserTicks is the amount of time the task goroutine has spent executing
|
|
// its associated Task's application code, in units of linux.ClockTick.
|
|
UserTicks uint64
|
|
|
|
// SysTicks is the amount of time the task goroutine has spent executing in
|
|
// the sentry, in units of linux.ClockTick.
|
|
SysTicks uint64
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) accountTaskGoroutineEnter(state TaskGoroutineState) {
|
|
now := t.k.CPUClockNow()
|
|
if t.gosched.State != TaskGoroutineRunningSys {
|
|
panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, TaskGoroutineRunningSys, state))
|
|
}
|
|
t.goschedSeq.BeginWrite()
|
|
// This function is very hot; avoid defer.
|
|
t.gosched.SysTicks += now - t.gosched.Timestamp
|
|
t.gosched.Timestamp = now
|
|
t.gosched.State = state
|
|
t.goschedSeq.EndWrite()
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine, and leaving
|
|
// a state indicated by a previous call to
|
|
// t.accountTaskGoroutineEnter(state).
|
|
func (t *Task) accountTaskGoroutineLeave(state TaskGoroutineState) {
|
|
now := t.k.CPUClockNow()
|
|
if t.gosched.State != state {
|
|
panic(fmt.Sprintf("Task goroutine switching from state %v (expected %v) to %v", t.gosched.State, state, TaskGoroutineRunningSys))
|
|
}
|
|
t.goschedSeq.BeginWrite()
|
|
// This function is very hot; avoid defer.
|
|
if state == TaskGoroutineRunningApp {
|
|
t.gosched.UserTicks += now - t.gosched.Timestamp
|
|
}
|
|
t.gosched.Timestamp = now
|
|
t.gosched.State = TaskGoroutineRunningSys
|
|
t.goschedSeq.EndWrite()
|
|
}
|
|
|
|
// TaskGoroutineSchedInfo returns a copy of t's task goroutine scheduling info.
|
|
// Most clients should use t.CPUStats() instead.
|
|
func (t *Task) TaskGoroutineSchedInfo() TaskGoroutineSchedInfo {
|
|
return SeqAtomicLoadTaskGoroutineSchedInfo(&t.goschedSeq, &t.gosched)
|
|
}
|
|
|
|
// CPUStats returns the CPU usage statistics of t.
|
|
func (t *Task) CPUStats() usage.CPUStats {
|
|
return t.cpuStatsAt(t.k.CPUClockNow())
|
|
}
|
|
|
|
// Preconditions: now <= Kernel.CPUClockNow(). (Since Kernel.cpuClock is
|
|
// monotonic, this is satisfied if now is the result of a previous call to
|
|
// Kernel.CPUClockNow().) This requirement exists because otherwise a racing
|
|
// change to t.gosched can cause cpuStatsAt to adjust stats by too much, making
|
|
// the returned stats non-monotonic.
|
|
func (t *Task) cpuStatsAt(now uint64) usage.CPUStats {
|
|
tsched := t.TaskGoroutineSchedInfo()
|
|
if tsched.Timestamp < now {
|
|
// Update stats to reflect execution since the last update to
|
|
// t.gosched.
|
|
switch tsched.State {
|
|
case TaskGoroutineRunningSys:
|
|
tsched.SysTicks += now - tsched.Timestamp
|
|
case TaskGoroutineRunningApp:
|
|
tsched.UserTicks += now - tsched.Timestamp
|
|
}
|
|
}
|
|
return usage.CPUStats{
|
|
UserTime: time.Duration(tsched.UserTicks * uint64(linux.ClockTick)),
|
|
SysTime: time.Duration(tsched.SysTicks * uint64(linux.ClockTick)),
|
|
VoluntarySwitches: atomic.LoadUint64(&t.yieldCount),
|
|
}
|
|
}
|
|
|
|
// CPUStats returns the combined CPU usage statistics of all past and present
|
|
// threads in tg.
|
|
func (tg *ThreadGroup) CPUStats() usage.CPUStats {
|
|
tg.pidns.owner.mu.RLock()
|
|
defer tg.pidns.owner.mu.RUnlock()
|
|
// Hack to get a pointer to the Kernel.
|
|
if tg.leader == nil {
|
|
// Per comment on tg.leader, this is only possible if nothing in the
|
|
// ThreadGroup has ever executed anyway.
|
|
return usage.CPUStats{}
|
|
}
|
|
now := tg.leader.k.CPUClockNow()
|
|
stats := tg.exitedCPUStats
|
|
// Account for active tasks.
|
|
for t := tg.tasks.Front(); t != nil; t = t.Next() {
|
|
stats.Accumulate(t.cpuStatsAt(now))
|
|
}
|
|
return stats
|
|
}
|
|
|
|
// JoinedChildCPUStats implements the semantics of RUSAGE_CHILDREN: "Return
|
|
// resource usage statistics for all children of [tg] that have terminated and
|
|
// been waited for. These statistics will include the resources used by
|
|
// grandchildren, and further removed descendants, if all of the intervening
|
|
// descendants waited on their terminated children."
|
|
func (tg *ThreadGroup) JoinedChildCPUStats() usage.CPUStats {
|
|
tg.pidns.owner.mu.RLock()
|
|
defer tg.pidns.owner.mu.RUnlock()
|
|
return tg.childCPUStats
|
|
}
|
|
|
|
// StateStatus returns a string representation of the task's current state,
|
|
// appropriate for /proc/[pid]/status.
|
|
func (t *Task) StateStatus() string {
|
|
switch s := t.TaskGoroutineSchedInfo().State; s {
|
|
case TaskGoroutineNonexistent:
|
|
t.tg.pidns.owner.mu.RLock()
|
|
defer t.tg.pidns.owner.mu.RUnlock()
|
|
switch t.exitState {
|
|
case TaskExitZombie:
|
|
return "Z (zombie)"
|
|
case TaskExitDead:
|
|
return "X (dead)"
|
|
default:
|
|
// The task goroutine can't exit before passing through
|
|
// runExitNotify, so this indicates that the task has been created,
|
|
// but the task goroutine hasn't yet started. The Linux equivalent
|
|
// is struct task_struct::state == TASK_NEW
|
|
// (kernel/fork.c:copy_process() =>
|
|
// kernel/sched/core.c:sched_fork()), but the TASK_NEW bit is
|
|
// masked out by TASK_REPORT for /proc/[pid]/status, leaving only
|
|
// TASK_RUNNING.
|
|
return "R (running)"
|
|
}
|
|
case TaskGoroutineRunningSys, TaskGoroutineRunningApp:
|
|
return "R (running)"
|
|
case TaskGoroutineBlockedInterruptible:
|
|
return "S (sleeping)"
|
|
case TaskGoroutineStopped:
|
|
t.tg.signalHandlers.mu.Lock()
|
|
defer t.tg.signalHandlers.mu.Unlock()
|
|
switch t.stop.(type) {
|
|
case *groupStop:
|
|
return "T (stopped)"
|
|
case *ptraceStop:
|
|
return "t (tracing stop)"
|
|
}
|
|
fallthrough
|
|
case TaskGoroutineBlockedUninterruptible:
|
|
// This is the name Linux uses for TASK_UNINTERRUPTIBLE and
|
|
// TASK_KILLABLE (= TASK_UNINTERRUPTIBLE | TASK_WAKEKILL):
|
|
// fs/proc/array.c:task_state_array.
|
|
return "D (disk sleep)"
|
|
default:
|
|
panic(fmt.Sprintf("Invalid TaskGoroutineState: %v", s))
|
|
}
|
|
}
|
|
|
|
// CPUMask returns a copy of t's allowed CPU mask.
|
|
func (t *Task) CPUMask() sched.CPUSet {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.allowedCPUMask.Copy()
|
|
}
|
|
|
|
// SetCPUMask sets t's allowed CPU mask based on mask. It takes ownership of
|
|
// mask.
|
|
//
|
|
// Preconditions: mask.Size() ==
|
|
// sched.CPUSetSize(t.Kernel().ApplicationCores()).
|
|
func (t *Task) SetCPUMask(mask sched.CPUSet) error {
|
|
if want := sched.CPUSetSize(t.k.applicationCores); mask.Size() != want {
|
|
panic(fmt.Sprintf("Invalid CPUSet %v (expected %d bytes)", mask, want))
|
|
}
|
|
|
|
// Remove CPUs in mask above Kernel.applicationCores.
|
|
mask.ClearAbove(t.k.applicationCores)
|
|
|
|
// Ensure that at least 1 CPU is still allowed.
|
|
if mask.NumCPUs() == 0 {
|
|
return syserror.EINVAL
|
|
}
|
|
|
|
if t.k.useHostCores {
|
|
// No-op; pretend the mask was immediately changed back.
|
|
return nil
|
|
}
|
|
|
|
t.tg.pidns.owner.mu.RLock()
|
|
rootTID := t.tg.pidns.owner.Root.tids[t]
|
|
t.tg.pidns.owner.mu.RUnlock()
|
|
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.allowedCPUMask = mask
|
|
atomic.StoreInt32(&t.cpu, assignCPU(mask, rootTID))
|
|
return nil
|
|
}
|
|
|
|
// CPU returns the cpu id for a given task.
|
|
func (t *Task) CPU() int32 {
|
|
if t.k.useHostCores {
|
|
return int32(hostcpu.GetCPU())
|
|
}
|
|
|
|
return atomic.LoadInt32(&t.cpu)
|
|
}
|
|
|
|
// assignCPU returns the virtualized CPU number for the task with global TID
|
|
// tid and allowedCPUMask allowed.
|
|
func assignCPU(allowed sched.CPUSet, tid ThreadID) (cpu int32) {
|
|
// To pretend that threads are evenly distributed to allowed CPUs, choose n
|
|
// to be less than the number of CPUs in allowed ...
|
|
n := int(tid) % int(allowed.NumCPUs())
|
|
// ... then pick the nth CPU in allowed.
|
|
allowed.ForEachCPU(func(c uint) {
|
|
if n--; n == 0 {
|
|
cpu = int32(c)
|
|
}
|
|
})
|
|
return cpu
|
|
}
|
|
|
|
// Niceness returns t's niceness.
|
|
func (t *Task) Niceness() int {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.niceness
|
|
}
|
|
|
|
// Priority returns t's priority.
|
|
func (t *Task) Priority() int {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.niceness + 20
|
|
}
|
|
|
|
// SetNiceness sets t's niceness to n.
|
|
func (t *Task) SetNiceness(n int) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.niceness = n
|
|
}
|
|
|
|
// NumaPolicy returns t's current numa policy.
|
|
func (t *Task) NumaPolicy() (policy int32, nodeMask uint32) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
return t.numaPolicy, t.numaNodeMask
|
|
}
|
|
|
|
// SetNumaPolicy sets t's numa policy.
|
|
func (t *Task) SetNumaPolicy(policy int32, nodeMask uint32) {
|
|
t.mu.Lock()
|
|
defer t.mu.Unlock()
|
|
t.numaPolicy = policy
|
|
t.numaNodeMask = nodeMask
|
|
}
|