gvisor/pkg/sentry/control/proc.go

521 lines
16 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package control
import (
"bytes"
"encoding/json"
"fmt"
"path"
"sort"
"strings"
"text/tabwriter"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/host"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/urpc"
)
// Proc includes task-related functions.
//
// At the moment, this is limited to exec support.
type Proc struct {
Kernel *kernel.Kernel
}
// ExecArgs is the set of arguments to exec.
type ExecArgs struct {
// Filename is the filename to load.
//
// If this is provided as "", then the file will be guessed via Argv[0].
Filename string `json:"filename"`
// Argv is a list of arguments.
Argv []string `json:"argv"`
// Envv is a list of environment variables.
Envv []string `json:"envv"`
// MountNamespace is the mount namespace to execute the new process in.
// A reference on MountNamespace must be held for the lifetime of the
// ExecArgs. If MountNamespace is nil, it will default to the init
// process's MountNamespace.
MountNamespace *fs.MountNamespace
// MountNamespaceVFS2 is the mount namespace to execute the new process in.
// A reference on MountNamespace must be held for the lifetime of the
// ExecArgs. If MountNamespace is nil, it will default to the init
// process's MountNamespace.
MountNamespaceVFS2 *vfs.MountNamespace
// WorkingDirectory defines the working directory for the new process.
WorkingDirectory string `json:"wd"`
// KUID is the UID to run with in the root user namespace. Defaults to
// root if not set explicitly.
KUID auth.KUID
// KGID is the GID to run with in the root user namespace. Defaults to
// the root group if not set explicitly.
KGID auth.KGID
// ExtraKGIDs is the list of additional groups to which the user
// belongs.
ExtraKGIDs []auth.KGID
// Capabilities is the list of capabilities to give to the process.
Capabilities *auth.TaskCapabilities
// StdioIsPty indicates that FDs 0, 1, and 2 are connected to a host
// pty FD.
StdioIsPty bool
// FilePayload determines the files to give to the new process.
urpc.FilePayload
// ContainerID is the container for the process being executed.
ContainerID string
// PIDNamespace is the pid namespace for the process being executed.
PIDNamespace *kernel.PIDNamespace
}
// String prints the arguments as a string.
func (args ExecArgs) String() string {
a := make([]string, len(args.Argv))
copy(a, args.Argv)
if args.Filename != "" {
a[0] = args.Filename
}
return strings.Join(a, " ")
}
// Exec runs a new task.
func (proc *Proc) Exec(args *ExecArgs, waitStatus *uint32) error {
newTG, _, _, err := proc.execAsync(args)
if err != nil {
return err
}
// Wait for completion.
newTG.WaitExited()
*waitStatus = newTG.ExitStatus().Status()
return nil
}
// ExecAsync runs a new task, but doesn't wait for it to finish. It is defined
// as a function rather than a method to avoid exposing execAsync as an RPC.
func ExecAsync(proc *Proc, args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
return proc.execAsync(args)
}
// execAsync runs a new task, but doesn't wait for it to finish. It returns the
// newly created thread group and its PID. If the stdio FDs are TTYs, then a
// TTYFileOperations that wraps the TTY is also returned.
func (proc *Proc) execAsync(args *ExecArgs) (*kernel.ThreadGroup, kernel.ThreadID, *host.TTYFileOperations, error) {
// Import file descriptors.
fdTable := proc.Kernel.NewFDTable()
defer fdTable.DecRef()
// No matter what happens, we should close all files in the FilePayload
// before returning. Any files that are imported will be duped.
defer func() {
for _, f := range args.FilePayload.Files {
f.Close()
}
}()
creds := auth.NewUserCredentials(
args.KUID,
args.KGID,
args.ExtraKGIDs,
args.Capabilities,
proc.Kernel.RootUserNamespace())
initArgs := kernel.CreateProcessArgs{
Filename: args.Filename,
Argv: args.Argv,
Envv: args.Envv,
WorkingDirectory: args.WorkingDirectory,
MountNamespace: args.MountNamespace,
MountNamespaceVFS2: args.MountNamespaceVFS2,
Credentials: creds,
FDTable: fdTable,
Umask: 0022,
Limits: limits.NewLimitSet(),
MaxSymlinkTraversals: linux.MaxSymlinkTraversals,
UTSNamespace: proc.Kernel.RootUTSNamespace(),
IPCNamespace: proc.Kernel.RootIPCNamespace(),
AbstractSocketNamespace: proc.Kernel.RootAbstractSocketNamespace(),
ContainerID: args.ContainerID,
PIDNamespace: args.PIDNamespace,
}
if initArgs.MountNamespace != nil {
// initArgs must hold a reference on MountNamespace, which will
// be donated to the new process in CreateProcess.
initArgs.MountNamespace.IncRef()
}
if initArgs.MountNamespaceVFS2 != nil {
// initArgs must hold a reference on MountNamespaceVFS2, which will
// be donated to the new process in CreateProcess.
initArgs.MountNamespaceVFS2.IncRef()
}
ctx := initArgs.NewContext(proc.Kernel)
if initArgs.Filename == "" {
if kernel.VFS2Enabled {
// Get the full path to the filename from the PATH env variable.
if initArgs.MountNamespaceVFS2 == nil {
// Set initArgs so that 'ctx' returns the namespace.
//
// MountNamespaceVFS2 adds a reference to the namespace, which is
// transferred to the new process.
initArgs.MountNamespaceVFS2 = proc.Kernel.GlobalInit().Leader().MountNamespaceVFS2()
}
paths := fs.GetPath(initArgs.Envv)
vfsObj := proc.Kernel.VFS()
file, err := ResolveExecutablePath(ctx, vfsObj, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
if err != nil {
return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
}
initArgs.File = fsbridge.NewVFSFile(file)
} else {
// Get the full path to the filename from the PATH env variable.
paths := fs.GetPath(initArgs.Envv)
if initArgs.MountNamespace == nil {
// Set initArgs so that 'ctx' returns the namespace.
initArgs.MountNamespace = proc.Kernel.GlobalInit().Leader().MountNamespace()
// initArgs must hold a reference on MountNamespace, which will
// be donated to the new process in CreateProcess.
initArgs.MountNamespaceVFS2.IncRef()
}
f, err := initArgs.MountNamespace.ResolveExecutablePath(ctx, initArgs.WorkingDirectory, initArgs.Argv[0], paths)
if err != nil {
return nil, 0, nil, fmt.Errorf("error finding executable %q in PATH %v: %v", initArgs.Argv[0], paths, err)
}
initArgs.Filename = f
}
}
mounter := fs.FileOwnerFromContext(ctx)
// TODO(gvisor.dev/issue/1623): Use host FD when supported in VFS2.
var ttyFile *fs.File
for appFD, hostFile := range args.FilePayload.Files {
var appFile *fs.File
if args.StdioIsPty && appFD < 3 {
// Import the file as a host TTY file.
if ttyFile == nil {
var err error
appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, true /* isTTY */)
if err != nil {
return nil, 0, nil, err
}
defer appFile.DecRef()
// Remember this in the TTY file, as we will
// use it for the other stdio FDs.
ttyFile = appFile
} else {
// Re-use the existing TTY file, as all three
// stdio FDs must point to the same fs.File in
// order to share TTY state, specifically the
// foreground process group id.
appFile = ttyFile
}
} else {
// Import the file as a regular host file.
var err error
appFile, err = host.ImportFile(ctx, int(hostFile.Fd()), mounter, false /* isTTY */)
if err != nil {
return nil, 0, nil, err
}
defer appFile.DecRef()
}
// Add the file to the FD map.
if err := fdTable.NewFDAt(ctx, int32(appFD), appFile, kernel.FDFlags{}); err != nil {
return nil, 0, nil, err
}
}
tg, tid, err := proc.Kernel.CreateProcess(initArgs)
if err != nil {
return nil, 0, nil, err
}
var ttyFileOps *host.TTYFileOperations
if ttyFile != nil {
// Set the foreground process group on the TTY before starting
// the process.
ttyFileOps = ttyFile.FileOperations.(*host.TTYFileOperations)
ttyFileOps.InitForegroundProcessGroup(tg.ProcessGroup())
}
// Start the newly created process.
proc.Kernel.StartProcess(tg)
return tg, tid, ttyFileOps, nil
}
// PsArgs is the set of arguments to ps.
type PsArgs struct {
// JSON will force calls to Ps to return the result as a JSON payload.
JSON bool
}
// Ps provides a process listing for the running kernel.
func (proc *Proc) Ps(args *PsArgs, out *string) error {
var p []*Process
if e := Processes(proc.Kernel, "", &p); e != nil {
return e
}
if !args.JSON {
*out = ProcessListToTable(p)
} else {
s, e := ProcessListToJSON(p)
if e != nil {
return e
}
*out = s
}
return nil
}
// Process contains information about a single process in a Sandbox.
type Process struct {
UID auth.KUID `json:"uid"`
PID kernel.ThreadID `json:"pid"`
// Parent PID
PPID kernel.ThreadID `json:"ppid"`
Threads []kernel.ThreadID `json:"threads"`
// Processor utilization
C int32 `json:"c"`
// TTY name of the process. Will be of the form "pts/N" if there is a
// TTY, or "?" if there is not.
TTY string `json:"tty"`
// Start time
STime string `json:"stime"`
// CPU time
Time string `json:"time"`
// Executable shortname (e.g. "sh" for /bin/sh)
Cmd string `json:"cmd"`
}
// ProcessListToTable prints a table with the following format:
// UID PID PPID C TTY STIME TIME CMD
// 0 1 0 0 pty/4 14:04 505262ns tail
func ProcessListToTable(pl []*Process) string {
var buf bytes.Buffer
tw := tabwriter.NewWriter(&buf, 10, 1, 3, ' ', 0)
fmt.Fprint(tw, "UID\tPID\tPPID\tC\tTTY\tSTIME\tTIME\tCMD")
for _, d := range pl {
fmt.Fprintf(tw, "\n%d\t%d\t%d\t%d\t%s\t%s\t%s\t%s",
d.UID,
d.PID,
d.PPID,
d.C,
d.TTY,
d.STime,
d.Time,
d.Cmd)
}
tw.Flush()
return buf.String()
}
// ProcessListToJSON will return the JSON representation of ps.
func ProcessListToJSON(pl []*Process) (string, error) {
b, err := json.MarshalIndent(pl, "", " ")
if err != nil {
return "", fmt.Errorf("couldn't marshal process list %v: %v", pl, err)
}
return string(b), nil
}
// PrintPIDsJSON prints a JSON object containing only the PIDs in pl. This
// behavior is the same as runc's.
func PrintPIDsJSON(pl []*Process) (string, error) {
pids := make([]kernel.ThreadID, 0, len(pl))
for _, d := range pl {
pids = append(pids, d.PID)
}
b, err := json.Marshal(pids)
if err != nil {
return "", fmt.Errorf("couldn't marshal PIDs %v: %v", pids, err)
}
return string(b), nil
}
// Processes retrieves information about processes running in the sandbox with
// the given container id. All processes are returned if 'containerID' is empty.
func Processes(k *kernel.Kernel, containerID string, out *[]*Process) error {
ts := k.TaskSet()
now := k.RealtimeClock().Now()
for _, tg := range ts.Root.ThreadGroups() {
pidns := tg.PIDNamespace()
pid := pidns.IDOfThreadGroup(tg)
// If tg has already been reaped ignore it.
if pid == 0 {
continue
}
if containerID != "" && containerID != tg.Leader().ContainerID() {
continue
}
ppid := kernel.ThreadID(0)
if p := tg.Leader().Parent(); p != nil {
ppid = pidns.IDOfThreadGroup(p.ThreadGroup())
}
threads := tg.MemberIDs(pidns)
*out = append(*out, &Process{
UID: tg.Leader().Credentials().EffectiveKUID,
PID: pid,
PPID: ppid,
Threads: threads,
STime: formatStartTime(now, tg.Leader().StartTime()),
C: percentCPU(tg.CPUStats(), tg.Leader().StartTime(), now),
Time: tg.CPUStats().SysTime.String(),
Cmd: tg.Leader().Name(),
TTY: ttyName(tg.TTY()),
})
}
sort.Slice(*out, func(i, j int) bool { return (*out)[i].PID < (*out)[j].PID })
return nil
}
// formatStartTime formats startTime depending on the current time:
// - If startTime was today, HH:MM is used.
// - If startTime was not today but was this year, MonDD is used (e.g. Jan02)
// - If startTime was not this year, the year is used.
func formatStartTime(now, startTime ktime.Time) string {
nowS, nowNs := now.Unix()
n := time.Unix(nowS, nowNs)
startTimeS, startTimeNs := startTime.Unix()
st := time.Unix(startTimeS, startTimeNs)
format := "15:04"
if st.YearDay() != n.YearDay() {
format = "Jan02"
}
if st.Year() != n.Year() {
format = "2006"
}
return st.Format(format)
}
func percentCPU(stats usage.CPUStats, startTime, now ktime.Time) int32 {
// Note: In procps, there is an option to include child CPU stats. As
// it is disabled by default, we do not include them.
total := stats.UserTime + stats.SysTime
lifetime := now.Sub(startTime)
if lifetime <= 0 {
return 0
}
percentCPU := total * 100 / lifetime
// Cap at 99% since procps does the same.
if percentCPU > 99 {
percentCPU = 99
}
return int32(percentCPU)
}
func ttyName(tty *kernel.TTY) string {
if tty == nil {
return "?"
}
return fmt.Sprintf("pts/%d", tty.Index)
}
// ResolveExecutablePath resolves the given executable name given a set of
// paths that might contain it.
func ResolveExecutablePath(ctx context.Context, vfsObj *vfs.VirtualFilesystem, wd, name string, paths []string) (*vfs.FileDescription, error) {
root := vfs.RootFromContext(ctx)
defer root.DecRef()
creds := auth.CredentialsFromContext(ctx)
// Absolute paths can be used directly.
if path.IsAbs(name) {
return openExecutable(ctx, vfsObj, creds, root, name)
}
// Paths with '/' in them should be joined to the working directory, or
// to the root if working directory is not set.
if strings.IndexByte(name, '/') > 0 {
if len(wd) == 0 {
wd = "/"
}
if !path.IsAbs(wd) {
return nil, fmt.Errorf("working directory %q must be absolute", wd)
}
return openExecutable(ctx, vfsObj, creds, root, path.Join(wd, name))
}
// Otherwise, we must lookup the name in the paths, starting from the
// calling context's root directory.
for _, p := range paths {
if !path.IsAbs(p) {
// Relative paths aren't safe, no one should be using them.
log.Warningf("Skipping relative path %q in $PATH", p)
continue
}
binPath := path.Join(p, name)
f, err := openExecutable(ctx, vfsObj, creds, root, binPath)
if err != nil {
return nil, err
}
if f == nil {
continue // Not found/no access.
}
return f, nil
}
return nil, syserror.ENOENT
}
func openExecutable(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, root vfs.VirtualDentry, path string) (*vfs.FileDescription, error) {
pop := vfs.PathOperation{
Root: root,
Start: root, // binPath is absolute, Start can be anything.
Path: fspath.Parse(path),
FollowFinalSymlink: true,
}
opts := &vfs.OpenOptions{
Flags: linux.O_RDONLY,
FileExec: true,
}
f, err := vfsObj.OpenAt(ctx, creds, &pop, opts)
if err == syserror.ENOENT || err == syserror.EACCES {
return nil, nil
}
return f, err
}