2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
// +build linux
|
|
|
|
|
|
|
|
package ptrace
|
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"syscall"
|
|
|
|
|
2019-11-20 09:24:41 +00:00
|
|
|
"golang.org/x/sys/unix"
|
2019-06-13 23:49:09 +00:00
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
|
|
"gvisor.dev/gvisor/pkg/procid"
|
|
|
|
"gvisor.dev/gvisor/pkg/seccomp"
|
|
|
|
"gvisor.dev/gvisor/pkg/sentry/arch"
|
2018-04-27 17:37:02 +00:00
|
|
|
)
|
|
|
|
|
2018-10-24 22:51:46 +00:00
|
|
|
const syscallEvent syscall.Signal = 0x80
|
2018-10-11 05:39:32 +00:00
|
|
|
|
|
|
|
// probeSeccomp returns true iff seccomp is run after ptrace notifications,
|
|
|
|
// which is generally the case for kernel version >= 4.8. This check is dynamic
|
|
|
|
// because kernels have be backported behavior.
|
|
|
|
//
|
|
|
|
// See createStub for more information.
|
|
|
|
//
|
|
|
|
// Precondition: the runtime OS thread must be locked.
|
|
|
|
func probeSeccomp() bool {
|
|
|
|
// Create a completely new, destroyable process.
|
2018-12-18 18:27:16 +00:00
|
|
|
t, err := attachedThread(0, linux.SECCOMP_RET_ERRNO)
|
2018-10-11 05:39:32 +00:00
|
|
|
if err != nil {
|
|
|
|
panic(fmt.Sprintf("seccomp probe failed: %v", err))
|
|
|
|
}
|
|
|
|
defer t.destroy()
|
|
|
|
|
|
|
|
// Set registers to the yield system call. This call is not allowed
|
|
|
|
// by the filters specified in the attachThread function.
|
|
|
|
regs := createSyscallRegs(&t.initRegs, syscall.SYS_SCHED_YIELD)
|
|
|
|
if err := t.setRegs(®s); err != nil {
|
|
|
|
panic(fmt.Sprintf("ptrace set regs failed: %v", err))
|
|
|
|
}
|
|
|
|
|
|
|
|
for {
|
|
|
|
// Attempt an emulation.
|
2019-12-23 03:01:07 +00:00
|
|
|
if _, _, errno := syscall.RawSyscall6(syscall.SYS_PTRACE, unix.PTRACE_SYSEMU, uintptr(t.tid), 0, 0, 0, 0); errno != 0 {
|
2018-10-11 05:39:32 +00:00
|
|
|
panic(fmt.Sprintf("ptrace syscall-enter failed: %v", errno))
|
|
|
|
}
|
|
|
|
|
|
|
|
sig := t.wait(stopped)
|
|
|
|
if sig == (syscallEvent | syscall.SIGTRAP) {
|
|
|
|
// Did the seccomp errno hook already run? This would
|
|
|
|
// indicate that seccomp is first in line and we're
|
|
|
|
// less than 4.8.
|
|
|
|
if err := t.getRegs(®s); err != nil {
|
|
|
|
panic(fmt.Sprintf("ptrace get-regs failed: %v", err))
|
|
|
|
}
|
|
|
|
if _, err := syscallReturnValue(®s); err == nil {
|
|
|
|
// The seccomp errno mode ran first, and reset
|
|
|
|
// the error in the registers.
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
// The seccomp hook did not run yet, and therefore it
|
|
|
|
// is safe to use RET_KILL mode for dispatched calls.
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// createStub creates a fresh stub processes.
|
|
|
|
//
|
|
|
|
// Precondition: the runtime OS thread must be locked.
|
|
|
|
func createStub() (*thread, error) {
|
2018-10-11 05:39:32 +00:00
|
|
|
// The exact interactions of ptrace and seccomp are complex, and
|
|
|
|
// changed in recent kernel versions. Before commit 93e35efb8de45, the
|
|
|
|
// seccomp check is done before the ptrace emulation check. This means
|
|
|
|
// that any calls not matching this list will trigger the seccomp
|
|
|
|
// default action instead of notifying ptrace.
|
|
|
|
//
|
|
|
|
// After commit 93e35efb8de45, the seccomp check is done after the
|
|
|
|
// ptrace emulation check. This simplifies using SYSEMU, since seccomp
|
|
|
|
// will never run for emulation. Seccomp will only run for injected
|
|
|
|
// system calls, and thus we can use RET_KILL as our violation action.
|
2018-12-18 18:27:16 +00:00
|
|
|
var defaultAction linux.BPFAction
|
2018-10-11 05:39:32 +00:00
|
|
|
if probeSeccomp() {
|
|
|
|
log.Infof("Latest seccomp behavior found (kernel >= 4.8 likely)")
|
2018-12-18 18:27:16 +00:00
|
|
|
defaultAction = linux.SECCOMP_RET_KILL_THREAD
|
2018-10-11 05:39:32 +00:00
|
|
|
} else {
|
|
|
|
// We must rely on SYSEMU behavior; tracing with SYSEMU is broken.
|
|
|
|
log.Infof("Legacy seccomp behavior found (kernel < 4.8 likely)")
|
2018-12-18 18:27:16 +00:00
|
|
|
defaultAction = linux.SECCOMP_RET_ALLOW
|
2018-10-11 05:39:32 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// When creating the new child process, we specify SIGKILL as the
|
|
|
|
// signal to deliver when the child exits. We never expect a subprocess
|
|
|
|
// to exit; they are pooled and reused. This is done to ensure that if
|
|
|
|
// a subprocess is OOM-killed, this process (and all other stubs,
|
|
|
|
// transitively) will be killed as well. It's simply not possible to
|
|
|
|
// safely handle a single stub getting killed: the exact state of
|
|
|
|
// execution is unknown and not recoverable.
|
2019-10-22 21:55:54 +00:00
|
|
|
//
|
|
|
|
// In addition, we set the PTRACE_O_TRACEEXIT option to log more
|
|
|
|
// information about a stub process when it receives a fatal signal.
|
2018-10-11 05:39:32 +00:00
|
|
|
return attachedThread(uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, defaultAction)
|
|
|
|
}
|
|
|
|
|
|
|
|
// attachedThread returns a new attached thread.
|
|
|
|
//
|
|
|
|
// Precondition: the runtime OS thread must be locked.
|
2018-12-18 18:27:16 +00:00
|
|
|
func attachedThread(flags uintptr, defaultAction linux.BPFAction) (*thread, error) {
|
2018-10-11 05:39:32 +00:00
|
|
|
// Create a BPF program that allows only the system calls needed by the
|
|
|
|
// stub and all its children. This is used to create child stubs
|
|
|
|
// (below), so we must include the ability to fork, but otherwise lock
|
|
|
|
// down available calls only to what is needed.
|
|
|
|
rules := []seccomp.RuleSet{
|
|
|
|
// Rules for trapping vsyscall access.
|
2019-06-10 22:46:17 +00:00
|
|
|
{
|
2018-10-11 05:39:32 +00:00
|
|
|
Rules: seccomp.SyscallRules{
|
|
|
|
syscall.SYS_GETTIMEOFDAY: {},
|
|
|
|
syscall.SYS_TIME: {},
|
2019-11-20 09:24:41 +00:00
|
|
|
unix.SYS_GETCPU: {}, // SYS_GETCPU was not defined in package syscall on amd64.
|
2018-10-11 05:39:32 +00:00
|
|
|
},
|
2018-12-18 18:27:16 +00:00
|
|
|
Action: linux.SECCOMP_RET_TRAP,
|
2018-10-11 05:39:32 +00:00
|
|
|
Vsyscall: true,
|
|
|
|
},
|
|
|
|
}
|
2018-12-18 18:27:16 +00:00
|
|
|
if defaultAction != linux.SECCOMP_RET_ALLOW {
|
2018-10-11 05:39:32 +00:00
|
|
|
rules = append(rules, seccomp.RuleSet{
|
|
|
|
Rules: seccomp.SyscallRules{
|
|
|
|
syscall.SYS_CLONE: []seccomp.Rule{
|
|
|
|
// Allow creation of new subprocesses (used by the master).
|
|
|
|
{seccomp.AllowValue(syscall.CLONE_FILES | syscall.SIGKILL)},
|
|
|
|
// Allow creation of new threads within a single address space (used by addresss spaces).
|
|
|
|
{seccomp.AllowValue(
|
|
|
|
syscall.CLONE_FILES |
|
|
|
|
syscall.CLONE_FS |
|
|
|
|
syscall.CLONE_SIGHAND |
|
|
|
|
syscall.CLONE_THREAD |
|
|
|
|
syscall.CLONE_PTRACE |
|
|
|
|
syscall.CLONE_VM)},
|
|
|
|
},
|
|
|
|
|
|
|
|
// For the initial process creation.
|
|
|
|
syscall.SYS_WAIT4: {},
|
2019-11-20 09:24:41 +00:00
|
|
|
syscall.SYS_EXIT: {},
|
2018-10-11 05:39:32 +00:00
|
|
|
|
|
|
|
// For the stub prctl dance (all).
|
|
|
|
syscall.SYS_PRCTL: []seccomp.Rule{
|
|
|
|
{seccomp.AllowValue(syscall.PR_SET_PDEATHSIG), seccomp.AllowValue(syscall.SIGKILL)},
|
|
|
|
},
|
|
|
|
syscall.SYS_GETPPID: {},
|
|
|
|
|
|
|
|
// For the stub to stop itself (all).
|
|
|
|
syscall.SYS_GETPID: {},
|
|
|
|
syscall.SYS_KILL: []seccomp.Rule{
|
|
|
|
{seccomp.AllowAny{}, seccomp.AllowValue(syscall.SIGSTOP)},
|
|
|
|
},
|
|
|
|
|
|
|
|
// Injected to support the address space operations.
|
|
|
|
syscall.SYS_MMAP: {},
|
|
|
|
syscall.SYS_MUNMAP: {},
|
|
|
|
},
|
2018-12-18 18:27:16 +00:00
|
|
|
Action: linux.SECCOMP_RET_ALLOW,
|
2018-10-11 05:39:32 +00:00
|
|
|
})
|
2019-11-20 09:24:41 +00:00
|
|
|
|
|
|
|
rules = appendArchSeccompRules(rules)
|
2018-10-11 05:39:32 +00:00
|
|
|
}
|
|
|
|
instrs, err := seccomp.BuildProgram(rules, defaultAction)
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Declare all variables up front in order to ensure that there's no
|
|
|
|
// need for allocations between beforeFork & afterFork.
|
|
|
|
var (
|
|
|
|
pid uintptr
|
|
|
|
ppid uintptr
|
|
|
|
errno syscall.Errno
|
|
|
|
)
|
|
|
|
|
|
|
|
// Remember the current ppid for the pdeathsig race.
|
|
|
|
ppid, _, _ = syscall.RawSyscall(syscall.SYS_GETPID, 0, 0, 0)
|
|
|
|
|
|
|
|
// Among other things, beforeFork masks all signals.
|
|
|
|
beforeFork()
|
2018-06-26 23:53:48 +00:00
|
|
|
|
2018-10-11 05:39:32 +00:00
|
|
|
// Do the clone.
|
|
|
|
pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, flags, 0, 0, 0, 0, 0)
|
2018-04-27 17:37:02 +00:00
|
|
|
if errno != 0 {
|
|
|
|
afterFork()
|
|
|
|
return nil, errno
|
|
|
|
}
|
|
|
|
|
|
|
|
// Is this the parent?
|
|
|
|
if pid != 0 {
|
|
|
|
// Among other things, restore signal mask.
|
|
|
|
afterFork()
|
|
|
|
|
|
|
|
// Initialize the first thread.
|
|
|
|
t := &thread{
|
|
|
|
tgid: int32(pid),
|
|
|
|
tid: int32(pid),
|
|
|
|
cpu: ^uint32(0),
|
|
|
|
}
|
2018-10-11 05:39:32 +00:00
|
|
|
if sig := t.wait(stopped); sig != syscall.SIGSTOP {
|
2018-04-27 17:37:02 +00:00
|
|
|
return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
|
|
|
|
}
|
|
|
|
t.attach()
|
2019-06-27 20:23:49 +00:00
|
|
|
t.grabInitRegs()
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
return t, nil
|
|
|
|
}
|
|
|
|
|
2018-10-24 17:41:34 +00:00
|
|
|
// Move the stub to a new session (and thus a new process group). This
|
|
|
|
// prevents the stub from getting PTY job control signals intended only
|
|
|
|
// for the sentry process. We must call this before restoring signal
|
|
|
|
// mask.
|
|
|
|
if _, _, errno := syscall.RawSyscall(syscall.SYS_SETSID, 0, 0, 0); errno != 0 {
|
|
|
|
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// afterForkInChild resets all signals to their default dispositions
|
|
|
|
// and restores the signal mask to its pre-fork state.
|
|
|
|
afterForkInChild()
|
|
|
|
|
|
|
|
// Explicitly unmask all signals to ensure that the tracer can see
|
|
|
|
// them.
|
2018-10-24 17:41:34 +00:00
|
|
|
if errno := unmaskAllSignals(); errno != 0 {
|
2018-04-27 17:37:02 +00:00
|
|
|
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
|
|
|
|
}
|
|
|
|
|
2018-10-11 05:39:32 +00:00
|
|
|
// Set an aggressive BPF filter for the stub and all it's children. See
|
|
|
|
// the description of the BPF program built above.
|
|
|
|
if errno := seccomp.SetFilter(instrs); errno != 0 {
|
|
|
|
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
|
|
|
|
}
|
|
|
|
|
2019-11-20 09:24:41 +00:00
|
|
|
// Enable cpuid-faulting.
|
|
|
|
enableCpuidFault()
|
2018-07-17 05:02:03 +00:00
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Call the stub; should not return.
|
|
|
|
stubCall(stubStart, ppid)
|
|
|
|
panic("unreachable")
|
|
|
|
}
|
|
|
|
|
|
|
|
// createStub creates a stub processes as a child of an existing subprocesses.
|
|
|
|
//
|
|
|
|
// Precondition: the runtime OS thread must be locked.
|
|
|
|
func (s *subprocess) createStub() (*thread, error) {
|
|
|
|
// There's no need to lock the runtime thread here, as this can only be
|
|
|
|
// called from a context that is already locked.
|
|
|
|
currentTID := int32(procid.Current())
|
|
|
|
t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
|
|
|
|
|
|
|
|
// Pass the expected PPID to the child via R15.
|
2018-10-11 05:39:32 +00:00
|
|
|
regs := t.initRegs
|
2019-08-09 20:16:46 +00:00
|
|
|
initChildProcessPPID(®s, t.tgid)
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Call fork in a subprocess.
|
|
|
|
//
|
|
|
|
// The new child must set up PDEATHSIG to ensure it dies if this
|
|
|
|
// process dies. Since this process could die at any time, this cannot
|
|
|
|
// be done via instrumentation from here.
|
|
|
|
//
|
|
|
|
// Instead, we create the child untraced, which will do the PDEATHSIG
|
|
|
|
// setup and then SIGSTOP itself for our attach below.
|
2018-06-26 23:53:48 +00:00
|
|
|
//
|
|
|
|
// See above re: SIGKILL.
|
2018-04-27 17:37:02 +00:00
|
|
|
pid, err := t.syscallIgnoreInterrupt(
|
|
|
|
®s,
|
|
|
|
syscall.SYS_CLONE,
|
2018-06-26 23:53:48 +00:00
|
|
|
arch.SyscallArgument{Value: uintptr(syscall.SIGKILL | syscall.CLONE_FILES)},
|
2018-04-27 17:37:02 +00:00
|
|
|
arch.SyscallArgument{Value: 0},
|
|
|
|
arch.SyscallArgument{Value: 0},
|
|
|
|
arch.SyscallArgument{Value: 0},
|
|
|
|
arch.SyscallArgument{Value: 0},
|
|
|
|
arch.SyscallArgument{Value: 0})
|
|
|
|
if err != nil {
|
2019-06-28 20:22:28 +00:00
|
|
|
return nil, fmt.Errorf("creating stub process: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// Wait for child to enter group-stop, so we don't stop its
|
|
|
|
// bootstrapping work with t.attach below.
|
|
|
|
//
|
|
|
|
// We unfortunately don't have a handy part of memory to write the wait
|
|
|
|
// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
|
|
|
|
// If the child actually exited, the attach below will fail.
|
|
|
|
_, err = t.syscallIgnoreInterrupt(
|
2018-10-11 05:39:32 +00:00
|
|
|
&t.initRegs,
|
2018-04-27 17:37:02 +00:00
|
|
|
syscall.SYS_WAIT4,
|
|
|
|
arch.SyscallArgument{Value: uintptr(pid)},
|
|
|
|
arch.SyscallArgument{Value: 0},
|
2018-06-26 23:53:48 +00:00
|
|
|
arch.SyscallArgument{Value: syscall.WALL | syscall.WUNTRACED},
|
2018-04-27 17:37:02 +00:00
|
|
|
arch.SyscallArgument{Value: 0},
|
|
|
|
arch.SyscallArgument{Value: 0},
|
|
|
|
arch.SyscallArgument{Value: 0})
|
|
|
|
if err != nil {
|
2019-06-28 20:22:28 +00:00
|
|
|
return nil, fmt.Errorf("waiting on stub process: %v", err)
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
childT := &thread{
|
|
|
|
tgid: int32(pid),
|
|
|
|
tid: int32(pid),
|
|
|
|
cpu: ^uint32(0),
|
|
|
|
}
|
|
|
|
childT.attach()
|
|
|
|
|
|
|
|
return childT, nil
|
|
|
|
}
|