162 lines
4.9 KiB
Go
162 lines
4.9 KiB
Go
// Copyright 2018 Google Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
// +build linux
|
|
|
|
package ptrace
|
|
|
|
import (
|
|
"fmt"
|
|
"syscall"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/platform/procid"
|
|
)
|
|
|
|
// createStub creates a fresh stub processes.
|
|
//
|
|
// Precondition: the runtime OS thread must be locked.
|
|
func createStub() (*thread, error) {
|
|
// Declare all variables up front in order to ensure that there's no
|
|
// need for allocations between beforeFork & afterFork.
|
|
var (
|
|
pid uintptr
|
|
ppid uintptr
|
|
errno syscall.Errno
|
|
)
|
|
|
|
// Remember the current ppid for the pdeathsig race.
|
|
ppid, _, _ = syscall.RawSyscall(syscall.SYS_GETPID, 0, 0, 0)
|
|
|
|
// Among other things, beforeFork masks all signals.
|
|
beforeFork()
|
|
|
|
// When creating the new child process, we specify SIGKILL as the
|
|
// signal to deliver when the child exits. We never expect a subprocess
|
|
// to exit; they are pooled and reused. This is done to ensure that if
|
|
// a subprocess is OOM-killed, this process (and all other stubs,
|
|
// transitively) will be killed as well. It's simply not possible to
|
|
// safely handle a single stub getting killed: the exact state of
|
|
// execution is unknown and not recoverable.
|
|
pid, _, errno = syscall.RawSyscall6(syscall.SYS_CLONE, uintptr(syscall.SIGKILL)|syscall.CLONE_FILES, 0, 0, 0, 0, 0)
|
|
if errno != 0 {
|
|
afterFork()
|
|
return nil, errno
|
|
}
|
|
|
|
// Is this the parent?
|
|
if pid != 0 {
|
|
// Among other things, restore signal mask.
|
|
afterFork()
|
|
|
|
// Initialize the first thread.
|
|
t := &thread{
|
|
tgid: int32(pid),
|
|
tid: int32(pid),
|
|
cpu: ^uint32(0),
|
|
}
|
|
if sig := t.wait(); sig != syscall.SIGSTOP {
|
|
return nil, fmt.Errorf("wait failed: expected SIGSTOP, got %v", sig)
|
|
}
|
|
t.attach()
|
|
|
|
return t, nil
|
|
}
|
|
|
|
// afterForkInChild resets all signals to their default dispositions
|
|
// and restores the signal mask to its pre-fork state.
|
|
afterForkInChild()
|
|
|
|
// Explicitly unmask all signals to ensure that the tracer can see
|
|
// them.
|
|
errno = unmaskAllSignals()
|
|
if errno != 0 {
|
|
syscall.RawSyscall(syscall.SYS_EXIT, uintptr(errno), 0, 0)
|
|
}
|
|
|
|
// Enable cpuid-faulting; this may fail on older kernels or hardware,
|
|
// so we just disregard the result. Host CPUID will be enabled.
|
|
syscall.RawSyscall(syscall.SYS_ARCH_PRCTL, linux.ARCH_SET_CPUID, 0, 0)
|
|
|
|
// Call the stub; should not return.
|
|
stubCall(stubStart, ppid)
|
|
panic("unreachable")
|
|
}
|
|
|
|
// createStub creates a stub processes as a child of an existing subprocesses.
|
|
//
|
|
// Precondition: the runtime OS thread must be locked.
|
|
func (s *subprocess) createStub() (*thread, error) {
|
|
// There's no need to lock the runtime thread here, as this can only be
|
|
// called from a context that is already locked.
|
|
currentTID := int32(procid.Current())
|
|
t := s.syscallThreads.lookupOrCreate(currentTID, s.newThread)
|
|
|
|
// Pass the expected PPID to the child via R15.
|
|
regs := s.initRegs
|
|
regs.R15 = uint64(t.tgid)
|
|
|
|
// Call fork in a subprocess.
|
|
//
|
|
// The new child must set up PDEATHSIG to ensure it dies if this
|
|
// process dies. Since this process could die at any time, this cannot
|
|
// be done via instrumentation from here.
|
|
//
|
|
// Instead, we create the child untraced, which will do the PDEATHSIG
|
|
// setup and then SIGSTOP itself for our attach below.
|
|
//
|
|
// See above re: SIGKILL.
|
|
pid, err := t.syscallIgnoreInterrupt(
|
|
®s,
|
|
syscall.SYS_CLONE,
|
|
arch.SyscallArgument{Value: uintptr(syscall.SIGKILL | syscall.CLONE_FILES)},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// Wait for child to enter group-stop, so we don't stop its
|
|
// bootstrapping work with t.attach below.
|
|
//
|
|
// We unfortunately don't have a handy part of memory to write the wait
|
|
// status. If the wait succeeds, we'll assume that it was the SIGSTOP.
|
|
// If the child actually exited, the attach below will fail.
|
|
_, err = t.syscallIgnoreInterrupt(
|
|
&s.initRegs,
|
|
syscall.SYS_WAIT4,
|
|
arch.SyscallArgument{Value: uintptr(pid)},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: syscall.WALL | syscall.WUNTRACED},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0},
|
|
arch.SyscallArgument{Value: 0})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
childT := &thread{
|
|
tgid: int32(pid),
|
|
tid: int32(pid),
|
|
cpu: ^uint32(0),
|
|
}
|
|
childT.attach()
|
|
|
|
return childT, nil
|
|
}
|