2018-10-19 23:34:09 +00:00
// Copyright 2018 Google LLC
2018-04-27 17:37:02 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
import (
"fmt"
"os"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/bits"
2018-11-15 23:13:52 +00:00
"gvisor.googlesource.com/gvisor/pkg/metric"
2018-04-27 17:37:02 +00:00
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
// SyscallRestartErrno represents a ERESTART* errno defined in the Linux's kernel
// include/linux/errno.h. These errnos are never returned to userspace
// directly, but are used to communicate the expected behavior of an
// interrupted syscall from the syscall to signal handling.
type SyscallRestartErrno int
// These numeric values are significant because ptrace syscall exit tracing can
// observe them.
//
// For all of the following errnos, if the syscall is not interrupted by a
// signal delivered to a user handler, the syscall is restarted.
const (
// ERESTARTSYS is returned by an interrupted syscall to indicate that it
// should be converted to EINTR if interrupted by a signal delivered to a
// user handler without SA_RESTART set, and restarted otherwise.
ERESTARTSYS = SyscallRestartErrno ( 512 )
// ERESTARTNOINTR is returned by an interrupted syscall to indicate that it
// should always be restarted.
ERESTARTNOINTR = SyscallRestartErrno ( 513 )
// ERESTARTNOHAND is returned by an interrupted syscall to indicate that it
// should be converted to EINTR if interrupted by a signal delivered to a
// user handler, and restarted otherwise.
ERESTARTNOHAND = SyscallRestartErrno ( 514 )
// ERESTART_RESTARTBLOCK is returned by an interrupted syscall to indicate
// that it should be restarted using a custom function. The interrupted
// syscall must register a custom restart function by calling
// Task.SetRestartSyscallFn.
ERESTART_RESTARTBLOCK = SyscallRestartErrno ( 516 )
)
2018-11-15 23:13:52 +00:00
var vsyscallCount = metric . MustCreateNewUint64Metric ( "/kernel/vsyscall_count" , false /* sync */ , "Number of times vsyscalls were invoked by the application" )
2018-04-27 17:37:02 +00:00
// Error implements error.Error.
func ( e SyscallRestartErrno ) Error ( ) string {
// Descriptions are borrowed from strace.
switch e {
case ERESTARTSYS :
return "to be restarted if SA_RESTART is set"
case ERESTARTNOINTR :
return "to be restarted"
case ERESTARTNOHAND :
return "to be restarted if no handler"
case ERESTART_RESTARTBLOCK :
return "interrupted by signal"
default :
return "(unknown interrupt error)"
}
}
// SyscallRestartErrnoFromReturn returns the SyscallRestartErrno represented by
// rv, the value in a syscall return register.
func SyscallRestartErrnoFromReturn ( rv uintptr ) ( SyscallRestartErrno , bool ) {
switch int ( rv ) {
case - int ( ERESTARTSYS ) :
return ERESTARTSYS , true
case - int ( ERESTARTNOINTR ) :
return ERESTARTNOINTR , true
case - int ( ERESTARTNOHAND ) :
return ERESTARTNOHAND , true
case - int ( ERESTART_RESTARTBLOCK ) :
return ERESTART_RESTARTBLOCK , true
default :
return 0 , false
}
}
// SyscallRestartBlock represents the restart block for a syscall restartable
// with a custom function. It encapsulates the state required to restart a
// syscall across a S/R.
type SyscallRestartBlock interface {
Restart ( t * Task ) ( uintptr , error )
}
// SyscallControl is returned by syscalls to control the behavior of
// Task.doSyscallInvoke.
type SyscallControl struct {
// next is the state that the task goroutine should switch to. If next is
// nil, the task goroutine should continue to syscall exit as usual.
next taskRunState
// If ignoreReturn is true, Task.doSyscallInvoke should not store any value
// in the task's syscall return value register.
ignoreReturn bool
}
var (
// CtrlDoExit is returned by the implementations of the exit and exit_group
// syscalls to enter the task exit path directly, skipping syscall exit
// tracing.
CtrlDoExit = & SyscallControl { next : ( * runExit ) ( nil ) , ignoreReturn : true }
// ctrlStopAndReinvokeSyscall is returned by syscalls using the external
// feature before syscall execution. This causes Task.doSyscallInvoke
// to return runSyscallReinvoke, allowing Task.run to check for stops
// before immediately re-invoking the syscall (skipping the re-checking
// of seccomp filters and ptrace which would confuse userspace
// tracing).
ctrlStopAndReinvokeSyscall = & SyscallControl { next : ( * runSyscallReinvoke ) ( nil ) , ignoreReturn : true }
// ctrlStopBeforeSyscallExit is returned by syscalls that initiate a stop at
// their end. This causes Task.doSyscallInvoke to return runSyscallExit, rather
// than tail-calling it, allowing stops to be checked before syscall exit.
ctrlStopBeforeSyscallExit = & SyscallControl { next : ( * runSyscallExit ) ( nil ) }
)
func ( t * Task ) invokeExternal ( ) {
t . BeginExternalStop ( )
go func ( ) { // S/R-SAFE: External control flow.
defer t . EndExternalStop ( )
t . SyscallTable ( ) . External ( t . Kernel ( ) )
} ( )
}
func ( t * Task ) executeSyscall ( sysno uintptr , args arch . SyscallArguments ) ( rval uintptr , ctrl * SyscallControl , err error ) {
s := t . SyscallTable ( )
fe := s . FeatureEnable . Word ( sysno )
var straceContext interface { }
if bits . IsAnyOn32 ( fe , StraceEnableBits ) {
straceContext = s . Stracer . SyscallEnter ( t , sysno , args , fe )
}
if bits . IsOn32 ( fe , ExternalBeforeEnable ) && ( s . ExternalFilterBefore == nil || s . ExternalFilterBefore ( t , sysno , args ) ) {
t . invokeExternal ( )
// Ensure we check for stops, then invoke the syscall again.
ctrl = ctrlStopAndReinvokeSyscall
} else {
fn := s . Lookup ( sysno )
if fn != nil {
// Call our syscall implementation.
rval , ctrl , err = fn ( t , args )
} else {
// Use the missing function if not found.
rval , err = t . SyscallTable ( ) . Missing ( t , sysno , args )
}
}
if bits . IsOn32 ( fe , ExternalAfterEnable ) && ( s . ExternalFilterAfter == nil || s . ExternalFilterAfter ( t , sysno , args ) ) {
t . invokeExternal ( )
// Don't reinvoke the syscall.
}
if bits . IsAnyOn32 ( fe , StraceEnableBits ) {
s . Stracer . SyscallExit ( straceContext , t , sysno , rval , err )
}
return
}
// doSyscall is the entry point for an invocation of a system call specified by
// the current state of t's registers.
//
// The syscall path is very hot; avoid defer.
func ( t * Task ) doSyscall ( ) taskRunState {
sysno := t . Arch ( ) . SyscallNo ( )
args := t . Arch ( ) . SyscallArgs ( )
// Tracers expect to see this between when the task traps into the kernel
// to perform a syscall and when the syscall is actually invoked.
// This useless-looking temporary is needed because Go.
tmp := uintptr ( syscall . ENOSYS )
t . Arch ( ) . SetReturn ( - tmp )
// Check seccomp filters. The nil check is for performance (as seccomp use
// is rare), not needed for correctness.
2018-08-02 15:09:03 +00:00
if t . syscallFilters . Load ( ) != nil {
2018-04-27 17:37:02 +00:00
switch r := t . checkSeccompSyscall ( int32 ( sysno ) , args , usermem . Addr ( t . Arch ( ) . IP ( ) ) ) ; r {
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_ERRNO , linux . SECCOMP_RET_TRAP :
2018-04-27 17:37:02 +00:00
t . Debugf ( "Syscall %d: denied by seccomp" , sysno )
return ( * runSyscallExit ) ( nil )
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_ALLOW :
2018-04-27 17:37:02 +00:00
// ok
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_KILL_THREAD :
2018-04-27 17:37:02 +00:00
t . Debugf ( "Syscall %d: killed by seccomp" , sysno )
t . PrepareExit ( ExitStatus { Signo : int ( linux . SIGSYS ) } )
return ( * runExit ) ( nil )
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_TRACE :
2018-04-27 17:37:02 +00:00
t . Debugf ( "Syscall %d: stopping for PTRACE_EVENT_SECCOMP" , sysno )
return ( * runSyscallAfterPtraceEventSeccomp ) ( nil )
default :
panic ( fmt . Sprintf ( "Unknown seccomp result %d" , r ) )
}
}
return t . doSyscallEnter ( sysno , args )
}
type runSyscallAfterPtraceEventSeccomp struct { }
func ( * runSyscallAfterPtraceEventSeccomp ) execute ( t * Task ) taskRunState {
if t . killed ( ) {
// "[S]yscall-exit-stop is not generated prior to death by SIGKILL." -
// ptrace(2)
return ( * runInterrupt ) ( nil )
}
sysno := t . Arch ( ) . SyscallNo ( )
// "The tracer can skip the system call by changing the syscall number to
// -1." - Documentation/prctl/seccomp_filter.txt
if sysno == ^ uintptr ( 0 ) {
return ( * runSyscallExit ) ( nil ) . execute ( t )
}
args := t . Arch ( ) . SyscallArgs ( )
return t . doSyscallEnter ( sysno , args )
}
func ( t * Task ) doSyscallEnter ( sysno uintptr , args arch . SyscallArguments ) taskRunState {
if next , ok := t . ptraceSyscallEnter ( ) ; ok {
return next
}
return t . doSyscallInvoke ( sysno , args )
}
2018-08-02 17:41:44 +00:00
// +stateify savable
2018-04-27 17:37:02 +00:00
type runSyscallAfterSyscallEnterStop struct { }
func ( * runSyscallAfterSyscallEnterStop ) execute ( t * Task ) taskRunState {
if sig := linux . Signal ( t . ptraceCode ) ; sig . IsValid ( ) {
t . tg . signalHandlers . mu . Lock ( )
t . sendSignalLocked ( sigPriv ( sig ) , false /* group */ )
t . tg . signalHandlers . mu . Unlock ( )
}
if t . killed ( ) {
return ( * runInterrupt ) ( nil )
}
sysno := t . Arch ( ) . SyscallNo ( )
if sysno == ^ uintptr ( 0 ) {
return ( * runSyscallExit ) ( nil )
}
args := t . Arch ( ) . SyscallArgs ( )
return t . doSyscallInvoke ( sysno , args )
}
2018-08-02 17:41:44 +00:00
// +stateify savable
2018-04-27 17:37:02 +00:00
type runSyscallAfterSysemuStop struct { }
func ( * runSyscallAfterSysemuStop ) execute ( t * Task ) taskRunState {
if sig := linux . Signal ( t . ptraceCode ) ; sig . IsValid ( ) {
t . tg . signalHandlers . mu . Lock ( )
t . sendSignalLocked ( sigPriv ( sig ) , false /* group */ )
t . tg . signalHandlers . mu . Unlock ( )
}
if t . killed ( ) {
return ( * runInterrupt ) ( nil )
}
return ( * runSyscallExit ) ( nil ) . execute ( t )
}
func ( t * Task ) doSyscallInvoke ( sysno uintptr , args arch . SyscallArguments ) taskRunState {
rval , ctrl , err := t . executeSyscall ( sysno , args )
if ctrl != nil {
if ! ctrl . ignoreReturn {
t . Arch ( ) . SetReturn ( rval )
}
if ctrl . next != nil {
return ctrl . next
}
} else if err != nil {
t . Arch ( ) . SetReturn ( uintptr ( - t . ExtractErrno ( err , int ( sysno ) ) ) )
t . haveSyscallReturn = true
} else {
t . Arch ( ) . SetReturn ( rval )
}
return ( * runSyscallExit ) ( nil ) . execute ( t )
}
2018-08-02 17:41:44 +00:00
// +stateify savable
2018-04-27 17:37:02 +00:00
type runSyscallReinvoke struct { }
func ( * runSyscallReinvoke ) execute ( t * Task ) taskRunState {
if t . killed ( ) {
// It's possible that since the last execution, the task has
// been forcible killed. Invoking the system call here could
// result in an infinite loop if it is again preempted by an
// external stop and reinvoked.
return ( * runInterrupt ) ( nil )
}
sysno := t . Arch ( ) . SyscallNo ( )
args := t . Arch ( ) . SyscallArgs ( )
return t . doSyscallInvoke ( sysno , args )
}
2018-08-02 17:41:44 +00:00
// +stateify savable
2018-04-27 17:37:02 +00:00
type runSyscallExit struct { }
func ( * runSyscallExit ) execute ( t * Task ) taskRunState {
t . ptraceSyscallExit ( )
return ( * runApp ) ( nil )
}
// doVsyscall is the entry point for a vsyscall invocation of syscall sysno, as
// indicated by an execution fault at address addr. doVsyscall returns the
// task's next run state.
func ( t * Task ) doVsyscall ( addr usermem . Addr , sysno uintptr ) taskRunState {
2018-11-15 23:13:52 +00:00
vsyscallCount . Increment ( )
2018-04-27 17:37:02 +00:00
// Grab the caller up front, to make sure there's a sensible stack.
caller := t . Arch ( ) . Native ( uintptr ( 0 ) )
if _ , err := t . CopyIn ( usermem . Addr ( t . Arch ( ) . Stack ( ) ) , caller ) ; err != nil {
t . Debugf ( "vsyscall %d: error reading return address from stack: %v" , sysno , err )
t . forceSignal ( linux . SIGSEGV , false /* unconditional */ )
t . SendSignal ( sigPriv ( linux . SIGSEGV ) )
return ( * runApp ) ( nil )
}
// For _vsyscalls_, there is no need to translate System V calling convention
// to syscall ABI because they both use RDI, RSI, and RDX for the first three
// arguments and none of the vsyscalls uses more than two arguments.
args := t . Arch ( ) . SyscallArgs ( )
2018-08-02 15:09:03 +00:00
if t . syscallFilters . Load ( ) != nil {
2018-04-27 17:37:02 +00:00
switch r := t . checkSeccompSyscall ( int32 ( sysno ) , args , addr ) ; r {
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_ERRNO , linux . SECCOMP_RET_TRAP :
2018-04-27 17:37:02 +00:00
t . Debugf ( "vsyscall %d, caller %x: denied by seccomp" , sysno , t . Arch ( ) . Value ( caller ) )
return ( * runApp ) ( nil )
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_ALLOW :
2018-04-27 17:37:02 +00:00
// ok
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_TRACE :
2018-04-27 17:37:02 +00:00
t . Debugf ( "vsyscall %d, caller %x: stopping for PTRACE_EVENT_SECCOMP" , sysno , t . Arch ( ) . Value ( caller ) )
return & runVsyscallAfterPtraceEventSeccomp { addr , sysno , caller }
2018-12-18 18:27:16 +00:00
case linux . SECCOMP_RET_KILL_THREAD :
t . Debugf ( "vsyscall %d: killed by seccomp" , sysno )
t . PrepareExit ( ExitStatus { Signo : int ( linux . SIGSYS ) } )
return ( * runExit ) ( nil )
2018-04-27 17:37:02 +00:00
default :
panic ( fmt . Sprintf ( "Unknown seccomp result %d" , r ) )
}
}
return t . doVsyscallInvoke ( sysno , args , caller )
}
type runVsyscallAfterPtraceEventSeccomp struct {
addr usermem . Addr
sysno uintptr
caller interface { }
}
func ( r * runVsyscallAfterPtraceEventSeccomp ) execute ( t * Task ) taskRunState {
if t . killed ( ) {
return ( * runInterrupt ) ( nil )
}
sysno := t . Arch ( ) . SyscallNo ( )
// "... the syscall may not be changed to another system call using the
// orig_rax register. It may only be changed to -1 order [sic] to skip the
// currently emulated call. ... The tracer MUST NOT modify rip or rsp." -
// Documentation/prctl/seccomp_filter.txt. On Linux, changing orig_ax or ip
// causes do_exit(SIGSYS), and changing sp is ignored.
if ( sysno != ^ uintptr ( 0 ) && sysno != r . sysno ) || usermem . Addr ( t . Arch ( ) . IP ( ) ) != r . addr {
t . PrepareExit ( ExitStatus { Signo : int ( linux . SIGSYS ) } )
return ( * runExit ) ( nil )
}
if sysno == ^ uintptr ( 0 ) {
return ( * runApp ) ( nil )
}
return t . doVsyscallInvoke ( sysno , t . Arch ( ) . SyscallArgs ( ) , r . caller )
}
func ( t * Task ) doVsyscallInvoke ( sysno uintptr , args arch . SyscallArguments , caller interface { } ) taskRunState {
rval , ctrl , err := t . executeSyscall ( sysno , args )
if ctrl != nil {
t . Debugf ( "vsyscall %d, caller %x: syscall control: %v" , sysno , t . Arch ( ) . Value ( caller ) , ctrl )
// Set the return value. The stack has already been adjusted.
t . Arch ( ) . SetReturn ( 0 )
} else if err == nil {
t . Debugf ( "vsyscall %d, caller %x: successfully emulated syscall" , sysno , t . Arch ( ) . Value ( caller ) )
// Set the return value. The stack has already been adjusted.
t . Arch ( ) . SetReturn ( uintptr ( rval ) )
} else {
t . Debugf ( "vsyscall %d, caller %x: emulated syscall returned error: %v" , sysno , t . Arch ( ) . Value ( caller ) , err )
if err == syserror . EFAULT {
t . forceSignal ( linux . SIGSEGV , false /* unconditional */ )
t . SendSignal ( sigPriv ( linux . SIGSEGV ) )
// A return is not emulated in this case.
return ( * runApp ) ( nil )
}
t . Arch ( ) . SetReturn ( uintptr ( - t . ExtractErrno ( err , int ( sysno ) ) ) )
}
t . Arch ( ) . SetIP ( t . Arch ( ) . Value ( caller ) )
t . Arch ( ) . SetStack ( t . Arch ( ) . Stack ( ) + uintptr ( t . Arch ( ) . Width ( ) ) )
return ( * runApp ) ( nil )
}
// ExtractErrno extracts an integer error number from the error.
// The syscall number is purely for context in the error case. Use -1 if
// syscall number is unknown.
func ( t * Task ) ExtractErrno ( err error , sysno int ) int {
switch err := err . ( type ) {
case nil :
return 0
case syscall . Errno :
return int ( err )
case SyscallRestartErrno :
return int ( err )
case * memmap . BusError :
// Bus errors may generate SIGBUS, but for syscalls they still
// return EFAULT. See case in task_run.go where the fault is
// handled (and the SIGBUS is delivered).
return int ( syscall . EFAULT )
case * os . PathError :
return t . ExtractErrno ( err . Err , sysno )
case * os . LinkError :
return t . ExtractErrno ( err . Err , sysno )
case * os . SyscallError :
return t . ExtractErrno ( err . Err , sysno )
default :
if errno , ok := syserror . TranslateError ( err ) ; ok {
return int ( errno )
}
}
panic ( fmt . Sprintf ( "Unknown syscall %d error: %v" , sysno , err ) )
}