// Copyright 2018 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "sync" "sync/atomic" "gvisor.googlesource.com/gvisor/pkg/abi" "gvisor.googlesource.com/gvisor/pkg/bits" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" ) // maxSyscallNum is the highest supported syscall number. // // The types below create fast lookup slices for all syscalls. This maximum // serves as a sanity check that we don't allocate huge slices for a very large // syscall. const maxSyscallNum = 2000 // SyscallFn is a syscall implementation. type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) // MissingFn is a syscall to be called when an implementation is missing. type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) // Possible flags for SyscallFlagsTable.enable. const ( // syscallPresent indicates that this is not a missing syscall. // // This flag is used internally in SyscallFlagsTable. syscallPresent = 1 << iota // StraceEnableLog enables syscall log tracing. StraceEnableLog // StraceEnableEvent enables syscall event tracing. StraceEnableEvent // ExternalBeforeEnable enables the external hook before syscall execution. ExternalBeforeEnable // ExternalAfterEnable enables the external hook after syscall execution. ExternalAfterEnable ) // StraceEnableBits combines both strace log and event flags. const StraceEnableBits = StraceEnableLog | StraceEnableEvent // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall // basis. type SyscallFlagsTable struct { // mu protects writes to the fields below. // // Atomic loads are always allowed. Atomic stores are allowed only // while mu is held. mu sync.Mutex // enable contains the enable bits for each syscall. // // missing syscalls have the same value in enable as missingEnable to // avoid an extra branch in Word. enable []uint32 // missingEnable contains the enable bits for missing syscalls. missingEnable uint32 } // Init initializes the struct, with all syscalls in table set to enable. // // max is the largest syscall number in table. func (e *SyscallFlagsTable) init(table map[uintptr]SyscallFn, max uintptr) { e.enable = make([]uint32, max+1) for num := range table { e.enable[num] = syscallPresent } } // Word returns the enable bitfield for sysno. func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { if sysno < uintptr(len(e.enable)) { return atomic.LoadUint32(&e.enable[sysno]) } return atomic.LoadUint32(&e.missingEnable) } // Enable sets enable bit bit for all syscalls based on s. // // Syscalls missing from s are disabled. // // Syscalls missing from the initial table passed to Init cannot be added as // individual syscalls. If present in s they will be ignored. // // Callers to Word may see either the old or new value while this function // is executing. func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { e.mu.Lock() defer e.mu.Unlock() missingVal := atomic.LoadUint32(&e.missingEnable) if missingEnable { missingVal |= bit } else { missingVal &^= bit } atomic.StoreUint32(&e.missingEnable, missingVal) for num := range e.enable { val := atomic.LoadUint32(&e.enable[num]) if !bits.IsOn32(val, syscallPresent) { // Missing. atomic.StoreUint32(&e.enable[num], missingVal) continue } if s[uintptr(num)] { val |= bit } else { val &^= bit } atomic.StoreUint32(&e.enable[num], val) } } // EnableAll sets enable bit bit for all syscalls, present and missing. func (e *SyscallFlagsTable) EnableAll(bit uint32) { e.mu.Lock() defer e.mu.Unlock() missingVal := atomic.LoadUint32(&e.missingEnable) missingVal |= bit atomic.StoreUint32(&e.missingEnable, missingVal) for num := range e.enable { val := atomic.LoadUint32(&e.enable[num]) if !bits.IsOn32(val, syscallPresent) { // Missing. atomic.StoreUint32(&e.enable[num], missingVal) continue } val |= bit atomic.StoreUint32(&e.enable[num], val) } } // Stracer traces syscall execution. type Stracer interface { // SyscallEnter is called on syscall entry. // // The returned private data is passed to SyscallExit. // // TODO: remove kernel imports from the strace package so // that the type can be used directly. SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} // SyscallExit is called on syscall exit. SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) } // SyscallTable is a lookup table of system calls. Critically, a SyscallTable // is *immutable*. In order to make supporting suspend and resume sane, they // must be uniquely registered and may not change during operation. type SyscallTable struct { // OS is the operating system that this syscall table implements. OS abi.OS `state:"wait"` // Arch is the architecture that this syscall table targets. Arch arch.Arch `state:"wait"` // The OS version that this syscall table implements. Version Version `state:"manual"` // AuditNumber is a numeric constant that represents the syscall table. If // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by // linux/audit.h. AuditNumber uint32 `state:"manual"` // Table is the collection of functions. Table map[uintptr]SyscallFn `state:"manual"` // lookup is a fixed-size array that holds the syscalls (indexed by // their numbers). It is used for fast look ups. lookup []SyscallFn `state:"manual"` // Emulate is a collection of instruction addresses to emulate. The // keys are addresses, and the values are system call numbers. Emulate map[usermem.Addr]uintptr `state:"manual"` // The function to call in case of a missing system call. Missing MissingFn `state:"manual"` // Stracer traces this syscall table. Stracer Stracer `state:"manual"` // External is used to handle an external callback. External func(*Kernel) `state:"manual"` // ExternalFilterBefore is called before External is called before the syscall is executed. // External is not called if it returns false. ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` // ExternalFilterAfter is called before External is called after the syscall is executed. // External is not called if it returns false. ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool `state:"manual"` // FeatureEnable stores the strace and one-shot enable bits. FeatureEnable SyscallFlagsTable `state:"manual"` } // allSyscallTables contains all known tables. var allSyscallTables []*SyscallTable // SyscallTables returns a read-only slice of registered SyscallTables. func SyscallTables() []*SyscallTable { return allSyscallTables } // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { for _, s := range allSyscallTables { if s.OS == os && s.Arch == a { return s, true } } return nil, false } // RegisterSyscallTable registers a new syscall table for use by a Kernel. func RegisterSyscallTable(s *SyscallTable) { if s.Table == nil { // Ensure non-nil lookup table. s.Table = make(map[uintptr]SyscallFn) } if s.Emulate == nil { // Ensure non-nil emulate table. s.Emulate = make(map[usermem.Addr]uintptr) } var max uintptr for num := range s.Table { if num > max { max = num } } if max > maxSyscallNum { panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) } s.lookup = make([]SyscallFn, max+1) // Initialize the fast-lookup table. for num, fn := range s.Table { s.lookup[num] = fn } s.FeatureEnable.init(s.Table, max) if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) } // Save a reference to this table. // // This is required for a Kernel to find the table and for save/restore // operations below. allSyscallTables = append(allSyscallTables, s) } // Lookup returns the syscall implementation, if one exists. func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { if sysno < uintptr(len(s.lookup)) { return s.lookup[sysno] } return nil } // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] return sysno, ok } // mapLookup is similar to Lookup, except that it only uses the syscall table, // that is, it skips the fast look array. This is available for benchmarking. func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { return s.Table[sysno] }