// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/abi" "gvisor.dev/gvisor/pkg/bits" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/usermem" ) // maxSyscallNum is the highest supported syscall number. // // The types below create fast lookup slices for all syscalls. This maximum // serves as a sanity check that we don't allocate huge slices for a very large // syscall. This is checked during registration. const maxSyscallNum = 2000 // SyscallSupportLevel is a syscall support levels. type SyscallSupportLevel int // String returns a human readable represetation of the support level. func (l SyscallSupportLevel) String() string { switch l { case SupportUnimplemented: return "Unimplemented" case SupportPartial: return "Partial Support" case SupportFull: return "Full Support" default: return "Undocumented" } } const ( // SupportUndocumented indicates the syscall is not documented yet. SupportUndocumented = iota // SupportUnimplemented indicates the syscall is unimplemented. SupportUnimplemented // SupportPartial indicates the syscall is partially supported. SupportPartial // SupportFull indicates the syscall is fully supported. SupportFull ) // Syscall includes the syscall implementation and compatibility information. type Syscall struct { // Name is the syscall name. Name string // Fn is the implementation of the syscall. Fn SyscallFn // SupportLevel is the level of support implemented in gVisor. SupportLevel SyscallSupportLevel // Note describes the compatibility of the syscall. Note string // URLs is set of URLs to any relevant bugs or issues. URLs []string } // SyscallFn is a syscall implementation. type SyscallFn func(t *Task, args arch.SyscallArguments) (uintptr, *SyscallControl, error) // MissingFn is a syscall to be called when an implementation is missing. type MissingFn func(t *Task, sysno uintptr, args arch.SyscallArguments) (uintptr, error) // Possible flags for SyscallFlagsTable.enable. const ( // syscallPresent indicates that this is not a missing syscall. // // This flag is used internally in SyscallFlagsTable. syscallPresent = 1 << iota // StraceEnableLog enables syscall log tracing. StraceEnableLog // StraceEnableEvent enables syscall event tracing. StraceEnableEvent // ExternalBeforeEnable enables the external hook before syscall execution. ExternalBeforeEnable // ExternalAfterEnable enables the external hook after syscall execution. ExternalAfterEnable ) // StraceEnableBits combines both strace log and event flags. const StraceEnableBits = StraceEnableLog | StraceEnableEvent // SyscallFlagsTable manages a set of enable/disable bit fields on a per-syscall // basis. type SyscallFlagsTable struct { // mu protects writes to the fields below. // // Atomic loads are always allowed. Atomic stores are allowed only // while mu is held. mu sync.Mutex // enable contains the enable bits for each syscall. // // missing syscalls have the same value in enable as missingEnable to // avoid an extra branch in Word. enable []uint32 // missingEnable contains the enable bits for missing syscalls. missingEnable uint32 } // Init initializes the struct, with all syscalls in table set to enable. // // max is the largest syscall number in table. func (e *SyscallFlagsTable) init(table map[uintptr]Syscall, max uintptr) { e.enable = make([]uint32, max+1) for num := range table { e.enable[num] = syscallPresent } } // Word returns the enable bitfield for sysno. func (e *SyscallFlagsTable) Word(sysno uintptr) uint32 { if sysno < uintptr(len(e.enable)) { return atomic.LoadUint32(&e.enable[sysno]) } return atomic.LoadUint32(&e.missingEnable) } // Enable sets enable bit bit for all syscalls based on s. // // Syscalls missing from s are disabled. // // Syscalls missing from the initial table passed to Init cannot be added as // individual syscalls. If present in s they will be ignored. // // Callers to Word may see either the old or new value while this function // is executing. func (e *SyscallFlagsTable) Enable(bit uint32, s map[uintptr]bool, missingEnable bool) { e.mu.Lock() defer e.mu.Unlock() missingVal := atomic.LoadUint32(&e.missingEnable) if missingEnable { missingVal |= bit } else { missingVal &^= bit } atomic.StoreUint32(&e.missingEnable, missingVal) for num := range e.enable { val := atomic.LoadUint32(&e.enable[num]) if !bits.IsOn32(val, syscallPresent) { // Missing. atomic.StoreUint32(&e.enable[num], missingVal) continue } if s[uintptr(num)] { val |= bit } else { val &^= bit } atomic.StoreUint32(&e.enable[num], val) } } // EnableAll sets enable bit bit for all syscalls, present and missing. func (e *SyscallFlagsTable) EnableAll(bit uint32) { e.mu.Lock() defer e.mu.Unlock() missingVal := atomic.LoadUint32(&e.missingEnable) missingVal |= bit atomic.StoreUint32(&e.missingEnable, missingVal) for num := range e.enable { val := atomic.LoadUint32(&e.enable[num]) if !bits.IsOn32(val, syscallPresent) { // Missing. atomic.StoreUint32(&e.enable[num], missingVal) continue } val |= bit atomic.StoreUint32(&e.enable[num], val) } } // Stracer traces syscall execution. type Stracer interface { // SyscallEnter is called on syscall entry. // // The returned private data is passed to SyscallExit. SyscallEnter(t *Task, sysno uintptr, args arch.SyscallArguments, flags uint32) interface{} // SyscallExit is called on syscall exit. SyscallExit(context interface{}, t *Task, sysno, rval uintptr, err error) } // SyscallTable is a lookup table of system calls. // // Note that a SyscallTable is not savable directly. Instead, they are saved as // an OS/Arch pair and lookup happens again on restore. type SyscallTable struct { // OS is the operating system that this syscall table implements. OS abi.OS // Arch is the architecture that this syscall table targets. Arch arch.Arch // The OS version that this syscall table implements. Version Version // AuditNumber is a numeric constant that represents the syscall table. If // non-zero, auditNumber must be one of the AUDIT_ARCH_* values defined by // linux/audit.h. AuditNumber uint32 // Table is the collection of functions. Table map[uintptr]Syscall // lookup is a fixed-size array that holds the syscalls (indexed by // their numbers). It is used for fast look ups. lookup []SyscallFn // Emulate is a collection of instruction addresses to emulate. The // keys are addresses, and the values are system call numbers. Emulate map[usermem.Addr]uintptr // The function to call in case of a missing system call. Missing MissingFn // Stracer traces this syscall table. Stracer Stracer // External is used to handle an external callback. External func(*Kernel) // ExternalFilterBefore is called before External is called before the syscall is executed. // External is not called if it returns false. ExternalFilterBefore func(*Task, uintptr, arch.SyscallArguments) bool // ExternalFilterAfter is called before External is called after the syscall is executed. // External is not called if it returns false. ExternalFilterAfter func(*Task, uintptr, arch.SyscallArguments) bool // FeatureEnable stores the strace and one-shot enable bits. FeatureEnable SyscallFlagsTable } // MaxSysno returns the largest system call number. func (s *SyscallTable) MaxSysno() (max uintptr) { for num := range s.Table { if num > max { max = num } } return max } // allSyscallTables contains all known tables. var allSyscallTables []*SyscallTable // SyscallTables returns a read-only slice of registered SyscallTables. func SyscallTables() []*SyscallTable { return allSyscallTables } // LookupSyscallTable returns the SyscallCall table for the OS/Arch combination. func LookupSyscallTable(os abi.OS, a arch.Arch) (*SyscallTable, bool) { for _, s := range allSyscallTables { if s.OS == os && s.Arch == a { return s, true } } return nil, false } // RegisterSyscallTable registers a new syscall table for use by a Kernel. func RegisterSyscallTable(s *SyscallTable) { if max := s.MaxSysno(); max > maxSyscallNum { panic(fmt.Sprintf("SyscallTable %+v contains too large syscall number %d", s, max)) } if _, ok := LookupSyscallTable(s.OS, s.Arch); ok { panic(fmt.Sprintf("Duplicate SyscallTable registered for OS %v Arch %v", s.OS, s.Arch)) } allSyscallTables = append(allSyscallTables, s) s.Init() } // Init initializes the system call table. // // This should normally be called only during registration. func (s *SyscallTable) Init() { if s.Table == nil { // Ensure non-nil lookup table. s.Table = make(map[uintptr]Syscall) } if s.Emulate == nil { // Ensure non-nil emulate table. s.Emulate = make(map[usermem.Addr]uintptr) } max := s.MaxSysno() // Checked during RegisterSyscallTable. // Initialize the fast-lookup table. s.lookup = make([]SyscallFn, max+1) for num, sc := range s.Table { s.lookup[num] = sc.Fn } // Initialize all features. s.FeatureEnable.init(s.Table, max) } // Lookup returns the syscall implementation, if one exists. func (s *SyscallTable) Lookup(sysno uintptr) SyscallFn { if sysno < uintptr(len(s.lookup)) { return s.lookup[sysno] } return nil } // LookupName looks up a syscall name. func (s *SyscallTable) LookupName(sysno uintptr) string { if sc, ok := s.Table[sysno]; ok { return sc.Name } return fmt.Sprintf("sys_%d", sysno) // Unlikely. } // LookupEmulate looks up an emulation syscall number. func (s *SyscallTable) LookupEmulate(addr usermem.Addr) (uintptr, bool) { sysno, ok := s.Emulate[addr] return sysno, ok } // mapLookup is similar to Lookup, except that it only uses the syscall table, // that is, it skips the fast look array. This is available for benchmarking. func (s *SyscallTable) mapLookup(sysno uintptr) SyscallFn { if sc, ok := s.Table[sysno]; ok { return sc.Fn } return nil }