gvisor/pkg/sentry/kernel/fd_table.go

501 lines
12 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package kernel
import (
"fmt"
"math"
"strings"
"sync/atomic"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
)
// FDFlags define flags for an individual descriptor.
//
// +stateify savable
type FDFlags struct {
// CloseOnExec indicates the descriptor should be closed on exec.
CloseOnExec bool
}
// ToLinuxFileFlags converts a kernel.FDFlags object to a Linux file flags
// representation.
func (f FDFlags) ToLinuxFileFlags() (mask uint) {
if f.CloseOnExec {
mask |= linux.O_CLOEXEC
}
return
}
// ToLinuxFDFlags converts a kernel.FDFlags object to a Linux descriptor flags
// representation.
func (f FDFlags) ToLinuxFDFlags() (mask uint) {
if f.CloseOnExec {
mask |= linux.FD_CLOEXEC
}
return
}
// descriptor holds the details about a file descriptor, namely a pointer to
// the file itself and the descriptor flags.
//
// Note that this is immutable and can only be changed via operations on the
// descriptorTable.
//
// It contains both VFS1 and VFS2 file types, but only one of them can be set.
//
// +stateify savable
type descriptor struct {
// TODO(gvisor.dev/issue/1624): Remove fs.File.
file *fs.File
fileVFS2 *vfs.FileDescription
flags FDFlags
}
// FDTable is used to manage File references and flags.
//
// +stateify savable
type FDTable struct {
refs.AtomicRefCount
k *Kernel
// uid is a unique identifier.
uid uint64
// mu protects below.
mu sync.Mutex `state:"nosave"`
// next is start position to find fd.
next int32
// used contains the number of non-nil entries. It must be accessed
// atomically. It may be read atomically without holding mu (but not
// written).
used int32
// descriptorTable holds descriptors.
descriptorTable `state:".(map[int32]descriptor)"`
}
func (f *FDTable) saveDescriptorTable() map[int32]descriptor {
m := make(map[int32]descriptor)
f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
m[fd] = descriptor{
file: file,
fileVFS2: fileVFS2,
flags: flags,
}
})
return m
}
func (f *FDTable) loadDescriptorTable(m map[int32]descriptor) {
f.init() // Initialize table.
for fd, d := range m {
f.setAll(fd, d.file, d.fileVFS2, d.flags)
// Note that we do _not_ need to acquire a extra table reference here. The
// table reference will already be accounted for in the file, so we drop the
// reference taken by set above.
switch {
case d.file != nil:
d.file.DecRef()
case d.fileVFS2 != nil:
d.fileVFS2.DecRef()
}
}
}
// drop drops the table reference.
func (f *FDTable) drop(file *fs.File) {
// Release locks.
file.Dirent.Inode.LockCtx.Posix.UnlockRegion(lock.UniqueID(f.uid), lock.LockRange{0, lock.LockEOF})
// Send inotify events.
d := file.Dirent
var ev uint32
if fs.IsDir(d.Inode.StableAttr) {
ev |= linux.IN_ISDIR
}
if file.Flags().Write {
ev |= linux.IN_CLOSE_WRITE
} else {
ev |= linux.IN_CLOSE_NOWRITE
}
d.InotifyEvent(ev, 0)
// Drop the table reference.
file.DecRef()
}
// dropVFS2 drops the table reference.
func (f *FDTable) dropVFS2(file *vfs.FileDescription) {
// TODO(gvisor.dev/issue/1480): Release locks.
// TODO(gvisor.dev/issue/1479): Send inotify events.
// Drop the table reference.
file.DecRef()
}
// ID returns a unique identifier for this FDTable.
func (f *FDTable) ID() uint64 {
return f.uid
}
// NewFDTable allocates a new FDTable that may be used by tasks in k.
func (k *Kernel) NewFDTable() *FDTable {
f := &FDTable{
k: k,
uid: atomic.AddUint64(&k.fdMapUids, 1),
}
f.init()
return f
}
// destroy removes all of the file descriptors from the map.
func (f *FDTable) destroy() {
f.RemoveIf(func(*fs.File, *vfs.FileDescription, FDFlags) bool {
return true
})
}
// DecRef implements RefCounter.DecRef with destructor f.destroy.
func (f *FDTable) DecRef() {
f.DecRefWithDestructor(f.destroy)
}
// Size returns the number of file descriptor slots currently allocated.
func (f *FDTable) Size() int {
size := atomic.LoadInt32(&f.used)
return int(size)
}
// forEach iterates over all non-nil files.
//
// It is the caller's responsibility to acquire an appropriate lock.
func (f *FDTable) forEach(fn func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags)) {
fd := int32(0)
for {
file, fileVFS2, flags, ok := f.getAll(fd)
if !ok {
break
}
switch {
case file != nil:
if !file.TryIncRef() {
continue // Race caught.
}
fn(fd, file, nil, flags)
file.DecRef()
case fileVFS2 != nil:
if !fileVFS2.TryIncRef() {
continue // Race caught.
}
fn(fd, nil, fileVFS2, flags)
fileVFS2.DecRef()
}
fd++
}
}
// String is a stringer for FDTable.
func (f *FDTable) String() string {
var buf strings.Builder
f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
switch {
case file != nil:
n, _ := file.Dirent.FullName(nil /* root */)
fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, n)
case fileVFS2 != nil:
vfsObj := fileVFS2.Mount().Filesystem().VirtualFilesystem()
name, err := vfsObj.PathnameWithDeleted(context.Background(), vfs.VirtualDentry{}, fileVFS2.VirtualDentry())
if err != nil {
fmt.Fprintf(&buf, "<err: %v>\n", err)
return
}
fmt.Fprintf(&buf, "\tfd:%d => name %s\n", fd, name)
}
})
return buf.String()
}
// NewFDs allocates new FDs guaranteed to be the lowest number available
// greater than or equal to the fd parameter. All files will share the set
// flags. Success is guaranteed to be all or none.
func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags FDFlags) (fds []int32, err error) {
if fd < 0 {
// Don't accept negative FDs.
return nil, syscall.EINVAL
}
// Default limit.
end := int32(math.MaxInt32)
// Ensure we don't get past the provided limit.
if limitSet := limits.FromContext(ctx); limitSet != nil {
lim := limitSet.Get(limits.NumberOfFiles)
if lim.Cur != limits.Infinity {
end = int32(lim.Cur)
}
if fd >= end {
return nil, syscall.EMFILE
}
}
f.mu.Lock()
defer f.mu.Unlock()
// From f.next to find available fd.
if fd < f.next {
fd = f.next
}
// Install all entries.
for i := fd; i < end && len(fds) < len(files); i++ {
if d, _, _ := f.get(i); d == nil {
f.set(i, files[len(fds)], flags) // Set the descriptor.
fds = append(fds, i) // Record the file descriptor.
}
}
// Failure? Unwind existing FDs.
if len(fds) < len(files) {
for _, i := range fds {
f.set(i, nil, FDFlags{}) // Zap entry.
}
return nil, syscall.EMFILE
}
if fd == f.next {
// Update next search start position.
f.next = fds[len(fds)-1] + 1
}
return fds, nil
}
// NewFDAt sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAt(ctx context.Context, fd int32, file *fs.File, flags FDFlags) error {
return f.newFDAt(ctx, fd, file, nil, flags)
}
// NewFDAtVFS2 sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
func (f *FDTable) NewFDAtVFS2(ctx context.Context, fd int32, file *vfs.FileDescription, flags FDFlags) error {
return f.newFDAt(ctx, fd, nil, file, flags)
}
func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
}
f.mu.Lock()
defer f.mu.Unlock()
// Check the limit for the provided file.
if limitSet := limits.FromContext(ctx); limitSet != nil {
if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
return syscall.EMFILE
}
}
// Install the entry.
f.setAll(fd, file, fileVFS2, flags)
return nil
}
// SetFlags sets the flags for the given file descriptor.
//
// True is returned iff flags were changed.
func (f *FDTable) SetFlags(fd int32, flags FDFlags) error {
if fd < 0 {
// Don't accept negative FDs.
return syscall.EBADF
}
f.mu.Lock()
defer f.mu.Unlock()
file, _, _ := f.get(fd)
if file == nil {
// No file found.
return syscall.EBADF
}
// Update the flags.
f.set(fd, file, flags)
return nil
}
// Get returns a reference to the file and the flags for the FD or nil if no
// file is defined for the given fd.
//
// N.B. Callers are required to use DecRef when they are done.
//
//go:nosplit
func (f *FDTable) Get(fd int32) (*fs.File, FDFlags) {
if fd < 0 {
return nil, FDFlags{}
}
for {
file, flags, _ := f.get(fd)
if file != nil {
if !file.TryIncRef() {
continue // Race caught.
}
// Reference acquired.
return file, flags
}
// No file available.
return nil, FDFlags{}
}
}
// GetVFS2 returns a reference to the file and the flags for the FD or nil if no
// file is defined for the given fd.
//
// N.B. Callers are required to use DecRef when they are done.
//
//go:nosplit
func (f *FDTable) GetVFS2(fd int32) (*vfs.FileDescription, FDFlags) {
if fd < 0 {
return nil, FDFlags{}
}
for {
file, flags, _ := f.getVFS2(fd)
if file != nil {
if !file.TryIncRef() {
continue // Race caught.
}
// Reference acquired.
return file, flags
}
// No file available.
return nil, FDFlags{}
}
}
// GetFDs returns a list of valid fds.
func (f *FDTable) GetFDs() []int32 {
fds := make([]int32, 0, int(atomic.LoadInt32(&f.used)))
f.forEach(func(fd int32, _ *fs.File, _ *vfs.FileDescription, _ FDFlags) {
fds = append(fds, fd)
})
return fds
}
// GetRefs returns a stable slice of references to all files and bumps the
// reference count on each. The caller must use DecRef on each reference when
// they're done using the slice.
func (f *FDTable) GetRefs() []*fs.File {
files := make([]*fs.File, 0, f.Size())
f.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) {
file.IncRef() // Acquire a reference for caller.
files = append(files, file)
})
return files
}
// GetRefsVFS2 returns a stable slice of references to all files and bumps the
// reference count on each. The caller must use DecRef on each reference when
// they're done using the slice.
func (f *FDTable) GetRefsVFS2() []*vfs.FileDescription {
files := make([]*vfs.FileDescription, 0, f.Size())
f.forEach(func(_ int32, _ *fs.File, file *vfs.FileDescription, _ FDFlags) {
file.IncRef() // Acquire a reference for caller.
files = append(files, file)
})
return files
}
// Fork returns an independent FDTable.
func (f *FDTable) Fork() *FDTable {
clone := f.k.NewFDTable()
f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
// The set function here will acquire an appropriate table
// reference for the clone. We don't need anything else.
switch {
case file != nil:
clone.set(fd, file, flags)
case fileVFS2 != nil:
clone.setVFS2(fd, fileVFS2, flags)
}
})
return clone
}
// Remove removes an FD from and returns a non-file iff successful.
//
// N.B. Callers are required to use DecRef when they are done.
func (f *FDTable) Remove(fd int32) (*fs.File, *vfs.FileDescription) {
if fd < 0 {
return nil, nil
}
f.mu.Lock()
defer f.mu.Unlock()
// Update current available position.
if fd < f.next {
f.next = fd
}
orig, orig2, _, _ := f.getAll(fd)
// Add reference for caller.
switch {
case orig != nil:
orig.IncRef()
case orig2 != nil:
orig2.IncRef()
}
f.setAll(fd, nil, nil, FDFlags{}) // Zap entry.
return orig, orig2
}
// RemoveIf removes all FDs where cond is true.
func (f *FDTable) RemoveIf(cond func(*fs.File, *vfs.FileDescription, FDFlags) bool) {
f.mu.Lock()
defer f.mu.Unlock()
f.forEach(func(fd int32, file *fs.File, fileVFS2 *vfs.FileDescription, flags FDFlags) {
if cond(file, fileVFS2, flags) {
f.set(fd, nil, FDFlags{}) // Clear from table.
// Update current available position.
if fd < f.next {
f.next = fd
}
}
})
}