Port most syscalls to VFS2.

pipe and pipe2 aren't ported, pending a slight rework of pipe FDs for VFS2.
mount and umount2 aren't ported out of temporary laziness. access and faccessat
need additional FSImpl methods to implement properly, but are stubbed to
prevent googletest from CHECK-failing. Other syscalls require additional
plumbing.

Updates #1623

PiperOrigin-RevId: 297188448
This commit is contained in:
Jamie Liu 2020-02-25 13:25:36 -08:00 committed by gVisor bot
parent 6def8ea6ac
commit 471b15b212
56 changed files with 4064 additions and 256 deletions

View File

@ -15,6 +15,8 @@
package linux
// EpollEvent is equivalent to struct epoll_event from epoll(2).
//
// +marshal
type EpollEvent struct {
Events uint32
// Linux makes struct epoll_event::data a __u64. We represent it as

View File

@ -15,6 +15,8 @@
package linux
// EpollEvent is equivalent to struct epoll_event from epoll(2).
//
// +marshal
type EpollEvent struct {
Events uint32
// Linux makes struct epoll_event a __u64, necessitating 4 bytes of padding

View File

@ -241,6 +241,8 @@ const (
)
// Statx represents struct statx.
//
// +marshal
type Statx struct {
Mask uint32
Blksize uint32

View File

@ -38,6 +38,8 @@ const (
)
// Statfs is struct statfs, from uapi/asm-generic/statfs.h.
//
// +marshal
type Statfs struct {
// Type is one of the filesystem magic values, defined above.
Type uint64

View File

@ -115,6 +115,8 @@ const (
)
// SignalSet is a signal mask with a bit corresponding to each signal.
//
// +marshal
type SignalSet uint64
// SignalSetSize is the size in bytes of a SignalSet.

View File

@ -157,6 +157,8 @@ func DurationToTimespec(dur time.Duration) Timespec {
const SizeOfTimeval = 16
// Timeval represents struct timeval in <time.h>.
//
// +marshal
type Timeval struct {
Sec int64
Usec int64
@ -230,6 +232,8 @@ type Tms struct {
type TimerID int32
// StatxTimestamp represents struct statx_timestamp.
//
// +marshal
type StatxTimestamp struct {
Sec int64
Nsec uint32
@ -258,6 +262,8 @@ func NsecToStatxTimestamp(nsec int64) (ts StatxTimestamp) {
}
// Utime represents struct utimbuf used by utimes(2).
//
// +marshal
type Utime struct {
Actime int64
Modtime int64

View File

@ -18,6 +18,7 @@ package linux
const (
XATTR_NAME_MAX = 255
XATTR_SIZE_MAX = 65536
XATTR_LIST_MAX = 65536
XATTR_CREATE = 1
XATTR_REPLACE = 2

View File

@ -8,9 +8,11 @@ go_library(
name = "fspath",
srcs = [
"builder.go",
"builder_unsafe.go",
"fspath.go",
],
deps = [
"//pkg/gohacks",
],
)
go_test(

View File

@ -16,6 +16,8 @@ package fspath
import (
"fmt"
"gvisor.dev/gvisor/pkg/gohacks"
)
// Builder is similar to strings.Builder, but is used to produce pathnames
@ -102,3 +104,9 @@ func (b *Builder) AppendString(str string) {
copy(b.buf[b.start:], b.buf[oldStart:])
copy(b.buf[len(b.buf)-len(str):], str)
}
// String returns the accumulated string. No other methods should be called
// after String.
func (b *Builder) String() string {
return gohacks.StringFromImmutableBytes(b.buf[b.start:])
}

View File

@ -67,7 +67,8 @@ func Parse(pathname string) Path {
// Path contains the information contained in a pathname string.
//
// Path is copyable by value.
// Path is copyable by value. The zero value for Path is equivalent to
// fspath.Parse(""), i.e. the empty path.
type Path struct {
// Begin is an iterator to the first path component in the relative part of
// the path.

11
pkg/gohacks/BUILD Normal file
View File

@ -0,0 +1,11 @@
load("//tools:defs.bzl", "go_library")
package(licenses = ["notice"])
go_library(
name = "gohacks",
srcs = [
"gohacks_unsafe.go",
],
visibility = ["//:sandbox"],
)

View File

@ -0,0 +1,57 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package gohacks contains utilities for subverting the Go compiler.
package gohacks
import (
"reflect"
"unsafe"
)
// Noescape hides a pointer from escape analysis. Noescape is the identity
// function but escape analysis doesn't think the output depends on the input.
// Noescape is inlined and currently compiles down to zero instructions.
// USE CAREFULLY!
//
// (Noescape is copy/pasted from Go's runtime/stubs.go:noescape().)
//
//go:nosplit
func Noescape(p unsafe.Pointer) unsafe.Pointer {
x := uintptr(p)
return unsafe.Pointer(x ^ 0)
}
// ImmutableBytesFromString is equivalent to []byte(s), except that it uses the
// same memory backing s instead of making a heap-allocated copy. This is only
// valid if the returned slice is never mutated.
func ImmutableBytesFromString(s string) []byte {
shdr := (*reflect.StringHeader)(unsafe.Pointer(&s))
var bs []byte
bshdr := (*reflect.SliceHeader)(unsafe.Pointer(&bs))
bshdr.Data = shdr.Data
bshdr.Len = shdr.Len
bshdr.Cap = shdr.Len
return bs
}
// StringFromImmutableBytes is equivalent to string(bs), except that it uses
// the same memory backing bs instead of making a heap-allocated copy. This is
// only valid if bs is never mutated after StringFromImmutableBytes returns.
func StringFromImmutableBytes(bs []byte) string {
// This is cheaper than messing with reflect.StringHeader and
// reflect.SliceHeader, which as of this writing produces many dead stores
// of zeroes. Compare strings.Builder.String().
return *(*string)(unsafe.Pointer(&bs))
}

View File

@ -117,15 +117,19 @@ func NewVFSLookup(mntns *vfs.MountNamespace, root, workingDir vfs.VirtualDentry)
// default anyways.
//
// TODO(gvisor.dev/issue/1623): Check mount has read and exec permission.
func (l *vfsLookup) OpenPath(ctx context.Context, path string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
func (l *vfsLookup) OpenPath(ctx context.Context, pathname string, opts vfs.OpenOptions, _ *uint, resolveFinal bool) (File, error) {
vfsObj := l.mntns.Root().Mount().Filesystem().VirtualFilesystem()
creds := auth.CredentialsFromContext(ctx)
path := fspath.Parse(pathname)
pop := &vfs.PathOperation{
Root: l.root,
Start: l.root,
Path: fspath.Parse(path),
Start: l.workingDir,
Path: path,
FollowFinalSymlink: resolveFinal,
}
if path.Absolute {
pop.Start = l.root
}
fd, err := vfsObj.OpenAt(ctx, creds, pop, &opts)
if err != nil {
return nil, err

View File

@ -73,9 +73,9 @@ func newTasksInode(inoGen InoGenerator, k *kernel.Kernel, pidns *kernel.PIDNames
"meminfo": newDentry(root, inoGen.NextIno(), 0444, &meminfoData{}),
"mounts": kernfs.NewStaticSymlink(root, inoGen.NextIno(), "self/mounts"),
"net": newNetDir(root, inoGen, k),
"stat": newDentry(root, inoGen.NextIno(), 0444, &statData{}),
"stat": newDentry(root, inoGen.NextIno(), 0444, &statData{k: k}),
"uptime": newDentry(root, inoGen.NextIno(), 0444, &uptimeData{}),
"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{}),
"version": newDentry(root, inoGen.NextIno(), 0444, &versionData{k: k}),
}
inode := &tasksInode{

View File

@ -296,6 +296,50 @@ func (f *FDTable) NewFDs(ctx context.Context, fd int32, files []*fs.File, flags
return fds, nil
}
// NewFDVFS2 allocates a file descriptor greater than or equal to minfd for
// the given file description. If it succeeds, it takes a reference on file.
func (f *FDTable) NewFDVFS2(ctx context.Context, minfd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
if minfd < 0 {
// Don't accept negative FDs.
return -1, syscall.EINVAL
}
// Default limit.
end := int32(math.MaxInt32)
// Ensure we don't get past the provided limit.
if limitSet := limits.FromContext(ctx); limitSet != nil {
lim := limitSet.Get(limits.NumberOfFiles)
if lim.Cur != limits.Infinity {
end = int32(lim.Cur)
}
if minfd >= end {
return -1, syscall.EMFILE
}
}
f.mu.Lock()
defer f.mu.Unlock()
// From f.next to find available fd.
fd := minfd
if fd < f.next {
fd = f.next
}
for fd < end {
if d, _, _ := f.get(fd); d == nil {
f.setVFS2(fd, file, flags)
if fd == f.next {
// Update next search start position.
f.next = fd + 1
}
return fd, nil
}
fd++
}
return -1, syscall.EMFILE
}
// NewFDAt sets the file reference for the given FD. If there is an active
// reference for that FD, the ref count for that existing reference is
// decremented.
@ -316,9 +360,6 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
return syscall.EBADF
}
f.mu.Lock()
defer f.mu.Unlock()
// Check the limit for the provided file.
if limitSet := limits.FromContext(ctx); limitSet != nil {
if lim := limitSet.Get(limits.NumberOfFiles); lim.Cur != limits.Infinity && uint64(fd) >= lim.Cur {
@ -327,6 +368,8 @@ func (f *FDTable) newFDAt(ctx context.Context, fd int32, file *fs.File, fileVFS2
}
// Install the entry.
f.mu.Lock()
defer f.mu.Unlock()
f.setAll(fd, file, fileVFS2, flags)
return nil
}

View File

@ -244,6 +244,28 @@ func (f *FSContext) SetRootDirectory(d *fs.Dirent) {
old.DecRef()
}
// SetRootDirectoryVFS2 sets the root directory. It takes a reference on vd.
//
// This is not a valid call after free.
func (f *FSContext) SetRootDirectoryVFS2(vd vfs.VirtualDentry) {
if !vd.Ok() {
panic("FSContext.SetRootDirectoryVFS2 called with zero-value VirtualDentry")
}
f.mu.Lock()
if !f.rootVFS2.Ok() {
f.mu.Unlock()
panic(fmt.Sprintf("FSContext.SetRootDirectoryVFS2(%v)) called after destroy", vd))
}
old := f.rootVFS2
vd.IncRef()
f.rootVFS2 = vd
f.mu.Unlock()
old.DecRef()
}
// Umask returns the current umask.
func (f *FSContext) Umask() uint {
f.mu.Lock()

View File

@ -789,6 +789,15 @@ func (t *Task) NewFDFrom(fd int32, file *fs.File, flags FDFlags) (int32, error)
return fds[0], nil
}
// NewFDFromVFS2 is a convenience wrapper for t.FDTable().NewFDVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.Get.
func (t *Task) NewFDFromVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) (int32, error) {
return t.fdTable.NewFDVFS2(t, fd, file, flags)
}
// NewFDAt is a convenience wrapper for t.FDTable().NewFDAt.
//
// This automatically passes the task as the context.
@ -798,6 +807,15 @@ func (t *Task) NewFDAt(fd int32, file *fs.File, flags FDFlags) error {
return t.fdTable.NewFDAt(t, fd, file, flags)
}
// NewFDAtVFS2 is a convenience wrapper for t.FDTable().NewFDAtVFS2.
//
// This automatically passes the task as the context.
//
// Precondition: same as FDTable.
func (t *Task) NewFDAtVFS2(fd int32, file *vfs.FileDescription, flags FDFlags) error {
return t.fdTable.NewFDAtVFS2(t, fd, file, flags)
}
// WithMuLocked executes f with t.mu locked.
func (t *Task) WithMuLocked(f func(*Task)) {
t.mu.Lock()

View File

@ -25,6 +25,8 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
// LINT.IfChange
// EpollCreate1 implements the epoll_create1(2) linux syscall.
func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
flags := args[0].Int()
@ -164,3 +166,5 @@ func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return EpollWait(t, args)
}
// LINT.ThenChange(vfs2/epoll.go)

View File

@ -130,6 +130,8 @@ func copyInPath(t *kernel.Task, addr usermem.Addr, allowEmpty bool) (path string
return path, dirPath, nil
}
// LINT.IfChange
func openAt(t *kernel.Task, dirFD int32, addr usermem.Addr, flags uint) (fd uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@ -575,6 +577,10 @@ func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, accessAt(t, dirFD, addr, flags&linux.AT_SYMLINK_NOFOLLOW == 0, mode)
}
// LINT.ThenChange(vfs2/filesystem.go)
// LINT.IfChange
// Ioctl implements linux syscall ioctl(2).
func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
@ -650,6 +656,10 @@ func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
}
// LINT.ThenChange(vfs2/ioctl.go)
// LINT.IfChange
// Getcwd implements the linux syscall getcwd(2).
func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
@ -760,6 +770,10 @@ func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, nil
}
// LINT.ThenChange(vfs2/fscontext.go)
// LINT.IfChange
// Close implements linux syscall close(2).
func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
@ -1094,6 +1108,8 @@ func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
}
// LINT.ThenChange(vfs2/fd.go)
const (
_FADV_NORMAL = 0
_FADV_RANDOM = 1
@ -1141,6 +1157,8 @@ func Fadvise64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, nil
}
// LINT.IfChange
func mkdirAt(t *kernel.Task, dirFD int32, addr usermem.Addr, mode linux.FileMode) error {
path, _, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@ -1421,6 +1439,10 @@ func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscal
return 0, nil, linkAt(t, oldDirFD, oldAddr, newDirFD, newAddr, resolve, allowEmpty)
}
// LINT.ThenChange(vfs2/filesystem.go)
// LINT.IfChange
func readlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr, bufAddr usermem.Addr, size uint) (copied uintptr, err error) {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@ -1480,6 +1502,10 @@ func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sy
return n, nil, err
}
// LINT.ThenChange(vfs2/stat.go)
// LINT.IfChange
func unlinkAt(t *kernel.Task, dirFD int32, addr usermem.Addr) error {
path, dirPath, err := copyInPath(t, addr, false /* allowEmpty */)
if err != nil {
@ -1516,6 +1542,10 @@ func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, unlinkAt(t, dirFD, addr)
}
// LINT.ThenChange(vfs2/filesystem.go)
// LINT.IfChange
// Truncate implements linux syscall truncate(2).
func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
@ -1614,6 +1644,8 @@ func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, nil
}
// LINT.ThenChange(vfs2/setstat.go)
// Umask implements linux syscall umask(2).
func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
mask := args[0].ModeT()
@ -1621,6 +1653,8 @@ func Umask(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
return uintptr(mask), nil, nil
}
// LINT.IfChange
// Change ownership of a file.
//
// uid and gid may be -1, in which case they will not be changed.
@ -1987,6 +2021,10 @@ func Futimesat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sys
return 0, nil, utimes(t, dirFD, pathnameAddr, ts, true)
}
// LINT.ThenChange(vfs2/setstat.go)
// LINT.IfChange
func renameAt(t *kernel.Task, oldDirFD int32, oldAddr usermem.Addr, newDirFD int32, newAddr usermem.Addr) error {
newPath, _, err := copyInPath(t, newAddr, false /* allowEmpty */)
if err != nil {
@ -2042,6 +2080,8 @@ func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Sysc
return 0, nil, renameAt(t, oldDirFD, oldPathAddr, newDirFD, newPathAddr)
}
// LINT.ThenChange(vfs2/filesystem.go)
// Fallocate implements linux system call fallocate(2).
func Fallocate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()

View File

@ -27,6 +27,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
// LINT.IfChange
// Getdents implements linux syscall getdents(2) for 64bit systems.
func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
@ -244,3 +246,5 @@ func (ds *direntSerializer) CopyOut(name string, attr fs.DentAttr) error {
func (ds *direntSerializer) Written() int {
return ds.written
}
// LINT.ThenChange(vfs2/getdents.go)

View File

@ -21,6 +21,8 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
// LINT.IfChange
// Lseek implements linux syscall lseek(2).
func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
@ -52,3 +54,5 @@ func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.Syscall
}
return uintptr(offset), nil, err
}
// LINT.ThenChange(vfs2/read_write.go)

View File

@ -35,6 +35,8 @@ func Brk(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallCo
return uintptr(addr), nil, nil
}
// LINT.IfChange
// Mmap implements linux syscall mmap(2).
func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
prot := args[2].Int()
@ -104,6 +106,8 @@ func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallC
return uintptr(rv), nil, err
}
// LINT.ThenChange(vfs2/mmap.go)
// Munmap implements linux syscall munmap(2).
func Munmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, t.MemoryManager().MUnmap(t, args[0].Pointer(), args[1].Uint64())

View File

@ -28,6 +28,8 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
// LINT.IfChange
const (
// EventMaskRead contains events that can be triggered on reads.
EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
@ -388,3 +390,5 @@ func preadv(t *kernel.Task, f *fs.File, dst usermem.IOSequence, offset int64) (i
return total, err
}
// LINT.ThenChange(vfs2/read_write.go)

View File

@ -23,6 +23,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
// LINT.IfChange
func statFromAttrs(t *kernel.Task, sattr fs.StableAttr, uattr fs.UnstableAttr) linux.Stat {
return linux.Stat{
Dev: sattr.DeviceID,
@ -297,3 +299,5 @@ func statfsImpl(t *kernel.Task, d *fs.Dirent, addr usermem.Addr) error {
_, err = t.CopyOut(addr, &statfs)
return err
}
// LINT.ThenChange(vfs2/stat.go)

View File

@ -22,6 +22,8 @@ import (
"gvisor.dev/gvisor/pkg/syserror"
)
// LINT.IfChange
// Sync implements linux system call sync(2).
func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
t.MountNamespace().SyncAll(t)
@ -135,3 +137,5 @@ func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel
return 0, nil, syserror.ConvertIntr(err, kernel.ERESTARTSYS)
}
// LINT.ThenChange(vfs2/sync.go)

View File

@ -28,6 +28,8 @@ import (
"gvisor.dev/gvisor/pkg/waiter"
)
// LINT.IfChange
const (
// EventMaskWrite contains events that can be triggered on writes.
//
@ -358,3 +360,5 @@ func pwritev(t *kernel.Task, f *fs.File, src usermem.IOSequence, offset int64) (
return total, err
}
// LINT.ThenChange(vfs2/read_write.go)

View File

@ -25,6 +25,8 @@ import (
"gvisor.dev/gvisor/pkg/usermem"
)
// LINT.IfChange
// GetXattr implements linux syscall getxattr(2).
func GetXattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return getXattrFromPath(t, args, true)
@ -418,3 +420,5 @@ func removeXattr(t *kernel.Task, d *fs.Dirent, nameAddr usermem.Addr) error {
return d.Inode.RemoveXattr(t, d, name)
}
// LINT.ThenChange(vfs2/xattr.go)

View File

@ -5,18 +5,44 @@ package(licenses = ["notice"])
go_library(
name = "vfs2",
srcs = [
"epoll.go",
"epoll_unsafe.go",
"execve.go",
"fd.go",
"filesystem.go",
"fscontext.go",
"getdents.go",
"ioctl.go",
"linux64.go",
"linux64_override_amd64.go",
"linux64_override_arm64.go",
"sys_read.go",
"mmap.go",
"path.go",
"poll.go",
"read_write.go",
"setstat.go",
"stat.go",
"sync.go",
"xattr.go",
],
marshal = True,
visibility = ["//:sandbox"],
deps = [
"//pkg/abi/linux",
"//pkg/fspath",
"//pkg/gohacks",
"//pkg/sentry/arch",
"//pkg/sentry/fsbridge",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/kernel/time",
"//pkg/sentry/limits",
"//pkg/sentry/loader",
"//pkg/sentry/memmap",
"//pkg/sentry/syscalls",
"//pkg/sentry/syscalls/linux",
"//pkg/sentry/vfs",
"//pkg/sync",
"//pkg/syserror",
"//pkg/usermem",
"//pkg/waiter",

View File

@ -0,0 +1,225 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"math"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
// EpollCreate1 implements Linux syscall epoll_create1(2).
func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
flags := args[0].Int()
if flags&^linux.EPOLL_CLOEXEC != 0 {
return 0, nil, syserror.EINVAL
}
file, err := t.Kernel().VFS().NewEpollInstanceFD()
if err != nil {
return 0, nil, err
}
defer file.DecRef()
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
})
if err != nil {
return 0, nil, err
}
return uintptr(fd), nil, nil
}
// EpollCreate implements Linux syscall epoll_create(2).
func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
size := args[0].Int()
// "Since Linux 2.6.8, the size argument is ignored, but must be greater
// than zero" - epoll_create(2)
if size <= 0 {
return 0, nil, syserror.EINVAL
}
file, err := t.Kernel().VFS().NewEpollInstanceFD()
if err != nil {
return 0, nil, err
}
defer file.DecRef()
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
if err != nil {
return 0, nil, err
}
return uintptr(fd), nil, nil
}
// EpollCtl implements Linux syscall epoll_ctl(2).
func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
epfd := args[0].Int()
op := args[1].Int()
fd := args[2].Int()
eventAddr := args[3].Pointer()
epfile := t.GetFileVFS2(epfd)
if epfile == nil {
return 0, nil, syserror.EBADF
}
defer epfile.DecRef()
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
}
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
if epfile == file {
return 0, nil, syserror.EINVAL
}
var event linux.EpollEvent
switch op {
case linux.EPOLL_CTL_ADD:
if err := event.CopyIn(t, eventAddr); err != nil {
return 0, nil, err
}
return 0, nil, ep.AddInterest(file, fd, event)
case linux.EPOLL_CTL_DEL:
return 0, nil, ep.DeleteInterest(file, fd)
case linux.EPOLL_CTL_MOD:
if err := event.CopyIn(t, eventAddr); err != nil {
return 0, nil, err
}
return 0, nil, ep.ModifyInterest(file, fd, event)
default:
return 0, nil, syserror.EINVAL
}
}
// EpollWait implements Linux syscall epoll_wait(2).
func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
epfd := args[0].Int()
eventsAddr := args[1].Pointer()
maxEvents := int(args[2].Int())
timeout := int(args[3].Int())
const _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
return 0, nil, syserror.EINVAL
}
epfile := t.GetFileVFS2(epfd)
if epfile == nil {
return 0, nil, syserror.EBADF
}
defer epfile.DecRef()
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
}
// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
// maxEvents), so that the buffer can be allocated on the stack.
var (
events [16]linux.EpollEvent
total int
ch chan struct{}
haveDeadline bool
deadline ktime.Time
)
for {
batchEvents := len(events)
if batchEvents > maxEvents {
batchEvents = maxEvents
}
n := ep.ReadEvents(events[:batchEvents])
maxEvents -= n
if n != 0 {
// Copy what we read out.
copiedEvents, err := copyOutEvents(t, eventsAddr, events[:n])
eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
total += copiedEvents
if err != nil {
if total != 0 {
return uintptr(total), nil, nil
}
return 0, nil, err
}
// If we've filled the application's event buffer, we're done.
if maxEvents == 0 {
return uintptr(total), nil, nil
}
// Loop if we read a full batch, under the expectation that there
// may be more events to read.
if n == batchEvents {
continue
}
}
// We get here if n != batchEvents. If we read any number of events
// (just now, or in a previous iteration of this loop), or if timeout
// is 0 (such that epoll_wait should be non-blocking), return the
// events we've read so far to the application.
if total != 0 || timeout == 0 {
return uintptr(total), nil, nil
}
// In the first iteration of this loop, register with the epoll
// instance for readability events, but then immediately continue the
// loop since we need to retry ReadEvents() before blocking. In all
// subsequent iterations, block until events are available, the timeout
// expires, or an interrupt arrives.
if ch == nil {
var w waiter.Entry
w, ch = waiter.NewChannelEntry(nil)
epfile.EventRegister(&w, waiter.EventIn)
defer epfile.EventUnregister(&w)
} else {
// Set up the timer if a timeout was specified.
if timeout > 0 && !haveDeadline {
timeoutDur := time.Duration(timeout) * time.Millisecond
deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
haveDeadline = true
}
if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
if err == syserror.ETIMEDOUT {
err = nil
}
// total must be 0 since otherwise we would have returned
// above.
return 0, nil, err
}
}
}
}
// EpollPwait implements Linux syscall epoll_pwait(2).
func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
maskAddr := args[4].Pointer()
maskSize := uint(args[5].Uint())
if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
return 0, nil, err
}
return EpollWait(t, args)
}

View File

@ -0,0 +1,44 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"reflect"
"runtime"
"unsafe"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/usermem"
)
const sizeofEpollEvent = int(unsafe.Sizeof(linux.EpollEvent{}))
func copyOutEvents(t *kernel.Task, addr usermem.Addr, events []linux.EpollEvent) (int, error) {
if len(events) == 0 {
return 0, nil
}
// Cast events to a byte slice for copying.
var eventBytes []byte
eventBytesHdr := (*reflect.SliceHeader)(unsafe.Pointer(&eventBytes))
eventBytesHdr.Data = uintptr(gohacks.Noescape(unsafe.Pointer(&events[0])))
eventBytesHdr.Len = len(events) * sizeofEpollEvent
eventBytesHdr.Cap = len(events) * sizeofEpollEvent
copiedBytes, err := t.CopyOutBytes(addr, eventBytes)
runtime.KeepAlive(events)
copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
return copiedEvents, err
}

View File

@ -0,0 +1,137 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fsbridge"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/loader"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
// Execve implements linux syscall execve(2).
func Execve(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathnameAddr := args[0].Pointer()
argvAddr := args[1].Pointer()
envvAddr := args[2].Pointer()
return execveat(t, linux.AT_FDCWD, pathnameAddr, argvAddr, envvAddr, 0 /* flags */)
}
// Execveat implements linux syscall execveat(2).
func Execveat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathnameAddr := args[1].Pointer()
argvAddr := args[2].Pointer()
envvAddr := args[3].Pointer()
flags := args[4].Int()
return execveat(t, dirfd, pathnameAddr, argvAddr, envvAddr, flags)
}
func execveat(t *kernel.Task, dirfd int32, pathnameAddr, argvAddr, envvAddr usermem.Addr, flags int32) (uintptr, *kernel.SyscallControl, error) {
if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
return 0, nil, syserror.EINVAL
}
pathname, err := t.CopyInString(pathnameAddr, linux.PATH_MAX)
if err != nil {
return 0, nil, err
}
var argv, envv []string
if argvAddr != 0 {
var err error
argv, err = t.CopyInVector(argvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
if err != nil {
return 0, nil, err
}
}
if envvAddr != 0 {
var err error
envv, err = t.CopyInVector(envvAddr, slinux.ExecMaxElemSize, slinux.ExecMaxTotalSize)
if err != nil {
return 0, nil, err
}
}
root := t.FSContext().RootDirectoryVFS2()
defer root.DecRef()
var executable fsbridge.File
closeOnExec := false
if path := fspath.Parse(pathname); dirfd != linux.AT_FDCWD && !path.Absolute {
// We must open the executable ourselves since dirfd is used as the
// starting point while resolving path, but the task working directory
// is used as the starting point while resolving interpreters (Linux:
// fs/binfmt_script.c:load_script() => fs/exec.c:open_exec() =>
// do_open_execat(fd=AT_FDCWD)), and the loader package is currently
// incapable of handling this correctly.
if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
return 0, nil, syserror.ENOENT
}
dirfile, dirfileFlags := t.FDTable().GetVFS2(dirfd)
if dirfile == nil {
return 0, nil, syserror.EBADF
}
start := dirfile.VirtualDentry()
start.IncRef()
dirfile.DecRef()
closeOnExec = dirfileFlags.CloseOnExec
file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &vfs.PathOperation{
Root: root,
Start: start,
Path: path,
FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
}, &vfs.OpenOptions{
Flags: linux.O_RDONLY,
FileExec: true,
})
start.DecRef()
if err != nil {
return 0, nil, err
}
defer file.DecRef()
executable = fsbridge.NewVFSFile(file)
}
// Load the new TaskContext.
mntns := t.MountNamespaceVFS2() // FIXME(jamieliu): useless refcount change
defer mntns.DecRef()
wd := t.FSContext().WorkingDirectoryVFS2()
defer wd.DecRef()
remainingTraversals := uint(linux.MaxSymlinkTraversals)
loadArgs := loader.LoadArgs{
Opener: fsbridge.NewVFSLookup(mntns, root, wd),
RemainingTraversals: &remainingTraversals,
ResolveFinal: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
Filename: pathname,
File: executable,
CloseOnExec: closeOnExec,
Argv: argv,
Envv: envv,
Features: t.Arch().FeatureSet(),
}
tc, se := t.Kernel().LoadTaskImage(t, loadArgs)
if se != nil {
return 0, nil, se.ToError()
}
ctrl, err := t.Execve(tc)
return 0, ctrl, err
}

View File

@ -0,0 +1,147 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/syserror"
)
// Close implements Linux syscall close(2).
func Close(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
// Note that Remove provides a reference on the file that we may use to
// flush. It is still active until we drop the final reference below
// (and other reference-holding operations complete).
_, file := t.FDTable().Remove(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
err := file.OnClose(t)
return 0, nil, slinux.HandleIOErrorVFS2(t, false /* partial */, err, syserror.EINTR, "close", file)
}
// Dup implements Linux syscall dup(2).
func Dup(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
newFD, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
if err != nil {
return 0, nil, syserror.EMFILE
}
return uintptr(newFD), nil, nil
}
// Dup2 implements Linux syscall dup2(2).
func Dup2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldfd := args[0].Int()
newfd := args[1].Int()
if oldfd == newfd {
// As long as oldfd is valid, dup2() does nothing and returns newfd.
file := t.GetFileVFS2(oldfd)
if file == nil {
return 0, nil, syserror.EBADF
}
file.DecRef()
return uintptr(newfd), nil, nil
}
return dup3(t, oldfd, newfd, 0)
}
// Dup3 implements Linux syscall dup3(2).
func Dup3(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldfd := args[0].Int()
newfd := args[1].Int()
flags := args[2].Uint()
if oldfd == newfd {
return 0, nil, syserror.EINVAL
}
return dup3(t, oldfd, newfd, flags)
}
func dup3(t *kernel.Task, oldfd, newfd int32, flags uint32) (uintptr, *kernel.SyscallControl, error) {
if flags&^linux.O_CLOEXEC != 0 {
return 0, nil, syserror.EINVAL
}
file := t.GetFileVFS2(oldfd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
err := t.NewFDAtVFS2(newfd, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
})
if err != nil {
return 0, nil, err
}
return uintptr(newfd), nil, nil
}
// Fcntl implements linux syscall fcntl(2).
func Fcntl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
cmd := args[1].Int()
file, flags := t.FDTable().GetVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
switch cmd {
case linux.F_DUPFD, linux.F_DUPFD_CLOEXEC:
minfd := args[2].Int()
fd, err := t.NewFDFromVFS2(minfd, file, kernel.FDFlags{
CloseOnExec: cmd == linux.F_DUPFD_CLOEXEC,
})
if err != nil {
return 0, nil, err
}
return uintptr(fd), nil, nil
case linux.F_GETFD:
return uintptr(flags.ToLinuxFDFlags()), nil, nil
case linux.F_SETFD:
flags := args[2].Uint()
t.FDTable().SetFlags(fd, kernel.FDFlags{
CloseOnExec: flags&linux.FD_CLOEXEC != 0,
})
return 0, nil, nil
case linux.F_GETFL:
return uintptr(file.StatusFlags()), nil, nil
case linux.F_SETFL:
return 0, nil, file.SetStatusFlags(t, t.Credentials(), args[2].Uint())
default:
// TODO(gvisor.dev/issue/1623): Everything else is not yet supported.
return 0, nil, syserror.EINVAL
}
}

View File

@ -0,0 +1,326 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
// Link implements Linux syscall link(2).
func Link(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldpathAddr := args[0].Pointer()
newpathAddr := args[1].Pointer()
return 0, nil, linkat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
}
// Linkat implements Linux syscall linkat(2).
func Linkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
olddirfd := args[0].Int()
oldpathAddr := args[1].Pointer()
newdirfd := args[2].Int()
newpathAddr := args[3].Pointer()
flags := args[4].Int()
return 0, nil, linkat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
}
func linkat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags int32) error {
if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_FOLLOW) != 0 {
return syserror.EINVAL
}
if flags&linux.AT_EMPTY_PATH != 0 && !t.HasCapability(linux.CAP_DAC_READ_SEARCH) {
return syserror.ENOENT
}
oldpath, err := copyInPath(t, oldpathAddr)
if err != nil {
return err
}
oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_FOLLOW != 0))
if err != nil {
return err
}
defer oldtpop.Release()
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
return err
}
newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer newtpop.Release()
return t.Kernel().VFS().LinkAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop)
}
// Mkdir implements Linux syscall mkdir(2).
func Mkdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mode := args[1].ModeT()
return 0, nil, mkdirat(t, linux.AT_FDCWD, addr, mode)
}
// Mkdirat implements Linux syscall mkdirat(2).
func Mkdirat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
mode := args[2].ModeT()
return 0, nil, mkdirat(t, dirfd, addr, mode)
}
func mkdirat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint) error {
path, err := copyInPath(t, addr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release()
return t.Kernel().VFS().MkdirAt(t, t.Credentials(), &tpop.pop, &vfs.MkdirOptions{
Mode: linux.FileMode(mode & (0777 | linux.S_ISVTX) &^ t.FSContext().Umask()),
})
}
// Mknod implements Linux syscall mknod(2).
func Mknod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mode := args[1].ModeT()
dev := args[2].Uint()
return 0, nil, mknodat(t, linux.AT_FDCWD, addr, mode, dev)
}
// Mknodat implements Linux syscall mknodat(2).
func Mknodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
mode := args[2].ModeT()
dev := args[3].Uint()
return 0, nil, mknodat(t, dirfd, addr, mode, dev)
}
func mknodat(t *kernel.Task, dirfd int32, addr usermem.Addr, mode uint, dev uint32) error {
path, err := copyInPath(t, addr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release()
major, minor := linux.DecodeDeviceID(dev)
return t.Kernel().VFS().MknodAt(t, t.Credentials(), &tpop.pop, &vfs.MknodOptions{
Mode: linux.FileMode(mode &^ t.FSContext().Umask()),
DevMajor: uint32(major),
DevMinor: minor,
})
}
// Open implements Linux syscall open(2).
func Open(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
flags := args[1].Uint()
mode := args[2].ModeT()
return openat(t, linux.AT_FDCWD, addr, flags, mode)
}
// Openat implements Linux syscall openat(2).
func Openat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
addr := args[1].Pointer()
flags := args[2].Uint()
mode := args[3].ModeT()
return openat(t, dirfd, addr, flags, mode)
}
// Creat implements Linux syscall creat(2).
func Creat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
mode := args[1].ModeT()
return openat(t, linux.AT_FDCWD, addr, linux.O_WRONLY|linux.O_CREAT|linux.O_TRUNC, mode)
}
func openat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, flags uint32, mode uint) (uintptr, *kernel.SyscallControl, error) {
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, shouldFollowFinalSymlink(flags&linux.O_NOFOLLOW == 0))
if err != nil {
return 0, nil, err
}
defer tpop.Release()
file, err := t.Kernel().VFS().OpenAt(t, t.Credentials(), &tpop.pop, &vfs.OpenOptions{
Flags: flags,
Mode: linux.FileMode(mode & (0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX) &^ t.FSContext().Umask()),
})
if err != nil {
return 0, nil, err
}
defer file.DecRef()
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.O_CLOEXEC != 0,
})
return uintptr(fd), nil, err
}
// Rename implements Linux syscall rename(2).
func Rename(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
oldpathAddr := args[0].Pointer()
newpathAddr := args[1].Pointer()
return 0, nil, renameat(t, linux.AT_FDCWD, oldpathAddr, linux.AT_FDCWD, newpathAddr, 0 /* flags */)
}
// Renameat implements Linux syscall renameat(2).
func Renameat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
olddirfd := args[0].Int()
oldpathAddr := args[1].Pointer()
newdirfd := args[2].Int()
newpathAddr := args[3].Pointer()
return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, 0 /* flags */)
}
// Renameat2 implements Linux syscall renameat2(2).
func Renameat2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
olddirfd := args[0].Int()
oldpathAddr := args[1].Pointer()
newdirfd := args[2].Int()
newpathAddr := args[3].Pointer()
flags := args[4].Uint()
return 0, nil, renameat(t, olddirfd, oldpathAddr, newdirfd, newpathAddr, flags)
}
func renameat(t *kernel.Task, olddirfd int32, oldpathAddr usermem.Addr, newdirfd int32, newpathAddr usermem.Addr, flags uint32) error {
oldpath, err := copyInPath(t, oldpathAddr)
if err != nil {
return err
}
// "If oldpath refers to a symbolic link, the link is renamed" - rename(2)
oldtpop, err := getTaskPathOperation(t, olddirfd, oldpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer oldtpop.Release()
newpath, err := copyInPath(t, newpathAddr)
if err != nil {
return err
}
newtpop, err := getTaskPathOperation(t, newdirfd, newpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer newtpop.Release()
return t.Kernel().VFS().RenameAt(t, t.Credentials(), &oldtpop.pop, &newtpop.pop, &vfs.RenameOptions{
Flags: flags,
})
}
// Rmdir implements Linux syscall rmdir(2).
func Rmdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
return 0, nil, rmdirat(t, linux.AT_FDCWD, pathAddr)
}
func rmdirat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, followFinalSymlink)
if err != nil {
return err
}
defer tpop.Release()
return t.Kernel().VFS().RmdirAt(t, t.Credentials(), &tpop.pop)
}
// Unlink implements Linux syscall unlink(2).
func Unlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
return 0, nil, unlinkat(t, linux.AT_FDCWD, pathAddr)
}
func unlinkat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr) error {
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, dirfd, path, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release()
return t.Kernel().VFS().UnlinkAt(t, t.Credentials(), &tpop.pop)
}
// Unlinkat implements Linux syscall unlinkat(2).
func Unlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
flags := args[2].Int()
if flags&^linux.AT_REMOVEDIR != 0 {
return 0, nil, syserror.EINVAL
}
if flags&linux.AT_REMOVEDIR != 0 {
return 0, nil, rmdirat(t, dirfd, pathAddr)
}
return 0, nil, unlinkat(t, dirfd, pathAddr)
}
// Symlink implements Linux syscall symlink(2).
func Symlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
targetAddr := args[0].Pointer()
linkpathAddr := args[1].Pointer()
return 0, nil, symlinkat(t, targetAddr, linux.AT_FDCWD, linkpathAddr)
}
// Symlinkat implements Linux syscall symlinkat(2).
func Symlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
targetAddr := args[0].Pointer()
newdirfd := args[1].Int()
linkpathAddr := args[2].Pointer()
return 0, nil, symlinkat(t, targetAddr, newdirfd, linkpathAddr)
}
func symlinkat(t *kernel.Task, targetAddr usermem.Addr, newdirfd int32, linkpathAddr usermem.Addr) error {
target, err := t.CopyInString(targetAddr, linux.PATH_MAX)
if err != nil {
return err
}
linkpath, err := copyInPath(t, linkpathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, newdirfd, linkpath, disallowEmptyPath, nofollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release()
return t.Kernel().VFS().SymlinkAt(t, t.Credentials(), &tpop.pop, target)
}

View File

@ -0,0 +1,131 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
// Getcwd implements Linux syscall getcwd(2).
func Getcwd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
size := args[1].SizeT()
root := t.FSContext().RootDirectoryVFS2()
wd := t.FSContext().WorkingDirectoryVFS2()
s, err := t.Kernel().VFS().PathnameForGetcwd(t, root, wd)
root.DecRef()
wd.DecRef()
if err != nil {
return 0, nil, err
}
// Note this is >= because we need a terminator.
if uint(len(s)) >= size {
return 0, nil, syserror.ERANGE
}
// Construct a byte slice containing a NUL terminator.
buf := t.CopyScratchBuffer(len(s) + 1)
copy(buf, s)
buf[len(buf)-1] = 0
// Write the pathname slice.
n, err := t.CopyOutBytes(addr, buf)
if err != nil {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Chdir implements Linux syscall chdir(2).
func Chdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
path, err := copyInPath(t, addr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
})
if err != nil {
return 0, nil, err
}
t.FSContext().SetWorkingDirectoryVFS2(vd)
vd.DecRef()
return 0, nil, nil
}
// Fchdir implements Linux syscall fchdir(2).
func Fchdir(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
})
if err != nil {
return 0, nil, err
}
t.FSContext().SetWorkingDirectoryVFS2(vd)
vd.DecRef()
return 0, nil, nil
}
// Chroot implements Linux syscall chroot(2).
func Chroot(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
if !t.HasCapability(linux.CAP_SYS_CHROOT) {
return 0, nil, syserror.EPERM
}
path, err := copyInPath(t, addr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
vd, err := t.Kernel().VFS().GetDentryAt(t, t.Credentials(), &tpop.pop, &vfs.GetDentryOptions{
CheckSearchable: true,
})
if err != nil {
return 0, nil, err
}
t.FSContext().SetRootDirectoryVFS2(vd)
vd.DecRef()
return 0, nil, nil
}

View File

@ -0,0 +1,149 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"fmt"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
// Getdents implements Linux syscall getdents(2).
func Getdents(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return getdents(t, args, false /* isGetdents64 */)
}
// Getdents64 implements Linux syscall getdents64(2).
func Getdents64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return getdents(t, args, true /* isGetdents64 */)
}
func getdents(t *kernel.Task, args arch.SyscallArguments, isGetdents64 bool) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
size := int(args[2].Uint())
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
cb := getGetdentsCallback(t, addr, size, isGetdents64)
err := file.IterDirents(t, cb)
n := size - cb.remaining
putGetdentsCallback(cb)
if n == 0 {
return 0, nil, err
}
return uintptr(n), nil, nil
}
type getdentsCallback struct {
t *kernel.Task
addr usermem.Addr
remaining int
isGetdents64 bool
}
var getdentsCallbackPool = sync.Pool{
New: func() interface{} {
return &getdentsCallback{}
},
}
func getGetdentsCallback(t *kernel.Task, addr usermem.Addr, size int, isGetdents64 bool) *getdentsCallback {
cb := getdentsCallbackPool.Get().(*getdentsCallback)
*cb = getdentsCallback{
t: t,
addr: addr,
remaining: size,
isGetdents64: isGetdents64,
}
return cb
}
func putGetdentsCallback(cb *getdentsCallback) {
cb.t = nil
getdentsCallbackPool.Put(cb)
}
// Handle implements vfs.IterDirentsCallback.Handle.
func (cb *getdentsCallback) Handle(dirent vfs.Dirent) error {
var buf []byte
if cb.isGetdents64 {
// struct linux_dirent64 {
// ino64_t d_ino; /* 64-bit inode number */
// off64_t d_off; /* 64-bit offset to next structure */
// unsigned short d_reclen; /* Size of this dirent */
// unsigned char d_type; /* File type */
// char d_name[]; /* Filename (null-terminated) */
// };
size := 8 + 8 + 2 + 1 + 1 + len(dirent.Name)
if size < cb.remaining {
return syserror.EINVAL
}
buf = cb.t.CopyScratchBuffer(size)
usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
buf[18] = dirent.Type
copy(buf[19:], dirent.Name)
buf[size-1] = 0 // NUL terminator
} else {
// struct linux_dirent {
// unsigned long d_ino; /* Inode number */
// unsigned long d_off; /* Offset to next linux_dirent */
// unsigned short d_reclen; /* Length of this linux_dirent */
// char d_name[]; /* Filename (null-terminated) */
// /* length is actually (d_reclen - 2 -
// offsetof(struct linux_dirent, d_name)) */
// /*
// char pad; // Zero padding byte
// char d_type; // File type (only since Linux
// // 2.6.4); offset is (d_reclen - 1)
// */
// };
if cb.t.Arch().Width() != 8 {
panic(fmt.Sprintf("unsupported sizeof(unsigned long): %d", cb.t.Arch().Width()))
}
size := 8 + 8 + 2 + 1 + 1 + 1 + len(dirent.Name)
if size < cb.remaining {
return syserror.EINVAL
}
buf = cb.t.CopyScratchBuffer(size)
usermem.ByteOrder.PutUint64(buf[0:8], dirent.Ino)
usermem.ByteOrder.PutUint64(buf[8:16], uint64(dirent.NextOff))
usermem.ByteOrder.PutUint16(buf[16:18], uint16(size))
copy(buf[18:], dirent.Name)
buf[size-3] = 0 // NUL terminator
buf[size-2] = 0 // zero padding byte
buf[size-1] = dirent.Type
}
n, err := cb.t.CopyOutBytes(cb.addr, buf)
if err != nil {
// Don't report partially-written dirents by advancing cb.addr or
// cb.remaining.
return err
}
cb.addr += usermem.Addr(n)
cb.remaining -= n
return nil
}

View File

@ -1,4 +1,4 @@
// Copyright 2019 The gVisor Authors.
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -12,16 +12,24 @@
// See the License for the specific language governing permissions and
// limitations under the License.
package fspath
package vfs2
import (
"unsafe"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
)
// String returns the accumulated string. No other methods should be called
// after String.
func (b *Builder) String() string {
bs := b.buf[b.start:]
// Compare strings.Builder.String().
return *(*string)(unsafe.Pointer(&bs))
// Ioctl implements Linux syscall ioctl(2).
func Ioctl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
ret, err := file.Ioctl(t, t.MemoryManager(), args)
return ret, nil, err
}

View File

@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// +build amd64
package vfs2
import (
@ -22,110 +24,142 @@ import (
// Override syscall table to add syscalls implementations from this package.
func Override(table map[uintptr]kernel.Syscall) {
table[0] = syscalls.Supported("read", Read)
// Remove syscalls that haven't been converted yet. It's better to get ENOSYS
// rather than a SIGSEGV deep in the stack.
delete(table, 1) // write
delete(table, 2) // open
delete(table, 3) // close
delete(table, 4) // stat
delete(table, 5) // fstat
delete(table, 6) // lstat
delete(table, 7) // poll
delete(table, 8) // lseek
delete(table, 9) // mmap
delete(table, 16) // ioctl
delete(table, 17) // pread64
delete(table, 18) // pwrite64
delete(table, 19) // readv
delete(table, 20) // writev
delete(table, 21) // access
delete(table, 22) // pipe
delete(table, 32) // dup
delete(table, 33) // dup2
delete(table, 40) // sendfile
delete(table, 59) // execve
delete(table, 72) // fcntl
delete(table, 73) // flock
delete(table, 74) // fsync
delete(table, 75) // fdatasync
delete(table, 76) // truncate
delete(table, 77) // ftruncate
delete(table, 78) // getdents
delete(table, 79) // getcwd
delete(table, 80) // chdir
delete(table, 81) // fchdir
delete(table, 82) // rename
delete(table, 83) // mkdir
delete(table, 84) // rmdir
delete(table, 85) // creat
delete(table, 86) // link
delete(table, 87) // unlink
delete(table, 88) // symlink
delete(table, 89) // readlink
delete(table, 90) // chmod
delete(table, 91) // fchmod
delete(table, 92) // chown
delete(table, 93) // fchown
delete(table, 94) // lchown
delete(table, 133) // mknod
delete(table, 137) // statfs
delete(table, 138) // fstatfs
delete(table, 161) // chroot
delete(table, 162) // sync
table[1] = syscalls.Supported("write", Write)
table[2] = syscalls.Supported("open", Open)
table[3] = syscalls.Supported("close", Close)
table[4] = syscalls.Supported("stat", Stat)
table[5] = syscalls.Supported("fstat", Fstat)
table[6] = syscalls.Supported("lstat", Lstat)
table[7] = syscalls.Supported("poll", Poll)
table[8] = syscalls.Supported("lseek", Lseek)
table[9] = syscalls.Supported("mmap", Mmap)
table[16] = syscalls.Supported("ioctl", Ioctl)
table[17] = syscalls.Supported("pread64", Pread64)
table[18] = syscalls.Supported("pwrite64", Pwrite64)
table[19] = syscalls.Supported("readv", Readv)
table[20] = syscalls.Supported("writev", Writev)
table[21] = syscalls.Supported("access", Access)
delete(table, 22) // pipe
table[23] = syscalls.Supported("select", Select)
table[32] = syscalls.Supported("dup", Dup)
table[33] = syscalls.Supported("dup2", Dup2)
delete(table, 40) // sendfile
delete(table, 41) // socket
delete(table, 42) // connect
delete(table, 43) // accept
delete(table, 44) // sendto
delete(table, 45) // recvfrom
delete(table, 46) // sendmsg
delete(table, 47) // recvmsg
delete(table, 48) // shutdown
delete(table, 49) // bind
delete(table, 50) // listen
delete(table, 51) // getsockname
delete(table, 52) // getpeername
delete(table, 53) // socketpair
delete(table, 54) // setsockopt
delete(table, 55) // getsockopt
table[59] = syscalls.Supported("execve", Execve)
table[72] = syscalls.Supported("fcntl", Fcntl)
delete(table, 73) // flock
table[74] = syscalls.Supported("fsync", Fsync)
table[75] = syscalls.Supported("fdatasync", Fdatasync)
table[76] = syscalls.Supported("truncate", Truncate)
table[77] = syscalls.Supported("ftruncate", Ftruncate)
table[78] = syscalls.Supported("getdents", Getdents)
table[79] = syscalls.Supported("getcwd", Getcwd)
table[80] = syscalls.Supported("chdir", Chdir)
table[81] = syscalls.Supported("fchdir", Fchdir)
table[82] = syscalls.Supported("rename", Rename)
table[83] = syscalls.Supported("mkdir", Mkdir)
table[84] = syscalls.Supported("rmdir", Rmdir)
table[85] = syscalls.Supported("creat", Creat)
table[86] = syscalls.Supported("link", Link)
table[87] = syscalls.Supported("unlink", Unlink)
table[88] = syscalls.Supported("symlink", Symlink)
table[89] = syscalls.Supported("readlink", Readlink)
table[90] = syscalls.Supported("chmod", Chmod)
table[91] = syscalls.Supported("fchmod", Fchmod)
table[92] = syscalls.Supported("chown", Chown)
table[93] = syscalls.Supported("fchown", Fchown)
table[94] = syscalls.Supported("lchown", Lchown)
table[132] = syscalls.Supported("utime", Utime)
table[133] = syscalls.Supported("mknod", Mknod)
table[137] = syscalls.Supported("statfs", Statfs)
table[138] = syscalls.Supported("fstatfs", Fstatfs)
table[161] = syscalls.Supported("chroot", Chroot)
table[162] = syscalls.Supported("sync", Sync)
delete(table, 165) // mount
delete(table, 166) // umount2
delete(table, 172) // iopl
delete(table, 173) // ioperm
delete(table, 187) // readahead
delete(table, 188) // setxattr
delete(table, 189) // lsetxattr
delete(table, 190) // fsetxattr
delete(table, 191) // getxattr
delete(table, 192) // lgetxattr
delete(table, 193) // fgetxattr
table[188] = syscalls.Supported("setxattr", Setxattr)
table[189] = syscalls.Supported("lsetxattr", Lsetxattr)
table[190] = syscalls.Supported("fsetxattr", Fsetxattr)
table[191] = syscalls.Supported("getxattr", Getxattr)
table[192] = syscalls.Supported("lgetxattr", Lgetxattr)
table[193] = syscalls.Supported("fgetxattr", Fgetxattr)
table[194] = syscalls.Supported("listxattr", Listxattr)
table[195] = syscalls.Supported("llistxattr", Llistxattr)
table[196] = syscalls.Supported("flistxattr", Flistxattr)
table[197] = syscalls.Supported("removexattr", Removexattr)
table[198] = syscalls.Supported("lremovexattr", Lremovexattr)
table[199] = syscalls.Supported("fremovexattr", Fremovexattr)
delete(table, 206) // io_setup
delete(table, 207) // io_destroy
delete(table, 208) // io_getevents
delete(table, 209) // io_submit
delete(table, 210) // io_cancel
delete(table, 213) // epoll_create
delete(table, 214) // epoll_ctl_old
delete(table, 215) // epoll_wait_old
delete(table, 216) // remap_file_pages
delete(table, 217) // getdents64
delete(table, 232) // epoll_wait
delete(table, 233) // epoll_ctl
table[213] = syscalls.Supported("epoll_create", EpollCreate)
table[217] = syscalls.Supported("getdents64", Getdents64)
delete(table, 221) // fdavise64
table[232] = syscalls.Supported("epoll_wait", EpollWait)
table[233] = syscalls.Supported("epoll_ctl", EpollCtl)
table[235] = syscalls.Supported("utimes", Utimes)
delete(table, 253) // inotify_init
delete(table, 254) // inotify_add_watch
delete(table, 255) // inotify_rm_watch
delete(table, 257) // openat
delete(table, 258) // mkdirat
delete(table, 259) // mknodat
delete(table, 260) // fchownat
delete(table, 261) // futimesat
delete(table, 262) // fstatat
delete(table, 263) // unlinkat
delete(table, 264) // renameat
delete(table, 265) // linkat
delete(table, 266) // symlinkat
delete(table, 267) // readlinkat
delete(table, 268) // fchmodat
delete(table, 269) // faccessat
delete(table, 270) // pselect
delete(table, 271) // ppoll
table[257] = syscalls.Supported("openat", Openat)
table[258] = syscalls.Supported("mkdirat", Mkdirat)
table[259] = syscalls.Supported("mknodat", Mknodat)
table[260] = syscalls.Supported("fchownat", Fchownat)
table[261] = syscalls.Supported("futimens", Futimens)
table[262] = syscalls.Supported("newfstatat", Newfstatat)
table[263] = syscalls.Supported("unlinkat", Unlinkat)
table[264] = syscalls.Supported("renameat", Renameat)
table[265] = syscalls.Supported("linkat", Linkat)
table[266] = syscalls.Supported("symlinkat", Symlinkat)
table[267] = syscalls.Supported("readlinkat", Readlinkat)
table[268] = syscalls.Supported("fchmodat", Fchmodat)
table[269] = syscalls.Supported("faccessat", Faccessat)
table[270] = syscalls.Supported("pselect", Pselect)
table[271] = syscalls.Supported("ppoll", Ppoll)
delete(table, 275) // splice
delete(table, 276) // tee
table[277] = syscalls.Supported("sync_file_range", SyncFileRange)
table[280] = syscalls.Supported("utimensat", Utimensat)
table[281] = syscalls.Supported("epoll_pwait", EpollPwait)
delete(table, 282) // signalfd
delete(table, 283) // timerfd_create
delete(table, 284) // eventfd
delete(table, 285) // fallocate
delete(table, 291) // epoll_create1
delete(table, 292) // dup3
delete(table, 286) // timerfd_settime
delete(table, 287) // timerfd_gettime
delete(table, 288) // accept4
delete(table, 289) // signalfd4
delete(table, 290) // eventfd2
table[291] = syscalls.Supported("epoll_create1", EpollCreate1)
table[292] = syscalls.Supported("dup3", Dup3)
delete(table, 293) // pipe2
delete(table, 294) // inotify_init1
delete(table, 295) // preadv
delete(table, 296) // pwritev
delete(table, 306) // syncfs
delete(table, 316) // renameat2
table[295] = syscalls.Supported("preadv", Preadv)
table[296] = syscalls.Supported("pwritev", Pwritev)
delete(table, 299) // recvmmsg
table[306] = syscalls.Supported("syncfs", Syncfs)
delete(table, 307) // sendmmsg
table[316] = syscalls.Supported("renameat2", Renameat2)
delete(table, 319) // memfd_create
delete(table, 322) // execveat
delete(table, 327) // preadv2
delete(table, 328) // pwritev2
delete(table, 332) // statx
table[322] = syscalls.Supported("execveat", Execveat)
table[327] = syscalls.Supported("preadv2", Preadv2)
table[328] = syscalls.Supported("pwritev2", Pwritev2)
table[332] = syscalls.Supported("statx", Statx)
}

View File

@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// +build arm64
package vfs2
import (

View File

@ -0,0 +1,92 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
// Mmap implements Linux syscall mmap(2).
func Mmap(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
prot := args[2].Int()
flags := args[3].Int()
fd := args[4].Int()
fixed := flags&linux.MAP_FIXED != 0
private := flags&linux.MAP_PRIVATE != 0
shared := flags&linux.MAP_SHARED != 0
anon := flags&linux.MAP_ANONYMOUS != 0
map32bit := flags&linux.MAP_32BIT != 0
// Require exactly one of MAP_PRIVATE and MAP_SHARED.
if private == shared {
return 0, nil, syserror.EINVAL
}
opts := memmap.MMapOpts{
Length: args[1].Uint64(),
Offset: args[5].Uint64(),
Addr: args[0].Pointer(),
Fixed: fixed,
Unmap: fixed,
Map32Bit: map32bit,
Private: private,
Perms: usermem.AccessType{
Read: linux.PROT_READ&prot != 0,
Write: linux.PROT_WRITE&prot != 0,
Execute: linux.PROT_EXEC&prot != 0,
},
MaxPerms: usermem.AnyAccess,
GrowsDown: linux.MAP_GROWSDOWN&flags != 0,
Precommit: linux.MAP_POPULATE&flags != 0,
}
if linux.MAP_LOCKED&flags != 0 {
opts.MLockMode = memmap.MLockEager
}
defer func() {
if opts.MappingIdentity != nil {
opts.MappingIdentity.DecRef()
}
}()
if !anon {
// Convert the passed FD to a file reference.
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// mmap unconditionally requires that the FD is readable.
if !file.IsReadable() {
return 0, nil, syserror.EACCES
}
// MAP_SHARED requires that the FD be writable for PROT_WRITE.
if shared && !file.IsWritable() {
opts.MaxPerms.Write = false
}
if err := file.ConfigureMMap(t, &opts); err != nil {
return 0, nil, err
}
}
rv, err := t.MemoryManager().MMap(t, opts)
return uintptr(rv), nil, err
}

View File

@ -0,0 +1,94 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
func copyInPath(t *kernel.Task, addr usermem.Addr) (fspath.Path, error) {
pathname, err := t.CopyInString(addr, linux.PATH_MAX)
if err != nil {
return fspath.Path{}, err
}
return fspath.Parse(pathname), nil
}
type taskPathOperation struct {
pop vfs.PathOperation
haveStartRef bool
}
func getTaskPathOperation(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink) (taskPathOperation, error) {
root := t.FSContext().RootDirectoryVFS2()
start := root
haveStartRef := false
if !path.Absolute {
if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
root.DecRef()
return taskPathOperation{}, syserror.ENOENT
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
haveStartRef = true
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
root.DecRef()
return taskPathOperation{}, syserror.EBADF
}
start = dirfile.VirtualDentry()
start.IncRef()
haveStartRef = true
dirfile.DecRef()
}
}
return taskPathOperation{
pop: vfs.PathOperation{
Root: root,
Start: start,
Path: path,
FollowFinalSymlink: bool(shouldFollowFinalSymlink),
},
haveStartRef: haveStartRef,
}, nil
}
func (tpop *taskPathOperation) Release() {
tpop.pop.Root.DecRef()
if tpop.haveStartRef {
tpop.pop.Start.DecRef()
tpop.haveStartRef = false
}
}
type shouldAllowEmptyPath bool
const (
disallowEmptyPath shouldAllowEmptyPath = false
allowEmptyPath shouldAllowEmptyPath = true
)
type shouldFollowFinalSymlink bool
const (
nofollowFinalSymlink shouldFollowFinalSymlink = false
followFinalSymlink shouldFollowFinalSymlink = true
)

View File

@ -0,0 +1,584 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"fmt"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/limits"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
// fileCap is the maximum allowable files for poll & select. This has no
// equivalent in Linux; it exists in gVisor since allocation failure in Go is
// unrecoverable.
const fileCap = 1024 * 1024
// Masks for "readable", "writable", and "exceptional" events as defined by
// select(2).
const (
// selectReadEvents is analogous to the Linux kernel's
// fs/select.c:POLLIN_SET.
selectReadEvents = linux.POLLIN | linux.POLLHUP | linux.POLLERR
// selectWriteEvents is analogous to the Linux kernel's
// fs/select.c:POLLOUT_SET.
selectWriteEvents = linux.POLLOUT | linux.POLLERR
// selectExceptEvents is analogous to the Linux kernel's
// fs/select.c:POLLEX_SET.
selectExceptEvents = linux.POLLPRI
)
// pollState tracks the associated file description and waiter of a PollFD.
type pollState struct {
file *vfs.FileDescription
waiter waiter.Entry
}
// initReadiness gets the current ready mask for the file represented by the FD
// stored in pfd.FD. If a channel is passed in, the waiter entry in "state" is
// used to register with the file for event notifications, and a reference to
// the file is stored in "state".
func initReadiness(t *kernel.Task, pfd *linux.PollFD, state *pollState, ch chan struct{}) {
if pfd.FD < 0 {
pfd.REvents = 0
return
}
file := t.GetFileVFS2(pfd.FD)
if file == nil {
pfd.REvents = linux.POLLNVAL
return
}
if ch == nil {
defer file.DecRef()
} else {
state.file = file
state.waiter, _ = waiter.NewChannelEntry(ch)
file.EventRegister(&state.waiter, waiter.EventMaskFromLinux(uint32(pfd.Events)))
}
r := file.Readiness(waiter.EventMaskFromLinux(uint32(pfd.Events)))
pfd.REvents = int16(r.ToLinux()) & pfd.Events
}
// releaseState releases all the pollState in "state".
func releaseState(state []pollState) {
for i := range state {
if state[i].file != nil {
state[i].file.EventUnregister(&state[i].waiter)
state[i].file.DecRef()
}
}
}
// pollBlock polls the PollFDs in "pfd" with a bounded time specified in "timeout"
// when "timeout" is greater than zero.
//
// pollBlock returns the remaining timeout, which is always 0 on a timeout; and 0 or
// positive if interrupted by a signal.
func pollBlock(t *kernel.Task, pfd []linux.PollFD, timeout time.Duration) (time.Duration, uintptr, error) {
var ch chan struct{}
if timeout != 0 {
ch = make(chan struct{}, 1)
}
// Register for event notification in the files involved if we may
// block (timeout not zero). Once we find a file that has a non-zero
// result, we stop registering for events but still go through all files
// to get their ready masks.
state := make([]pollState, len(pfd))
defer releaseState(state)
n := uintptr(0)
for i := range pfd {
initReadiness(t, &pfd[i], &state[i], ch)
if pfd[i].REvents != 0 {
n++
ch = nil
}
}
if timeout == 0 {
return timeout, n, nil
}
haveTimeout := timeout >= 0
for n == 0 {
var err error
// Wait for a notification.
timeout, err = t.BlockWithTimeout(ch, haveTimeout, timeout)
if err != nil {
if err == syserror.ETIMEDOUT {
err = nil
}
return timeout, 0, err
}
// We got notified, count how many files are ready. If none,
// then this was a spurious notification, and we just go back
// to sleep with the remaining timeout.
for i := range state {
if state[i].file == nil {
continue
}
r := state[i].file.Readiness(waiter.EventMaskFromLinux(uint32(pfd[i].Events)))
rl := int16(r.ToLinux()) & pfd[i].Events
if rl != 0 {
pfd[i].REvents = rl
n++
}
}
}
return timeout, n, nil
}
// copyInPollFDs copies an array of struct pollfd unless nfds exceeds the max.
func copyInPollFDs(t *kernel.Task, addr usermem.Addr, nfds uint) ([]linux.PollFD, error) {
if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
return nil, syserror.EINVAL
}
pfd := make([]linux.PollFD, nfds)
if nfds > 0 {
if _, err := t.CopyIn(addr, &pfd); err != nil {
return nil, err
}
}
return pfd, nil
}
func doPoll(t *kernel.Task, addr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
pfd, err := copyInPollFDs(t, addr, nfds)
if err != nil {
return timeout, 0, err
}
// Compatibility warning: Linux adds POLLHUP and POLLERR just before
// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
// polling, changing event masks here is an application-visible difference.
// (Linux also doesn't copy out event masks at all, only revents.)
for i := range pfd {
pfd[i].Events |= linux.POLLHUP | linux.POLLERR
}
remainingTimeout, n, err := pollBlock(t, pfd, timeout)
err = syserror.ConvertIntr(err, syserror.EINTR)
// The poll entries are copied out regardless of whether
// any are set or not. This aligns with the Linux behavior.
if nfds > 0 && err == nil {
if _, err := t.CopyOut(addr, pfd); err != nil {
return remainingTimeout, 0, err
}
}
return remainingTimeout, n, err
}
// CopyInFDSet copies an fd set from select(2)/pselect(2).
func CopyInFDSet(t *kernel.Task, addr usermem.Addr, nBytes, nBitsInLastPartialByte int) ([]byte, error) {
set := make([]byte, nBytes)
if addr != 0 {
if _, err := t.CopyIn(addr, &set); err != nil {
return nil, err
}
// If we only use part of the last byte, mask out the extraneous bits.
//
// N.B. This only works on little-endian architectures.
if nBitsInLastPartialByte != 0 {
set[nBytes-1] &^= byte(0xff) << nBitsInLastPartialByte
}
}
return set, nil
}
func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
if nfds < 0 || nfds > fileCap {
return 0, syserror.EINVAL
}
// Calculate the size of the fd sets (one bit per fd).
nBytes := (nfds + 7) / 8
nBitsInLastPartialByte := nfds % 8
// Capture all the provided input vectors.
r, err := CopyInFDSet(t, readFDs, nBytes, nBitsInLastPartialByte)
if err != nil {
return 0, err
}
w, err := CopyInFDSet(t, writeFDs, nBytes, nBitsInLastPartialByte)
if err != nil {
return 0, err
}
e, err := CopyInFDSet(t, exceptFDs, nBytes, nBitsInLastPartialByte)
if err != nil {
return 0, err
}
// Count how many FDs are actually being requested so that we can build
// a PollFD array.
fdCount := 0
for i := 0; i < nBytes; i++ {
v := r[i] | w[i] | e[i]
for v != 0 {
v &= (v - 1)
fdCount++
}
}
// Build the PollFD array.
pfd := make([]linux.PollFD, 0, fdCount)
var fd int32
for i := 0; i < nBytes; i++ {
rV, wV, eV := r[i], w[i], e[i]
v := rV | wV | eV
m := byte(1)
for j := 0; j < 8; j++ {
if (v & m) != 0 {
// Make sure the fd is valid and decrement the reference
// immediately to ensure we don't leak. Note, another thread
// might be about to close fd. This is racy, but that's
// OK. Linux is racy in the same way.
file := t.GetFileVFS2(fd)
if file == nil {
return 0, syserror.EBADF
}
file.DecRef()
var mask int16
if (rV & m) != 0 {
mask |= selectReadEvents
}
if (wV & m) != 0 {
mask |= selectWriteEvents
}
if (eV & m) != 0 {
mask |= selectExceptEvents
}
pfd = append(pfd, linux.PollFD{
FD: fd,
Events: mask,
})
}
fd++
m <<= 1
}
}
// Do the syscall, then count the number of bits set.
if _, _, err = pollBlock(t, pfd, timeout); err != nil {
return 0, syserror.ConvertIntr(err, syserror.EINTR)
}
// r, w, and e are currently event mask bitsets; unset bits corresponding
// to events that *didn't* occur.
bitSetCount := uintptr(0)
for idx := range pfd {
events := pfd[idx].REvents
i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
m := byte(1) << j
if r[i]&m != 0 {
if (events & selectReadEvents) != 0 {
bitSetCount++
} else {
r[i] &^= m
}
}
if w[i]&m != 0 {
if (events & selectWriteEvents) != 0 {
bitSetCount++
} else {
w[i] &^= m
}
}
if e[i]&m != 0 {
if (events & selectExceptEvents) != 0 {
bitSetCount++
} else {
e[i] &^= m
}
}
}
// Copy updated vectors back.
if readFDs != 0 {
if _, err := t.CopyOut(readFDs, r); err != nil {
return 0, err
}
}
if writeFDs != 0 {
if _, err := t.CopyOut(writeFDs, w); err != nil {
return 0, err
}
}
if exceptFDs != 0 {
if _, err := t.CopyOut(exceptFDs, e); err != nil {
return 0, err
}
}
return bitSetCount, nil
}
// timeoutRemaining returns the amount of time remaining for the specified
// timeout or 0 if it has elapsed.
//
// startNs must be from CLOCK_MONOTONIC.
func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
now := t.Kernel().MonotonicClock().Now()
remaining := timeout - now.Sub(startNs)
if remaining < 0 {
remaining = 0
}
return remaining
}
// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
//
// startNs must be from CLOCK_MONOTONIC.
func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
if timeout <= 0 {
return nil
}
remaining := timeoutRemaining(t, startNs, timeout)
tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
return tsRemaining.CopyOut(t, timespecAddr)
}
// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
//
// startNs must be from CLOCK_MONOTONIC.
func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
if timeout <= 0 {
return nil
}
remaining := timeoutRemaining(t, startNs, timeout)
tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
return tvRemaining.CopyOut(t, timevalAddr)
}
// pollRestartBlock encapsulates the state required to restart poll(2) via
// restart_syscall(2).
//
// +stateify savable
type pollRestartBlock struct {
pfdAddr usermem.Addr
nfds uint
timeout time.Duration
}
// Restart implements kernel.SyscallRestartBlock.Restart.
func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
return poll(t, p.pfdAddr, p.nfds, p.timeout)
}
func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
// On an interrupt poll(2) is restarted with the remaining timeout.
if err == syserror.EINTR {
t.SetSyscallRestartBlock(&pollRestartBlock{
pfdAddr: pfdAddr,
nfds: nfds,
timeout: remainingTimeout,
})
return 0, kernel.ERESTART_RESTARTBLOCK
}
return n, err
}
// Poll implements linux syscall poll(2).
func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pfdAddr := args[0].Pointer()
nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
timeout := time.Duration(args[2].Int()) * time.Millisecond
n, err := poll(t, pfdAddr, nfds, timeout)
return n, nil, err
}
// Ppoll implements linux syscall ppoll(2).
func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pfdAddr := args[0].Pointer()
nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
timespecAddr := args[2].Pointer()
maskAddr := args[3].Pointer()
maskSize := uint(args[4].Uint())
timeout, err := copyTimespecInToDuration(t, timespecAddr)
if err != nil {
return 0, nil, err
}
var startNs ktime.Time
if timeout > 0 {
startNs = t.Kernel().MonotonicClock().Now()
}
if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
return 0, nil, err
}
_, n, err := doPoll(t, pfdAddr, nfds, timeout)
copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
// doPoll returns EINTR if interrupted, but ppoll is normally restartable
// if interrupted by something other than a signal handled by the
// application (i.e. returns ERESTARTNOHAND). However, if
// copyOutTimespecRemaining failed, then the restarted ppoll would use the
// wrong timeout, so the error should be left as EINTR.
//
// Note that this means that if err is nil but copyErr is not, copyErr is
// ignored. This is consistent with Linux.
if err == syserror.EINTR && copyErr == nil {
err = kernel.ERESTARTNOHAND
}
return n, nil, err
}
// Select implements linux syscall select(2).
func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nfds := int(args[0].Int()) // select(2) uses an int.
readFDs := args[1].Pointer()
writeFDs := args[2].Pointer()
exceptFDs := args[3].Pointer()
timevalAddr := args[4].Pointer()
// Use a negative Duration to indicate "no timeout".
timeout := time.Duration(-1)
if timevalAddr != 0 {
var timeval linux.Timeval
if err := timeval.CopyIn(t, timevalAddr); err != nil {
return 0, nil, err
}
if timeval.Sec < 0 || timeval.Usec < 0 {
return 0, nil, syserror.EINVAL
}
timeout = time.Duration(timeval.ToNsecCapped())
}
startNs := t.Kernel().MonotonicClock().Now()
n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
err = kernel.ERESTARTNOHAND
}
return n, nil, err
}
// Pselect implements linux syscall pselect(2).
func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nfds := int(args[0].Int()) // select(2) uses an int.
readFDs := args[1].Pointer()
writeFDs := args[2].Pointer()
exceptFDs := args[3].Pointer()
timespecAddr := args[4].Pointer()
maskWithSizeAddr := args[5].Pointer()
timeout, err := copyTimespecInToDuration(t, timespecAddr)
if err != nil {
return 0, nil, err
}
var startNs ktime.Time
if timeout > 0 {
startNs = t.Kernel().MonotonicClock().Now()
}
if maskWithSizeAddr != 0 {
if t.Arch().Width() != 8 {
panic(fmt.Sprintf("unsupported sizeof(void*): %d", t.Arch().Width()))
}
var maskStruct sigSetWithSize
if err := maskStruct.CopyIn(t, maskWithSizeAddr); err != nil {
return 0, nil, err
}
if err := setTempSignalSet(t, usermem.Addr(maskStruct.sigsetAddr), uint(maskStruct.sizeofSigset)); err != nil {
return 0, nil, err
}
}
n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
err = kernel.ERESTARTNOHAND
}
return n, nil, err
}
// +marshal
type sigSetWithSize struct {
sigsetAddr uint64
sizeofSigset uint64
}
// copyTimespecInToDuration copies a Timespec from the untrusted app range,
// validates it and converts it to a Duration.
//
// If the Timespec is larger than what can be represented in a Duration, the
// returned value is the maximum that Duration will allow.
//
// If timespecAddr is NULL, the returned value is negative.
func copyTimespecInToDuration(t *kernel.Task, timespecAddr usermem.Addr) (time.Duration, error) {
// Use a negative Duration to indicate "no timeout".
timeout := time.Duration(-1)
if timespecAddr != 0 {
var timespec linux.Timespec
if err := timespec.CopyIn(t, timespecAddr); err != nil {
return 0, err
}
if !timespec.Valid() {
return 0, syserror.EINVAL
}
timeout = time.Duration(timespec.ToNsecCapped())
}
return timeout, nil
}
func setTempSignalSet(t *kernel.Task, maskAddr usermem.Addr, maskSize uint) error {
if maskAddr == 0 {
return nil
}
if maskSize != linux.SignalSetSize {
return syserror.EINVAL
}
var mask linux.SignalSet
if err := mask.CopyIn(t, maskAddr); err != nil {
return err
}
mask &^= kernel.UnblockableSignals
oldmask := t.SignalMask()
t.SetSignalMask(mask)
t.SetSavedSignalMask(oldmask)
return nil
}

View File

@ -0,0 +1,511 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
slinux "gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
const (
eventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
eventMaskWrite = waiter.EventOut | waiter.EventHUp | waiter.EventErr
)
// Read implements Linux syscall read(2).
func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the size is legitimate.
si := int(size)
if si < 0 {
return 0, nil, syserror.EINVAL
}
// Get the destination of the read.
dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := read(t, file, dst, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
}
// Readv implements Linux syscall readv(2).
func Readv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Get the destination of the read.
dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := read(t, file, dst, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "readv", file)
}
func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
n, err := file.Read(t, dst, opts)
if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
return n, err
}
// Register for notifications.
w, ch := waiter.NewChannelEntry(nil)
file.EventRegister(&w, eventMaskRead)
total := n
for {
// Shorten dst to reflect bytes previously read.
dst = dst.DropFirst(int(n))
// Issue the request and break out if it completes with anything other than
// "would block".
n, err := file.Read(t, dst, opts)
total += n
if err != syserror.ErrWouldBlock {
break
}
if err := t.Block(ch); err != nil {
break
}
}
file.EventUnregister(&w)
return total, err
}
// Pread64 implements Linux syscall pread64(2).
func Pread64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
offset := args[3].Int64()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the offset is legitimate.
if offset < 0 {
return 0, nil, syserror.EINVAL
}
// Check that the size is legitimate.
si := int(size)
if si < 0 {
return 0, nil, syserror.EINVAL
}
// Get the destination of the read.
dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pread64", file)
}
// Preadv implements Linux syscall preadv(2).
func Preadv(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the offset is legitimate.
if offset < 0 {
return 0, nil, syserror.EINVAL
}
// Get the destination of the read.
dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := pread(t, file, dst, offset, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv", file)
}
// Preadv2 implements Linux syscall preadv2(2).
func Preadv2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// While the glibc signature is
// preadv2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
// the actual syscall
// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1142)
// splits the offset argument into a high/low value for compatibility with
// 32-bit architectures. The flags argument is the 6th argument (index 5).
fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
flags := args[5].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the offset is legitimate.
if offset < -1 {
return 0, nil, syserror.EINVAL
}
// Get the destination of the read.
dst, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
opts := vfs.ReadOptions{
Flags: uint32(flags),
}
var n int64
if offset == -1 {
n, err = read(t, file, dst, opts)
} else {
n, err = pread(t, file, dst, offset, opts)
}
t.IOUsage().AccountReadSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "preadv2", file)
}
func pread(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, offset int64, opts vfs.ReadOptions) (int64, error) {
n, err := file.PRead(t, dst, offset, opts)
if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
return n, err
}
// Register for notifications.
w, ch := waiter.NewChannelEntry(nil)
file.EventRegister(&w, eventMaskRead)
total := n
for {
// Shorten dst to reflect bytes previously read.
dst = dst.DropFirst(int(n))
// Issue the request and break out if it completes with anything other than
// "would block".
n, err := file.PRead(t, dst, offset+total, opts)
total += n
if err != syserror.ErrWouldBlock {
break
}
if err := t.Block(ch); err != nil {
break
}
}
file.EventUnregister(&w)
return total, err
}
// Write implements Linux syscall write(2).
func Write(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the size is legitimate.
si := int(size)
if si < 0 {
return 0, nil, syserror.EINVAL
}
// Get the source of the write.
src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := write(t, file, src, vfs.WriteOptions{})
t.IOUsage().AccountWriteSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "write", file)
}
// Writev implements Linux syscall writev(2).
func Writev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Get the source of the write.
src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := write(t, file, src, vfs.WriteOptions{})
t.IOUsage().AccountWriteSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "writev", file)
}
func write(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, opts vfs.WriteOptions) (int64, error) {
n, err := file.Write(t, src, opts)
if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
return n, err
}
// Register for notifications.
w, ch := waiter.NewChannelEntry(nil)
file.EventRegister(&w, eventMaskWrite)
total := n
for {
// Shorten src to reflect bytes previously written.
src = src.DropFirst(int(n))
// Issue the request and break out if it completes with anything other than
// "would block".
n, err := file.Write(t, src, opts)
total += n
if err != syserror.ErrWouldBlock {
break
}
if err := t.Block(ch); err != nil {
break
}
}
file.EventUnregister(&w)
return total, err
}
// Pwrite64 implements Linux syscall pwrite64(2).
func Pwrite64(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
offset := args[3].Int64()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the offset is legitimate.
if offset < 0 {
return 0, nil, syserror.EINVAL
}
// Check that the size is legitimate.
si := int(size)
if si < 0 {
return 0, nil, syserror.EINVAL
}
// Get the source of the write.
src, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
t.IOUsage().AccountWriteSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwrite64", file)
}
// Pwritev implements Linux syscall pwritev(2).
func Pwritev(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the offset is legitimate.
if offset < 0 {
return 0, nil, syserror.EINVAL
}
// Get the source of the write.
src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := pwrite(t, file, src, offset, vfs.WriteOptions{})
t.IOUsage().AccountReadSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev", file)
}
// Pwritev2 implements Linux syscall pwritev2(2).
func Pwritev2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// While the glibc signature is
// pwritev2(int fd, struct iovec* iov, int iov_cnt, off_t offset, int flags)
// the actual syscall
// (https://elixir.bootlin.com/linux/v5.5/source/fs/read_write.c#L1162)
// splits the offset argument into a high/low value for compatibility with
// 32-bit architectures. The flags argument is the 6th argument (index 5).
fd := args[0].Int()
addr := args[1].Pointer()
iovcnt := int(args[2].Int())
offset := args[3].Int64()
flags := args[5].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the offset is legitimate.
if offset < -1 {
return 0, nil, syserror.EINVAL
}
// Get the source of the write.
src, err := t.IovecsIOSequence(addr, iovcnt, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
opts := vfs.WriteOptions{
Flags: uint32(flags),
}
var n int64
if offset == -1 {
n, err = write(t, file, src, opts)
} else {
n, err = pwrite(t, file, src, offset, opts)
}
t.IOUsage().AccountWriteSyscall(n)
return uintptr(n), nil, slinux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "pwritev2", file)
}
func pwrite(t *kernel.Task, file *vfs.FileDescription, src usermem.IOSequence, offset int64, opts vfs.WriteOptions) (int64, error) {
n, err := file.PWrite(t, src, offset, opts)
if err != syserror.ErrWouldBlock || file.StatusFlags()&linux.O_NONBLOCK != 0 {
return n, err
}
// Register for notifications.
w, ch := waiter.NewChannelEntry(nil)
file.EventRegister(&w, eventMaskWrite)
total := n
for {
// Shorten src to reflect bytes previously written.
src = src.DropFirst(int(n))
// Issue the request and break out if it completes with anything other than
// "would block".
n, err := file.PWrite(t, src, offset+total, opts)
total += n
if err != syserror.ErrWouldBlock {
break
}
if err := t.Block(ch); err != nil {
break
}
}
file.EventUnregister(&w)
return total, err
}
// Lseek implements Linux syscall lseek(2).
func Lseek(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
offset := args[1].Int64()
whence := args[2].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
newoff, err := file.Seek(t, offset, whence)
return uintptr(newoff), nil, err
}

View File

@ -0,0 +1,380 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
const chmodMask = 0777 | linux.S_ISUID | linux.S_ISGID | linux.S_ISVTX
// Chmod implements Linux syscall chmod(2).
func Chmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
mode := args[1].ModeT()
return 0, nil, fchmodat(t, linux.AT_FDCWD, pathAddr, mode)
}
// Fchmodat implements Linux syscall fchmodat(2).
func Fchmodat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
mode := args[2].ModeT()
return 0, nil, fchmodat(t, dirfd, pathAddr, mode)
}
func fchmodat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, mode uint) error {
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
return setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_MODE,
Mode: uint16(mode & chmodMask),
},
})
}
// Fchmod implements Linux syscall fchmod(2).
func Fchmod(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
mode := args[1].ModeT()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
return 0, nil, file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_MODE,
Mode: uint16(mode & chmodMask),
},
})
}
// Chown implements Linux syscall chown(2).
func Chown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
owner := args[1].Int()
group := args[2].Int()
return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, 0 /* flags */)
}
// Lchown implements Linux syscall lchown(2).
func Lchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
owner := args[1].Int()
group := args[2].Int()
return 0, nil, fchownat(t, linux.AT_FDCWD, pathAddr, owner, group, linux.AT_SYMLINK_NOFOLLOW)
}
// Fchownat implements Linux syscall fchownat(2).
func Fchownat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
owner := args[2].Int()
group := args[3].Int()
flags := args[4].Int()
return 0, nil, fchownat(t, dirfd, pathAddr, owner, group, flags)
}
func fchownat(t *kernel.Task, dirfd int32, pathAddr usermem.Addr, owner, group, flags int32) error {
if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
return syserror.EINVAL
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
return err
}
return setstatat(t, dirfd, path, shouldAllowEmptyPath(flags&linux.AT_EMPTY_PATH != 0), shouldFollowFinalSymlink(flags&linux.AT_SYMLINK_NOFOLLOW == 0), &opts)
}
func populateSetStatOptionsForChown(t *kernel.Task, owner, group int32, opts *vfs.SetStatOptions) error {
userns := t.UserNamespace()
if owner != -1 {
kuid := userns.MapToKUID(auth.UID(owner))
if !kuid.Ok() {
return syserror.EINVAL
}
opts.Stat.Mask |= linux.STATX_UID
opts.Stat.UID = uint32(kuid)
}
if group != -1 {
kgid := userns.MapToKGID(auth.GID(group))
if !kgid.Ok() {
return syserror.EINVAL
}
opts.Stat.Mask |= linux.STATX_GID
opts.Stat.GID = uint32(kgid)
}
return nil
}
// Fchown implements Linux syscall fchown(2).
func Fchown(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
owner := args[1].Int()
group := args[2].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForChown(t, owner, group, &opts); err != nil {
return 0, nil, err
}
return 0, nil, file.SetStat(t, opts)
}
// Truncate implements Linux syscall truncate(2).
func Truncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
addr := args[0].Pointer()
length := args[1].Int64()
if length < 0 {
return 0, nil, syserror.EINVAL
}
path, err := copyInPath(t, addr)
if err != nil {
return 0, nil, err
}
return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_SIZE,
Size: uint64(length),
},
})
}
// Ftruncate implements Linux syscall ftruncate(2).
func Ftruncate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
length := args[1].Int64()
if length < 0 {
return 0, nil, syserror.EINVAL
}
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
return 0, nil, file.SetStat(t, vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_SIZE,
Size: uint64(length),
},
})
}
// Utime implements Linux syscall utime(2).
func Utime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
timesAddr := args[1].Pointer()
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
opts := vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_ATIME | linux.STATX_MTIME,
},
}
if timesAddr == 0 {
opts.Stat.Atime.Nsec = linux.UTIME_NOW
opts.Stat.Mtime.Nsec = linux.UTIME_NOW
} else {
var times linux.Utime
if err := times.CopyIn(t, timesAddr); err != nil {
return 0, nil, err
}
opts.Stat.Atime.Sec = times.Actime
opts.Stat.Mtime.Sec = times.Modtime
}
return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
}
// Utimes implements Linux syscall utimes(2).
func Utimes(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
timesAddr := args[1].Pointer()
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
opts := vfs.SetStatOptions{
Stat: linux.Statx{
Mask: linux.STATX_ATIME | linux.STATX_MTIME,
},
}
if timesAddr == 0 {
opts.Stat.Atime.Nsec = linux.UTIME_NOW
opts.Stat.Mtime.Nsec = linux.UTIME_NOW
} else {
var times [2]linux.Timeval
if _, err := t.CopyIn(timesAddr, &times); err != nil {
return 0, nil, err
}
opts.Stat.Atime = linux.StatxTimestamp{
Sec: times[0].Sec,
Nsec: uint32(times[0].Usec * 1000),
}
opts.Stat.Mtime = linux.StatxTimestamp{
Sec: times[1].Sec,
Nsec: uint32(times[1].Usec * 1000),
}
}
return 0, nil, setstatat(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink, &opts)
}
// Utimensat implements Linux syscall utimensat(2).
func Utimensat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
timesAddr := args[2].Pointer()
flags := args[3].Int()
if flags&^linux.AT_SYMLINK_NOFOLLOW != 0 {
return 0, nil, syserror.EINVAL
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
return 0, nil, err
}
return 0, nil, setstatat(t, dirfd, path, disallowEmptyPath, followFinalSymlink, &opts)
}
// Futimens implements Linux syscall futimens(2).
func Futimens(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
timesAddr := args[1].Pointer()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
var opts vfs.SetStatOptions
if err := populateSetStatOptionsForUtimens(t, timesAddr, &opts); err != nil {
return 0, nil, err
}
return 0, nil, file.SetStat(t, opts)
}
func populateSetStatOptionsForUtimens(t *kernel.Task, timesAddr usermem.Addr, opts *vfs.SetStatOptions) error {
if timesAddr == 0 {
opts.Stat.Mask = linux.STATX_ATIME | linux.STATX_MTIME
opts.Stat.Atime.Nsec = linux.UTIME_NOW
opts.Stat.Mtime.Nsec = linux.UTIME_NOW
return nil
}
var times [2]linux.Timespec
if _, err := t.CopyIn(timesAddr, &times); err != nil {
return err
}
if times[0].Nsec != linux.UTIME_OMIT {
opts.Stat.Mask |= linux.STATX_ATIME
opts.Stat.Atime = linux.StatxTimestamp{
Sec: times[0].Sec,
Nsec: uint32(times[0].Nsec),
}
}
if times[1].Nsec != linux.UTIME_OMIT {
opts.Stat.Mask |= linux.STATX_MTIME
opts.Stat.Mtime = linux.StatxTimestamp{
Sec: times[1].Sec,
Nsec: uint32(times[1].Nsec),
}
}
return nil
}
func setstatat(t *kernel.Task, dirfd int32, path fspath.Path, shouldAllowEmptyPath shouldAllowEmptyPath, shouldFollowFinalSymlink shouldFollowFinalSymlink, opts *vfs.SetStatOptions) error {
root := t.FSContext().RootDirectoryVFS2()
defer root.DecRef()
start := root
if !path.Absolute {
if !path.HasComponents() && !bool(shouldAllowEmptyPath) {
return syserror.ENOENT
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
defer start.DecRef()
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
return syserror.EBADF
}
if !path.HasComponents() {
// Use FileDescription.SetStat() instead of
// VirtualFilesystem.SetStatAt(), since the former may be able
// to use opened file state to expedite the SetStat.
err := dirfile.SetStat(t, *opts)
dirfile.DecRef()
return err
}
start = dirfile.VirtualDentry()
start.IncRef()
defer start.DecRef()
dirfile.DecRef()
}
}
return t.Kernel().VFS().SetStatAt(t, t.Credentials(), &vfs.PathOperation{
Root: root,
Start: start,
Path: path,
FollowFinalSymlink: bool(shouldFollowFinalSymlink),
}, opts)
}

View File

@ -0,0 +1,346 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
// Stat implements Linux syscall stat(2).
func Stat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
statAddr := args[1].Pointer()
return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, 0 /* flags */)
}
// Lstat implements Linux syscall lstat(2).
func Lstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
statAddr := args[1].Pointer()
return 0, nil, fstatat(t, linux.AT_FDCWD, pathAddr, statAddr, linux.AT_SYMLINK_NOFOLLOW)
}
// Newfstatat implements Linux syscall newfstatat, which backs fstatat(2).
func Newfstatat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
statAddr := args[2].Pointer()
flags := args[3].Int()
return 0, nil, fstatat(t, dirfd, pathAddr, statAddr, flags)
}
func fstatat(t *kernel.Task, dirfd int32, pathAddr, statAddr usermem.Addr, flags int32) error {
if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
return syserror.EINVAL
}
opts := vfs.StatOptions{
Mask: linux.STATX_BASIC_STATS,
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
root := t.FSContext().RootDirectoryVFS2()
defer root.DecRef()
start := root
if !path.Absolute {
if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
return syserror.ENOENT
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
defer start.DecRef()
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
return syserror.EBADF
}
if !path.HasComponents() {
// Use FileDescription.Stat() instead of
// VirtualFilesystem.StatAt() for fstatat(fd, ""), since the
// former may be able to use opened file state to expedite the
// Stat.
statx, err := dirfile.Stat(t, opts)
dirfile.DecRef()
if err != nil {
return err
}
var stat linux.Stat
convertStatxToUserStat(t, &statx, &stat)
return stat.CopyOut(t, statAddr)
}
start = dirfile.VirtualDentry()
start.IncRef()
defer start.DecRef()
dirfile.DecRef()
}
}
statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
Root: root,
Start: start,
Path: path,
FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
}, &opts)
if err != nil {
return err
}
var stat linux.Stat
convertStatxToUserStat(t, &statx, &stat)
return stat.CopyOut(t, statAddr)
}
// This takes both input and output as pointer arguments to avoid copying large
// structs.
func convertStatxToUserStat(t *kernel.Task, statx *linux.Statx, stat *linux.Stat) {
// Linux just copies fields from struct kstat without regard to struct
// kstat::result_mask (fs/stat.c:cp_new_stat()), so we do too.
userns := t.UserNamespace()
*stat = linux.Stat{
Dev: uint64(linux.MakeDeviceID(uint16(statx.DevMajor), statx.DevMinor)),
Ino: statx.Ino,
Nlink: uint64(statx.Nlink),
Mode: uint32(statx.Mode),
UID: uint32(auth.KUID(statx.UID).In(userns).OrOverflow()),
GID: uint32(auth.KGID(statx.GID).In(userns).OrOverflow()),
Rdev: uint64(linux.MakeDeviceID(uint16(statx.RdevMajor), statx.RdevMinor)),
Size: int64(statx.Size),
Blksize: int64(statx.Blksize),
Blocks: int64(statx.Blocks),
ATime: timespecFromStatxTimestamp(statx.Atime),
MTime: timespecFromStatxTimestamp(statx.Mtime),
CTime: timespecFromStatxTimestamp(statx.Ctime),
}
}
func timespecFromStatxTimestamp(sxts linux.StatxTimestamp) linux.Timespec {
return linux.Timespec{
Sec: sxts.Sec,
Nsec: int64(sxts.Nsec),
}
}
// Fstat implements Linux syscall fstat(2).
func Fstat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
statAddr := args[1].Pointer()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
statx, err := file.Stat(t, vfs.StatOptions{
Mask: linux.STATX_BASIC_STATS,
})
if err != nil {
return 0, nil, err
}
var stat linux.Stat
convertStatxToUserStat(t, &statx, &stat)
return 0, nil, stat.CopyOut(t, statAddr)
}
// Statx implements Linux syscall statx(2).
func Statx(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
flags := args[2].Int()
mask := args[3].Uint()
statxAddr := args[4].Pointer()
if flags&^(linux.AT_EMPTY_PATH|linux.AT_SYMLINK_NOFOLLOW) != 0 {
return 0, nil, syserror.EINVAL
}
opts := vfs.StatOptions{
Mask: mask,
Sync: uint32(flags & linux.AT_STATX_SYNC_TYPE),
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
root := t.FSContext().RootDirectoryVFS2()
defer root.DecRef()
start := root
if !path.Absolute {
if !path.HasComponents() && flags&linux.AT_EMPTY_PATH == 0 {
return 0, nil, syserror.ENOENT
}
if dirfd == linux.AT_FDCWD {
start = t.FSContext().WorkingDirectoryVFS2()
defer start.DecRef()
} else {
dirfile := t.GetFileVFS2(dirfd)
if dirfile == nil {
return 0, nil, syserror.EBADF
}
if !path.HasComponents() {
// Use FileDescription.Stat() instead of
// VirtualFilesystem.StatAt() for statx(fd, ""), since the
// former may be able to use opened file state to expedite the
// Stat.
statx, err := dirfile.Stat(t, opts)
dirfile.DecRef()
if err != nil {
return 0, nil, err
}
userifyStatx(t, &statx)
return 0, nil, statx.CopyOut(t, statxAddr)
}
start = dirfile.VirtualDentry()
start.IncRef()
defer start.DecRef()
dirfile.DecRef()
}
}
statx, err := t.Kernel().VFS().StatAt(t, t.Credentials(), &vfs.PathOperation{
Root: root,
Start: start,
Path: path,
FollowFinalSymlink: flags&linux.AT_SYMLINK_NOFOLLOW == 0,
}, &opts)
if err != nil {
return 0, nil, err
}
userifyStatx(t, &statx)
return 0, nil, statx.CopyOut(t, statxAddr)
}
func userifyStatx(t *kernel.Task, statx *linux.Statx) {
userns := t.UserNamespace()
statx.UID = uint32(auth.KUID(statx.UID).In(userns).OrOverflow())
statx.GID = uint32(auth.KGID(statx.GID).In(userns).OrOverflow())
}
// Readlink implements Linux syscall readlink(2).
func Readlink(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
bufAddr := args[1].Pointer()
size := args[2].SizeT()
return readlinkat(t, linux.AT_FDCWD, pathAddr, bufAddr, size)
}
// Access implements Linux syscall access(2).
func Access(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// FIXME(jamieliu): actually implement
return 0, nil, nil
}
// Faccessat implements Linux syscall access(2).
func Faccessat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// FIXME(jamieliu): actually implement
return 0, nil, nil
}
// Readlinkat implements Linux syscall mknodat(2).
func Readlinkat(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
dirfd := args[0].Int()
pathAddr := args[1].Pointer()
bufAddr := args[2].Pointer()
size := args[3].SizeT()
return readlinkat(t, dirfd, pathAddr, bufAddr, size)
}
func readlinkat(t *kernel.Task, dirfd int32, pathAddr, bufAddr usermem.Addr, size uint) (uintptr, *kernel.SyscallControl, error) {
if int(size) <= 0 {
return 0, nil, syserror.EINVAL
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
// "Since Linux 2.6.39, pathname can be an empty string, in which case the
// call operates on the symbolic link referred to by dirfd ..." -
// readlinkat(2)
tpop, err := getTaskPathOperation(t, dirfd, path, allowEmptyPath, nofollowFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
target, err := t.Kernel().VFS().ReadlinkAt(t, t.Credentials(), &tpop.pop)
if err != nil {
return 0, nil, err
}
if len(target) > int(size) {
target = target[:size]
}
n, err := t.CopyOutBytes(bufAddr, gohacks.ImmutableBytesFromString(target))
if n == 0 {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Statfs implements Linux syscall statfs(2).
func Statfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
bufAddr := args[1].Pointer()
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, followFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
if err != nil {
return 0, nil, err
}
return 0, nil, statfs.CopyOut(t, bufAddr)
}
// Fstatfs implements Linux syscall fstatfs(2).
func Fstatfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
bufAddr := args[1].Pointer()
tpop, err := getTaskPathOperation(t, fd, fspath.Path{}, allowEmptyPath, nofollowFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
statfs, err := t.Kernel().VFS().StatFSAt(t, t.Credentials(), &tpop.pop)
if err != nil {
return 0, nil, err
}
return 0, nil, statfs.CopyOut(t, bufAddr)
}

View File

@ -0,0 +1,87 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/syserror"
)
// Sync implements Linux syscall sync(2).
func Sync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, t.Kernel().VFS().SyncAllFilesystems(t)
}
// Syncfs implements Linux syscall syncfs(2).
func Syncfs(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
return 0, nil, file.SyncFS(t)
}
// Fsync implements Linux syscall fsync(2).
func Fsync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
return 0, nil, file.Sync(t)
}
// Fdatasync implements Linux syscall fdatasync(2).
func Fdatasync(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
// TODO(gvisor.dev/issue/1897): Avoid writeback of unnecessary metadata.
return Fsync(t, args)
}
// SyncFileRange implements Linux syscall sync_file_range(2).
func SyncFileRange(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
offset := args[1].Int64()
nbytes := args[2].Int64()
flags := args[3].Uint()
if offset < 0 {
return 0, nil, syserror.EINVAL
}
if nbytes < 0 {
return 0, nil, syserror.EINVAL
}
if flags&^(linux.SYNC_FILE_RANGE_WAIT_BEFORE|linux.SYNC_FILE_RANGE_WRITE|linux.SYNC_FILE_RANGE_WAIT_AFTER) != 0 {
return 0, nil, syserror.EINVAL
}
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// TODO(gvisor.dev/issue/1897): Avoid writeback of data ranges outside of
// [offset, offset+nbytes).
return 0, nil, file.Sync(t)
}

View File

@ -1,95 +0,0 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/syscalls/linux"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
const (
// EventMaskRead contains events that can be triggered on reads.
EventMaskRead = waiter.EventIn | waiter.EventHUp | waiter.EventErr
)
// Read implements linux syscall read(2). Note that we try to get a buffer that
// is exactly the size requested because some applications like qemu expect
// they can do large reads all at once. Bug for bug. Same for other read
// calls below.
func Read(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
addr := args[1].Pointer()
size := args[2].SizeT()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
// Check that the size is legitimate.
si := int(size)
if si < 0 {
return 0, nil, syserror.EINVAL
}
// Get the destination of the read.
dst, err := t.SingleIOSequence(addr, si, usermem.IOOpts{
AddressSpaceActive: true,
})
if err != nil {
return 0, nil, err
}
n, err := read(t, file, dst, vfs.ReadOptions{})
t.IOUsage().AccountReadSyscall(n)
return uintptr(n), nil, linux.HandleIOErrorVFS2(t, n != 0, err, kernel.ERESTARTSYS, "read", file)
}
func read(t *kernel.Task, file *vfs.FileDescription, dst usermem.IOSequence, opts vfs.ReadOptions) (int64, error) {
n, err := file.Read(t, dst, opts)
if err != syserror.ErrWouldBlock {
return n, err
}
// Register for notifications.
w, ch := waiter.NewChannelEntry(nil)
file.EventRegister(&w, EventMaskRead)
total := n
for {
// Shorten dst to reflect bytes previously read.
dst = dst.DropFirst(int(n))
// Issue the request and break out if it completes with anything other than
// "would block".
n, err := file.Read(t, dst, opts)
total += n
if err != syserror.ErrWouldBlock {
break
}
if err := t.Block(ch); err != nil {
break
}
}
file.EventUnregister(&w)
return total, err
}

View File

@ -0,0 +1,353 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"bytes"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
)
// Listxattr implements Linux syscall listxattr(2).
func Listxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return listxattr(t, args, followFinalSymlink)
}
// Llistxattr implements Linux syscall llistxattr(2).
func Llistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return listxattr(t, args, nofollowFinalSymlink)
}
func listxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
listAddr := args[1].Pointer()
size := args[2].SizeT()
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
names, err := t.Kernel().VFS().ListxattrAt(t, t.Credentials(), &tpop.pop)
if err != nil {
return 0, nil, err
}
n, err := copyOutXattrNameList(t, listAddr, size, names)
if err != nil {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Flistxattr implements Linux syscall flistxattr(2).
func Flistxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
listAddr := args[1].Pointer()
size := args[2].SizeT()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
names, err := file.Listxattr(t)
if err != nil {
return 0, nil, err
}
n, err := copyOutXattrNameList(t, listAddr, size, names)
if err != nil {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Getxattr implements Linux syscall getxattr(2).
func Getxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return getxattr(t, args, followFinalSymlink)
}
// Lgetxattr implements Linux syscall lgetxattr(2).
func Lgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return getxattr(t, args, nofollowFinalSymlink)
}
func getxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) (uintptr, *kernel.SyscallControl, error) {
pathAddr := args[0].Pointer()
nameAddr := args[1].Pointer()
valueAddr := args[2].Pointer()
size := args[3].SizeT()
path, err := copyInPath(t, pathAddr)
if err != nil {
return 0, nil, err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
if err != nil {
return 0, nil, err
}
defer tpop.Release()
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return 0, nil, err
}
value, err := t.Kernel().VFS().GetxattrAt(t, t.Credentials(), &tpop.pop, name)
if err != nil {
return 0, nil, err
}
n, err := copyOutXattrValue(t, valueAddr, size, value)
if err != nil {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Fgetxattr implements Linux syscall fgetxattr(2).
func Fgetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
nameAddr := args[1].Pointer()
valueAddr := args[2].Pointer()
size := args[3].SizeT()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return 0, nil, err
}
value, err := file.Getxattr(t, name)
if err != nil {
return 0, nil, err
}
n, err := copyOutXattrValue(t, valueAddr, size, value)
if err != nil {
return 0, nil, err
}
return uintptr(n), nil, nil
}
// Setxattr implements Linux syscall setxattr(2).
func Setxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, setxattr(t, args, followFinalSymlink)
}
// Lsetxattr implements Linux syscall lsetxattr(2).
func Lsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, setxattr(t, args, nofollowFinalSymlink)
}
func setxattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
pathAddr := args[0].Pointer()
nameAddr := args[1].Pointer()
valueAddr := args[2].Pointer()
size := args[3].SizeT()
flags := args[4].Int()
if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
return syserror.EINVAL
}
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release()
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return err
}
value, err := copyInXattrValue(t, valueAddr, size)
if err != nil {
return err
}
return t.Kernel().VFS().SetxattrAt(t, t.Credentials(), &tpop.pop, &vfs.SetxattrOptions{
Name: name,
Value: value,
Flags: uint32(flags),
})
}
// Fsetxattr implements Linux syscall fsetxattr(2).
func Fsetxattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
nameAddr := args[1].Pointer()
valueAddr := args[2].Pointer()
size := args[3].SizeT()
flags := args[4].Int()
if flags&^(linux.XATTR_CREATE|linux.XATTR_REPLACE) != 0 {
return 0, nil, syserror.EINVAL
}
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return 0, nil, err
}
value, err := copyInXattrValue(t, valueAddr, size)
if err != nil {
return 0, nil, err
}
return 0, nil, file.Setxattr(t, vfs.SetxattrOptions{
Name: name,
Value: value,
Flags: uint32(flags),
})
}
// Removexattr implements Linux syscall removexattr(2).
func Removexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, removexattr(t, args, followFinalSymlink)
}
// Lremovexattr implements Linux syscall lremovexattr(2).
func Lremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
return 0, nil, removexattr(t, args, nofollowFinalSymlink)
}
func removexattr(t *kernel.Task, args arch.SyscallArguments, shouldFollowFinalSymlink shouldFollowFinalSymlink) error {
pathAddr := args[0].Pointer()
nameAddr := args[1].Pointer()
path, err := copyInPath(t, pathAddr)
if err != nil {
return err
}
tpop, err := getTaskPathOperation(t, linux.AT_FDCWD, path, disallowEmptyPath, shouldFollowFinalSymlink)
if err != nil {
return err
}
defer tpop.Release()
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return err
}
return t.Kernel().VFS().RemovexattrAt(t, t.Credentials(), &tpop.pop, name)
}
// Fremovexattr implements Linux syscall fremovexattr(2).
func Fremovexattr(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
fd := args[0].Int()
nameAddr := args[1].Pointer()
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
name, err := copyInXattrName(t, nameAddr)
if err != nil {
return 0, nil, err
}
return 0, nil, file.Removexattr(t, name)
}
func copyInXattrName(t *kernel.Task, nameAddr usermem.Addr) (string, error) {
name, err := t.CopyInString(nameAddr, linux.XATTR_NAME_MAX+1)
if err != nil {
if err == syserror.ENAMETOOLONG {
return "", syserror.ERANGE
}
return "", err
}
if len(name) == 0 {
return "", syserror.ERANGE
}
return name, nil
}
func copyOutXattrNameList(t *kernel.Task, listAddr usermem.Addr, size uint, names []string) (int, error) {
if size > linux.XATTR_LIST_MAX {
size = linux.XATTR_LIST_MAX
}
var buf bytes.Buffer
for _, name := range names {
buf.WriteString(name)
buf.WriteByte(0)
}
if size == 0 {
// Return the size that would be required to accomodate the list.
return buf.Len(), nil
}
if buf.Len() > int(size) {
if size >= linux.XATTR_LIST_MAX {
return 0, syserror.E2BIG
}
return 0, syserror.ERANGE
}
return t.CopyOutBytes(listAddr, buf.Bytes())
}
func copyInXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint) (string, error) {
if size > linux.XATTR_SIZE_MAX {
return "", syserror.E2BIG
}
buf := make([]byte, size)
if _, err := t.CopyInBytes(valueAddr, buf); err != nil {
return "", err
}
return gohacks.StringFromImmutableBytes(buf), nil
}
func copyOutXattrValue(t *kernel.Task, valueAddr usermem.Addr, size uint, value string) (int, error) {
if size > linux.XATTR_SIZE_MAX {
size = linux.XATTR_SIZE_MAX
}
if size == 0 {
// Return the size that would be required to accomodate the value.
return len(value), nil
}
if len(value) > int(size) {
if size >= linux.XATTR_SIZE_MAX {
return 0, syserror.E2BIG
}
return 0, syserror.ERANGE
}
return t.CopyOutBytes(valueAddr, gohacks.ImmutableBytesFromString(value))
}

View File

@ -43,6 +43,7 @@ go_library(
"//pkg/abi/linux",
"//pkg/context",
"//pkg/fspath",
"//pkg/gohacks",
"//pkg/log",
"//pkg/sentry/arch",
"//pkg/sentry/fs/lock",

View File

@ -202,6 +202,9 @@ func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event lin
// Add epi to file.epolls so that it is removed when the last
// FileDescription reference is dropped.
file.epollMu.Lock()
if file.epolls == nil {
file.epolls = make(map[*epollInterest]struct{})
}
file.epolls[epi] = struct{}{}
file.epollMu.Unlock()

View File

@ -26,6 +26,7 @@ import (
"sync/atomic"
"unsafe"
"gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/sync"
)
@ -160,7 +161,7 @@ func newMountTableSlots(cap uintptr) unsafe.Pointer {
// Lookup may be called even if there are concurrent mutators of mt.
func (mt *mountTable) Lookup(parent *Mount, point *Dentry) *Mount {
key := mountKey{parent: unsafe.Pointer(parent), point: unsafe.Pointer(point)}
hash := memhash(noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
hash := memhash(gohacks.Noescape(unsafe.Pointer(&key)), uintptr(mt.seed), mountKeyBytes)
loop:
for {
@ -361,12 +362,3 @@ func memhash(p unsafe.Pointer, seed, s uintptr) uintptr
//go:linkname rand32 runtime.fastrand
func rand32() uint32
// This is copy/pasted from runtime.noescape(), and is needed because arguments
// apparently escape from all functions defined by linkname.
//
//go:nosplit
func noescape(p unsafe.Pointer) unsafe.Pointer {
x := uintptr(p)
return unsafe.Pointer(x ^ 0)
}

View File

@ -228,7 +228,7 @@ func (rp *ResolvingPath) Advance() {
rp.pit = next
} else { // at end of path segment, continue with next one
rp.curPart--
rp.pit = rp.parts[rp.curPart-1]
rp.pit = rp.parts[rp.curPart]
}
}

View File

@ -385,15 +385,11 @@ func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credential
// Only a regular file can be executed.
stat, err := fd.Stat(ctx, StatOptions{Mask: linux.STATX_TYPE})
if err != nil {
fd.DecRef()
return nil, err
}
if stat.Mask&linux.STATX_TYPE != 0 {
// This shouldn't happen, but if type can't be retrieved, file can't
// be executed.
return nil, syserror.EACCES
}
if t := linux.FileMode(stat.Mode).FileType(); t != linux.ModeRegular {
ctx.Infof("%q is not a regular file: %v", pop.Path, t)
if stat.Mask&linux.STATX_TYPE == 0 || stat.Mode&linux.S_IFMT != linux.S_IFREG {
fd.DecRef()
return nil, syserror.EACCES
}
}

View File

@ -25,7 +25,6 @@ go_library(
"bytes_io_unsafe.go",
"usermem.go",
"usermem_arm64.go",
"usermem_unsafe.go",
"usermem_x86.go",
],
visibility = ["//:sandbox"],
@ -33,6 +32,7 @@ go_library(
"//pkg/atomicbitops",
"//pkg/binary",
"//pkg/context",
"//pkg/gohacks",
"//pkg/log",
"//pkg/safemem",
"//pkg/syserror",

View File

@ -23,6 +23,7 @@ import (
"gvisor.dev/gvisor/pkg/binary"
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/gohacks"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/syserror"
)
@ -251,7 +252,7 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
}
end, ok := addr.AddLength(uint64(readlen))
if !ok {
return stringFromImmutableBytes(buf[:done]), syserror.EFAULT
return gohacks.StringFromImmutableBytes(buf[:done]), syserror.EFAULT
}
// Shorten the read to avoid crossing page boundaries, since faulting
// in a page unnecessarily is expensive. This also ensures that partial
@ -272,16 +273,16 @@ func CopyStringIn(ctx context.Context, uio IO, addr Addr, maxlen int, opts IOOpt
// Look for the terminating zero byte, which may have occurred before
// hitting err.
if i := bytes.IndexByte(buf[done:done+n], byte(0)); i >= 0 {
return stringFromImmutableBytes(buf[:done+i]), nil
return gohacks.StringFromImmutableBytes(buf[:done+i]), nil
}
done += n
if err != nil {
return stringFromImmutableBytes(buf[:done]), err
return gohacks.StringFromImmutableBytes(buf[:done]), err
}
addr = end
}
return stringFromImmutableBytes(buf), syserror.ENAMETOOLONG
return gohacks.StringFromImmutableBytes(buf), syserror.ENAMETOOLONG
}
// CopyOutVec copies bytes from src to the memory mapped at ars in uio. The

View File

@ -1,27 +0,0 @@
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package usermem
import (
"unsafe"
)
// stringFromImmutableBytes is equivalent to string(bs), except that it never
// copies even if escape analysis can't prove that bs does not escape. This is
// only valid if bs is never mutated after stringFromImmutableBytes returns.
func stringFromImmutableBytes(bs []byte) string {
// Compare strings.Builder.String().
return *(*string)(unsafe.Pointer(&bs))
}

View File

@ -229,7 +229,9 @@ var allowedSyscalls = seccomp.SyscallRules{
syscall.SYS_NANOSLEEP: {},
syscall.SYS_PPOLL: {},
syscall.SYS_PREAD64: {},
syscall.SYS_PREADV: {},
syscall.SYS_PWRITE64: {},
syscall.SYS_PWRITEV: {},
syscall.SYS_READ: {},
syscall.SYS_RECVMSG: []seccomp.Rule{
{