gvisor/pkg/sentry/vfs/syscalls.go

218 lines
6.5 KiB
Go
Raw Normal View History

Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs
import (
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/syserror"
)
// PathOperation specifies the path operated on by a VFS method.
//
// PathOperation is passed to VFS methods by pointer to reduce memory copying:
// it's somewhat large and should never escape. (Options structs are passed by
// pointer to VFS and FileDescription methods for the same reason.)
type PathOperation struct {
// Root is the VFS root. References on Root are borrowed from the provider
// of the PathOperation.
//
// Invariants: Root.Ok().
Root VirtualDentry
// Start is the starting point for the path traversal. References on Start
// are borrowed from the provider of the PathOperation (i.e. the caller of
// the VFS method to which the PathOperation was passed).
//
// Invariants: Start.Ok(). If Pathname.Absolute, then Start == Root.
Start VirtualDentry
// Path is the pathname traversed by this operation.
Pathname string
// If FollowFinalSymlink is true, and the Dentry traversed by the final
// path component represents a symbolic link, the symbolic link should be
// followed.
FollowFinalSymlink bool
}
// GetDentryAt returns a VirtualDentry representing the given path, at which a
// file must exist. A reference is taken on the returned VirtualDentry.
func (vfs *VirtualFilesystem) GetDentryAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *GetDentryOptions) (VirtualDentry, error) {
rp, err := vfs.getResolvingPath(creds, pop)
if err != nil {
return VirtualDentry{}, err
}
for {
d, err := rp.mount.fs.impl.GetDentryAt(ctx, rp, *opts)
if err == nil {
vd := VirtualDentry{
mount: rp.mount,
dentry: d,
}
rp.mount.incRef()
vfs.putResolvingPath(rp)
return vd, nil
}
if !rp.handleError(err) {
vfs.putResolvingPath(rp)
return VirtualDentry{}, err
}
}
}
// MkdirAt creates a directory at the given path.
func (vfs *VirtualFilesystem) MkdirAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *MkdirOptions) error {
// "Under Linux, apart from the permission bits, the S_ISVTX mode bit is
// also honored." - mkdir(2)
opts.Mode &= 01777
rp, err := vfs.getResolvingPath(creds, pop)
if err != nil {
return err
}
for {
err := rp.mount.fs.impl.MkdirAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(rp)
return nil
}
if !rp.handleError(err) {
vfs.putResolvingPath(rp)
return err
}
}
}
// OpenAt returns a FileDescription providing access to the file at the given
// path. A reference is taken on the returned FileDescription.
func (vfs *VirtualFilesystem) OpenAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *OpenOptions) (*FileDescription, error) {
// Remove:
//
// - O_LARGEFILE, which we always report in FileDescription status flags
// since only 64-bit architectures are supported at this time.
//
// - O_CLOEXEC, which affects file descriptors and therefore must be
// handled outside of VFS.
//
// - Unknown flags.
opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC | linux.O_APPEND | linux.O_NONBLOCK | linux.O_DSYNC | linux.O_ASYNC | linux.O_DIRECT | linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_NOATIME | linux.O_SYNC | linux.O_PATH | linux.O_TMPFILE
// Linux's __O_SYNC (which we call linux.O_SYNC) implies O_DSYNC.
if opts.Flags&linux.O_SYNC != 0 {
opts.Flags |= linux.O_DSYNC
}
// Linux's __O_TMPFILE (which we call linux.O_TMPFILE) must be specified
// with O_DIRECTORY and a writable access mode (to ensure that it fails on
// filesystem implementations that do not support it).
if opts.Flags&linux.O_TMPFILE != 0 {
if opts.Flags&linux.O_DIRECTORY == 0 {
return nil, syserror.EINVAL
}
if opts.Flags&linux.O_CREAT != 0 {
return nil, syserror.EINVAL
}
if opts.Flags&linux.O_ACCMODE == linux.O_RDONLY {
return nil, syserror.EINVAL
}
}
// O_PATH causes most other flags to be ignored.
if opts.Flags&linux.O_PATH != 0 {
opts.Flags &= linux.O_DIRECTORY | linux.O_NOFOLLOW | linux.O_PATH
}
// "On Linux, the following bits are also honored in mode: [S_ISUID,
// S_ISGID, S_ISVTX]" - open(2)
opts.Mode &= 07777
if opts.Flags&linux.O_NOFOLLOW != 0 {
pop.FollowFinalSymlink = false
}
rp, err := vfs.getResolvingPath(creds, pop)
if err != nil {
return nil, err
}
if opts.Flags&linux.O_DIRECTORY != 0 {
rp.mustBeDir = true
rp.mustBeDirOrig = true
}
for {
fd, err := rp.mount.fs.impl.OpenAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(rp)
return fd, nil
}
if !rp.handleError(err) {
vfs.putResolvingPath(rp)
return nil, err
}
}
}
// StatAt returns metadata for the file at the given path.
func (vfs *VirtualFilesystem) StatAt(ctx context.Context, creds *auth.Credentials, pop *PathOperation, opts *StatOptions) (linux.Statx, error) {
rp, err := vfs.getResolvingPath(creds, pop)
if err != nil {
return linux.Statx{}, err
}
for {
stat, err := rp.mount.fs.impl.StatAt(ctx, rp, *opts)
if err == nil {
vfs.putResolvingPath(rp)
return stat, nil
}
if !rp.handleError(err) {
vfs.putResolvingPath(rp)
return linux.Statx{}, err
}
}
}
// StatusFlags returns file description status flags.
func (fd *FileDescription) StatusFlags(ctx context.Context) (uint32, error) {
flags, err := fd.impl.StatusFlags(ctx)
flags |= linux.O_LARGEFILE
return flags, err
}
// SetStatusFlags sets file description status flags.
func (fd *FileDescription) SetStatusFlags(ctx context.Context, flags uint32) error {
return fd.impl.SetStatusFlags(ctx, flags)
}
// TODO:
//
// - VFS.SyncAllFilesystems() for sync(2)
//
// - Something for syncfs(2)
//
// - VFS.LinkAt()
//
// - VFS.MknodAt()
//
// - VFS.ReadlinkAt()
//
// - VFS.RenameAt()
//
// - VFS.RmdirAt()
//
// - VFS.SetStatAt()
//
// - VFS.StatFSAt()
//
// - VFS.SymlinkAt()
//
// - VFS.UnlinkAt()
//
// - FileDescription.(almost everything)