gvisor/pkg/sentry/vfs/file_description.go

773 lines
27 KiB
Go
Raw Normal View History

Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs
import (
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/context"
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
"gvisor.dev/gvisor/pkg/waiter"
)
// A FileDescription represents an open file description, which is the entity
// referred to by a file descriptor (POSIX.1-2017 3.258 "Open File
// Description").
//
// FileDescriptions are reference-counted. Unless otherwise specified, all
// FileDescription methods require that a reference is held.
//
// FileDescription is analogous to Linux's struct file.
type FileDescription struct {
// refs is the reference count. refs is accessed using atomic memory
// operations.
refs int64
// statusFlags contains status flags, "initialized by open(2) and possibly
// modified by fcntl()" - fcntl(2). statusFlags is accessed using atomic
// memory operations.
statusFlags uint32
// epolls is the set of epollInterests registered for this FileDescription.
// epolls is protected by epollMu.
epollMu sync.Mutex
epolls map[*epollInterest]struct{}
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// vd is the filesystem location at which this FileDescription was opened.
// A reference is held on vd. vd is immutable.
vd VirtualDentry
// opts contains options passed to FileDescription.Init(). opts is
// immutable.
opts FileDescriptionOptions
// readable is MayReadFileWithOpenFlags(statusFlags). readable is
// immutable.
//
// readable is analogous to Linux's FMODE_READ.
readable bool
// writable is MayWriteFileWithOpenFlags(statusFlags). If writable is true,
// the FileDescription holds a write count on vd.mount. writable is
// immutable.
//
// writable is analogous to Linux's FMODE_WRITE.
writable bool
usedLockBSD uint32
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// impl is the FileDescriptionImpl associated with this Filesystem. impl is
// immutable. This should be the last field in FileDescription.
impl FileDescriptionImpl
}
// FileDescriptionOptions contains options to FileDescription.Init().
type FileDescriptionOptions struct {
// If AllowDirectIO is true, allow O_DIRECT to be set on the file. This is
// usually only the case if O_DIRECT would actually have an effect.
AllowDirectIO bool
// If DenyPRead is true, calls to FileDescription.PRead() return ESPIPE.
DenyPRead bool
// If DenyPWrite is true, calls to FileDescription.PWrite() return
// ESPIPE.
DenyPWrite bool
// If UseDentryMetadata is true, calls to FileDescription methods that
// interact with file and filesystem metadata (Stat, SetStat, StatFS,
// Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling
// the corresponding FilesystemImpl methods instead of the corresponding
// FileDescriptionImpl methods.
//
// UseDentryMetadata is intended for file descriptions that are implemented
// outside of individual filesystems, such as pipes, sockets, and device
// special files. FileDescriptions for which UseDentryMetadata is true may
// embed DentryMetadataFileDescriptionImpl to obtain appropriate
// implementations of FileDescriptionImpl methods that should not be
// called.
UseDentryMetadata bool
}
Add //pkg/sentry/fsimpl/overlay. Major differences from existing overlay filesystems: - Linux allows lower layers in an overlay to require revalidation, but not the upper layer. VFS1 allows the upper layer in an overlay to require revalidation, but not the lower layer. VFS2 does not allow any layers to require revalidation. (Now that vfs.MkdirOptions.ForSyntheticMountpoint exists, no uses of overlay in VFS1 are believed to require upper layer revalidation; in particular, the requirement that the upper layer support the creation of "trusted." extended attributes for whiteouts effectively required the upper filesystem to be tmpfs in most cases.) - Like VFS1, but unlike Linux, VFS2 overlay does not attempt to make mutations of the upper layer atomic using a working directory and features like RENAME_WHITEOUT. (This may change in the future, since not having a working directory makes error recovery for some operations, e.g. rmdir, particularly painful.) - Like Linux, but unlike VFS1, VFS2 represents whiteouts using character devices with rdev == 0; the equivalent of the whiteout attribute on directories is xattr trusted.overlay.opaque = "y"; and there is no equivalent to the whiteout attribute on non-directories since non-directories are never merged with lower layers. - Device and inode numbers work as follows: - In Linux, modulo the xino feature and a special case for when all layers are the same filesystem: - Directories use the overlay filesystem's device number and an ephemeral inode number assigned by the overlay. - Non-directories that have been copied up use the device and inode number assigned by the upper filesystem. - Non-directories that have not been copied up use a per-(overlay, layer)-pair device number and the inode number assigned by the lower filesystem. - In VFS1, device and inode numbers always come from the lower layer unless "whited out"; this has the adverse effect of requiring interaction with the lower filesystem even for non-directory files that exist on the upper layer. - In VFS2, device and inode numbers are assigned as in Linux, except that xino and the samefs special case are not supported. - Like Linux, but unlike VFS1, VFS2 does not attempt to maintain memory mapping coherence across copy-up. (This may have to change in the future, as users may be dependent on this property.) - Like Linux, but unlike VFS1, VFS2 uses the overlayfs mounter's credentials when interacting with the overlay's layers, rather than the caller's. - Like Linux, but unlike VFS1, VFS2 permits multiple lower layers in an overlay. - Like Linux, but unlike VFS1, VFS2's overlay filesystem is application-mountable. Updates #1199 PiperOrigin-RevId: 316019067
2020-06-12 01:33:35 +00:00
// FileCreationFlags are the set of flags passed to FileDescription.Init() but
// omitted from FileDescription.StatusFlags().
const FileCreationFlags = linux.O_CREAT | linux.O_EXCL | linux.O_NOCTTY | linux.O_TRUNC
// Init must be called before first use of fd. If it succeeds, it takes
// references on mnt and d. flags is the initial file description flags, which
// is usually the full set of flags passed to open(2).
func (fd *FileDescription) Init(impl FileDescriptionImpl, flags uint32, mnt *Mount, d *Dentry, opts *FileDescriptionOptions) error {
writable := MayWriteFileWithOpenFlags(flags)
if writable {
if err := mnt.CheckBeginWrite(); err != nil {
return err
}
}
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
fd.refs = 1
// Remove "file creation flags" to mirror the behavior from file.f_flags in
Add //pkg/sentry/fsimpl/overlay. Major differences from existing overlay filesystems: - Linux allows lower layers in an overlay to require revalidation, but not the upper layer. VFS1 allows the upper layer in an overlay to require revalidation, but not the lower layer. VFS2 does not allow any layers to require revalidation. (Now that vfs.MkdirOptions.ForSyntheticMountpoint exists, no uses of overlay in VFS1 are believed to require upper layer revalidation; in particular, the requirement that the upper layer support the creation of "trusted." extended attributes for whiteouts effectively required the upper filesystem to be tmpfs in most cases.) - Like VFS1, but unlike Linux, VFS2 overlay does not attempt to make mutations of the upper layer atomic using a working directory and features like RENAME_WHITEOUT. (This may change in the future, since not having a working directory makes error recovery for some operations, e.g. rmdir, particularly painful.) - Like Linux, but unlike VFS1, VFS2 represents whiteouts using character devices with rdev == 0; the equivalent of the whiteout attribute on directories is xattr trusted.overlay.opaque = "y"; and there is no equivalent to the whiteout attribute on non-directories since non-directories are never merged with lower layers. - Device and inode numbers work as follows: - In Linux, modulo the xino feature and a special case for when all layers are the same filesystem: - Directories use the overlay filesystem's device number and an ephemeral inode number assigned by the overlay. - Non-directories that have been copied up use the device and inode number assigned by the upper filesystem. - Non-directories that have not been copied up use a per-(overlay, layer)-pair device number and the inode number assigned by the lower filesystem. - In VFS1, device and inode numbers always come from the lower layer unless "whited out"; this has the adverse effect of requiring interaction with the lower filesystem even for non-directory files that exist on the upper layer. - In VFS2, device and inode numbers are assigned as in Linux, except that xino and the samefs special case are not supported. - Like Linux, but unlike VFS1, VFS2 does not attempt to maintain memory mapping coherence across copy-up. (This may have to change in the future, as users may be dependent on this property.) - Like Linux, but unlike VFS1, VFS2 uses the overlayfs mounter's credentials when interacting with the overlay's layers, rather than the caller's. - Like Linux, but unlike VFS1, VFS2 permits multiple lower layers in an overlay. - Like Linux, but unlike VFS1, VFS2's overlay filesystem is application-mountable. Updates #1199 PiperOrigin-RevId: 316019067
2020-06-12 01:33:35 +00:00
// fs/open.c:do_dentry_open.
fd.statusFlags = flags &^ FileCreationFlags
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
fd.vd = VirtualDentry{
mount: mnt,
dentry: d,
}
Remove filesystem structure from vfs.Dentry. This change: - Drastically simplifies the synchronization model: filesystem structure is both implementation-defined and implementation-synchronized. - Allows implementations of vfs.DentryImpl to use implementation-specific dentry types, reducing casts during path traversal. - Doesn't require dentries representing non-directory files to waste space on a map of children. - Allows dentry revalidation and mount lookup to be correctly ordered (fixed FIXME in fsimpl/gofer/filesystem.go). - Removes the need to have two separate maps in gofer.dentry (dentry.vfsd.children and dentry.negativeChildren) for positive and negative lookups respectively. //pkg/sentry/fsimpl/tmpfs/benchmark_test.go: name old time/op new time/op delta VFS2TmpfsStat/1-112 172ns ± 4% 165ns ± 3% -4.08% (p=0.002 n=9+9) VFS2TmpfsStat/2-112 199ns ± 3% 195ns ±10% ~ (p=0.132 n=8+9) VFS2TmpfsStat/3-112 230ns ± 2% 216ns ± 2% -6.15% (p=0.000 n=8+8) VFS2TmpfsStat/8-112 390ns ± 2% 358ns ± 4% -8.33% (p=0.000 n=9+8) VFS2TmpfsStat/64-112 2.20µs ± 3% 2.01µs ± 3% -8.48% (p=0.000 n=10+8) VFS2TmpfsStat/100-112 3.42µs ± 9% 3.08µs ± 2% -9.82% (p=0.000 n=9+8) VFS2TmpfsMountStat/1-112 278ns ± 1% 286ns ±15% ~ (p=0.712 n=8+10) VFS2TmpfsMountStat/2-112 311ns ± 4% 298ns ± 2% -4.27% (p=0.000 n=9+8) VFS2TmpfsMountStat/3-112 339ns ± 3% 330ns ± 9% ~ (p=0.070 n=8+9) VFS2TmpfsMountStat/8-112 503ns ± 3% 466ns ± 3% -7.38% (p=0.000 n=8+8) VFS2TmpfsMountStat/64-112 2.53µs ±16% 2.17µs ± 7% -14.19% (p=0.000 n=10+9) VFS2TmpfsMountStat/100-112 3.60µs ± 4% 3.30µs ± 8% -8.33% (p=0.001 n=8+9) Updates #1035 PiperOrigin-RevId: 307655892
2020-04-21 19:16:42 +00:00
mnt.IncRef()
d.IncRef()
fd.opts = *opts
fd.readable = MayReadFileWithOpenFlags(flags)
fd.writable = writable
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
fd.impl = impl
return nil
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
}
// IncRef increments fd's reference count.
func (fd *FileDescription) IncRef() {
atomic.AddInt64(&fd.refs, 1)
}
Minor VFS2 interface changes. - Remove the Filesystem argument from DentryImpl.*Ref(); in general DentryImpls that need the Filesystem for reference counting will probably also need it for other interface methods that don't plumb Filesystem, so it's easier to just store a pointer to the filesystem in the DentryImpl. - Add a pointer to the VirtualFilesystem to Filesystem, which is needed by the gofer client to disown dentries for cache eviction triggered by dentry reference count changes. - Rename FilesystemType.NewFilesystem to GetFilesystem; in some cases (e.g. sysfs, cgroupfs) it's much cleaner for there to be only one Filesystem that is used by all mounts, and in at least one case (devtmpfs) it's visibly incorrect not to do so, so NewFilesystem doesn't always actually create and return a *new* Filesystem. - Require callers of FileDescription.Init() to increment Mount/Dentry references. This is because the gofer client may, in the OpenAt() path, take a reference on a dentry with 0 references, which is safe due to synchronization that is outside the scope of this CL, and it would be safer to still have its implementation of DentryImpl.IncRef() check for an increment for 0 references in other cases. - Add FileDescription.TryIncRef. This is used by the gofer client to take references on "special file descriptions" (FDs for files such as pipes, sockets, and devices), which use per-FD handles (fids) instead of dentry-shared handles, for sync() and syncfs(). PiperOrigin-RevId: 282473364
2019-11-26 02:09:15 +00:00
// TryIncRef increments fd's reference count and returns true. If fd's
// reference count is already zero, TryIncRef does nothing and returns false.
//
// TryIncRef does not require that a reference is held on fd.
func (fd *FileDescription) TryIncRef() bool {
for {
refs := atomic.LoadInt64(&fd.refs)
if refs <= 0 {
return false
}
if atomic.CompareAndSwapInt64(&fd.refs, refs, refs+1) {
return true
}
}
}
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// DecRef decrements fd's reference count.
func (fd *FileDescription) DecRef() {
if refs := atomic.AddInt64(&fd.refs, -1); refs == 0 {
// Unregister fd from all epoll instances.
fd.epollMu.Lock()
epolls := fd.epolls
fd.epolls = nil
fd.epollMu.Unlock()
for epi := range epolls {
ep := epi.epoll
ep.interestMu.Lock()
// Check that epi has not been concurrently unregistered by
// EpollInstance.DeleteInterest() or EpollInstance.Release().
if _, ok := ep.interest[epi.key]; ok {
fd.EventUnregister(&epi.waiter)
ep.removeLocked(epi)
}
ep.interestMu.Unlock()
}
// If BSD locks were used, release any lock that it may have acquired.
if atomic.LoadUint32(&fd.usedLockBSD) != 0 {
fd.impl.UnlockBSD(context.Background(), fd)
}
// Release implementation resources.
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
fd.impl.Release()
if fd.writable {
fd.vd.mount.EndWrite()
}
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
fd.vd.DecRef()
} else if refs < 0 {
panic("FileDescription.DecRef() called without holding a reference")
}
}
// Refs returns the current number of references. The returned count
// is inherently racy and is unsafe to use without external synchronization.
func (fd *FileDescription) Refs() int64 {
return atomic.LoadInt64(&fd.refs)
}
// Mount returns the mount on which fd was opened. It does not take a reference
// on the returned Mount.
func (fd *FileDescription) Mount() *Mount {
return fd.vd.mount
}
// Dentry returns the dentry at which fd was opened. It does not take a
// reference on the returned Dentry.
func (fd *FileDescription) Dentry() *Dentry {
return fd.vd.dentry
}
// VirtualDentry returns the location at which fd was opened. It does not take
// a reference on the returned VirtualDentry.
func (fd *FileDescription) VirtualDentry() VirtualDentry {
return fd.vd
}
// Options returns the options passed to fd.Init().
func (fd *FileDescription) Options() FileDescriptionOptions {
return fd.opts
}
// StatusFlags returns file description status flags, as for fcntl(F_GETFL).
func (fd *FileDescription) StatusFlags() uint32 {
return atomic.LoadUint32(&fd.statusFlags)
}
// SetStatusFlags sets file description status flags, as for fcntl(F_SETFL).
func (fd *FileDescription) SetStatusFlags(ctx context.Context, creds *auth.Credentials, flags uint32) error {
// Compare Linux's fs/fcntl.c:setfl().
oldFlags := fd.StatusFlags()
// Linux documents this check as "O_APPEND cannot be cleared if the file is
// marked as append-only and the file is open for write", which would make
// sense. However, the check as actually implemented seems to be "O_APPEND
// cannot be changed if the file is marked as append-only".
if (flags^oldFlags)&linux.O_APPEND != 0 {
stat, err := fd.Stat(ctx, StatOptions{
// There is no mask bit for stx_attributes.
Mask: 0,
// Linux just reads inode::i_flags directly.
Sync: linux.AT_STATX_DONT_SYNC,
})
if err != nil {
return err
}
if (stat.AttributesMask&linux.STATX_ATTR_APPEND != 0) && (stat.Attributes&linux.STATX_ATTR_APPEND != 0) {
return syserror.EPERM
}
}
if (flags&linux.O_NOATIME != 0) && (oldFlags&linux.O_NOATIME == 0) {
stat, err := fd.Stat(ctx, StatOptions{
Mask: linux.STATX_UID,
// Linux's inode_owner_or_capable() just reads inode::i_uid
// directly.
Sync: linux.AT_STATX_DONT_SYNC,
})
if err != nil {
return err
}
if stat.Mask&linux.STATX_UID == 0 {
return syserror.EPERM
}
if !CanActAsOwner(creds, auth.KUID(stat.UID)) {
return syserror.EPERM
}
}
if flags&linux.O_DIRECT != 0 && !fd.opts.AllowDirectIO {
return syserror.EINVAL
}
// TODO(jamieliu): FileDescriptionImpl.SetOAsync()?
const settableFlags = linux.O_APPEND | linux.O_ASYNC | linux.O_DIRECT | linux.O_NOATIME | linux.O_NONBLOCK
atomic.StoreUint32(&fd.statusFlags, (oldFlags&^settableFlags)|(flags&settableFlags))
return nil
}
// IsReadable returns true if fd was opened for reading.
func (fd *FileDescription) IsReadable() bool {
return fd.readable
}
// IsWritable returns true if fd was opened for writing.
func (fd *FileDescription) IsWritable() bool {
return fd.writable
}
// Impl returns the FileDescriptionImpl associated with fd.
func (fd *FileDescription) Impl() FileDescriptionImpl {
return fd.impl
}
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// FileDescriptionImpl contains implementation details for an FileDescription.
// Implementations of FileDescriptionImpl should contain their associated
// FileDescription by value as their first field.
//
// For all functions that return linux.Statx, Statx.Uid and Statx.Gid will
// be interpreted as IDs in the root UserNamespace (i.e. as auth.KUID and
// auth.KGID respectively).
//
// All methods may return errors not specified.
//
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// FileDescriptionImpl is analogous to Linux's struct file_operations.
type FileDescriptionImpl interface {
// Release is called when the associated FileDescription reaches zero
// references.
Release()
// OnClose is called when a file descriptor representing the
// FileDescription is closed. Note that returning a non-nil error does not
// prevent the file descriptor from being closed.
OnClose(ctx context.Context) error
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// Stat returns metadata for the file represented by the FileDescription.
Stat(ctx context.Context, opts StatOptions) (linux.Statx, error)
// SetStat updates metadata for the file represented by the
// FileDescription. Implementations are responsible for checking if the
// operation can be performed (see vfs.CheckSetStat() for common checks).
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
SetStat(ctx context.Context, opts SetStatOptions) error
// StatFS returns metadata for the filesystem containing the file
// represented by the FileDescription.
StatFS(ctx context.Context) (linux.Statfs, error)
// waiter.Waitable methods may be used to poll for I/O events.
waiter.Waitable
// PRead reads from the file into dst, starting at the given offset, and
// returns the number of bytes read. PRead is permitted to return partial
// reads with a nil error.
//
// Errors:
//
// - If opts.Flags specifies unsupported options, PRead returns EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for reading.
// FileDescriptionOptions.DenyPRead == false.
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error)
// Read is similar to PRead, but does not specify an offset.
//
// For files with an implicit FileDescription offset (e.g. regular files),
// Read begins at the FileDescription offset, and advances the offset by
// the number of bytes read; note that POSIX 2.9.7 "Thread Interactions
// with Regular File Operations" requires that all operations that may
// mutate the FileDescription offset are serialized.
//
// Errors:
//
// - If opts.Flags specifies unsupported options, Read returns EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for reading.
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error)
// PWrite writes src to the file, starting at the given offset, and returns
// the number of bytes written. PWrite is permitted to return partial
// writes with a nil error.
//
// As in Linux (but not POSIX), if O_APPEND is in effect for the
// FileDescription, PWrite should ignore the offset and append data to the
// end of the file.
//
// Errors:
//
// - If opts.Flags specifies unsupported options, PWrite returns
// EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for writing.
// FileDescriptionOptions.DenyPWrite == false.
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error)
// Write is similar to PWrite, but does not specify an offset, which is
// implied as for Read.
//
// Write is a FileDescriptionImpl method, instead of a wrapper around
// PWrite that uses a FileDescription offset, to make it possible for
// remote filesystems to implement O_APPEND correctly (i.e. atomically with
// respect to writers outside the scope of VFS).
//
// Errors:
//
// - If opts.Flags specifies unsupported options, Write returns EOPNOTSUPP.
//
// Preconditions: The FileDescription was opened for writing.
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error)
// IterDirents invokes cb on each entry in the directory represented by the
// FileDescription. If IterDirents has been called since the last call to
// Seek, it continues iteration from the end of the last call.
IterDirents(ctx context.Context, cb IterDirentsCallback) error
// Seek changes the FileDescription offset (assuming one exists) and
// returns its new value.
//
// For directories, if whence == SEEK_SET and offset == 0, the caller is
// rewinddir(), such that Seek "shall also cause the directory stream to
// refer to the current state of the corresponding directory" -
// POSIX.1-2017.
Seek(ctx context.Context, offset int64, whence int32) (int64, error)
// Sync requests that cached state associated with the file represented by
// the FileDescription is synchronized with persistent storage, and blocks
// until this is complete.
Sync(ctx context.Context) error
// ConfigureMMap mutates opts to implement mmap(2) for the file. Most
// implementations that support memory mapping can call
// GenericConfigureMMap with the appropriate memmap.Mappable.
ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
// Ioctl implements the ioctl(2) syscall.
Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error)
// Listxattr returns all extended attribute names for the file.
Listxattr(ctx context.Context, size uint64) ([]string, error)
// Getxattr returns the value associated with the given extended attribute
// for the file.
Getxattr(ctx context.Context, opts GetxattrOptions) (string, error)
// Setxattr changes the value associated with the given extended attribute
// for the file.
Setxattr(ctx context.Context, opts SetxattrOptions) error
// Removexattr removes the given extended attribute from the file.
Removexattr(ctx context.Context, name string) error
// LockBSD tries to acquire a BSD-style advisory file lock.
LockBSD(ctx context.Context, uid lock.UniqueID, t lock.LockType, block lock.Blocker) error
// UnlockBSD releases a BSD-style advisory file lock.
UnlockBSD(ctx context.Context, uid lock.UniqueID) error
// LockPOSIX tries to acquire a POSIX-style advisory file lock.
LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, length uint64, whence int16, block lock.Blocker) error
// UnlockPOSIX releases a POSIX-style advisory file lock.
UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, length uint64, whence int16) error
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
}
// Dirent holds the information contained in struct linux_dirent64.
type Dirent struct {
// Name is the filename.
Name string
// Type is the file type, a linux.DT_* constant.
Type uint8
// Ino is the inode number.
Ino uint64
// NextOff is the offset of the *next* Dirent in the directory; that is,
// FileDescription.Seek(NextOff, SEEK_SET) (as called by seekdir(3)) will
// cause the next call to FileDescription.IterDirents() to yield the next
// Dirent. (The offset of the first Dirent in a directory is always 0.)
NextOff int64
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
}
// IterDirentsCallback receives Dirents from FileDescriptionImpl.IterDirents.
type IterDirentsCallback interface {
// Handle handles the given iterated Dirent. If Handle returns a non-nil
// error, FileDescriptionImpl.IterDirents must stop iteration and return
// the error; the next call to FileDescriptionImpl.IterDirents should
// restart with the same Dirent.
Handle(dirent Dirent) error
Sentry virtual filesystem, v2 Major differences from the current ("v1") sentry VFS: - Path resolution is Filesystem-driven (FilesystemImpl methods call vfs.ResolvingPath methods) rather than VFS-driven (fs package owns a Dirent tree and calls fs.InodeOperations methods to populate it). This drastically improves performance, primarily by reducing overhead from inefficient synchronization and indirection. It also makes it possible to implement remote filesystem protocols that translate FS system calls into single RPCs, rather than having to make (at least) one RPC per path component, significantly reducing the latency of remote filesystems (especially during cold starts and for uncacheable shared filesystems). - Mounts are correctly represented as a separate check based on contextual state (current mount) rather than direct replacement in a fs.Dirent tree. This makes it possible to support (non-recursive) bind mounts and mount namespaces. Included in this CL is fsimpl/memfs, an incomplete in-memory filesystem that exists primarily to demonstrate intended filesystem implementation patterns and for benchmarking: BenchmarkVFS1TmpfsStat/1-6 3000000 497 ns/op BenchmarkVFS1TmpfsStat/2-6 2000000 676 ns/op BenchmarkVFS1TmpfsStat/3-6 2000000 904 ns/op BenchmarkVFS1TmpfsStat/8-6 1000000 1944 ns/op BenchmarkVFS1TmpfsStat/64-6 100000 14067 ns/op BenchmarkVFS1TmpfsStat/100-6 50000 21700 ns/op BenchmarkVFS2MemfsStat/1-6 10000000 197 ns/op BenchmarkVFS2MemfsStat/2-6 5000000 233 ns/op BenchmarkVFS2MemfsStat/3-6 5000000 268 ns/op BenchmarkVFS2MemfsStat/8-6 3000000 477 ns/op BenchmarkVFS2MemfsStat/64-6 500000 2592 ns/op BenchmarkVFS2MemfsStat/100-6 300000 4045 ns/op BenchmarkVFS1TmpfsMountStat/1-6 2000000 679 ns/op BenchmarkVFS1TmpfsMountStat/2-6 2000000 912 ns/op BenchmarkVFS1TmpfsMountStat/3-6 1000000 1113 ns/op BenchmarkVFS1TmpfsMountStat/8-6 1000000 2118 ns/op BenchmarkVFS1TmpfsMountStat/64-6 100000 14251 ns/op BenchmarkVFS1TmpfsMountStat/100-6 100000 22397 ns/op BenchmarkVFS2MemfsMountStat/1-6 5000000 317 ns/op BenchmarkVFS2MemfsMountStat/2-6 5000000 361 ns/op BenchmarkVFS2MemfsMountStat/3-6 5000000 387 ns/op BenchmarkVFS2MemfsMountStat/8-6 3000000 582 ns/op BenchmarkVFS2MemfsMountStat/64-6 500000 2699 ns/op BenchmarkVFS2MemfsMountStat/100-6 300000 4133 ns/op From this we can infer that, on this machine: - Constant cost for tmpfs stat() is ~160ns in VFS2 and ~280ns in VFS1. - Per-path-component cost is ~35ns in VFS2 and ~215ns in VFS1, a difference of about 6x. - The cost of crossing a mount boundary is about 80ns in VFS2 (MemfsMountStat/1 does approximately the same amount of work as MemfsStat/2, except that it also crosses a mount boundary). This is an inescapable cost of the separate mount lookup needed to support bind mounts and mount namespaces. PiperOrigin-RevId: 258853946
2019-07-18 22:09:14 +00:00
}
Add //pkg/sentry/fsimpl/overlay. Major differences from existing overlay filesystems: - Linux allows lower layers in an overlay to require revalidation, but not the upper layer. VFS1 allows the upper layer in an overlay to require revalidation, but not the lower layer. VFS2 does not allow any layers to require revalidation. (Now that vfs.MkdirOptions.ForSyntheticMountpoint exists, no uses of overlay in VFS1 are believed to require upper layer revalidation; in particular, the requirement that the upper layer support the creation of "trusted." extended attributes for whiteouts effectively required the upper filesystem to be tmpfs in most cases.) - Like VFS1, but unlike Linux, VFS2 overlay does not attempt to make mutations of the upper layer atomic using a working directory and features like RENAME_WHITEOUT. (This may change in the future, since not having a working directory makes error recovery for some operations, e.g. rmdir, particularly painful.) - Like Linux, but unlike VFS1, VFS2 represents whiteouts using character devices with rdev == 0; the equivalent of the whiteout attribute on directories is xattr trusted.overlay.opaque = "y"; and there is no equivalent to the whiteout attribute on non-directories since non-directories are never merged with lower layers. - Device and inode numbers work as follows: - In Linux, modulo the xino feature and a special case for when all layers are the same filesystem: - Directories use the overlay filesystem's device number and an ephemeral inode number assigned by the overlay. - Non-directories that have been copied up use the device and inode number assigned by the upper filesystem. - Non-directories that have not been copied up use a per-(overlay, layer)-pair device number and the inode number assigned by the lower filesystem. - In VFS1, device and inode numbers always come from the lower layer unless "whited out"; this has the adverse effect of requiring interaction with the lower filesystem even for non-directory files that exist on the upper layer. - In VFS2, device and inode numbers are assigned as in Linux, except that xino and the samefs special case are not supported. - Like Linux, but unlike VFS1, VFS2 does not attempt to maintain memory mapping coherence across copy-up. (This may have to change in the future, as users may be dependent on this property.) - Like Linux, but unlike VFS1, VFS2 uses the overlayfs mounter's credentials when interacting with the overlay's layers, rather than the caller's. - Like Linux, but unlike VFS1, VFS2 permits multiple lower layers in an overlay. - Like Linux, but unlike VFS1, VFS2's overlay filesystem is application-mountable. Updates #1199 PiperOrigin-RevId: 316019067
2020-06-12 01:33:35 +00:00
// IterDirentsCallbackFunc implements IterDirentsCallback for a function with
// the semantics of IterDirentsCallback.Handle.
type IterDirentsCallbackFunc func(dirent Dirent) error
// Handle implements IterDirentsCallback.Handle.
func (f IterDirentsCallbackFunc) Handle(dirent Dirent) error {
return f(dirent)
}
// OnClose is called when a file descriptor representing the FileDescription is
// closed. Returning a non-nil error should not prevent the file descriptor
// from being closed.
func (fd *FileDescription) OnClose(ctx context.Context) error {
return fd.impl.OnClose(ctx)
}
// Stat returns metadata for the file represented by fd.
func (fd *FileDescription) Stat(ctx context.Context, opts StatOptions) (linux.Statx, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
stat, err := fd.vd.mount.fs.impl.StatAt(ctx, rp, opts)
vfsObj.putResolvingPath(rp)
return stat, err
}
return fd.impl.Stat(ctx, opts)
}
// SetStat updates metadata for the file represented by fd.
func (fd *FileDescription) SetStat(ctx context.Context, opts SetStatOptions) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.SetStatAt(ctx, rp, opts)
vfsObj.putResolvingPath(rp)
return err
}
return fd.impl.SetStat(ctx, opts)
}
// StatFS returns metadata for the filesystem containing the file represented
// by fd.
func (fd *FileDescription) StatFS(ctx context.Context) (linux.Statfs, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
statfs, err := fd.vd.mount.fs.impl.StatFSAt(ctx, rp)
vfsObj.putResolvingPath(rp)
return statfs, err
}
return fd.impl.StatFS(ctx)
}
// Readiness returns fd's I/O readiness.
func (fd *FileDescription) Readiness(mask waiter.EventMask) waiter.EventMask {
return fd.impl.Readiness(mask)
}
// EventRegister registers e for I/O readiness events in mask.
func (fd *FileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) {
fd.impl.EventRegister(e, mask)
}
// EventUnregister unregisters e for I/O readiness events.
func (fd *FileDescription) EventUnregister(e *waiter.Entry) {
fd.impl.EventUnregister(e)
}
// PRead reads from the file represented by fd into dst, starting at the given
// offset, and returns the number of bytes read. PRead is permitted to return
// partial reads with a nil error.
func (fd *FileDescription) PRead(ctx context.Context, dst usermem.IOSequence, offset int64, opts ReadOptions) (int64, error) {
if fd.opts.DenyPRead {
return 0, syserror.ESPIPE
}
if !fd.readable {
return 0, syserror.EBADF
}
return fd.impl.PRead(ctx, dst, offset, opts)
}
// Read is similar to PRead, but does not specify an offset.
func (fd *FileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) {
if !fd.readable {
return 0, syserror.EBADF
}
return fd.impl.Read(ctx, dst, opts)
}
// PWrite writes src to the file represented by fd, starting at the given
// offset, and returns the number of bytes written. PWrite is permitted to
// return partial writes with a nil error.
func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, offset int64, opts WriteOptions) (int64, error) {
if fd.opts.DenyPWrite {
return 0, syserror.ESPIPE
}
if !fd.writable {
return 0, syserror.EBADF
}
return fd.impl.PWrite(ctx, src, offset, opts)
}
// Write is similar to PWrite, but does not specify an offset.
func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) {
if !fd.writable {
return 0, syserror.EBADF
}
return fd.impl.Write(ctx, src, opts)
}
// IterDirents invokes cb on each entry in the directory represented by fd. If
// IterDirents has been called since the last call to Seek, it continues
// iteration from the end of the last call.
func (fd *FileDescription) IterDirents(ctx context.Context, cb IterDirentsCallback) error {
return fd.impl.IterDirents(ctx, cb)
}
// Seek changes fd's offset (assuming one exists) and returns its new value.
func (fd *FileDescription) Seek(ctx context.Context, offset int64, whence int32) (int64, error) {
return fd.impl.Seek(ctx, offset, whence)
}
// Sync has the semantics of fsync(2).
func (fd *FileDescription) Sync(ctx context.Context) error {
return fd.impl.Sync(ctx)
}
// ConfigureMMap mutates opts to implement mmap(2) for the file represented by
// fd.
func (fd *FileDescription) ConfigureMMap(ctx context.Context, opts *memmap.MMapOpts) error {
return fd.impl.ConfigureMMap(ctx, opts)
}
// Ioctl implements the ioctl(2) syscall.
func (fd *FileDescription) Ioctl(ctx context.Context, uio usermem.IO, args arch.SyscallArguments) (uintptr, error) {
return fd.impl.Ioctl(ctx, uio, args)
}
// Listxattr returns all extended attribute names for the file represented by
// fd.
//
// If the size of the list (including a NUL terminating byte after every entry)
// would exceed size, ERANGE may be returned. Note that implementations
// are free to ignore size entirely and return without error). In all cases,
// if size is 0, the list should be returned without error, regardless of size.
func (fd *FileDescription) Listxattr(ctx context.Context, size uint64) ([]string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
names, err := fd.vd.mount.fs.impl.ListxattrAt(ctx, rp, size)
vfsObj.putResolvingPath(rp)
return names, err
}
names, err := fd.impl.Listxattr(ctx, size)
if err == syserror.ENOTSUP {
// Linux doesn't actually return ENOTSUP in this case; instead,
// fs/xattr.c:vfs_listxattr() falls back to allowing the security
// subsystem to return security extended attributes, which by default
// don't exist.
return nil, nil
}
return names, err
}
// Getxattr returns the value associated with the given extended attribute for
// the file represented by fd.
//
// If the size of the return value exceeds opts.Size, ERANGE may be returned
// (note that implementations are free to ignore opts.Size entirely and return
// without error). In all cases, if opts.Size is 0, the value should be
// returned without error, regardless of size.
func (fd *FileDescription) Getxattr(ctx context.Context, opts *GetxattrOptions) (string, error) {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
val, err := fd.vd.mount.fs.impl.GetxattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(rp)
return val, err
}
return fd.impl.Getxattr(ctx, *opts)
}
// Setxattr changes the value associated with the given extended attribute for
// the file represented by fd.
func (fd *FileDescription) Setxattr(ctx context.Context, opts *SetxattrOptions) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.SetxattrAt(ctx, rp, *opts)
vfsObj.putResolvingPath(rp)
return err
}
return fd.impl.Setxattr(ctx, *opts)
}
// Removexattr removes the given extended attribute from the file represented
// by fd.
func (fd *FileDescription) Removexattr(ctx context.Context, name string) error {
if fd.opts.UseDentryMetadata {
vfsObj := fd.vd.mount.vfs
rp := vfsObj.getResolvingPath(auth.CredentialsFromContext(ctx), &PathOperation{
Root: fd.vd,
Start: fd.vd,
})
err := fd.vd.mount.fs.impl.RemovexattrAt(ctx, rp, name)
vfsObj.putResolvingPath(rp)
return err
}
return fd.impl.Removexattr(ctx, name)
}
// SyncFS instructs the filesystem containing fd to execute the semantics of
// syncfs(2).
func (fd *FileDescription) SyncFS(ctx context.Context) error {
return fd.vd.mount.fs.impl.Sync(ctx)
}
// MappedName implements memmap.MappingIdentity.MappedName.
func (fd *FileDescription) MappedName(ctx context.Context) string {
vfsroot := RootFromContext(ctx)
s, _ := fd.vd.mount.vfs.PathnameWithDeleted(ctx, vfsroot, fd.vd)
if vfsroot.Ok() {
vfsroot.DecRef()
}
return s
}
// DeviceID implements memmap.MappingIdentity.DeviceID.
func (fd *FileDescription) DeviceID() uint64 {
stat, err := fd.Stat(context.Background(), StatOptions{
// There is no STATX_DEV; we assume that Stat will return it if it's
// available regardless of mask.
Mask: 0,
// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_sb->s_dev
// directly.
Sync: linux.AT_STATX_DONT_SYNC,
})
if err != nil {
return 0
}
return uint64(linux.MakeDeviceID(uint16(stat.DevMajor), stat.DevMinor))
}
// InodeID implements memmap.MappingIdentity.InodeID.
func (fd *FileDescription) InodeID() uint64 {
stat, err := fd.Stat(context.Background(), StatOptions{
Mask: linux.STATX_INO,
// fs/proc/task_mmu.c:show_map_vma() just reads inode::i_ino directly.
Sync: linux.AT_STATX_DONT_SYNC,
})
if err != nil || stat.Mask&linux.STATX_INO == 0 {
return 0
}
return stat.Ino
}
// Msync implements memmap.MappingIdentity.Msync.
func (fd *FileDescription) Msync(ctx context.Context, mr memmap.MappableRange) error {
return fd.Sync(ctx)
}
// LockBSD tries to acquire a BSD-style advisory file lock.
func (fd *FileDescription) LockBSD(ctx context.Context, lockType lock.LockType, blocker lock.Blocker) error {
atomic.StoreUint32(&fd.usedLockBSD, 1)
return fd.impl.LockBSD(ctx, fd, lockType, blocker)
}
// UnlockBSD releases a BSD-style advisory file lock.
func (fd *FileDescription) UnlockBSD(ctx context.Context) error {
return fd.impl.UnlockBSD(ctx, fd)
}
// LockPOSIX locks a POSIX-style file range lock.
func (fd *FileDescription) LockPOSIX(ctx context.Context, uid lock.UniqueID, t lock.LockType, start, end uint64, whence int16, block lock.Blocker) error {
return fd.impl.LockPOSIX(ctx, uid, t, start, end, whence, block)
}
// UnlockPOSIX unlocks a POSIX-style file range lock.
func (fd *FileDescription) UnlockPOSIX(ctx context.Context, uid lock.UniqueID, start, end uint64, whence int16) error {
return fd.impl.UnlockPOSIX(ctx, uid, start, end, whence)
}