gvisor/pkg/sentry/fsimpl/kernfs/filesystem.go

692 lines
19 KiB
Go
Raw Normal View History

// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This file implements vfs.FilesystemImpl for kernfs.
package kernfs
import (
"fmt"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/fspath"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
)
// stepExistingLocked resolves rp.Component() in parent directory vfsd.
//
// stepExistingLocked is loosely analogous to fs/namei.c:walk_component().
//
// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
//
// Postcondition: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) stepExistingLocked(ctx context.Context, rp *vfs.ResolvingPath, vfsd *vfs.Dentry) (*vfs.Dentry, error) {
d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return nil, syserror.ENOTDIR
}
// Directory searchable?
if err := d.inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
afterSymlink:
d.dirMu.Lock()
nextVFSD, err := rp.ResolveComponent(vfsd)
d.dirMu.Unlock()
if err != nil {
return nil, err
}
if nextVFSD != nil {
// Cached dentry exists, revalidate.
next := nextVFSD.Impl().(*Dentry)
if !next.inode.Valid(ctx) {
d.dirMu.Lock()
rp.VirtualFilesystem().ForceDeleteDentry(nextVFSD)
d.dirMu.Unlock()
fs.deferDecRef(nextVFSD) // Reference from Lookup.
nextVFSD = nil
}
}
if nextVFSD == nil {
// Dentry isn't cached; it either doesn't exist or failed
// revalidation. Attempt to resolve it via Lookup.
name := rp.Component()
var err error
nextVFSD, err = d.inode.Lookup(ctx, name)
// Reference on nextVFSD dropped by a corresponding Valid.
if err != nil {
return nil, err
}
d.InsertChild(name, nextVFSD)
}
next := nextVFSD.Impl().(*Dentry)
// Resolve any symlink at current path component.
if rp.ShouldFollowSymlink() && d.isSymlink() {
// TODO: VFS2 needs something extra for /proc/[pid]/fd/ "magic symlinks".
target, err := next.inode.Readlink(ctx)
if err != nil {
return nil, err
}
if err := rp.HandleSymlink(target); err != nil {
return nil, err
}
goto afterSymlink
}
rp.Advance()
return nextVFSD, nil
}
// walkExistingLocked resolves rp to an existing file.
//
// walkExistingLocked is loosely analogous to Linux's
// fs/namei.c:path_lookupat().
//
// Preconditions: Filesystem.mu must be locked for at least reading.
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) walkExistingLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
vfsd := rp.Start()
for !rp.Done() {
var err error
vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
if err != nil {
return nil, nil, err
}
}
d := vfsd.Impl().(*Dentry)
if rp.MustBeDir() && !d.isDir() {
return nil, nil, syserror.ENOTDIR
}
return vfsd, d.inode, nil
}
// walkParentDirLocked resolves all but the last path component of rp to an
// existing directory. It does not check that the returned directory is
// searchable by the provider of rp.
//
// walkParentDirLocked is loosely analogous to Linux's
// fs/namei.c:path_parentat().
//
// Preconditions: Filesystem.mu must be locked for at least reading. !rp.Done().
//
// Postconditions: Caller must call fs.processDeferredDecRefs*.
func (fs *Filesystem) walkParentDirLocked(ctx context.Context, rp *vfs.ResolvingPath) (*vfs.Dentry, Inode, error) {
vfsd := rp.Start()
for !rp.Final() {
var err error
vfsd, err = fs.stepExistingLocked(ctx, rp, vfsd)
if err != nil {
return nil, nil, err
}
}
d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return nil, nil, syserror.ENOTDIR
}
return vfsd, d.inode, nil
}
// checkCreateLocked checks that a file named rp.Component() may be created in
// directory parentVFSD, then returns rp.Component().
//
// Preconditions: Filesystem.mu must be locked for at least reading. parentInode
// == parentVFSD.Impl().(*Dentry).Inode. isDir(parentInode) == true.
func checkCreateLocked(rp *vfs.ResolvingPath, parentVFSD *vfs.Dentry, parentInode Inode) (string, error) {
if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return "", err
}
pc := rp.Component()
if pc == "." || pc == ".." {
return "", syserror.EEXIST
}
childVFSD, err := rp.ResolveChild(parentVFSD, pc)
if err != nil {
return "", err
}
if childVFSD != nil {
return "", syserror.EEXIST
}
if parentVFSD.IsDisowned() {
return "", syserror.ENOENT
}
return pc, nil
}
// checkDeleteLocked checks that the file represented by vfsd may be deleted.
//
// Preconditions: Filesystem.mu must be locked for at least reading.
func checkDeleteLocked(rp *vfs.ResolvingPath, vfsd *vfs.Dentry) error {
parentVFSD := vfsd.Parent()
if parentVFSD == nil {
return syserror.EBUSY
}
if parentVFSD.IsDisowned() {
return syserror.ENOENT
}
if err := parentVFSD.Impl().(*Dentry).inode.CheckPermissions(rp.Credentials(), vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
return nil
}
// checkRenameLocked checks that a rename operation may be performed on the
// target dentry across the given set of parent directories. The target dentry
// may be nil.
//
// Precondition: isDir(dstInode) == true.
func checkRenameLocked(creds *auth.Credentials, src, dstDir *vfs.Dentry, dstInode Inode) error {
srcDir := src.Parent()
if srcDir == nil {
return syserror.EBUSY
}
if srcDir.IsDisowned() {
return syserror.ENOENT
}
if dstDir.IsDisowned() {
return syserror.ENOENT
}
// Check for creation permissions on dst dir.
if err := dstInode.CheckPermissions(creds, vfs.MayWrite|vfs.MayExec); err != nil {
return err
}
return nil
}
// Release implements vfs.FilesystemImpl.Release.
func (fs *Filesystem) Release() {
}
// Sync implements vfs.FilesystemImpl.Sync.
func (fs *Filesystem) Sync(ctx context.Context) error {
// All filesystem state is in-memory.
return nil
}
// GetDentryAt implements vfs.FilesystemImpl.GetDentryAt.
func (fs *Filesystem) GetDentryAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.GetDentryOptions) (*vfs.Dentry, error) {
fs.mu.RLock()
defer fs.processDeferredDecRefs()
defer fs.mu.RUnlock()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return nil, err
}
if opts.CheckSearchable {
d := vfsd.Impl().(*Dentry)
if !d.isDir() {
return nil, syserror.ENOTDIR
}
if err := inode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
}
vfsd.IncRef() // Ownership transferred to caller.
return vfsd, nil
}
// LinkAt implements vfs.FilesystemImpl.LinkAt.
func (fs *Filesystem) LinkAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry) error {
if rp.Done() {
return syserror.EEXIST
}
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return err
}
pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
if err != nil {
return err
}
if rp.Mount() != vd.Mount() {
return syserror.EXDEV
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
defer rp.Mount().EndWrite()
d := vd.Dentry().Impl().(*Dentry)
if d.isDir() {
return syserror.EPERM
}
child, err := parentInode.NewLink(ctx, pc, d.inode)
if err != nil {
return err
}
parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
return nil
}
// MkdirAt implements vfs.FilesystemImpl.MkdirAt.
func (fs *Filesystem) MkdirAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MkdirOptions) error {
if rp.Done() {
return syserror.EEXIST
}
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return err
}
pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
if err != nil {
return err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
defer rp.Mount().EndWrite()
child, err := parentInode.NewDir(ctx, pc, opts)
if err != nil {
return err
}
parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
return nil
}
// MknodAt implements vfs.FilesystemImpl.MknodAt.
func (fs *Filesystem) MknodAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.MknodOptions) error {
if rp.Done() {
return syserror.EEXIST
}
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return err
}
pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
if err != nil {
return err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
defer rp.Mount().EndWrite()
new, err := parentInode.NewNode(ctx, pc, opts)
if err != nil {
return err
}
parentVFSD.Impl().(*Dentry).InsertChild(pc, new)
return nil
}
// OpenAt implements vfs.FilesystemImpl.OpenAt.
func (fs *Filesystem) OpenAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.OpenOptions) (*vfs.FileDescription, error) {
// Filter out flags that are not supported by kernfs. O_DIRECTORY and
// O_NOFOLLOW have no effect here (they're handled by VFS by setting
// appropriate bits in rp), but are returned by
// FileDescriptionImpl.StatusFlags().
opts.Flags &= linux.O_ACCMODE | linux.O_CREAT | linux.O_EXCL | linux.O_TRUNC | linux.O_DIRECTORY | linux.O_NOFOLLOW
ats := vfs.AccessTypesForOpenFlags(opts.Flags)
// Do not create new file.
if opts.Flags&linux.O_CREAT == 0 {
fs.mu.RLock()
defer fs.processDeferredDecRefs()
defer fs.mu.RUnlock()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
if err != nil {
return nil, err
}
if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
return nil, err
}
return inode.Open(rp, vfsd, opts.Flags)
}
// May create new file.
mustCreate := opts.Flags&linux.O_EXCL != 0
vfsd := rp.Start()
inode := vfsd.Impl().(*Dentry).inode
fs.mu.Lock()
defer fs.mu.Unlock()
if rp.Done() {
if rp.MustBeDir() {
return nil, syserror.EISDIR
}
if mustCreate {
return nil, syserror.EEXIST
}
if err := inode.CheckPermissions(rp.Credentials(), ats); err != nil {
return nil, err
}
return inode.Open(rp, vfsd, opts.Flags)
}
afterTrailingSymlink:
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return nil, err
}
// Check for search permission in the parent directory.
if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayExec); err != nil {
return nil, err
}
// Reject attempts to open directories with O_CREAT.
if rp.MustBeDir() {
return nil, syserror.EISDIR
}
pc := rp.Component()
if pc == "." || pc == ".." {
return nil, syserror.EISDIR
}
// Determine whether or not we need to create a file.
childVFSD, err := rp.ResolveChild(parentVFSD, pc)
if err != nil {
return nil, err
}
if childVFSD == nil {
// Already checked for searchability above; now check for writability.
if err := parentInode.CheckPermissions(rp.Credentials(), vfs.MayWrite); err != nil {
return nil, err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return nil, err
}
defer rp.Mount().EndWrite()
// Create and open the child.
child, err := parentInode.NewFile(ctx, pc, opts)
if err != nil {
return nil, err
}
parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
return child.Impl().(*Dentry).inode.Open(rp, child, opts.Flags)
}
// Open existing file or follow symlink.
if mustCreate {
return nil, syserror.EEXIST
}
childDentry := childVFSD.Impl().(*Dentry)
childInode := childDentry.inode
if rp.ShouldFollowSymlink() {
if childDentry.isSymlink() {
target, err := childInode.Readlink(ctx)
if err != nil {
return nil, err
}
if err := rp.HandleSymlink(target); err != nil {
return nil, err
}
// rp.Final() may no longer be true since we now need to resolve the
// symlink target.
goto afterTrailingSymlink
}
}
if err := childInode.CheckPermissions(rp.Credentials(), ats); err != nil {
return nil, err
}
return childInode.Open(rp, childVFSD, opts.Flags)
}
// ReadlinkAt implements vfs.FilesystemImpl.ReadlinkAt.
func (fs *Filesystem) ReadlinkAt(ctx context.Context, rp *vfs.ResolvingPath) (string, error) {
fs.mu.RLock()
d, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs()
if err != nil {
return "", err
}
if !d.Impl().(*Dentry).isSymlink() {
return "", syserror.EINVAL
}
return inode.Readlink(ctx)
}
// RenameAt implements vfs.FilesystemImpl.RenameAt.
func (fs *Filesystem) RenameAt(ctx context.Context, rp *vfs.ResolvingPath, vd vfs.VirtualDentry, opts vfs.RenameOptions) error {
noReplace := opts.Flags&linux.RENAME_NOREPLACE != 0
exchange := opts.Flags&linux.RENAME_EXCHANGE != 0
whiteout := opts.Flags&linux.RENAME_WHITEOUT != 0
if exchange && (noReplace || whiteout) {
// Can't specify RENAME_NOREPLACE or RENAME_WHITEOUT with RENAME_EXCHANGE.
return syserror.EINVAL
}
if exchange || whiteout {
// Exchange and Whiteout flags are not supported on kernfs.
return syserror.EINVAL
}
fs.mu.Lock()
defer fs.mu.Lock()
mnt := rp.Mount()
if mnt != vd.Mount() {
return syserror.EXDEV
}
if err := mnt.CheckBeginWrite(); err != nil {
return err
}
defer mnt.EndWrite()
dstDirVFSD, dstDirInode, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return err
}
srcVFSD := vd.Dentry()
srcDirVFSD := srcVFSD.Parent()
// Can we remove the src dentry?
if err := checkDeleteLocked(rp, srcVFSD); err != nil {
return err
}
// Can we create the dst dentry?
var dstVFSD *vfs.Dentry
pc, err := checkCreateLocked(rp, dstDirVFSD, dstDirInode)
switch err {
case nil:
// Ok, continue with rename as replacement.
case syserror.EEXIST:
if noReplace {
// Won't overwrite existing node since RENAME_NOREPLACE was requested.
return syserror.EEXIST
}
dstVFSD, err = rp.ResolveChild(dstDirVFSD, pc)
if err != nil {
panic(fmt.Sprintf("Child %q for parent Dentry %+v disappeared inside atomic section?", pc, dstDirVFSD))
}
default:
return err
}
mntns := vfs.MountNamespaceFromContext(ctx)
virtfs := rp.VirtualFilesystem()
srcDirDentry := srcDirVFSD.Impl().(*Dentry)
dstDirDentry := dstDirVFSD.Impl().(*Dentry)
// We can't deadlock here due to lock ordering because we're protected from
// concurrent renames by fs.mu held for writing.
srcDirDentry.dirMu.Lock()
defer srcDirDentry.dirMu.Unlock()
dstDirDentry.dirMu.Lock()
defer dstDirDentry.dirMu.Unlock()
if err := virtfs.PrepareRenameDentry(mntns, srcVFSD, dstVFSD); err != nil {
return err
}
srcDirInode := srcDirDentry.inode
replaced, err := srcDirInode.Rename(ctx, srcVFSD.Name(), pc, srcVFSD, dstDirVFSD)
if err != nil {
virtfs.AbortRenameDentry(srcVFSD, dstVFSD)
return err
}
virtfs.CommitRenameReplaceDentry(srcVFSD, dstDirVFSD, pc, replaced)
return nil
}
// RmdirAt implements vfs.FilesystemImpl.RmdirAt.
func (fs *Filesystem) RmdirAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, inode, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
defer rp.Mount().EndWrite()
if err := checkDeleteLocked(rp, vfsd); err != nil {
return err
}
if !vfsd.Impl().(*Dentry).isDir() {
return syserror.ENOTDIR
}
if inode.HasChildren() {
return syserror.ENOTEMPTY
}
virtfs := rp.VirtualFilesystem()
parentDentry := vfsd.Parent().Impl().(*Dentry)
parentDentry.dirMu.Lock()
defer parentDentry.dirMu.Unlock()
if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
if err := parentDentry.inode.RmDir(ctx, rp.Component(), vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
virtfs.CommitDeleteDentry(vfsd)
return nil
}
// SetStatAt implements vfs.FilesystemImpl.SetStatAt.
func (fs *Filesystem) SetStatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.SetStatOptions) error {
fs.mu.RLock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs()
if err != nil {
return err
}
if opts.Stat.Mask == 0 {
return nil
}
return inode.SetStat(fs.VFSFilesystem(), opts)
}
// StatAt implements vfs.FilesystemImpl.StatAt.
func (fs *Filesystem) StatAt(ctx context.Context, rp *vfs.ResolvingPath, opts vfs.StatOptions) (linux.Statx, error) {
fs.mu.RLock()
_, inode, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs()
if err != nil {
return linux.Statx{}, err
}
return inode.Stat(fs.VFSFilesystem()), nil
}
// StatFSAt implements vfs.FilesystemImpl.StatFSAt.
func (fs *Filesystem) StatFSAt(ctx context.Context, rp *vfs.ResolvingPath) (linux.Statfs, error) {
fs.mu.RLock()
_, _, err := fs.walkExistingLocked(ctx, rp)
fs.mu.RUnlock()
fs.processDeferredDecRefs()
if err != nil {
return linux.Statfs{}, err
}
// TODO: actually implement statfs
return linux.Statfs{}, syserror.ENOSYS
}
// SymlinkAt implements vfs.FilesystemImpl.SymlinkAt.
func (fs *Filesystem) SymlinkAt(ctx context.Context, rp *vfs.ResolvingPath, target string) error {
if rp.Done() {
return syserror.EEXIST
}
fs.mu.Lock()
defer fs.mu.Unlock()
parentVFSD, parentInode, err := fs.walkParentDirLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return err
}
pc, err := checkCreateLocked(rp, parentVFSD, parentInode)
if err != nil {
return err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
defer rp.Mount().EndWrite()
child, err := parentInode.NewSymlink(ctx, pc, target)
if err != nil {
return err
}
parentVFSD.Impl().(*Dentry).InsertChild(pc, child)
return nil
}
// UnlinkAt implements vfs.FilesystemImpl.UnlinkAt.
func (fs *Filesystem) UnlinkAt(ctx context.Context, rp *vfs.ResolvingPath) error {
fs.mu.Lock()
defer fs.mu.Unlock()
vfsd, _, err := fs.walkExistingLocked(ctx, rp)
fs.processDeferredDecRefsLocked()
if err != nil {
return err
}
if err := rp.Mount().CheckBeginWrite(); err != nil {
return err
}
defer rp.Mount().EndWrite()
if err := checkDeleteLocked(rp, vfsd); err != nil {
return err
}
if vfsd.Impl().(*Dentry).isDir() {
return syserror.EISDIR
}
virtfs := rp.VirtualFilesystem()
parentDentry := vfsd.Parent().Impl().(*Dentry)
parentDentry.dirMu.Lock()
defer parentDentry.dirMu.Unlock()
if err := virtfs.PrepareDeleteDentry(vfs.MountNamespaceFromContext(ctx), vfsd); err != nil {
return err
}
if err := parentDentry.inode.Unlink(ctx, rp.Component(), vfsd); err != nil {
virtfs.AbortDeleteDentry(vfsd)
return err
}
virtfs.CommitDeleteDentry(vfsd)
return nil
}
// PrependPath implements vfs.FilesystemImpl.PrependPath.
func (fs *Filesystem) PrependPath(ctx context.Context, vfsroot, vd vfs.VirtualDentry, b *fspath.Builder) error {
fs.mu.RLock()
defer fs.mu.RUnlock()
return vfs.GenericPrependPath(vfsroot, vd, b)
}