// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package kernfs provides the tools to implement inode-based filesystems. // Kernfs has two main features: // // 1. The Inode interface, which maps VFS2's path-based filesystem operations to // specific filesystem nodes. Kernfs uses the Inode interface to provide a // blanket implementation for the vfs.FilesystemImpl. Kernfs also serves as // the synchronization mechanism for all filesystem operations by holding a // filesystem-wide lock across all operations. // // 2. Various utility types which provide generic implementations for various // parts of the Inode and vfs.FileDescription interfaces. Client filesystems // based on kernfs can embed the appropriate set of these to avoid having to // reimplement common filesystem operations. See inode_impl_util.go and // fd_impl_util.go. // // Reference Model: // // Kernfs dentries represents named pointers to inodes. Dentries and inode have // independent lifetimes and reference counts. A child dentry unconditionally // holds a reference on its parent directory's dentry. A dentry also holds a // reference on the inode it points to. Multiple dentries can point to the same // inode (for example, in the case of hardlinks). File descriptors hold a // reference to the dentry they're opened on. // // Dentries are guaranteed to exist while holding Filesystem.mu for // reading. Dropping dentries require holding Filesystem.mu for writing. To // queue dentries for destruction from a read critical section, see // Filesystem.deferDecRef. // // Lock ordering: // // kernfs.Filesystem.mu // kernfs.Dentry.dirMu // vfs.VirtualFilesystem.mountMu // vfs.Dentry.mu // kernfs.Filesystem.droppedDentriesMu // (inode implementation locks, if any) package kernfs import ( "fmt" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/refs" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sync" ) // Filesystem mostly implements vfs.FilesystemImpl for a generic in-memory // filesystem. Concrete implementations are expected to embed this in their own // Filesystem type. type Filesystem struct { vfsfs vfs.Filesystem droppedDentriesMu sync.Mutex // droppedDentries is a list of dentries waiting to be DecRef()ed. This is // used to defer dentry destruction until mu can be acquired for // writing. Protected by droppedDentriesMu. droppedDentries []*vfs.Dentry // mu synchronizes the lifetime of Dentries on this filesystem. Holding it // for reading guarantees continued existence of any resolved dentries, but // the dentry tree may be modified. // // Kernfs dentries can only be DecRef()ed while holding mu for writing. For // example: // // fs.mu.Lock() // defer fs.mu.Unlock() // ... // dentry1.DecRef() // defer dentry2.DecRef() // Ok, will run before Unlock. // // If discarding dentries in a read context, use Filesystem.deferDecRef. For // example: // // fs.mu.RLock() // fs.mu.processDeferredDecRefs() // defer fs.mu.RUnlock() // ... // fs.deferDecRef(dentry) mu sync.RWMutex // nextInoMinusOne is used to to allocate inode numbers on this // filesystem. Must be accessed by atomic operations. nextInoMinusOne uint64 } // deferDecRef defers dropping a dentry ref until the next call to // processDeferredDecRefs{,Locked}. See comment on Filesystem.mu. // // Precondition: d must not already be pending destruction. func (fs *Filesystem) deferDecRef(d *vfs.Dentry) { fs.droppedDentriesMu.Lock() fs.droppedDentries = append(fs.droppedDentries, d) fs.droppedDentriesMu.Unlock() } // processDeferredDecRefs calls vfs.Dentry.DecRef on all dentries in the // droppedDentries list. See comment on Filesystem.mu. func (fs *Filesystem) processDeferredDecRefs() { fs.mu.Lock() fs.processDeferredDecRefsLocked() fs.mu.Unlock() } // Precondition: fs.mu must be held for writing. func (fs *Filesystem) processDeferredDecRefsLocked() { fs.droppedDentriesMu.Lock() for _, d := range fs.droppedDentries { d.DecRef() } fs.droppedDentries = fs.droppedDentries[:0] // Keep slice memory for reuse. fs.droppedDentriesMu.Unlock() } // Init initializes a kernfs filesystem. This should be called from during // vfs.FilesystemType.NewFilesystem for the concrete filesystem embedding // kernfs. func (fs *Filesystem) Init(vfsObj *vfs.VirtualFilesystem, fsType vfs.FilesystemType) { fs.vfsfs.Init(vfsObj, fsType, fs) } // VFSFilesystem returns the generic vfs filesystem object. func (fs *Filesystem) VFSFilesystem() *vfs.Filesystem { return &fs.vfsfs } // NextIno allocates a new inode number on this filesystem. func (fs *Filesystem) NextIno() uint64 { return atomic.AddUint64(&fs.nextInoMinusOne, 1) } // These consts are used in the Dentry.flags field. const ( // Dentry points to a directory inode. dflagsIsDir = 1 << iota // Dentry points to a symlink inode. dflagsIsSymlink ) // Dentry implements vfs.DentryImpl. // // A kernfs dentry is similar to a dentry in a traditional filesystem: it's a // named reference to an inode. A dentry generally lives as long as it's part of // a mounted filesystem tree. Kernfs doesn't cache dentries once all references // to them are removed. Dentries hold a single reference to the inode they point // to, and child dentries hold a reference on their parent. // // Must be initialized by Init prior to first use. type Dentry struct { refs.AtomicRefCount vfsd vfs.Dentry inode Inode // flags caches useful information about the dentry from the inode. See the // dflags* consts above. Must be accessed by atomic ops. flags uint32 // dirMu protects vfsd.children for directory dentries. dirMu sync.Mutex } // Init initializes this dentry. // // Precondition: Caller must hold a reference on inode. // // Postcondition: Caller's reference on inode is transferred to the dentry. func (d *Dentry) Init(inode Inode) { d.vfsd.Init(d) d.inode = inode ftype := inode.Mode().FileType() if ftype == linux.ModeDirectory { d.flags |= dflagsIsDir } if ftype == linux.ModeSymlink { d.flags |= dflagsIsSymlink } } // VFSDentry returns the generic vfs dentry for this kernfs dentry. func (d *Dentry) VFSDentry() *vfs.Dentry { return &d.vfsd } // isDir checks whether the dentry points to a directory inode. func (d *Dentry) isDir() bool { return atomic.LoadUint32(&d.flags)&dflagsIsDir != 0 } // isSymlink checks whether the dentry points to a symlink inode. func (d *Dentry) isSymlink() bool { return atomic.LoadUint32(&d.flags)&dflagsIsSymlink != 0 } // DecRef implements vfs.DentryImpl.DecRef. func (d *Dentry) DecRef() { d.AtomicRefCount.DecRefWithDestructor(d.destroy) } // Precondition: Dentry must be removed from VFS' dentry cache. func (d *Dentry) destroy() { d.inode.DecRef() // IncRef from Init. d.inode = nil if parent := d.vfsd.Parent(); parent != nil { parent.DecRef() // IncRef from Dentry.InsertChild. } } // InsertChild inserts child into the vfs dentry cache with the given name under // this dentry. This does not update the directory inode, so calling this on // it's own isn't sufficient to insert a child into a directory. InsertChild // updates the link count on d if required. // // Precondition: d must represent a directory inode. func (d *Dentry) InsertChild(name string, child *vfs.Dentry) { d.dirMu.Lock() d.insertChildLocked(name, child) d.dirMu.Unlock() } // insertChildLocked is equivalent to InsertChild, with additional // preconditions. // // Precondition: d.dirMu must be locked. func (d *Dentry) insertChildLocked(name string, child *vfs.Dentry) { if !d.isDir() { panic(fmt.Sprintf("InsertChild called on non-directory Dentry: %+v.", d)) } vfsDentry := d.VFSDentry() vfsDentry.IncRef() // DecRef in child's Dentry.destroy. vfsDentry.InsertChild(child, name) } // The Inode interface maps filesystem-level operations that operate on paths to // equivalent operations on specific filesystem nodes. // // The interface methods are groups into logical categories as sub interfaces // below. Generally, an implementation for each sub interface can be provided by // embedding an appropriate type from inode_impl_utils.go. The sub interfaces // are purely organizational. Methods declared directly in the main interface // have no generic implementations, and should be explicitly provided by the // client filesystem. // // Generally, implementations are not responsible for tasks that are common to // all filesystems. These include: // // - Checking that dentries passed to methods are of the appropriate file type. // - Checking permissions. // - Updating link and reference counts. // // Specific responsibilities of implementations are documented below. type Inode interface { // Methods related to reference counting. A generic implementation is // provided by InodeNoopRefCount. These methods are generally called by the // equivalent Dentry methods. inodeRefs // Methods related to node metadata. A generic implementation is provided by // InodeAttrs. inodeMetadata // Method for inodes that represent symlink. InodeNotSymlink provides a // blanket implementation for all non-symlink inodes. inodeSymlink // Method for inodes that represent directories. InodeNotDirectory provides // a blanket implementation for all non-directory inodes. inodeDirectory // Method for inodes that represent dynamic directories and their // children. InodeNoDynamicLookup provides a blanket implementation for all // non-dynamic-directory inodes. inodeDynamicLookup // Open creates a file description for the filesystem object represented by // this inode. The returned file description should hold a reference on the // inode for its lifetime. // // Precondition: rp.Done(). vfsd.Impl() must be the kernfs Dentry containing // the inode on which Open() is being called. Open(rp *vfs.ResolvingPath, vfsd *vfs.Dentry, opts vfs.OpenOptions) (*vfs.FileDescription, error) } type inodeRefs interface { IncRef() DecRef() TryIncRef() bool // Destroy is called when the inode reaches zero references. Destroy release // all resources (references) on objects referenced by the inode, including // any child dentries. Destroy() } type inodeMetadata interface { // CheckPermissions checks that creds may access this inode for the // requested access type, per the the rules of // fs/namei.c:generic_permission(). CheckPermissions(ctx context.Context, creds *auth.Credentials, ats vfs.AccessTypes) error // Mode returns the (struct stat)::st_mode value for this inode. This is // separated from Stat for performance. Mode() linux.FileMode // Stat returns the metadata for this inode. This corresponds to // vfs.FilesystemImpl.StatAt. Stat(fs *vfs.Filesystem, opts vfs.StatOptions) (linux.Statx, error) // SetStat updates the metadata for this inode. This corresponds to // vfs.FilesystemImpl.SetStatAt. Implementations are responsible for checking // if the operation can be performed (see vfs.CheckSetStat() for common // checks). SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Credentials, opts vfs.SetStatOptions) error } // Precondition: All methods in this interface may only be called on directory // inodes. type inodeDirectory interface { // The New{File,Dir,Node,Symlink} methods below should return a new inode // hashed into this inode. // // These inode constructors are inode-level operations rather than // filesystem-level operations to allow client filesystems to mix different // implementations based on the new node's location in the // filesystem. // HasChildren returns true if the directory inode has any children. HasChildren() bool // NewFile creates a new regular file inode. NewFile(ctx context.Context, name string, opts vfs.OpenOptions) (*vfs.Dentry, error) // NewDir creates a new directory inode. NewDir(ctx context.Context, name string, opts vfs.MkdirOptions) (*vfs.Dentry, error) // NewLink creates a new hardlink to a specified inode in this // directory. Implementations should create a new kernfs Dentry pointing to // target, and update target's link count. NewLink(ctx context.Context, name string, target Inode) (*vfs.Dentry, error) // NewSymlink creates a new symbolic link inode. NewSymlink(ctx context.Context, name, target string) (*vfs.Dentry, error) // NewNode creates a new filesystem node for a mknod syscall. NewNode(ctx context.Context, name string, opts vfs.MknodOptions) (*vfs.Dentry, error) // Unlink removes a child dentry from this directory inode. Unlink(ctx context.Context, name string, child *vfs.Dentry) error // RmDir removes an empty child directory from this directory // inode. Implementations must update the parent directory's link count, // if required. Implementations are not responsible for checking that child // is a directory, checking for an empty directory. RmDir(ctx context.Context, name string, child *vfs.Dentry) error // Rename is called on the source directory containing an inode being // renamed. child should point to the resolved child in the source // directory. If Rename replaces a dentry in the destination directory, it // should return the replaced dentry or nil otherwise. // // Precondition: Caller must serialize concurrent calls to Rename. Rename(ctx context.Context, oldname, newname string, child, dstDir *vfs.Dentry) (replaced *vfs.Dentry, err error) } type inodeDynamicLookup interface { // Lookup should return an appropriate dentry if name should resolve to a // child of this dynamic directory inode. This gives the directory an // opportunity on every lookup to resolve additional entries that aren't // hashed into the directory. This is only called when the inode is a // directory. If the inode is not a directory, or if the directory only // contains a static set of children, the implementer can unconditionally // return an appropriate error (ENOTDIR and ENOENT respectively). // // The child returned by Lookup will be hashed into the VFS dentry tree. Its // lifetime can be controlled by the filesystem implementation with an // appropriate implementation of Valid. // // Lookup returns the child with an extra reference and the caller owns this // reference. Lookup(ctx context.Context, name string) (*vfs.Dentry, error) // Valid should return true if this inode is still valid, or needs to // be resolved again by a call to Lookup. Valid(ctx context.Context) bool // IterDirents is used to iterate over dynamically created entries. It invokes // cb on each entry in the directory represented by the FileDescription. // 'offset' is the offset for the entire IterDirents call, which may include // results from the caller. 'relOffset' is the offset inside the entries // returned by this IterDirents invocation. In other words, // 'offset+relOffset+1' is the value that should be set in vfs.Dirent.NextOff, // while 'relOffset' is the place where iteration should start from. IterDirents(ctx context.Context, callback vfs.IterDirentsCallback, offset, relOffset int64) (newOffset int64, err error) } type inodeSymlink interface { // Readlink resolves the target of a symbolic link. If an inode is not a // symlink, the implementation should return EINVAL. Readlink(ctx context.Context) (string, error) }