// Copyright 2019 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. // Package tmpfs provides a filesystem implementation that behaves like tmpfs: // the Dentry tree is the sole source of truth for the state of the filesystem. // // Lock order: // // filesystem.mu // inode.mu // regularFileFD.offMu // regularFile.mapsMu // regularFile.dataMu package tmpfs import ( "fmt" "math" "sync/atomic" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" fslock "gvisor.dev/gvisor/pkg/sentry/fs/lock" "gvisor.dev/gvisor/pkg/sentry/kernel/auth" "gvisor.dev/gvisor/pkg/sentry/kernel/time" "gvisor.dev/gvisor/pkg/sentry/pgalloc" "gvisor.dev/gvisor/pkg/sentry/vfs" "gvisor.dev/gvisor/pkg/sentry/vfs/lock" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" ) // Name is the default filesystem name. const Name = "tmpfs" // FilesystemType implements vfs.FilesystemType. type FilesystemType struct{} // filesystem implements vfs.FilesystemImpl. type filesystem struct { vfsfs vfs.Filesystem // memFile is used to allocate pages to for regular files. memFile *pgalloc.MemoryFile // clock is a realtime clock used to set timestamps in file operations. clock time.Clock // mu serializes changes to the Dentry tree. mu sync.RWMutex nextInoMinusOne uint64 // accessed using atomic memory operations } // GetFilesystem implements vfs.FilesystemType.GetFilesystem. func (fstype FilesystemType) GetFilesystem(ctx context.Context, vfsObj *vfs.VirtualFilesystem, creds *auth.Credentials, source string, opts vfs.GetFilesystemOptions) (*vfs.Filesystem, *vfs.Dentry, error) { memFileProvider := pgalloc.MemoryFileProviderFromContext(ctx) if memFileProvider == nil { panic("MemoryFileProviderFromContext returned nil") } clock := time.RealtimeClockFromContext(ctx) fs := filesystem{ memFile: memFileProvider.MemoryFile(), clock: clock, } fs.vfsfs.Init(vfsObj, &fs) root := fs.newDentry(fs.newDirectory(creds, 01777)) return &fs.vfsfs, &root.vfsd, nil } // Release implements vfs.FilesystemImpl.Release. func (fs *filesystem) Release() { } // dentry implements vfs.DentryImpl. type dentry struct { vfsd vfs.Dentry // inode is the inode represented by this dentry. Multiple Dentries may // share a single non-directory inode (with hard links). inode is // immutable. inode *inode // tmpfs doesn't count references on dentries; because the dentry tree is // the sole source of truth, it is by definition always consistent with the // state of the filesystem. However, it does count references on inodes, // because inode resources are released when all references are dropped. // (tmpfs doesn't really have resources to release, but we implement // reference counting because tmpfs regular files will.) // dentryEntry (ugh) links dentries into their parent directory.childList. dentryEntry } func (fs *filesystem) newDentry(inode *inode) *dentry { d := &dentry{ inode: inode, } d.vfsd.Init(d) return d } // IncRef implements vfs.DentryImpl.IncRef. func (d *dentry) IncRef() { d.inode.incRef() } // TryIncRef implements vfs.DentryImpl.TryIncRef. func (d *dentry) TryIncRef() bool { return d.inode.tryIncRef() } // DecRef implements vfs.DentryImpl.DecRef. func (d *dentry) DecRef() { d.inode.decRef() } // inode represents a filesystem object. type inode struct { // clock is a realtime clock used to set timestamps in file operations. clock time.Clock // refs is a reference count. refs is accessed using atomic memory // operations. // // A reference is held on all inodes that are reachable in the filesystem // tree. For non-directories (which may have multiple hard links), this // means that a reference is dropped when nlink reaches 0. For directories, // nlink never reaches 0 due to the "." entry; instead, // filesystem.RmdirAt() drops the reference. refs int64 // Inode metadata. Writing multiple fields atomically requires holding // mu, othewise atomic operations can be used. mu sync.Mutex mode uint32 // excluding file type bits, which are based on impl nlink uint32 // protected by filesystem.mu instead of inode.mu uid uint32 // auth.KUID, but stored as raw uint32 for sync/atomic gid uint32 // auth.KGID, but ... ino uint64 // immutable // Linux's tmpfs has no concept of btime. atime int64 // nanoseconds ctime int64 // nanoseconds mtime int64 // nanoseconds // Only meaningful for device special files. rdevMajor uint32 rdevMinor uint32 // Advisory file locks, which lock at the inode level. locks lock.FileLocks impl interface{} // immutable } const maxLinks = math.MaxUint32 func (i *inode) init(impl interface{}, fs *filesystem, creds *auth.Credentials, mode linux.FileMode) { i.clock = fs.clock i.refs = 1 i.mode = uint32(mode) i.uid = uint32(creds.EffectiveKUID) i.gid = uint32(creds.EffectiveKGID) i.ino = atomic.AddUint64(&fs.nextInoMinusOne, 1) // Tmpfs creation sets atime, ctime, and mtime to current time. now := i.clock.Now().Nanoseconds() i.atime = now i.ctime = now i.mtime = now // i.nlink initialized by caller i.impl = impl } // incLinksLocked increments i's link count. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. // i.nlink < maxLinks. func (i *inode) incLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.incLinksLocked() called with no existing links") } if i.nlink == maxLinks { panic("memfs.inode.incLinksLocked() called with maximum link count") } atomic.AddUint32(&i.nlink, 1) } // decLinksLocked decrements i's link count. // // Preconditions: filesystem.mu must be locked for writing. i.nlink != 0. func (i *inode) decLinksLocked() { if i.nlink == 0 { panic("tmpfs.inode.decLinksLocked() called with no existing links") } atomic.AddUint32(&i.nlink, ^uint32(0)) } func (i *inode) incRef() { if atomic.AddInt64(&i.refs, 1) <= 1 { panic("tmpfs.inode.incRef() called without holding a reference") } } func (i *inode) tryIncRef() bool { for { refs := atomic.LoadInt64(&i.refs) if refs == 0 { return false } if atomic.CompareAndSwapInt64(&i.refs, refs, refs+1) { return true } } } func (i *inode) decRef() { if refs := atomic.AddInt64(&i.refs, -1); refs == 0 { if regFile, ok := i.impl.(*regularFile); ok { // Hold inode.mu and regFile.dataMu while mutating // size. i.mu.Lock() regFile.dataMu.Lock() regFile.data.DropAll(regFile.memFile) atomic.StoreUint64(®File.size, 0) regFile.dataMu.Unlock() i.mu.Unlock() } } else if refs < 0 { panic("tmpfs.inode.decRef() called without holding a reference") } } func (i *inode) checkPermissions(creds *auth.Credentials, ats vfs.AccessTypes, isDir bool) error { return vfs.GenericCheckPermissions(creds, ats, isDir, uint16(atomic.LoadUint32(&i.mode)), auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))) } // Go won't inline this function, and returning linux.Statx (which is quite // big) means spending a lot of time in runtime.duffcopy(), so instead it's an // output parameter. // // Note that Linux does not guarantee to return consistent data (in the case of // a concurrent modification), so we do not require holding inode.mu. func (i *inode) statTo(stat *linux.Statx) { stat.Mask = linux.STATX_TYPE | linux.STATX_MODE | linux.STATX_NLINK | linux.STATX_UID | linux.STATX_GID | linux.STATX_INO | linux.STATX_ATIME | linux.STATX_BTIME | linux.STATX_CTIME | linux.STATX_MTIME stat.Blksize = 1 // usermem.PageSize in tmpfs stat.Nlink = atomic.LoadUint32(&i.nlink) stat.UID = atomic.LoadUint32(&i.uid) stat.GID = atomic.LoadUint32(&i.gid) stat.Mode = uint16(atomic.LoadUint32(&i.mode)) stat.Ino = i.ino // Linux's tmpfs has no concept of btime, so zero-value is returned. stat.Atime = linux.NsecToStatxTimestamp(i.atime) stat.Ctime = linux.NsecToStatxTimestamp(i.ctime) stat.Mtime = linux.NsecToStatxTimestamp(i.mtime) // TODO(gvisor.dev/issues/1197): Device number. switch impl := i.impl.(type) { case *regularFile: stat.Mode |= linux.S_IFREG stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(atomic.LoadUint64(&impl.size)) // In tmpfs, this will be FileRangeSet.Span() / 512 (but also cached in // a uint64 accessed using atomic memory operations to avoid taking // locks). stat.Blocks = allocatedBlocksForSize(stat.Size) case *directory: stat.Mode |= linux.S_IFDIR case *symlink: stat.Mode |= linux.S_IFLNK stat.Mask |= linux.STATX_SIZE | linux.STATX_BLOCKS stat.Size = uint64(len(impl.target)) stat.Blocks = allocatedBlocksForSize(stat.Size) case *namedPipe: stat.Mode |= linux.S_IFIFO case *deviceFile: switch impl.kind { case vfs.BlockDevice: stat.Mode |= linux.S_IFBLK case vfs.CharDevice: stat.Mode |= linux.S_IFCHR } stat.RdevMajor = impl.major stat.RdevMinor = impl.minor default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } } func (i *inode) setStat(ctx context.Context, creds *auth.Credentials, stat *linux.Statx) error { if stat.Mask == 0 { return nil } if stat.Mask&^(linux.STATX_MODE|linux.STATX_UID|linux.STATX_GID|linux.STATX_ATIME|linux.STATX_MTIME|linux.STATX_CTIME|linux.STATX_SIZE) != 0 { return syserror.EPERM } if err := vfs.CheckSetStat(ctx, creds, stat, uint16(atomic.LoadUint32(&i.mode))&^linux.S_IFMT, auth.KUID(atomic.LoadUint32(&i.uid)), auth.KGID(atomic.LoadUint32(&i.gid))); err != nil { return err } i.mu.Lock() var ( needsMtimeBump bool needsCtimeBump bool ) mask := stat.Mask if mask&linux.STATX_MODE != 0 { atomic.StoreUint32(&i.mode, uint32(stat.Mode)) needsCtimeBump = true } if mask&linux.STATX_UID != 0 { atomic.StoreUint32(&i.uid, stat.UID) needsCtimeBump = true } if mask&linux.STATX_GID != 0 { atomic.StoreUint32(&i.gid, stat.GID) needsCtimeBump = true } if mask&linux.STATX_SIZE != 0 { switch impl := i.impl.(type) { case *regularFile: updated, err := impl.truncateLocked(stat.Size) if err != nil { return err } if updated { needsMtimeBump = true needsCtimeBump = true } case *directory: return syserror.EISDIR default: return syserror.EINVAL } } if mask&linux.STATX_ATIME != 0 { atomic.StoreInt64(&i.atime, stat.Atime.ToNsecCapped()) needsCtimeBump = true } if mask&linux.STATX_MTIME != 0 { atomic.StoreInt64(&i.mtime, stat.Mtime.ToNsecCapped()) needsCtimeBump = true // Ignore the mtime bump, since we just set it ourselves. needsMtimeBump = false } if mask&linux.STATX_CTIME != 0 { atomic.StoreInt64(&i.ctime, stat.Ctime.ToNsecCapped()) // Ignore the ctime bump, since we just set it ourselves. needsCtimeBump = false } now := i.clock.Now().Nanoseconds() if needsMtimeBump { atomic.StoreInt64(&i.mtime, now) } if needsCtimeBump { atomic.StoreInt64(&i.ctime, now) } i.mu.Unlock() return nil } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) lockBSD(uid fslock.UniqueID, t fslock.LockType, block fslock.Blocker) error { switch i.impl.(type) { case *regularFile: return i.locks.LockBSD(uid, t, block) } return syserror.EBADF } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) unlockBSD(uid fslock.UniqueID) error { switch i.impl.(type) { case *regularFile: i.locks.UnlockBSD(uid) return nil } return syserror.EBADF } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) lockPOSIX(uid fslock.UniqueID, t fslock.LockType, rng fslock.LockRange, block fslock.Blocker) error { switch i.impl.(type) { case *regularFile: return i.locks.LockPOSIX(uid, t, rng, block) } return syserror.EBADF } // TODO(gvisor.dev/issue/1480): support file locking for file types other than regular. func (i *inode) unlockPOSIX(uid fslock.UniqueID, rng fslock.LockRange) error { switch i.impl.(type) { case *regularFile: i.locks.UnlockPOSIX(uid, rng) return nil } return syserror.EBADF } // allocatedBlocksForSize returns the number of 512B blocks needed to // accommodate the given size in bytes, as appropriate for struct // stat::st_blocks and struct statx::stx_blocks. (Note that this 512B block // size is independent of the "preferred block size for I/O", struct // stat::st_blksize and struct statx::stx_blksize.) func allocatedBlocksForSize(size uint64) uint64 { return (size + 511) / 512 } func (i *inode) direntType() uint8 { switch impl := i.impl.(type) { case *regularFile: return linux.DT_REG case *directory: return linux.DT_DIR case *symlink: return linux.DT_LNK case *deviceFile: switch impl.kind { case vfs.BlockDevice: return linux.DT_BLK case vfs.CharDevice: return linux.DT_CHR default: panic(fmt.Sprintf("unknown vfs.DeviceKind: %v", impl.kind)) } default: panic(fmt.Sprintf("unknown inode type: %T", i.impl)) } } // fileDescription is embedded by tmpfs implementations of // vfs.FileDescriptionImpl. type fileDescription struct { vfsfd vfs.FileDescription vfs.FileDescriptionDefaultImpl } func (fd *fileDescription) filesystem() *filesystem { return fd.vfsfd.Mount().Filesystem().Impl().(*filesystem) } func (fd *fileDescription) inode() *inode { return fd.vfsfd.Dentry().Impl().(*dentry).inode } // Stat implements vfs.FileDescriptionImpl.Stat. func (fd *fileDescription) Stat(ctx context.Context, opts vfs.StatOptions) (linux.Statx, error) { var stat linux.Statx fd.inode().statTo(&stat) return stat, nil } // SetStat implements vfs.FileDescriptionImpl.SetStat. func (fd *fileDescription) SetStat(ctx context.Context, opts vfs.SetStatOptions) error { creds := auth.CredentialsFromContext(ctx) return fd.inode().setStat(ctx, creds, &opts.Stat) }