gvisor/pkg/sentry/fs/dirent.go

1687 lines
48 KiB
Go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"fmt"
"path"
"sort"
"sync/atomic"
"syscall"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/refs"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/socket/unix/transport"
"gvisor.dev/gvisor/pkg/sentry/uniqueid"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/syserror"
)
type globalDirentMap struct {
mu sync.Mutex
dirents map[*Dirent]struct{}
}
func (g *globalDirentMap) add(d *Dirent) {
g.mu.Lock()
g.dirents[d] = struct{}{}
g.mu.Unlock()
}
func (g *globalDirentMap) remove(d *Dirent) {
g.mu.Lock()
delete(g.dirents, d)
g.mu.Unlock()
}
// allDirents keeps track of all Dirents that need to be considered in
// Save/Restore for inode mappings.
//
// Because inodes do not hold paths, but inodes for external file systems map
// to an external path, every user-visible Dirent is stored in this map and
// iterated through upon save to keep inode ID -> restore path mappings.
var allDirents = globalDirentMap{
dirents: map[*Dirent]struct{}{},
}
// renameMu protects the parent of *all* Dirents. (See explanation in
// lockForRename.)
//
// See fs.go for lock ordering.
var renameMu sync.RWMutex
// Dirent holds an Inode in memory.
//
// A Dirent may be negative or positive:
//
// A negative Dirent contains a nil Inode and indicates that a path does not exist. This
// is a convention taken from the Linux dcache, see fs/dcache.c. A negative Dirent remains
// cached until a create operation replaces it with a positive Dirent. A negative Dirent
// always has one reference owned by its parent and takes _no_ reference on its parent. This
// ensures that its parent can be unhashed regardless of negative children.
//
// A positive Dirent contains a non-nil Inode. It remains cached for as long as there remain
// references to it. A positive Dirent always takes a reference on its parent.
//
// A Dirent may be a root Dirent (parent is nil) or be parented (non-nil parent).
//
// Dirents currently do not attempt to free entries that lack application references under
// memory pressure.
//
// +stateify savable
type Dirent struct {
// AtomicRefCount is our reference count.
refs.AtomicRefCount
// userVisible indicates whether the Dirent is visible to the user or
// not. Only user-visible Dirents should save inode mappings in
// save/restore, as only they hold the real path to the underlying
// inode.
//
// See newDirent and Dirent.afterLoad.
userVisible bool
// Inode is the underlying file object.
//
// Inode is exported currently to assist in implementing overlay Inodes (where a
// Inode.InodeOperations.Lookup may need to merge the Inode contained in a positive Dirent with
// another Inode). This is normally done before the Dirent is parented (there are
// no external references to it).
//
// Other objects in the VFS may take a reference to this Inode but only while holding
// a reference to this Dirent.
Inode *Inode
// name is the name (i.e. basename) of this entry.
//
// N.B. name is protected by parent.mu, not this node's mu!
name string
// parent is the parent directory.
//
// We hold a hard reference to the parent.
//
// parent is protected by renameMu.
parent *Dirent
// deleted may be set atomically when removed.
deleted int32
// frozen indicates this entry can't walk to unknown nodes.
frozen bool
// mounted is true if Dirent is a mount point, similar to include/linux/dcache.h:DCACHE_MOUNTED.
mounted bool
// direntEntry identifies this Dirent as an element in a DirentCache. DirentCaches
// and their contents are not saved.
direntEntry `state:"nosave"`
// dirMu is a read-write mutex that protects caching decisions made by directory operations.
// Lock ordering: dirMu must be taken before mu (see below). Details:
//
// dirMu does not participate in Rename; instead mu and renameMu are used, see lockForRename.
//
// Creation and Removal operations must be synchronized with Walk to prevent stale negative
// caching. Note that this requirement is not specific to a _Dirent_ doing negative caching.
// The following race exists at any level of the VFS:
//
// For an object D that represents a directory, containing a cache of non-existent paths,
// protected by D.cacheMu:
//
// T1: T2:
// D.lookup(name)
// --> ENOENT
// D.create(name)
// --> success
// D.cacheMu.Lock
// delete(D.cache, name)
// D.cacheMu.Unlock
// D.cacheMu.Lock
// D.cache[name] = true
// D.cacheMu.Unlock
//
// D.lookup(name)
// D.cacheMu.Lock
// if D.cache[name] {
// --> ENOENT (wrong)
// }
// D.cacheMu.Lock
//
// Correct:
//
// T1: T2:
// D.cacheMu.Lock
// D.lookup(name)
// --> ENOENT
// D.cache[name] = true
// D.cacheMu.Unlock
// D.cacheMu.Lock
// D.create(name)
// --> success
// delete(D.cache, name)
// D.cacheMu.Unlock
//
// D.cacheMu.Lock
// D.lookup(name)
// --> EXISTS (right)
// D.cacheMu.Unlock
//
// Note that the above "correct" solution causes too much lock contention: all lookups are
// synchronized with each other. This is a problem because lookups are involved in any VFS
// path operation.
//
// A Dirent diverges from the single D.cacheMu and instead uses two locks: dirMu to protect
// concurrent creation/removal/lookup caching, and mu to protect the Dirent's children map
// in general.
//
// This allows for concurrent Walks to be executed in order to pipeline lookups. For instance
// for a hot directory /a/b, threads T1, T2, T3 will only block on each other update the
// children map of /a/b when their individual lookups complete.
//
// T1: T2: T3:
// stat(/a/b/c) stat(/a/b/d) stat(/a/b/e)
dirMu sync.RWMutex `state:"nosave"`
// mu protects the below fields. Lock ordering: mu must be taken after dirMu.
mu sync.Mutex `state:"nosave"`
// children are cached via weak references.
children map[string]*refs.WeakRef `state:".(map[string]*Dirent)"`
}
// NewDirent returns a new root Dirent, taking the caller's reference on inode. The caller
// holds the only reference to the Dirent. Parents may call hashChild to parent this Dirent.
func NewDirent(ctx context.Context, inode *Inode, name string) *Dirent {
d := newDirent(inode, name)
allDirents.add(d)
d.userVisible = true
return d
}
// NewTransientDirent creates a transient Dirent that shouldn't actually be
// visible to users.
//
// An Inode is required.
func NewTransientDirent(inode *Inode) *Dirent {
if inode == nil {
panic("an inode is required")
}
return newDirent(inode, "transient")
}
func newDirent(inode *Inode, name string) *Dirent {
// The Dirent needs to maintain one reference to MountSource.
if inode != nil {
inode.MountSource.IncDirentRefs()
}
d := Dirent{
Inode: inode,
name: name,
children: make(map[string]*refs.WeakRef),
}
d.EnableLeakCheck("fs.Dirent")
return &d
}
// NewNegativeDirent returns a new root negative Dirent. Otherwise same as NewDirent.
func NewNegativeDirent(name string) *Dirent {
return newDirent(nil, name)
}
// IsRoot returns true if d is a root Dirent.
func (d *Dirent) IsRoot() bool {
return d.parent == nil
}
// IsNegative returns true if d represents a path that does not exist.
func (d *Dirent) IsNegative() bool {
return d.Inode == nil
}
// hashChild will hash child into the children list of its new parent d, carrying over
// any "frozen" state from d.
//
// Returns (*WeakRef, true) if hashing child caused a Dirent to be unhashed. The caller must
// validate the returned unhashed weak reference. Common cases:
//
// * Remove: hashing a negative Dirent unhashes a positive Dirent (unimplemented).
// * Create: hashing a positive Dirent unhashes a negative Dirent.
// * Lookup: hashing any Dirent should not unhash any other Dirent.
//
// Preconditions:
// * d.mu must be held.
// * child must be a root Dirent.
func (d *Dirent) hashChild(child *Dirent) (*refs.WeakRef, bool) {
if !child.IsRoot() {
panic("hashChild must be a root Dirent")
}
// Assign parentage.
child.parent = d
// Avoid letting negative Dirents take a reference on their parent; these Dirents
// don't have a role outside of the Dirent cache and should not keep their parent
// indefinitely pinned.
if !child.IsNegative() {
// Positive dirents must take a reference on their parent.
d.IncRef()
}
// Carry over parent's frozen state.
child.frozen = d.frozen
return d.hashChildParentSet(child)
}
// hashChildParentSet will rehash child into the children list of its parent d.
//
// Assumes that child.parent = d already.
func (d *Dirent) hashChildParentSet(child *Dirent) (*refs.WeakRef, bool) {
if child.parent != d {
panic("hashChildParentSet assumes the child already belongs to the parent")
}
// Save any replaced child so our caller can validate it.
old, ok := d.children[child.name]
// Hash the child.
d.children[child.name] = refs.NewWeakRef(child, nil)
// Return any replaced child.
return old, ok
}
// SyncAll iterates through mount points under d and writes back their buffered
// modifications to filesystems.
func (d *Dirent) SyncAll(ctx context.Context) {
d.mu.Lock()
defer d.mu.Unlock()
// For negative Dirents there is nothing to sync. By definition these are
// leaves (there is nothing left to traverse).
if d.IsNegative() {
return
}
// There is nothing to sync for a read-only filesystem.
if !d.Inode.MountSource.Flags.ReadOnly {
// FIXME(b/34856369): This should be a mount traversal, not a
// Dirent traversal, because some Inodes that need to be synced
// may no longer be reachable by name (after sys_unlink).
//
// Write out metadata, dirty page cached pages, and sync disk/remote
// caches.
d.Inode.WriteOut(ctx)
}
// Continue iterating through other mounted filesystems.
for _, w := range d.children {
if child := w.Get(); child != nil {
child.(*Dirent).SyncAll(ctx)
child.DecRef()
}
}
}
// BaseName returns the base name of the dirent.
func (d *Dirent) BaseName() string {
p := d.parent
if p == nil {
return d.name
}
p.mu.Lock()
defer p.mu.Unlock()
return d.name
}
// FullName returns the fully-qualified name and a boolean value representing
// whether this Dirent was a descendant of root.
// If the root argument is nil it is assumed to be the root of the Dirent tree.
func (d *Dirent) FullName(root *Dirent) (string, bool) {
renameMu.RLock()
defer renameMu.RUnlock()
return d.fullName(root)
}
// fullName returns the fully-qualified name and a boolean value representing
// if the root node was reachable from this Dirent.
func (d *Dirent) fullName(root *Dirent) (string, bool) {
if d == root {
return "/", true
}
if d.IsRoot() {
if root != nil {
// We reached the top of the Dirent tree but did not encounter
// the given root. Return false for reachable so the caller
// can handle this situation accordingly.
return d.name, false
}
return d.name, true
}
// Traverse up to parent.
d.parent.mu.Lock()
name := d.name
d.parent.mu.Unlock()
parentName, reachable := d.parent.fullName(root)
s := path.Join(parentName, name)
if atomic.LoadInt32(&d.deleted) != 0 {
return s + " (deleted)", reachable
}
return s, reachable
}
// MountRoot finds and returns the mount-root for a given dirent.
func (d *Dirent) MountRoot() *Dirent {
renameMu.RLock()
defer renameMu.RUnlock()
mountRoot := d
for !mountRoot.mounted && mountRoot.parent != nil {
mountRoot = mountRoot.parent
}
mountRoot.IncRef()
return mountRoot
}
// Freeze prevents this dirent from walking to more nodes. Freeze is applied
// recursively to all children.
//
// If this particular Dirent represents a Virtual node, then Walks and Creates
// may proceed as before.
//
// Freeze can only be called before the application starts running, otherwise
// the root it might be out of sync with the application root if modified by
// sys_chroot.
func (d *Dirent) Freeze() {
d.mu.Lock()
defer d.mu.Unlock()
if d.frozen {
// Already frozen.
return
}
d.frozen = true
// Take a reference when freezing.
for _, w := range d.children {
if child := w.Get(); child != nil {
// NOTE: We would normally drop the reference here. But
// instead we're hanging on to it.
ch := child.(*Dirent)
ch.Freeze()
}
}
// Drop all expired weak references.
d.flush()
}
// descendantOf returns true if the receiver dirent is equal to, or a
// descendant of, the argument dirent.
//
// d.mu must be held.
func (d *Dirent) descendantOf(p *Dirent) bool {
if d == p {
return true
}
if d.IsRoot() {
return false
}
return d.parent.descendantOf(p)
}
// walk walks to path name starting at the dirent, and will not traverse above
// root Dirent.
//
// If walkMayUnlock is true then walk can unlock d.mu to execute a slow
// Inode.Lookup, otherwise walk will keep d.mu locked.
//
// Preconditions:
// - renameMu must be held for reading.
// - d.mu must be held.
// - name must must not contain "/"s.
func (d *Dirent) walk(ctx context.Context, root *Dirent, name string, walkMayUnlock bool) (*Dirent, error) {
if !IsDir(d.Inode.StableAttr) {
return nil, syscall.ENOTDIR
}
if name == "" || name == "." {
d.IncRef()
return d, nil
} else if name == ".." {
// Respect the chroot. Note that in Linux there is no check to enforce
// that d is a descendant of root.
if d == root {
d.IncRef()
return d, nil
}
// Are we already at the root? Then ".." is ".".
if d.IsRoot() {
d.IncRef()
return d, nil
}
d.parent.IncRef()
return d.parent, nil
}
if w, ok := d.children[name]; ok {
// Try to resolve the weak reference to a hard reference.
if child := w.Get(); child != nil {
cd := child.(*Dirent)
// Is this a negative Dirent?
if cd.IsNegative() {
// Don't leak a reference; this doesn't matter as much for negative Dirents,
// which don't hold a hard reference on their parent (their parent holds a
// hard reference on them, and they contain virtually no state). But this is
// good house-keeping.
child.DecRef()
return nil, syscall.ENOENT
}
// Do we need to revalidate this child?
//
// We never allow the file system to revalidate mounts, that could cause them
// to unexpectedly drop out before umount.
if cd.mounted || !cd.Inode.MountSource.Revalidate(ctx, name, d.Inode, cd.Inode) {
// Good to go. This is the fast-path.
return cd, nil
}
// If we're revalidating a child, we must ensure all inotify watches release
// their pins on the child. Inotify doesn't properly support filesystems that
// revalidate dirents (since watches are lost on revalidation), but if we fail
// to unpin the watches child will never be GCed.
cd.Inode.Watches.Unpin(cd)
// This child needs to be revalidated, fallthrough to unhash it. Make sure
// to not leak a reference from Get().
//
// Note that previous lookups may still have a reference to this stale child;
// this can't be helped, but we can ensure that *new* lookups are up-to-date.
child.DecRef()
}
// Either our weak reference expired or we need to revalidate it. Unhash child first, we're
// about to replace it.
delete(d.children, name)
w.Drop()
}
// Are we allowed to do the lookup?
if d.frozen && !d.Inode.IsVirtual() {
return nil, syscall.ENOENT
}
// Slow path: load the InodeOperations into memory. Since this is a hot path and the lookup may be
// expensive, if possible release the lock and re-acquire it.
if walkMayUnlock {
d.mu.Unlock()
}
c, err := d.Inode.Lookup(ctx, name)
if walkMayUnlock {
d.mu.Lock()
}
// No dice.
if err != nil {
return nil, err
}
// Sanity check c, its name must be consistent.
if c.name != name {
panic(fmt.Sprintf("lookup from %q to %q returned unexpected name %q", d.name, name, c.name))
}
// Now that we have the lock again, check if we raced.
if w, ok := d.children[name]; ok {
// Someone else looked up or created a child at name before us.
if child := w.Get(); child != nil {
cd := child.(*Dirent)
// There are active references to the existing child, prefer it to the one we
// retrieved from Lookup. Likely the Lookup happened very close to the insertion
// of child, so considering one stale over the other is fairly arbitrary.
c.DecRef()
// The child that was installed could be negative.
if cd.IsNegative() {
// If so, don't leak a reference and short circuit.
child.DecRef()
return nil, syscall.ENOENT
}
// We make the judgement call that if c raced with cd they are close enough to have
// the same staleness, so we don't attempt to revalidate cd. In Linux revalidations
// can continue indefinitely (see fs/namei.c, retry_estale); we try to avoid this.
return cd, nil
}
// Weak reference expired. We went through a full cycle of create/destroy in the time
// we did the Inode.Lookup. Fully drop the weak reference and fallback to using the child
// we looked up.
delete(d.children, name)
w.Drop()
}
// Give the looked up child a parent. We cannot kick out entries, since we just checked above
// that there is nothing at name in d's children list.
if _, kicked := d.hashChild(c); kicked {
// Yell loudly.
panic(fmt.Sprintf("hashed child %q over existing child", c.name))
}
// Is this a negative Dirent?
if c.IsNegative() {
// Don't drop a reference on the negative Dirent, it was just installed and this is the
// only reference we'll ever get. d owns the reference.
return nil, syscall.ENOENT
}
// Return the positive Dirent.
return c, nil
}
// Walk walks to a new dirent, and will not walk higher than the given root
// Dirent, which must not be nil.
func (d *Dirent) Walk(ctx context.Context, root *Dirent, name string) (*Dirent, error) {
if root == nil {
panic("Dirent.Walk: root must not be nil")
}
// We could use lockDirectory here, but this is a hot path and we want
// to avoid defer.
renameMu.RLock()
d.dirMu.RLock()
d.mu.Lock()
child, err := d.walk(ctx, root, name, true /* may unlock */)
d.mu.Unlock()
d.dirMu.RUnlock()
renameMu.RUnlock()
return child, err
}
// exists returns true if name exists in relation to d.
//
// Preconditions:
// - renameMu must be held for reading.
// - d.mu must be held.
// - name must must not contain "/"s.
func (d *Dirent) exists(ctx context.Context, root *Dirent, name string) bool {
child, err := d.walk(ctx, root, name, false /* may unlock */)
if err != nil {
// Child may not exist.
return false
}
// Child exists.
child.DecRef()
return true
}
// lockDirectory should be called for any operation that changes this `d`s
// children (creating or removing them).
func (d *Dirent) lockDirectory() func() {
renameMu.RLock()
d.dirMu.Lock()
d.mu.Lock()
return func() {
d.mu.Unlock()
d.dirMu.Unlock()
renameMu.RUnlock()
}
}
// Create creates a new regular file in this directory.
func (d *Dirent) Create(ctx context.Context, root *Dirent, name string, flags FileFlags, perms FilePermissions) (*File, error) {
unlock := d.lockDirectory()
defer unlock()
// Does something already exist?
if d.exists(ctx, root, name) {
return nil, syscall.EEXIST
}
// Are we frozen?
if d.frozen && !d.Inode.IsVirtual() {
return nil, syscall.ENOENT
}
// Try the create. We need to trust the file system to return EEXIST (or something
// that will translate to EEXIST) if name already exists.
file, err := d.Inode.Create(ctx, d, name, flags, perms)
if err != nil {
return nil, err
}
child := file.Dirent
d.finishCreate(child, name)
// Return the reference and the new file. When the last reference to
// the file is dropped, file.Dirent may no longer be cached.
return file, nil
}
// finishCreate validates the created file, adds it as a child of this dirent,
// and notifies any watchers.
func (d *Dirent) finishCreate(child *Dirent, name string) {
// Sanity check c, its name must be consistent.
if child.name != name {
panic(fmt.Sprintf("create from %q to %q returned unexpected name %q", d.name, name, child.name))
}
// File systems cannot return a negative Dirent on Create, that makes no sense.
if child.IsNegative() {
panic(fmt.Sprintf("create from %q to %q returned negative Dirent", d.name, name))
}
// Hash the child into its parent. We can only kick out a Dirent if it is negative
// (we are replacing something that does not exist with something that now does).
if w, kicked := d.hashChild(child); kicked {
if old := w.Get(); old != nil {
if !old.(*Dirent).IsNegative() {
panic(fmt.Sprintf("hashed child %q over a positive child", child.name))
}
// Don't leak a reference.
old.DecRef()
// Drop d's reference.
old.DecRef()
}
// Finally drop the useless weak reference on the floor.
w.Drop()
}
d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
// Allow the file system to take extra references on c.
child.maybeExtendReference()
}
// genericCreate executes create if name does not exist. Removes a negative Dirent at name if
// create succeeds.
func (d *Dirent) genericCreate(ctx context.Context, root *Dirent, name string, create func() error) error {
unlock := d.lockDirectory()
defer unlock()
// Does something already exist?
if d.exists(ctx, root, name) {
return syscall.EEXIST
}
// Are we frozen?
if d.frozen && !d.Inode.IsVirtual() {
return syscall.ENOENT
}
// Remove any negative Dirent. We've already asserted above with d.exists
// that the only thing remaining here can be a negative Dirent.
if w, ok := d.children[name]; ok {
// Same as Create.
if old := w.Get(); old != nil {
if !old.(*Dirent).IsNegative() {
panic(fmt.Sprintf("hashed over a positive child %q", old.(*Dirent).name))
}
// Don't leak a reference.
old.DecRef()
// Drop d's reference.
old.DecRef()
}
// Unhash the negative Dirent, name needs to exist now.
delete(d.children, name)
// Finally drop the useless weak reference on the floor.
w.Drop()
}
// Execute the create operation.
return create()
}
// CreateLink creates a new link in this directory.
func (d *Dirent) CreateLink(ctx context.Context, root *Dirent, oldname, newname string) error {
return d.genericCreate(ctx, root, newname, func() error {
if err := d.Inode.CreateLink(ctx, d, oldname, newname); err != nil {
return err
}
d.Inode.Watches.Notify(newname, linux.IN_CREATE, 0)
return nil
})
}
// CreateHardLink creates a new hard link in this directory.
func (d *Dirent) CreateHardLink(ctx context.Context, root *Dirent, target *Dirent, name string) error {
// Make sure that target does not span filesystems.
if d.Inode.MountSource != target.Inode.MountSource {
return syscall.EXDEV
}
// Directories are never linkable. See fs/namei.c:vfs_link.
if IsDir(target.Inode.StableAttr) {
return syscall.EPERM
}
return d.genericCreate(ctx, root, name, func() error {
if err := d.Inode.CreateHardLink(ctx, d, target, name); err != nil {
return err
}
target.Inode.Watches.Notify("", linux.IN_ATTRIB, 0) // Link count change.
d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
return nil
})
}
// CreateDirectory creates a new directory under this dirent.
func (d *Dirent) CreateDirectory(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
return d.genericCreate(ctx, root, name, func() error {
if err := d.Inode.CreateDirectory(ctx, d, name, perms); err != nil {
return err
}
d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_CREATE, 0)
return nil
})
}
// Bind satisfies the InodeOperations interface; otherwise same as GetFile.
func (d *Dirent) Bind(ctx context.Context, root *Dirent, name string, data transport.BoundEndpoint, perms FilePermissions) (*Dirent, error) {
var childDir *Dirent
err := d.genericCreate(ctx, root, name, func() error {
var e error
childDir, e = d.Inode.Bind(ctx, d, name, data, perms)
if e != nil {
return e
}
d.finishCreate(childDir, name)
return nil
})
if err == syscall.EEXIST {
return nil, syscall.EADDRINUSE
}
if err != nil {
return nil, err
}
return childDir, err
}
// CreateFifo creates a new named pipe under this dirent.
func (d *Dirent) CreateFifo(ctx context.Context, root *Dirent, name string, perms FilePermissions) error {
return d.genericCreate(ctx, root, name, func() error {
if err := d.Inode.CreateFifo(ctx, d, name, perms); err != nil {
return err
}
d.Inode.Watches.Notify(name, linux.IN_CREATE, 0)
return nil
})
}
// GetDotAttrs returns the DentAttrs corresponding to "." and ".." directories.
func (d *Dirent) GetDotAttrs(root *Dirent) (DentAttr, DentAttr) {
// Get '.'.
sattr := d.Inode.StableAttr
dot := DentAttr{
Type: sattr.Type,
InodeID: sattr.InodeID,
}
// Hold d.mu while we call d.descendantOf.
d.mu.Lock()
defer d.mu.Unlock()
// Get '..'.
if !d.IsRoot() && d.descendantOf(root) {
// Dirent is a descendant of the root. Get its parent's attrs.
psattr := d.parent.Inode.StableAttr
dotdot := DentAttr{
Type: psattr.Type,
InodeID: psattr.InodeID,
}
return dot, dotdot
}
// Dirent is either root or not a descendant of the root. ".." is the
// same as ".".
return dot, dot
}
// readdirFrozen returns readdir results based solely on the frozen children.
func (d *Dirent) readdirFrozen(root *Dirent, offset int64, dirCtx *DirCtx) (int64, error) {
// Collect attrs for "." and "..".
attrs := make(map[string]DentAttr)
names := []string{".", ".."}
attrs["."], attrs[".."] = d.GetDotAttrs(root)
// Get info from all children.
d.mu.Lock()
defer d.mu.Unlock()
for name, w := range d.children {
if child := w.Get(); child != nil {
defer child.DecRef()
// Skip negative children.
if child.(*Dirent).IsNegative() {
continue
}
sattr := child.(*Dirent).Inode.StableAttr
attrs[name] = DentAttr{
Type: sattr.Type,
InodeID: sattr.InodeID,
}
names = append(names, name)
}
}
sort.Strings(names)
if int(offset) >= len(names) {
return offset, nil
}
names = names[int(offset):]
for _, name := range names {
if err := dirCtx.DirEmit(name, attrs[name]); err != nil {
return offset, err
}
offset++
}
return offset, nil
}
// DirIterator is an open directory containing directory entries that can be read.
type DirIterator interface {
// IterateDir emits directory entries by calling dirCtx.EmitDir, beginning
// with the entry at offset and returning the next directory offset.
//
// Entries for "." and ".." must *not* be included.
//
// If the offset returned is the same as the argument offset, then
// nothing has been serialized. This is equivalent to reaching EOF.
// In this case serializer.Written() should return 0.
//
// The order of entries to emit must be consistent between Readdir
// calls, and must start with the given offset.
//
// The caller must ensure that this operation is permitted.
IterateDir(ctx context.Context, d *Dirent, dirCtx *DirCtx, offset int) (int, error)
}
// DirentReaddir serializes the directory entries of d including "." and "..".
//
// Arguments:
//
// * d: the Dirent of the directory being read; required to provide "." and "..".
// * it: the directory iterator; which represents an open directory handle.
// * root: fs root; if d is equal to the root, then '..' will refer to d.
// * ctx: context provided to file systems in order to select and serialize entries.
// * offset: the current directory offset.
//
// Returns the offset of the *next* element which was not serialized.
func DirentReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
offset, err := direntReaddir(ctx, d, it, root, dirCtx, offset)
// Serializing any directory entries at all means success.
if dirCtx.Serializer.Written() > 0 {
return offset, nil
}
return offset, err
}
func direntReaddir(ctx context.Context, d *Dirent, it DirIterator, root *Dirent, dirCtx *DirCtx, offset int64) (int64, error) {
if root == nil {
panic("Dirent.Readdir: root must not be nil")
}
if dirCtx.Serializer == nil {
panic("Dirent.Readdir: serializer must not be nil")
}
// Check that this is actually a directory before emitting anything.
// Once we have written entries for "." and "..", future errors from
// IterateDir will be hidden.
if !IsDir(d.Inode.StableAttr) {
return 0, syserror.ENOTDIR
}
// This is a special case for lseek(fd, 0, SEEK_END).
// See SeekWithDirCursor for more details.
if offset == FileMaxOffset {
return offset, nil
}
if d.frozen {
return d.readdirFrozen(root, offset, dirCtx)
}
// Collect attrs for "." and "..".
dot, dotdot := d.GetDotAttrs(root)
// Emit "." and ".." if the offset is low enough.
if offset == 0 {
// Serialize ".".
if err := dirCtx.DirEmit(".", dot); err != nil {
return offset, err
}
offset++
}
if offset == 1 {
// Serialize "..".
if err := dirCtx.DirEmit("..", dotdot); err != nil {
return offset, err
}
offset++
}
// it.IterateDir should be passed an offset that does not include the
// initial dot elements. We will add them back later.
offset -= 2
newOffset, err := it.IterateDir(ctx, d, dirCtx, int(offset))
if int64(newOffset) < offset {
panic(fmt.Sprintf("node.Readdir returned offset %v less than input offset %v", newOffset, offset))
}
// Add the initial nodes back to the offset count.
newOffset += 2
return int64(newOffset), err
}
// flush flushes all weak references recursively, and removes any cached
// references to children.
//
// Preconditions: d.mu must be held.
func (d *Dirent) flush() {
expired := make(map[string]*refs.WeakRef)
for n, w := range d.children {
// Call flush recursively on each child before removing our
// reference on it, and removing the cache's reference.
if child := w.Get(); child != nil {
cd := child.(*Dirent)
if !cd.IsNegative() {
// Flush the child.
cd.mu.Lock()
cd.flush()
cd.mu.Unlock()
// Allow the file system to drop extra references on child.
cd.dropExtendedReference()
}
// Don't leak a reference.
child.DecRef()
}
// Check if the child dirent is closed, and mark it as expired if it is.
// We must call w.Get() again here, since the child could have been closed
// by the calls to flush() and cache.Remove() in the above if-block.
if child := w.Get(); child != nil {
child.DecRef()
} else {
expired[n] = w
}
}
// Remove expired entries.
for n, w := range expired {
delete(d.children, n)
w.Drop()
}
}
// isMountPoint returns true if the dirent is a mount point or the root.
func (d *Dirent) isMountPoint() bool {
d.mu.Lock()
defer d.mu.Unlock()
return d.isMountPointLocked()
}
func (d *Dirent) isMountPointLocked() bool {
return d.mounted || d.parent == nil
}
// mount mounts a new dirent with the given inode over d.
//
// Precondition: must be called with mm.withMountLocked held on `d`.
func (d *Dirent) mount(ctx context.Context, inode *Inode) (newChild *Dirent, err error) {
// Did we race with deletion?
if atomic.LoadInt32(&d.deleted) != 0 {
return nil, syserror.ENOENT
}
// Refuse to mount a symlink.
//
// See Linux equivalent in fs/namespace.c:do_add_mount.
if IsSymlink(inode.StableAttr) {
return nil, syserror.EINVAL
}
// Are we frozen?
if d.parent.frozen && !d.parent.Inode.IsVirtual() {
return nil, syserror.ENOENT
}
// Dirent that'll replace d.
//
// Note that NewDirent returns with one reference taken; the reference
// is donated to the caller as the mount reference.
replacement := NewDirent(ctx, inode, d.name)
replacement.mounted = true
weakRef, ok := d.parent.hashChild(replacement)
if !ok {
panic("mount must mount over an existing dirent")
}
weakRef.Drop()
// Note that even though `d` is now hidden, it still holds a reference
// to its parent.
return replacement, nil
}
// unmount unmounts `d` and replaces it with the last Dirent that was in its
// place, supplied by the MountNamespace as `replacement`.
//
// Precondition: must be called with mm.withMountLocked held on `d`.
func (d *Dirent) unmount(ctx context.Context, replacement *Dirent) error {
// Did we race with deletion?
if atomic.LoadInt32(&d.deleted) != 0 {
return syserror.ENOENT
}
// Are we frozen?
if d.parent.frozen && !d.parent.Inode.IsVirtual() {
return syserror.ENOENT
}
// Remount our former child in its place.
//
// As replacement used to be our child, it must already have the right
// parent.
weakRef, ok := d.parent.hashChildParentSet(replacement)
if !ok {
panic("mount must mount over an existing dirent")
}
weakRef.Drop()
// d is not reachable anymore, and hence not mounted anymore.
d.mounted = false
// Drop mount reference.
d.DecRef()
return nil
}
// Remove removes the given file or symlink. The root dirent is used to
// resolve name, and must not be nil.
func (d *Dirent) Remove(ctx context.Context, root *Dirent, name string, dirPath bool) error {
// Check the root.
if root == nil {
panic("Dirent.Remove: root must not be nil")
}
unlock := d.lockDirectory()
defer unlock()
// Are we frozen?
if d.frozen && !d.Inode.IsVirtual() {
return syscall.ENOENT
}
// Try to walk to the node.
child, err := d.walk(ctx, root, name, false /* may unlock */)
if err != nil {
// Child does not exist.
return err
}
defer child.DecRef()
// Remove cannot remove directories.
if IsDir(child.Inode.StableAttr) {
return syscall.EISDIR
} else if dirPath {
return syscall.ENOTDIR
}
// Remove cannot remove a mount point.
if child.isMountPoint() {
return syscall.EBUSY
}
// Try to remove name on the file system.
if err := d.Inode.Remove(ctx, d, child); err != nil {
return err
}
// Link count changed, this only applies to non-directory nodes.
child.Inode.Watches.Notify("", linux.IN_ATTRIB, 0)
// Mark name as deleted and remove from children.
atomic.StoreInt32(&child.deleted, 1)
if w, ok := d.children[name]; ok {
delete(d.children, name)
w.Drop()
}
// Allow the file system to drop extra references on child.
child.dropExtendedReference()
// Finally, let inotify know the child is being unlinked. Drop any extra
// refs from inotify to this child dirent. This doesn't necessarily mean the
// watches on the underlying inode will be destroyed, since the underlying
// inode may have other links. If this was the last link, the events for the
// watch removal will be queued by the inode destructor.
child.Inode.Watches.MarkUnlinked()
child.Inode.Watches.Unpin(child)
d.Inode.Watches.Notify(name, linux.IN_DELETE, 0)
return nil
}
// RemoveDirectory removes the given directory. The root dirent is used to
// resolve name, and must not be nil.
func (d *Dirent) RemoveDirectory(ctx context.Context, root *Dirent, name string) error {
// Check the root.
if root == nil {
panic("Dirent.Remove: root must not be nil")
}
unlock := d.lockDirectory()
defer unlock()
// Are we frozen?
if d.frozen && !d.Inode.IsVirtual() {
return syscall.ENOENT
}
// Check for dots.
if name == "." {
// Rejected as the last component by rmdir(2).
return syscall.EINVAL
}
if name == ".." {
// If d was found, then its parent is not empty.
return syscall.ENOTEMPTY
}
// Try to walk to the node.
child, err := d.walk(ctx, root, name, false /* may unlock */)
if err != nil {
// Child does not exist.
return err
}
defer child.DecRef()
// RemoveDirectory can only remove directories.
if !IsDir(child.Inode.StableAttr) {
return syscall.ENOTDIR
}
// Remove cannot remove a mount point.
if child.isMountPoint() {
return syscall.EBUSY
}
// Try to remove name on the file system.
if err := d.Inode.Remove(ctx, d, child); err != nil {
return err
}
// Mark name as deleted and remove from children.
atomic.StoreInt32(&child.deleted, 1)
if w, ok := d.children[name]; ok {
delete(d.children, name)
w.Drop()
}
// Allow the file system to drop extra references on child.
child.dropExtendedReference()
// Finally, let inotify know the child is being unlinked. Drop any extra
// refs from inotify to this child dirent.
child.Inode.Watches.MarkUnlinked()
child.Inode.Watches.Unpin(child)
d.Inode.Watches.Notify(name, linux.IN_ISDIR|linux.IN_DELETE, 0)
return nil
}
// destroy closes this node and all children.
func (d *Dirent) destroy() {
if d.IsNegative() {
// Nothing to tear-down and no parent references to drop, since a negative
// Dirent does not take a references on its parent, has no Inode and no children.
return
}
d.mu.Lock()
defer d.mu.Unlock()
// Drop all weak references.
for _, w := range d.children {
if c := w.Get(); c != nil {
if c.(*Dirent).IsNegative() {
// The parent holds both weak and strong refs in the case of
// negative dirents.
c.DecRef()
}
// Drop the reference we just acquired in WeakRef.Get.
c.DecRef()
}
w.Drop()
}
d.children = nil
allDirents.remove(d)
// Drop our reference to the Inode.
d.Inode.DecRef()
// Allow the Dirent to be GC'ed after this point, since the Inode may still
// be referenced after the Dirent is destroyed (for instance by filesystem
// internal caches or hard links).
d.Inode = nil
// Drop the reference we have on our parent if we took one. renameMu doesn't need to be
// held because d can't be reparented without any references to it left.
if d.parent != nil {
d.parent.DecRef()
}
}
// IncRef increases the Dirent's refcount as well as its mount's refcount.
//
// IncRef implements RefCounter.IncRef.
func (d *Dirent) IncRef() {
if d.Inode != nil {
d.Inode.MountSource.IncDirentRefs()
}
d.AtomicRefCount.IncRef()
}
// TryIncRef implements RefCounter.TryIncRef.
func (d *Dirent) TryIncRef() bool {
ok := d.AtomicRefCount.TryIncRef()
if ok && d.Inode != nil {
d.Inode.MountSource.IncDirentRefs()
}
return ok
}
// DecRef decreases the Dirent's refcount and drops its reference on its mount.
//
// DecRef implements RefCounter.DecRef with destructor d.destroy.
func (d *Dirent) DecRef() {
if d.Inode != nil {
// Keep mount around, since DecRef may destroy d.Inode.
msrc := d.Inode.MountSource
d.DecRefWithDestructor(d.destroy)
msrc.DecDirentRefs()
} else {
d.DecRefWithDestructor(d.destroy)
}
}
// InotifyEvent notifies all watches on the inode for this dirent and its parent
// of potential events. The events may not actually propagate up to the user,
// depending on the event masks. InotifyEvent automatically provides the name of
// the current dirent as the subject of the event as required, and adds the
// IN_ISDIR flag for dirents that refer to directories.
func (d *Dirent) InotifyEvent(events, cookie uint32) {
// N.B. We don't defer the unlocks because InotifyEvent is in the hot
// path of all IO operations, and the defers cost too much for small IO
// operations.
renameMu.RLock()
if IsDir(d.Inode.StableAttr) {
events |= linux.IN_ISDIR
}
// The ordering below is important, Linux always notifies the parent first.
if d.parent != nil {
// name is immediately stale w.r.t. renames (renameMu doesn't
// protect against renames in the same directory). Holding
// d.parent.mu around Notify() wouldn't matter since Notify
// doesn't provide a synchronous mechanism for reading the name
// anyway.
d.parent.mu.Lock()
name := d.name
d.parent.mu.Unlock()
d.parent.Inode.Watches.Notify(name, events, cookie)
}
d.Inode.Watches.Notify("", events, cookie)
renameMu.RUnlock()
}
// maybeExtendReference caches a reference on this Dirent if
// MountSourceOperations.Keep returns true.
func (d *Dirent) maybeExtendReference() {
if msrc := d.Inode.MountSource; msrc.Keep(d) {
msrc.fscache.Add(d)
}
}
// dropExtendedReference drops any cached reference held by the
// MountSource on the dirent.
func (d *Dirent) dropExtendedReference() {
d.Inode.MountSource.fscache.Remove(d)
}
// lockForRename takes locks on oldParent and newParent as required by Rename
// and returns a function that will unlock the locks taken. The returned
// function must be called even if a non-nil error is returned.
func lockForRename(oldParent *Dirent, oldName string, newParent *Dirent, newName string) (func(), error) {
renameMu.Lock()
if oldParent == newParent {
oldParent.mu.Lock()
return func() {
oldParent.mu.Unlock()
renameMu.Unlock()
}, nil
}
// Renaming between directories is a bit subtle:
//
// - A concurrent cross-directory Rename may try to lock in the opposite
// order; take renameMu to prevent this from happening.
//
// - If either directory is an ancestor of the other, then a concurrent
// Remove may lock the descendant (in DecRef -> closeAll) while holding a
// lock on the ancestor; to avoid this, ensure we take locks in the same
// ancestor-to-descendant order. (Holding renameMu prevents this
// relationship from changing.)
// First check if newParent is a descendant of oldParent.
child := newParent
for p := newParent.parent; p != nil; p = p.parent {
if p == oldParent {
oldParent.mu.Lock()
newParent.mu.Lock()
var err error
if child.name == oldName {
// newParent is not just a descendant of oldParent, but
// more specifically of oldParent/oldName. That is, we're
// trying to rename something into a subdirectory of
// itself.
err = syscall.EINVAL
}
return func() {
newParent.mu.Unlock()
oldParent.mu.Unlock()
renameMu.Unlock()
}, err
}
child = p
}
// Otherwise, either oldParent is a descendant of newParent or the two
// have no relationship; in either case we can do this:
newParent.mu.Lock()
oldParent.mu.Lock()
return func() {
oldParent.mu.Unlock()
newParent.mu.Unlock()
renameMu.Unlock()
}, nil
}
func checkSticky(ctx context.Context, dir *Dirent, victim *Dirent) error {
uattr, err := dir.Inode.UnstableAttr(ctx)
if err != nil {
return syserror.EPERM
}
if !uattr.Perms.Sticky {
return nil
}
creds := auth.CredentialsFromContext(ctx)
if uattr.Owner.UID == creds.EffectiveKUID {
return nil
}
vuattr, err := victim.Inode.UnstableAttr(ctx)
if err != nil {
return syserror.EPERM
}
if vuattr.Owner.UID == creds.EffectiveKUID {
return nil
}
if victim.Inode.CheckCapability(ctx, linux.CAP_FOWNER) {
return nil
}
return syserror.EPERM
}
// MayDelete determines whether `name`, a child of `dir`, can be deleted or
// renamed by `ctx`.
//
// Compare Linux kernel fs/namei.c:may_delete.
func MayDelete(ctx context.Context, root, dir *Dirent, name string) error {
if err := dir.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
return err
}
victim, err := dir.Walk(ctx, root, name)
if err != nil {
return err
}
defer victim.DecRef()
return mayDelete(ctx, dir, victim)
}
// mayDelete determines whether `victim`, a child of `dir`, can be deleted or
// renamed by `ctx`.
//
// Preconditions: `dir` is writable and executable by `ctx`.
func mayDelete(ctx context.Context, dir, victim *Dirent) error {
if err := checkSticky(ctx, dir, victim); err != nil {
return err
}
if victim.IsRoot() {
return syserror.EBUSY
}
return nil
}
// Rename atomically converts the child of oldParent named oldName to a
// child of newParent named newName.
func Rename(ctx context.Context, root *Dirent, oldParent *Dirent, oldName string, newParent *Dirent, newName string) error {
if root == nil {
panic("Rename: root must not be nil")
}
if oldParent == newParent && oldName == newName {
return nil
}
// Acquire global renameMu lock, and mu locks on oldParent/newParent.
unlock, err := lockForRename(oldParent, oldName, newParent, newName)
defer unlock()
if err != nil {
return err
}
// Are we frozen?
// TODO(jamieliu): Is this the right errno?
if oldParent.frozen && !oldParent.Inode.IsVirtual() {
return syscall.ENOENT
}
if newParent.frozen && !newParent.Inode.IsVirtual() {
return syscall.ENOENT
}
// Do we have general permission to remove from oldParent and
// create/replace in newParent?
if err := oldParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
return err
}
if err := newParent.Inode.CheckPermission(ctx, PermMask{Write: true, Execute: true}); err != nil {
return err
}
// renamed is the dirent that will be renamed to something else.
renamed, err := oldParent.walk(ctx, root, oldName, false /* may unlock */)
if err != nil {
return err
}
defer renamed.DecRef()
// Check that the renamed dirent is deletable.
if err := mayDelete(ctx, oldParent, renamed); err != nil {
return err
}
// Check that the renamed dirent is not a mount point.
if renamed.isMountPointLocked() {
return syscall.EBUSY
}
// Source should not be an ancestor of the target.
if newParent.descendantOf(renamed) {
return syscall.EINVAL
}
// Per rename(2): "... EACCES: ... or oldpath is a directory and does not
// allow write permission (needed to update the .. entry)."
if IsDir(renamed.Inode.StableAttr) {
if err := renamed.Inode.CheckPermission(ctx, PermMask{Write: true}); err != nil {
return err
}
}
// replaced is the dirent that is being overwritten by rename.
replaced, err := newParent.walk(ctx, root, newName, false /* may unlock */)
if err != nil {
if err != syserror.ENOENT {
return err
}
// newName doesn't exist; simply create it below.
replaced = nil
} else {
// Check constraints on the dirent being replaced.
// NOTE(b/111808347): We don't want to keep replaced alive
// across the Rename, so must call DecRef manually (no defer).
// Check that we can delete replaced.
if err := mayDelete(ctx, newParent, replaced); err != nil {
replaced.DecRef()
return err
}
// Target should not be an ancestor of source.
if oldParent.descendantOf(replaced) {
replaced.DecRef()
// Note that Linux returns EINVAL if the source is an
// ancestor of target, but ENOTEMPTY if the target is
// an ancestor of source (unless RENAME_EXCHANGE flag
// is present). See fs/namei.c:renameat2.
return syscall.ENOTEMPTY
}
// Check that replaced is not a mount point.
if replaced.isMountPointLocked() {
replaced.DecRef()
return syscall.EBUSY
}
// Require that a directory is replaced by a directory.
oldIsDir := IsDir(renamed.Inode.StableAttr)
newIsDir := IsDir(replaced.Inode.StableAttr)
if !newIsDir && oldIsDir {
replaced.DecRef()
return syscall.ENOTDIR
}
if !oldIsDir && newIsDir {
replaced.DecRef()
return syscall.EISDIR
}
// Allow the file system to drop extra references on replaced.
replaced.dropExtendedReference()
// NOTE(b/31798319,b/31867149,b/31867671): Keeping a dirent
// open across renames is currently broken for multiple
// reasons, so we flush all references on the replaced node and
// its children.
replaced.Inode.Watches.Unpin(replaced)
replaced.mu.Lock()
replaced.flush()
replaced.mu.Unlock()
// Done with replaced.
replaced.DecRef()
}
if err := renamed.Inode.Rename(ctx, oldParent, renamed, newParent, newName, replaced != nil); err != nil {
return err
}
renamed.name = newName
renamed.parent = newParent
if oldParent != newParent {
// Reparent the reference held by renamed.parent. oldParent.DecRef
// can't destroy oldParent (and try to retake its lock) because
// Rename's caller must be holding a reference.
newParent.IncRef()
oldParent.DecRef()
}
if w, ok := newParent.children[newName]; ok {
w.Drop()
delete(newParent.children, newName)
}
if w, ok := oldParent.children[oldName]; ok {
w.Drop()
delete(oldParent.children, oldName)
}
// Add a weak reference from the new parent. This ensures that the child
// can still be found from the new parent if a prior hard reference is
// held on renamed.
//
// This is required for file lock correctness because file locks are per-Dirent
// and without maintaining the a cached child (via a weak reference) for renamed,
// multiple Dirents can correspond to the same resource (by virtue of the renamed
// Dirent being unreachable by its parent and it being looked up).
newParent.children[newName] = refs.NewWeakRef(renamed, nil)
// Queue inotify events for the rename.
var ev uint32
if IsDir(renamed.Inode.StableAttr) {
ev |= linux.IN_ISDIR
}
cookie := uniqueid.InotifyCookie(ctx)
oldParent.Inode.Watches.Notify(oldName, ev|linux.IN_MOVED_FROM, cookie)
newParent.Inode.Watches.Notify(newName, ev|linux.IN_MOVED_TO, cookie)
// Somewhat surprisingly, self move events do not have a cookie.
renamed.Inode.Watches.Notify("", linux.IN_MOVE_SELF, 0)
// Allow the file system to drop extra references on renamed.
renamed.dropExtendedReference()
// Same as replaced.flush above.
renamed.mu.Lock()
renamed.flush()
renamed.mu.Unlock()
return nil
}