gvisor/pkg/sentry/fs/inotify.go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package fs

import (
	"sync"
	"sync/atomic"

	"gvisor.dev/gvisor/pkg/abi/linux"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/context"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
	"gvisor.dev/gvisor/pkg/sentry/usermem"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/waiter"
)

// Inotify represents an inotify instance created by inotify_init(2) or
// inotify_init1(2). Inotify implements the FileOperations interface.
//
// Lock ordering:
//   Inotify.mu -> Inode.Watches.mu -> Watch.mu -> Inotify.evMu
//
// +stateify savable
type Inotify struct {
	// Unique identifier for this inotify instance. We don't just reuse the
	// inotify fd because fds can be duped. These should not be exposed to the
	// user, since we may aggressively reuse an id on S/R.
	id uint64

	waiter.Queue `state:"nosave"`

	// evMu *only* protects the events list. We need a separate lock because
	// while queuing events, a watch needs to lock the event queue, and using mu
	// for that would violate lock ordering since at that point the calling
	// goroutine already holds Watch.target.Watches.mu.
	evMu sync.Mutex `state:"nosave"`

	// A list of pending events for this inotify instance. Protected by evMu.
	events eventList

	// A scratch buffer, use to serialize inotify events. Use allocate this
	// ahead of time and reuse performance. Protected by evMu.
	scratch []byte

	// mu protects the fields below.
	mu sync.Mutex `state:"nosave"`

	// The next watch descriptor number to use for this inotify instance. Note
	// that Linux starts numbering watch descriptors from 1.
	nextWatch int32

	// Map from watch descriptors to watch objects.
	watches map[int32]*Watch
}

// NewInotify constructs a new Inotify instance.
func NewInotify(ctx context.Context) *Inotify {
	return &Inotify{
		id:        uniqueid.GlobalFromContext(ctx),
		scratch:   make([]byte, inotifyEventBaseSize),
		nextWatch: 1, // Linux starts numbering watch descriptors from 1.
		watches:   make(map[int32]*Watch),
	}
}

// Release implements FileOperations.Release. Release removes all watches and
// frees all resources for an inotify instance.
func (i *Inotify) Release() {
	// We need to hold i.mu to avoid a race with concurrent calls to
	// Inotify.targetDestroyed from Watches. There's no risk of Watches
	// accessing this Inotify after the destructor ends, because we remove all
	// references to it below.
	i.mu.Lock()
	defer i.mu.Unlock()
	for _, w := range i.watches {
		// Remove references to the watch from the watch target. We don't need
		// to worry about the references from the owner instance, since we're in
		// the owner's destructor.
		w.target.Watches.Remove(w.ID())
		// Don't leak any references to the target, held by pins in the watch.
		w.destroy()
	}
}

// Readiness implements waiter.Waitable.Readiness.
//
// Readiness indicates whether there are pending events for an inotify instance.
func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask {
	ready := waiter.EventMask(0)

	i.evMu.Lock()
	defer i.evMu.Unlock()

	if !i.events.Empty() {
		ready |= waiter.EventIn
	}

	return mask & ready
}

// Seek implements FileOperations.Seek.
func (*Inotify) Seek(context.Context, *File, SeekWhence, int64) (int64, error) {
	return 0, syserror.ESPIPE
}

// Readdir implements FileOperatons.Readdir.
func (*Inotify) Readdir(context.Context, *File, DentrySerializer) (int64, error) {
	return 0, syserror.ENOTDIR
}

// Write implements FileOperations.Write.
func (*Inotify) Write(context.Context, *File, usermem.IOSequence, int64) (int64, error) {
	return 0, syserror.EBADF
}

// Read implements FileOperations.Read.
func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ int64) (int64, error) {
	if dst.NumBytes() < inotifyEventBaseSize {
		return 0, syserror.EINVAL
	}

	i.evMu.Lock()
	defer i.evMu.Unlock()

	if i.events.Empty() {
		// Nothing to read yet, tell caller to block.
		return 0, syserror.ErrWouldBlock
	}

	var writeLen int64
	for event := i.events.Front(); event != nil; event = event.Next() {
		// Does the buffer have enough remaining space to hold the event we're
		// about to write out?
		if dst.NumBytes() < int64(event.sizeOf()) {
			if writeLen > 0 {
				// Buffer wasn't big enough for all pending events, but we did
				// write some events out.
				return writeLen, nil
			}
			return 0, syserror.EINVAL
		}

		// Linux always dequeues an available event as long as there's enough
		// buffer space to copy it out, even if the copy below fails. Emulate
		// this behaviour.
		i.events.Remove(event)

		// Buffer has enough space, copy event to the read buffer.
		n, err := event.CopyTo(ctx, i.scratch, dst)
		if err != nil {
			return 0, err
		}

		writeLen += n
		dst = dst.DropFirst64(n)
	}
	return writeLen, nil
}

// WriteTo implements FileOperations.WriteTo.
func (*Inotify) WriteTo(context.Context, *File, *File, SpliceOpts) (int64, error) {
	return 0, syserror.ENOSYS
}

// Fsync implements FileOperations.Fsync.
func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error {
	return syserror.EINVAL
}

// ReadFrom implements FileOperations.ReadFrom.
func (*Inotify) ReadFrom(context.Context, *File, *File, SpliceOpts) (int64, error) {
	return 0, syserror.ENOSYS
}

// Flush implements FileOperations.Flush.
func (*Inotify) Flush(context.Context, *File) error {
	return nil
}

// ConfigureMMap implements FileOperations.ConfigureMMap.
func (*Inotify) ConfigureMMap(context.Context, *File, *memmap.MMapOpts) error {
	return syserror.ENODEV
}

// UnstableAttr implements FileOperations.UnstableAttr.
func (i *Inotify) UnstableAttr(ctx context.Context, file *File) (UnstableAttr, error) {
	return file.Dirent.Inode.UnstableAttr(ctx)
}

// Ioctl implements fs.FileOperations.Ioctl.
func (i *Inotify) Ioctl(ctx context.Context, _ *File, io usermem.IO, args arch.SyscallArguments) (uintptr, error) {
	switch args[1].Int() {
	case linux.FIONREAD:
		i.evMu.Lock()
		defer i.evMu.Unlock()
		var n uint32
		for e := i.events.Front(); e != nil; e = e.Next() {
			n += uint32(e.sizeOf())
		}
		var buf [4]byte
		usermem.ByteOrder.PutUint32(buf[:], n)
		_, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{})
		return 0, err

	default:
		return 0, syserror.ENOTTY
	}
}

func (i *Inotify) queueEvent(ev *Event) {
	i.evMu.Lock()

	// Check if we should coalesce the event we're about to queue with the last
	// one currently in the queue. Events are coalesced if they are identical.
	if last := i.events.Back(); last != nil {
		if ev.equals(last) {
			// "Coalesce" the two events by simply not queuing the new one. We
			// don't need to raise a waiter.EventIn notification because no new
			// data is available for reading.
			i.evMu.Unlock()
			return
		}
	}

	i.events.PushBack(ev)

	// Release mutex before notifying waiters because we don't control what they
	// can do.
	i.evMu.Unlock()

	i.Queue.Notify(waiter.EventIn)
}

// newWatchLocked creates and adds a new watch to target.
func (i *Inotify) newWatchLocked(target *Dirent, mask uint32) *Watch {
	wd := i.nextWatch
	i.nextWatch++

	watch := &Watch{
		owner:  i,
		wd:     wd,
		mask:   mask,
		target: target.Inode,
		pins:   make(map[*Dirent]bool),
	}

	i.watches[wd] = watch

	// Grab an extra reference to target to prevent it from being evicted from
	// memory. This ref is dropped during either watch removal, target
	// destruction, or inotify instance destruction. See callers of Watch.Unpin.
	watch.Pin(target)
	target.Inode.Watches.Add(watch)

	return watch
}

// targetDestroyed is called by w to notify i that w's target is gone. This
// automatically generates a watch removal event.
func (i *Inotify) targetDestroyed(w *Watch) {
	i.mu.Lock()
	_, found := i.watches[w.wd]
	delete(i.watches, w.wd)
	i.mu.Unlock()

	if found {
		i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0))
	}
}

// AddWatch constructs a new inotify watch and adds it to the target dirent. It
// returns the watch descriptor returned by inotify_add_watch(2).
func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 {
	// Note: Locking this inotify instance protects the result returned by
	// Lookup() below. With the lock held, we know for sure the lookup result
	// won't become stale because it's impossible for *this* instance to
	// add/remove watches on target.
	i.mu.Lock()
	defer i.mu.Unlock()

	// Does the target already have a watch from this inotify instance?
	if existing := target.Inode.Watches.Lookup(i.id); existing != nil {
		// This may be a watch on a different dirent pointing to the
		// same inode. Obtain an extra reference if necessary.
		existing.Pin(target)

		newmask := mask
		if mergeMask := mask&linux.IN_MASK_ADD != 0; mergeMask {
			// "Add (OR) events to watch mask for this pathname if it already
			// exists (instead of replacing mask)." -- inotify(7)
			newmask |= atomic.LoadUint32(&existing.mask)
		}
		atomic.StoreUint32(&existing.mask, newmask)
		return existing.wd
	}

	// No existing watch, create a new watch.
	watch := i.newWatchLocked(target, mask)
	return watch.wd
}

// RmWatch implements watcher.Watchable.RmWatch.
//
// RmWatch looks up an inotify watch for the given 'wd' and configures the
// target dirent to stop sending events to this inotify instance.
func (i *Inotify) RmWatch(wd int32) error {
	i.mu.Lock()

	// Find the watch we were asked to removed.
	watch, ok := i.watches[wd]
	if !ok {
		i.mu.Unlock()
		return syserror.EINVAL
	}

	// Remove the watch from this instance.
	delete(i.watches, wd)

	// Remove the watch from the watch target.
	watch.target.Watches.Remove(watch.ID())

	// The watch is now isolated and we can safely drop the instance lock. We
	// need to do so because watch.destroy() acquires Watch.mu, which cannot be
	// acquired with Inotify.mu held.
	i.mu.Unlock()

	// Generate the event for the removal.
	i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0))

	// Remove all pins.
	watch.destroy()

	return nil
}