// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package fs import ( "sync" "sync/atomic" "gvisor.googlesource.com/gvisor/pkg/abi/linux" "gvisor.googlesource.com/gvisor/pkg/sentry/arch" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/memmap" "gvisor.googlesource.com/gvisor/pkg/sentry/uniqueid" "gvisor.googlesource.com/gvisor/pkg/sentry/usermem" "gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/pkg/waiter" ) // Inotify represents an inotify instance created by inotify_init(2) or // inotify_init1(2). Inotify implements the FileOperations interface. // // Lock ordering: // Inotify.mu -> Inode.Watches.mu -> Watch.mu -> Inotify.evMu // // +stateify savable type Inotify struct { // Unique identifier for this inotify instance. We don't just reuse the // inotify fd because fds can be duped. These should not be exposed to the // user, since we may aggressively reuse an id on S/R. id uint64 waiter.Queue `state:"nosave"` // evMu *only* protects the events list. We need a separate lock because // while queuing events, a watch needs to lock the event queue, and using mu // for that would violate lock ordering since at that point the calling // goroutine already holds Watch.target.Watches.mu. evMu sync.Mutex `state:"nosave"` // A list of pending events for this inotify instance. Protected by evMu. events eventList // A scratch buffer, use to serialize inotify events. Use allocate this // ahead of time and reuse performance. Protected by evMu. scratch []byte // mu protects the fields below. mu sync.Mutex `state:"nosave"` // The next watch descriptor number to use for this inotify instance. Note // that Linux starts numbering watch descriptors from 1. nextWatch int32 // Map from watch descriptors to watch objects. watches map[int32]*Watch } // NewInotify constructs a new Inotify instance. func NewInotify(ctx context.Context) *Inotify { return &Inotify{ id: uniqueid.GlobalFromContext(ctx), scratch: make([]byte, inotifyEventBaseSize), nextWatch: 1, // Linux starts numbering watch descriptors from 1. watches: make(map[int32]*Watch), } } // Release implements FileOperations.Release. Release removes all watches and // frees all resources for an inotify instance. func (i *Inotify) Release() { // We need to hold i.mu to avoid a race with concurrent calls to // Inotify.targetDestroyed from Watches. There's no risk of Watches // accessing this Inotify after the destructor ends, because we remove all // references to it below. i.mu.Lock() defer i.mu.Unlock() for _, w := range i.watches { // Remove references to the watch from the watch target. We don't need // to worry about the references from the owner instance, since we're in // the owner's destructor. w.target.Watches.Remove(w.ID()) // Don't leak any references to the target, held by pins in the watch. w.destroy() } } // Readiness implements waiter.Waitable.Readiness. // // Readiness indicates whether there are pending events for an inotify instance. func (i *Inotify) Readiness(mask waiter.EventMask) waiter.EventMask { ready := waiter.EventMask(0) i.evMu.Lock() defer i.evMu.Unlock() if !i.events.Empty() { ready |= waiter.EventIn } return mask & ready } // Seek implements FileOperations.Seek. func (*Inotify) Seek(context.Context, *File, SeekWhence, int64) (int64, error) { return 0, syserror.ESPIPE } // Readdir implements FileOperatons.Readdir. func (*Inotify) Readdir(context.Context, *File, DentrySerializer) (int64, error) { return 0, syserror.ENOTDIR } // Write implements FileOperations.Write. func (*Inotify) Write(context.Context, *File, usermem.IOSequence, int64) (int64, error) { return 0, syserror.EBADF } // Read implements FileOperations.Read. func (i *Inotify) Read(ctx context.Context, _ *File, dst usermem.IOSequence, _ int64) (int64, error) { if dst.NumBytes() < inotifyEventBaseSize { return 0, syserror.EINVAL } i.evMu.Lock() defer i.evMu.Unlock() if i.events.Empty() { // Nothing to read yet, tell caller to block. return 0, syserror.ErrWouldBlock } var writeLen int64 for event := i.events.Front(); event != nil; event = event.Next() { // Does the buffer have enough remaining space to hold the event we're // about to write out? if dst.NumBytes() < int64(event.sizeOf()) { if writeLen > 0 { // Buffer wasn't big enough for all pending events, but we did // write some events out. return writeLen, nil } return 0, syserror.EINVAL } // Linux always dequeues an available event as long as there's enough // buffer space to copy it out, even if the copy below fails. Emulate // this behaviour. i.events.Remove(event) // Buffer has enough space, copy event to the read buffer. n, err := event.CopyTo(ctx, i.scratch, dst) if err != nil { return 0, err } writeLen += n dst = dst.DropFirst64(n) } return writeLen, nil } // WriteTo implements FileOperations.WriteTo. func (*Inotify) WriteTo(context.Context, *File, *File, SpliceOpts) (int64, error) { return 0, syserror.ENOSYS } // Fsync implements FileOperations.Fsync. func (*Inotify) Fsync(context.Context, *File, int64, int64, SyncType) error { return syserror.EINVAL } // ReadFrom implements FileOperations.ReadFrom. func (*Inotify) ReadFrom(context.Context, *File, *File, SpliceOpts) (int64, error) { return 0, syserror.ENOSYS } // Flush implements FileOperations.Flush. func (*Inotify) Flush(context.Context, *File) error { return nil } // ConfigureMMap implements FileOperations.ConfigureMMap. func (*Inotify) ConfigureMMap(context.Context, *File, *memmap.MMapOpts) error { return syserror.ENODEV } // UnstableAttr implements FileOperations.UnstableAttr. func (i *Inotify) UnstableAttr(ctx context.Context, file *File) (UnstableAttr, error) { return file.Dirent.Inode.UnstableAttr(ctx) } // Ioctl implements fs.FileOperations.Ioctl. func (i *Inotify) Ioctl(ctx context.Context, io usermem.IO, args arch.SyscallArguments) (uintptr, error) { switch args[1].Int() { case linux.FIONREAD: i.evMu.Lock() defer i.evMu.Unlock() var n uint32 for e := i.events.Front(); e != nil; e = e.Next() { n += uint32(e.sizeOf()) } var buf [4]byte usermem.ByteOrder.PutUint32(buf[:], n) _, err := io.CopyOut(ctx, args[2].Pointer(), buf[:], usermem.IOOpts{}) return 0, err default: return 0, syserror.ENOTTY } } func (i *Inotify) queueEvent(ev *Event) { i.evMu.Lock() // Check if we should coalesce the event we're about to queue with the last // one currently in the queue. Events are coalesced if they are identical. if last := i.events.Back(); last != nil { if ev.equals(last) { // "Coalesce" the two events by simply not queuing the new one. We // don't need to raise a waiter.EventIn notification because no new // data is available for reading. i.evMu.Unlock() return } } i.events.PushBack(ev) // Release mutex before notifying waiters because we don't control what they // can do. i.evMu.Unlock() i.Queue.Notify(waiter.EventIn) } // newWatchLocked creates and adds a new watch to target. func (i *Inotify) newWatchLocked(target *Dirent, mask uint32) *Watch { wd := i.nextWatch i.nextWatch++ watch := &Watch{ owner: i, wd: wd, mask: mask, target: target.Inode, pins: make(map[*Dirent]bool), } i.watches[wd] = watch // Grab an extra reference to target to prevent it from being evicted from // memory. This ref is dropped during either watch removal, target // destruction, or inotify instance destruction. See callers of Watch.Unpin. watch.Pin(target) target.Inode.Watches.Add(watch) return watch } // targetDestroyed is called by w to notify i that w's target is gone. This // automatically generates a watch removal event. func (i *Inotify) targetDestroyed(w *Watch) { i.mu.Lock() _, found := i.watches[w.wd] delete(i.watches, w.wd) i.mu.Unlock() if found { i.queueEvent(newEvent(w.wd, "", linux.IN_IGNORED, 0)) } } // AddWatch constructs a new inotify watch and adds it to the target dirent. It // returns the watch descriptor returned by inotify_add_watch(2). func (i *Inotify) AddWatch(target *Dirent, mask uint32) int32 { // Note: Locking this inotify instance protects the result returned by // Lookup() below. With the lock held, we know for sure the lookup result // won't become stale because it's impossible for *this* instance to // add/remove watches on target. i.mu.Lock() defer i.mu.Unlock() // Does the target already have a watch from this inotify instance? if existing := target.Inode.Watches.Lookup(i.id); existing != nil { // This may be a watch on a different dirent pointing to the // same inode. Obtain an extra reference if necessary. existing.Pin(target) newmask := mask if mergeMask := mask&linux.IN_MASK_ADD != 0; mergeMask { // "Add (OR) events to watch mask for this pathname if it already // exists (instead of replacing mask)." -- inotify(7) newmask |= atomic.LoadUint32(&existing.mask) } atomic.StoreUint32(&existing.mask, newmask) return existing.wd } // No existing watch, create a new watch. watch := i.newWatchLocked(target, mask) return watch.wd } // RmWatch implements watcher.Watchable.RmWatch. // // RmWatch looks up an inotify watch for the given 'wd' and configures the // target dirent to stop sending events to this inotify instance. func (i *Inotify) RmWatch(wd int32) error { i.mu.Lock() // Find the watch we were asked to removed. watch, ok := i.watches[wd] if !ok { i.mu.Unlock() return syserror.EINVAL } // Remove the watch from this instance. delete(i.watches, wd) // Remove the watch from the watch target. watch.target.Watches.Remove(watch.ID()) // The watch is now isolated and we can safely drop the instance lock. We // need to do so because watch.destroy() acquires Watch.mu, which cannot be // acquired with Inotify.mu held. i.mu.Unlock() // Generate the event for the removal. i.queueEvent(newEvent(watch.wd, "", linux.IN_IGNORED, 0)) // Remove all pins. watch.destroy() return nil }