// Copyright 2020 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package vfs import ( "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/context" "gvisor.dev/gvisor/pkg/sync" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/waiter" ) // epollCycleMu serializes attempts to register EpollInstances with other // EpollInstances in order to check for cycles. var epollCycleMu sync.Mutex // EpollInstance represents an epoll instance, as described by epoll(7). type EpollInstance struct { vfsfd FileDescription FileDescriptionDefaultImpl DentryMetadataFileDescriptionImpl // q holds waiters on this EpollInstance. q waiter.Queue // interest is the set of file descriptors that are registered with the // EpollInstance for monitoring. interest is protected by interestMu. interestMu sync.Mutex interest map[epollInterestKey]*epollInterest // mu protects fields in registered epollInterests. mu sync.Mutex // ready is the set of file descriptors that may be "ready" for I/O. Note // that this must be an ordered list, not a map: "If more than maxevents // file descriptors are ready when epoll_wait() is called, then successive // epoll_wait() calls will round robin through the set of ready file // descriptors. This behavior helps avoid starvation scenarios, where a // process fails to notice that additional file descriptors are ready // because it focuses on a set of file descriptors that are already known // to be ready." - epoll_wait(2) ready epollInterestList } type epollInterestKey struct { // file is the registered FileDescription. No reference is held on file; // instead, when the last reference is dropped, FileDescription.DecRef() // removes the FileDescription from all EpollInstances. file is immutable. file *FileDescription // num is the file descriptor number with which this entry was registered. // num is immutable. num int32 } // epollInterest represents an EpollInstance's interest in a file descriptor. type epollInterest struct { // epoll is the owning EpollInstance. epoll is immutable. epoll *EpollInstance // key is the file to which this epollInterest applies. key is immutable. key epollInterestKey // waiter is registered with key.file. entry is protected by epoll.mu. waiter waiter.Entry // mask is the event mask associated with this registration, including // flags EPOLLET and EPOLLONESHOT. mask is protected by epoll.mu. mask uint32 // ready is true if epollInterestEntry is linked into epoll.ready. ready // and epollInterestEntry are protected by epoll.mu. ready bool epollInterestEntry // userData is the struct epoll_event::data associated with this // epollInterest. userData is protected by epoll.mu. userData [2]int32 } // NewEpollInstanceFD returns a FileDescription representing a new epoll // instance. A reference is taken on the returned FileDescription. func (vfs *VirtualFilesystem) NewEpollInstanceFD() (*FileDescription, error) { vd := vfs.NewAnonVirtualDentry("[eventpoll]") defer vd.DecRef() ep := &EpollInstance{ interest: make(map[epollInterestKey]*epollInterest), } if err := ep.vfsfd.Init(ep, linux.O_RDWR, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ DenyPRead: true, DenyPWrite: true, UseDentryMetadata: true, }); err != nil { return nil, err } return &ep.vfsfd, nil } // Release implements FileDescriptionImpl.Release. func (ep *EpollInstance) Release() { // Unregister all polled fds. ep.interestMu.Lock() defer ep.interestMu.Unlock() for key, epi := range ep.interest { file := key.file file.epollMu.Lock() delete(file.epolls, epi) file.epollMu.Unlock() file.EventUnregister(&epi.waiter) } ep.interest = nil } // Readiness implements waiter.Waitable.Readiness. func (ep *EpollInstance) Readiness(mask waiter.EventMask) waiter.EventMask { if mask&waiter.EventIn == 0 { return 0 } ep.mu.Lock() for epi := ep.ready.Front(); epi != nil; epi = epi.Next() { wmask := waiter.EventMaskFromLinux(epi.mask) if epi.key.file.Readiness(wmask)&wmask != 0 { ep.mu.Unlock() return waiter.EventIn } } ep.mu.Unlock() return 0 } // EventRegister implements waiter.Waitable.EventRegister. func (ep *EpollInstance) EventRegister(e *waiter.Entry, mask waiter.EventMask) { ep.q.EventRegister(e, mask) } // EventUnregister implements waiter.Waitable.EventUnregister. func (ep *EpollInstance) EventUnregister(e *waiter.Entry) { ep.q.EventUnregister(e) } // Seek implements FileDescriptionImpl.Seek. func (ep *EpollInstance) Seek(ctx context.Context, offset int64, whence int32) (int64, error) { // Linux: fs/eventpoll.c:eventpoll_fops.llseek == noop_llseek return 0, nil } // AddInterest implements the semantics of EPOLL_CTL_ADD. // // Preconditions: A reference must be held on file. func (ep *EpollInstance) AddInterest(file *FileDescription, num int32, event linux.EpollEvent) error { // Check for cyclic polling if necessary. subep, _ := file.impl.(*EpollInstance) if subep != nil { epollCycleMu.Lock() // epollCycleMu must be locked for the rest of AddInterest to ensure // that cyclic polling is not introduced after the check. defer epollCycleMu.Unlock() if subep.mightPoll(ep) { return syserror.ELOOP } } ep.interestMu.Lock() defer ep.interestMu.Unlock() // Fail if the key is already registered. key := epollInterestKey{ file: file, num: num, } if _, ok := ep.interest[key]; ok { return syserror.EEXIST } // Register interest in file. mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP epi := &epollInterest{ epoll: ep, key: key, mask: mask, userData: event.Data, } epi.waiter.Callback = epi ep.interest[key] = epi wmask := waiter.EventMaskFromLinux(mask) file.EventRegister(&epi.waiter, wmask) // Check if the file is already ready. if file.Readiness(wmask)&wmask != 0 { epi.Callback(nil) } // Add epi to file.epolls so that it is removed when the last // FileDescription reference is dropped. file.epollMu.Lock() if file.epolls == nil { file.epolls = make(map[*epollInterest]struct{}) } file.epolls[epi] = struct{}{} file.epollMu.Unlock() return nil } func (ep *EpollInstance) mightPoll(ep2 *EpollInstance) bool { return ep.mightPollRecursive(ep2, 4) // Linux: fs/eventpoll.c:EP_MAX_NESTS } func (ep *EpollInstance) mightPollRecursive(ep2 *EpollInstance, remainingRecursion int) bool { ep.interestMu.Lock() defer ep.interestMu.Unlock() for key := range ep.interest { nextep, ok := key.file.impl.(*EpollInstance) if !ok { continue } if nextep == ep2 { return true } if remainingRecursion == 0 { return true } if nextep.mightPollRecursive(ep2, remainingRecursion-1) { return true } } return false } // ModifyInterest implements the semantics of EPOLL_CTL_MOD. // // Preconditions: A reference must be held on file. func (ep *EpollInstance) ModifyInterest(file *FileDescription, num int32, event linux.EpollEvent) error { ep.interestMu.Lock() defer ep.interestMu.Unlock() // Fail if the key is not already registered. epi, ok := ep.interest[epollInterestKey{ file: file, num: num, }] if !ok { return syserror.ENOENT } // Update epi for the next call to ep.ReadEvents(). mask := event.Events | linux.EPOLLERR | linux.EPOLLRDHUP ep.mu.Lock() epi.mask = mask epi.userData = event.Data ep.mu.Unlock() // Re-register with the new mask. file.EventUnregister(&epi.waiter) wmask := waiter.EventMaskFromLinux(mask) file.EventRegister(&epi.waiter, wmask) // Check if the file is already ready with the new mask. if file.Readiness(wmask)&wmask != 0 { epi.Callback(nil) } return nil } // DeleteInterest implements the semantics of EPOLL_CTL_DEL. // // Preconditions: A reference must be held on file. func (ep *EpollInstance) DeleteInterest(file *FileDescription, num int32) error { ep.interestMu.Lock() defer ep.interestMu.Unlock() // Fail if the key is not already registered. epi, ok := ep.interest[epollInterestKey{ file: file, num: num, }] if !ok { return syserror.ENOENT } // Unregister from the file so that epi will no longer be readied. file.EventUnregister(&epi.waiter) // Forget about epi. ep.removeLocked(epi) file.epollMu.Lock() delete(file.epolls, epi) file.epollMu.Unlock() return nil } // Callback implements waiter.EntryCallback.Callback. func (epi *epollInterest) Callback(*waiter.Entry) { newReady := false epi.epoll.mu.Lock() if !epi.ready { newReady = true epi.ready = true epi.epoll.ready.PushBack(epi) } epi.epoll.mu.Unlock() if newReady { epi.epoll.q.Notify(waiter.EventIn) } } // Preconditions: ep.interestMu must be locked. func (ep *EpollInstance) removeLocked(epi *epollInterest) { delete(ep.interest, epi.key) ep.mu.Lock() if epi.ready { epi.ready = false ep.ready.Remove(epi) } ep.mu.Unlock() } // ReadEvents reads up to len(events) ready events into events and returns the // number of events read. // // Preconditions: len(events) != 0. func (ep *EpollInstance) ReadEvents(events []linux.EpollEvent) int { i := 0 // Hot path: avoid defer. ep.mu.Lock() var next *epollInterest var requeue epollInterestList for epi := ep.ready.Front(); epi != nil; epi = next { next = epi.Next() // Regardless of what else happens, epi is initially removed from the // ready list. ep.ready.Remove(epi) wmask := waiter.EventMaskFromLinux(epi.mask) ievents := epi.key.file.Readiness(wmask) & wmask if ievents == 0 { // Leave epi off the ready list. epi.ready = false continue } // Determine what we should do with epi. switch { case epi.mask&linux.EPOLLONESHOT != 0: // Clear all events from the mask; they must be re-added by // EPOLL_CTL_MOD. epi.mask &= linux.EP_PRIVATE_BITS fallthrough case epi.mask&linux.EPOLLET != 0: // Leave epi off the ready list. epi.ready = false default: // Queue epi to be moved to the end of the ready list. requeue.PushBack(epi) } // Report ievents. events[i] = linux.EpollEvent{ Events: ievents.ToLinux(), Data: epi.userData, } i++ if i == len(events) { break } } ep.ready.PushBackList(&requeue) ep.mu.Unlock() return i }