2019-04-29 21:25:05 +00:00
|
|
|
// Copyright 2018 The gVisor Authors.
|
2018-04-27 17:37:02 +00:00
|
|
|
//
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
|
|
|
// Package eventfd provides an implementation of Linux's file-based event
|
|
|
|
// notification.
|
|
|
|
package eventfd
|
|
|
|
|
|
|
|
import (
|
|
|
|
"math"
|
|
|
|
"sync"
|
|
|
|
"syscall"
|
|
|
|
|
2018-07-16 19:19:02 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
|
2018-12-15 02:03:43 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/fdnotifier"
|
2018-04-27 17:37:02 +00:00
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/anon"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/syserror"
|
|
|
|
"gvisor.googlesource.com/gvisor/pkg/waiter"
|
|
|
|
)
|
|
|
|
|
|
|
|
// EventOperations represents an event with the semantics of Linux's file-based event
|
2018-07-16 19:19:02 +00:00
|
|
|
// notification (eventfd). Eventfds are usually internal to the Sentry but in certain
|
|
|
|
// situations they may be converted into a host-backed eventfd.
|
2018-08-02 17:41:44 +00:00
|
|
|
//
|
|
|
|
// +stateify savable
|
2018-04-27 17:37:02 +00:00
|
|
|
type EventOperations struct {
|
2019-04-11 07:41:42 +00:00
|
|
|
fsutil.FileNoopRelease `state:"nosave"`
|
|
|
|
fsutil.FilePipeSeek `state:"nosave"`
|
|
|
|
fsutil.FileNotDirReaddir `state:"nosave"`
|
|
|
|
fsutil.FileNoFsync `state:"nosave"`
|
|
|
|
fsutil.FileNoIoctl `state:"nosave"`
|
2019-05-21 22:17:05 +00:00
|
|
|
fsutil.FileNoMMap `state:"nosave"`
|
|
|
|
fsutil.FileNoSplice `state:"nosave"`
|
|
|
|
fsutil.FileNoopFlush `state:"nosave"`
|
2019-04-11 07:41:42 +00:00
|
|
|
fsutil.FileUseInodeUnstableAttr `state:"nosave"`
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// Mutex that protects accesses to the fields of this event.
|
|
|
|
mu sync.Mutex `state:"nosave"`
|
|
|
|
|
|
|
|
// Queue is used to notify interested parties when the event object
|
|
|
|
// becomes readable or writable.
|
2018-08-02 17:41:44 +00:00
|
|
|
wq waiter.Queue `state:"zerovalue"`
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
// val is the current value of the event counter.
|
|
|
|
val uint64
|
|
|
|
|
|
|
|
// semMode specifies whether the event is in "semaphore" mode.
|
|
|
|
semMode bool
|
2018-07-16 19:19:02 +00:00
|
|
|
|
|
|
|
// hostfd indicates whether this eventfd is passed through to the host.
|
|
|
|
hostfd int
|
2018-04-27 17:37:02 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// New creates a new event object with the supplied initial value and mode.
|
|
|
|
func New(ctx context.Context, initVal uint64, semMode bool) *fs.File {
|
|
|
|
// name matches fs/eventfd.c:eventfd_file_create.
|
|
|
|
dirent := fs.NewDirent(anon.NewInode(ctx), "anon_inode:[eventfd]")
|
Drop one dirent reference after referenced by file
When pipe is created, a dirent of pipe will be
created and its initial reference is set as 0.
Cause all dirent will only be destroyed when
the reference decreased to -1, so there is already
a 'initial reference' of dirent after it created.
For destroying dirent after all reference released,
the correct way is to drop the 'initial reference'
once someone hold a reference to the dirent, such
as fs.NewFile, otherwise the reference of dirent
will stay 0 all the time, and will cause memory
leak of dirent.
Except pipe, timerfd/eventfd/epoll has the same
problem
Here is a simple case to create memory leak of dirent
for pipe/timerfd/eventfd/epoll in C langange, after
run the case, pprof the runsc process, you will
find lots dirents of pipe/timerfd/eventfd/epoll not
freed:
int main(int argc, char *argv[])
{
int i;
int n;
int pipefd[2];
if (argc != 3) {
printf("Usage: %s epoll|timerfd|eventfd|pipe <iterations>\n", argv[0]);
}
n = strtol(argv[2], NULL, 10);
if (strcmp(argv[1], "epoll") == 0) {
for (i = 0; i < n; ++i)
close(epoll_create(1));
} else if (strcmp(argv[1], "timerfd") == 0) {
for (i = 0; i < n; ++i)
close(timerfd_create(CLOCK_REALTIME, 0));
} else if (strcmp(argv[1], "eventfd") == 0) {
for (i = 0; i < n; ++i)
close(eventfd(0, 0));
} else if (strcmp(argv[1], "pipe") == 0) {
for (i = 0; i < n; ++i)
if (pipe(pipefd) == 0) {
close(pipefd[0]);
close(pipefd[1]);
}
}
printf("%s %s test finished\r\n",argv[1],argv[2]);
return 0;
}
Change-Id: Ia1b8a1fb9142edb00c040e44ec644d007f81f5d2
PiperOrigin-RevId: 251531096
2019-06-04 22:39:24 +00:00
|
|
|
// Release the initial dirent reference after NewFile takes a reference.
|
|
|
|
defer dirent.DecRef()
|
2018-04-27 17:37:02 +00:00
|
|
|
return fs.NewFile(ctx, dirent, fs.FileFlags{Read: true, Write: true}, &EventOperations{
|
|
|
|
val: initVal,
|
|
|
|
semMode: semMode,
|
2018-07-16 19:19:02 +00:00
|
|
|
hostfd: -1,
|
2018-04-27 17:37:02 +00:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
2018-07-16 19:19:02 +00:00
|
|
|
// HostFD returns the host eventfd associated with this event.
|
|
|
|
func (e *EventOperations) HostFD() (int, error) {
|
|
|
|
e.mu.Lock()
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
if e.hostfd >= 0 {
|
|
|
|
return e.hostfd, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
flags := linux.EFD_NONBLOCK
|
|
|
|
if e.semMode {
|
|
|
|
flags |= linux.EFD_SEMAPHORE
|
|
|
|
}
|
|
|
|
|
|
|
|
fd, _, err := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(e.val), uintptr(flags), 0)
|
|
|
|
if err != 0 {
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := fdnotifier.AddFD(int32(fd), &e.wq); err != nil {
|
|
|
|
syscall.Close(int(fd))
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
|
|
|
|
e.hostfd = int(fd)
|
|
|
|
return e.hostfd, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Release implements fs.FileOperations.Release.
|
|
|
|
func (e *EventOperations) Release() {
|
|
|
|
e.mu.Lock()
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
if e.hostfd >= 0 {
|
|
|
|
fdnotifier.RemoveFD(int32(e.hostfd))
|
|
|
|
syscall.Close(e.hostfd)
|
|
|
|
e.hostfd = -1
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// Read implements fs.FileOperations.Read.
|
|
|
|
func (e *EventOperations) Read(ctx context.Context, _ *fs.File, dst usermem.IOSequence, _ int64) (int64, error) {
|
|
|
|
if dst.NumBytes() < 8 {
|
|
|
|
return 0, syscall.EINVAL
|
|
|
|
}
|
|
|
|
if err := e.read(ctx, dst); err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
return 8, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Write implements fs.FileOperations.Write.
|
|
|
|
func (e *EventOperations) Write(ctx context.Context, _ *fs.File, src usermem.IOSequence, _ int64) (int64, error) {
|
|
|
|
if src.NumBytes() < 8 {
|
|
|
|
return 0, syscall.EINVAL
|
|
|
|
}
|
|
|
|
if err := e.write(ctx, src); err != nil {
|
|
|
|
return 0, err
|
|
|
|
}
|
|
|
|
return 8, nil
|
|
|
|
}
|
|
|
|
|
2018-07-16 19:19:02 +00:00
|
|
|
// Must be called with e.mu locked.
|
|
|
|
func (e *EventOperations) hostRead(ctx context.Context, dst usermem.IOSequence) error {
|
|
|
|
var buf [8]byte
|
|
|
|
|
|
|
|
if _, err := syscall.Read(e.hostfd, buf[:]); err != nil {
|
|
|
|
if err == syscall.EWOULDBLOCK {
|
|
|
|
return syserror.ErrWouldBlock
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
_, err := dst.CopyOut(ctx, buf[:])
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
func (e *EventOperations) read(ctx context.Context, dst usermem.IOSequence) error {
|
|
|
|
e.mu.Lock()
|
|
|
|
|
2018-07-16 19:19:02 +00:00
|
|
|
if e.hostfd >= 0 {
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
return e.hostRead(ctx, dst)
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// We can't complete the read if the value is currently zero.
|
|
|
|
if e.val == 0 {
|
|
|
|
e.mu.Unlock()
|
|
|
|
return syserror.ErrWouldBlock
|
|
|
|
}
|
|
|
|
|
|
|
|
// Update the value based on the mode the event is operating in.
|
|
|
|
var val uint64
|
|
|
|
if e.semMode {
|
|
|
|
val = 1
|
|
|
|
// Consistent with Linux, this is done even if writing to memory fails.
|
|
|
|
e.val--
|
|
|
|
} else {
|
|
|
|
val = e.val
|
|
|
|
e.val = 0
|
|
|
|
}
|
|
|
|
|
|
|
|
e.mu.Unlock()
|
|
|
|
|
|
|
|
// Notify writers. We do this even if we were already writable because
|
|
|
|
// it is possible that a writer is waiting to write the maximum value
|
|
|
|
// to the event.
|
2018-07-16 19:19:02 +00:00
|
|
|
e.wq.Notify(waiter.EventOut)
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
var buf [8]byte
|
|
|
|
usermem.ByteOrder.PutUint64(buf[:], val)
|
|
|
|
_, err := dst.CopyOut(ctx, buf[:])
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-07-16 19:19:02 +00:00
|
|
|
// Must be called with e.mu locked.
|
|
|
|
func (e *EventOperations) hostWrite(val uint64) error {
|
|
|
|
var buf [8]byte
|
|
|
|
usermem.ByteOrder.PutUint64(buf[:], val)
|
|
|
|
_, err := syscall.Write(e.hostfd, buf[:])
|
|
|
|
if err == syscall.EWOULDBLOCK {
|
|
|
|
return syserror.ErrWouldBlock
|
|
|
|
}
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
func (e *EventOperations) write(ctx context.Context, src usermem.IOSequence) error {
|
|
|
|
var buf [8]byte
|
|
|
|
if _, err := src.CopyIn(ctx, buf[:]); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
val := usermem.ByteOrder.Uint64(buf[:])
|
|
|
|
|
|
|
|
return e.Signal(val)
|
|
|
|
}
|
|
|
|
|
|
|
|
// Signal is an internal function to signal the event fd.
|
|
|
|
func (e *EventOperations) Signal(val uint64) error {
|
|
|
|
if val == math.MaxUint64 {
|
|
|
|
return syscall.EINVAL
|
|
|
|
}
|
|
|
|
|
|
|
|
e.mu.Lock()
|
|
|
|
|
2018-07-16 19:19:02 +00:00
|
|
|
if e.hostfd >= 0 {
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
return e.hostWrite(val)
|
|
|
|
}
|
|
|
|
|
2018-04-27 17:37:02 +00:00
|
|
|
// We only allow writes that won't cause the value to go over the max
|
|
|
|
// uint64 minus 1.
|
|
|
|
if val > math.MaxUint64-1-e.val {
|
|
|
|
e.mu.Unlock()
|
|
|
|
return syserror.ErrWouldBlock
|
|
|
|
}
|
|
|
|
|
|
|
|
e.val += val
|
|
|
|
e.mu.Unlock()
|
|
|
|
|
|
|
|
// Always trigger a notification.
|
2018-07-16 19:19:02 +00:00
|
|
|
e.wq.Notify(waiter.EventIn)
|
2018-04-27 17:37:02 +00:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Readiness returns the ready events for the event fd.
|
|
|
|
func (e *EventOperations) Readiness(mask waiter.EventMask) waiter.EventMask {
|
|
|
|
e.mu.Lock()
|
2018-07-16 19:19:02 +00:00
|
|
|
if e.hostfd >= 0 {
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
return fdnotifier.NonBlockingPoll(int32(e.hostfd), mask)
|
|
|
|
}
|
|
|
|
|
|
|
|
ready := waiter.EventMask(0)
|
2018-04-27 17:37:02 +00:00
|
|
|
if e.val > 0 {
|
|
|
|
ready |= waiter.EventIn
|
|
|
|
}
|
|
|
|
|
|
|
|
if e.val < math.MaxUint64-1 {
|
|
|
|
ready |= waiter.EventOut
|
|
|
|
}
|
|
|
|
e.mu.Unlock()
|
|
|
|
|
|
|
|
return mask & ready
|
|
|
|
}
|
2018-07-16 19:19:02 +00:00
|
|
|
|
|
|
|
// EventRegister implements waiter.Waitable.EventRegister.
|
|
|
|
func (e *EventOperations) EventRegister(entry *waiter.Entry, mask waiter.EventMask) {
|
|
|
|
e.wq.EventRegister(entry, mask)
|
|
|
|
|
|
|
|
e.mu.Lock()
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
if e.hostfd >= 0 {
|
|
|
|
fdnotifier.UpdateFD(int32(e.hostfd))
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// EventUnregister implements waiter.Waitable.EventUnregister.
|
|
|
|
func (e *EventOperations) EventUnregister(entry *waiter.Entry) {
|
|
|
|
e.wq.EventUnregister(entry)
|
|
|
|
|
|
|
|
e.mu.Lock()
|
|
|
|
defer e.mu.Unlock()
|
|
|
|
if e.hostfd >= 0 {
|
|
|
|
fdnotifier.UpdateFD(int32(e.hostfd))
|
|
|
|
}
|
|
|
|
}
|