diff --git a/pkg/sentry/syscalls/linux/sys_eventfd.go b/pkg/sentry/syscalls/linux/sys_eventfd.go index 8a34c4e99..ed3413ca6 100644 --- a/pkg/sentry/syscalls/linux/sys_eventfd.go +++ b/pkg/sentry/syscalls/linux/sys_eventfd.go @@ -15,6 +15,7 @@ package linux import ( + "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/arch" "gvisor.dev/gvisor/pkg/sentry/fs" "gvisor.dev/gvisor/pkg/sentry/kernel" @@ -22,32 +23,24 @@ import ( "gvisor.dev/gvisor/pkg/syserror" ) -const ( - // EFD_SEMAPHORE is a flag used in syscall eventfd(2) and eventfd2(2). Please - // see its man page for more information. - EFD_SEMAPHORE = 1 - EFD_NONBLOCK = 0x800 - EFD_CLOEXEC = 0x80000 -) - // Eventfd2 implements linux syscall eventfd2(2). func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { initVal := args[0].Int() flags := uint(args[1].Uint()) - allOps := uint(EFD_SEMAPHORE | EFD_NONBLOCK | EFD_CLOEXEC) + allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) if flags & ^allOps != 0 { return 0, nil, syserror.EINVAL } - event := eventfd.New(t, uint64(initVal), flags&EFD_SEMAPHORE != 0) + event := eventfd.New(t, uint64(initVal), flags&linux.EFD_SEMAPHORE != 0) event.SetFlags(fs.SettableFileFlags{ - NonBlocking: flags&EFD_NONBLOCK != 0, + NonBlocking: flags&linux.EFD_NONBLOCK != 0, }) defer event.DecRef() fd, err := t.NewFDFrom(0, event, kernel.FDFlags{ - CloseOnExec: flags&EFD_CLOEXEC != 0, + CloseOnExec: flags&linux.EFD_CLOEXEC != 0, }) if err != nil { return 0, nil, err diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index f6fb0f219..ffca627d4 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -6,6 +6,7 @@ go_library( name = "vfs2", srcs = [ "epoll.go", + "eventfd.go", "execve.go", "fd.go", "filesystem.go", @@ -26,7 +27,7 @@ go_library( "stat_amd64.go", "stat_arm64.go", "sync.go", - "sys_timerfd.go", + "timerfd.go", "xattr.go", ], marshal = True, diff --git a/pkg/sentry/syscalls/linux/vfs2/eventfd.go b/pkg/sentry/syscalls/linux/vfs2/eventfd.go new file mode 100644 index 000000000..bd2194972 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/eventfd.go @@ -0,0 +1,59 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + "gvisor.dev/gvisor/pkg/syserror" +) + +// Eventfd2 implements linux syscall eventfd2(2). +func Eventfd2(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + initVal := uint64(args[0].Uint()) + flags := uint(args[1].Uint()) + allOps := uint(linux.EFD_SEMAPHORE | linux.EFD_NONBLOCK | linux.EFD_CLOEXEC) + + if flags & ^allOps != 0 { + return 0, nil, syserror.EINVAL + } + + fileFlags := uint32(linux.O_RDWR) + if flags&linux.EFD_NONBLOCK != 0 { + fileFlags |= linux.O_NONBLOCK + } + semMode := flags&linux.EFD_SEMAPHORE != 0 + eventfd, err := t.Kernel().VFS().NewEventFD(initVal, semMode, fileFlags) + if err != nil { + return 0, nil, err + } + defer eventfd.DecRef() + + fd, err := t.NewFDFromVFS2(0, eventfd, kernel.FDFlags{ + CloseOnExec: flags&linux.EFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + + return uintptr(fd), nil, nil +} + +// Eventfd implements linux syscall eventfd(2). +func Eventfd(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + args[1].Value = 0 + return Eventfd2(t, args) +} diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index 74920f785..074f58e5d 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -141,14 +141,14 @@ func Override(table map[uintptr]kernel.Syscall) { table[281] = syscalls.Supported("epoll_pwait", EpollPwait) delete(table, 282) // signalfd table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) - delete(table, 284) // eventfd + table[284] = syscalls.Supported("eventfd", Eventfd) delete(table, 285) // fallocate table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) // TODO(gvisor.dev/issue/1485): Port all socket variants to VFS2. table[288] = syscalls.PartiallySupported("accept4", Accept4, "In process of porting socket syscalls to VFS2.", nil) delete(table, 289) // signalfd4 - delete(table, 290) // eventfd2 + table[290] = syscalls.Supported("eventfd2", Eventfd2) table[291] = syscalls.Supported("epoll_create1", EpollCreate1) table[292] = syscalls.Supported("dup3", Dup3) table[293] = syscalls.Supported("pipe2", Pipe2) diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go b/pkg/sentry/syscalls/linux/vfs2/timerfd.go similarity index 100% rename from pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go rename to pkg/sentry/syscalls/linux/vfs2/timerfd.go diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index 9aeb83fb0..c62505fe2 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -25,6 +25,7 @@ go_library( "device.go", "epoll.go", "epoll_interest_list.go", + "eventfd.go", "file_description.go", "file_description_impl_util.go", "filesystem.go", @@ -44,6 +45,7 @@ go_library( "//pkg/abi/linux", "//pkg/context", "//pkg/fd", + "//pkg/fdnotifier", "//pkg/fspath", "//pkg/gohacks", "//pkg/log", @@ -68,6 +70,7 @@ go_test( name = "vfs_test", size = "small", srcs = [ + "eventfd_test.go", "file_description_impl_util_test.go", "mount_test.go", ], @@ -79,5 +82,6 @@ go_test( "//pkg/sync", "//pkg/syserror", "//pkg/usermem", + "//pkg/waiter", ], ) diff --git a/pkg/sentry/vfs/eventfd.go b/pkg/sentry/vfs/eventfd.go new file mode 100644 index 000000000..f39dacacf --- /dev/null +++ b/pkg/sentry/vfs/eventfd.go @@ -0,0 +1,282 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "math" + "sync" + "syscall" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/context" + "gvisor.dev/gvisor/pkg/fdnotifier" + "gvisor.dev/gvisor/pkg/log" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// EventFileDescription implements FileDescriptionImpl for file-based event +// notification (eventfd). Eventfds are usually internal to the Sentry but in +// certain situations they may be converted into a host-backed eventfd. +type EventFileDescription struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + // queue is used to notify interested parties when the event object + // becomes readable or writable. + queue waiter.Queue `state:"zerovalue"` + + // mu protects the fields below. + mu sync.Mutex `state:"nosave"` + + // val is the current value of the event counter. + val uint64 + + // semMode specifies whether the event is in "semaphore" mode. + semMode bool + + // hostfd indicates whether this eventfd is passed through to the host. + hostfd int +} + +var _ FileDescriptionImpl = (*EventFileDescription)(nil) + +// NewEventFD creates a new event fd. +func (vfs *VirtualFilesystem) NewEventFD(initVal uint64, semMode bool, flags uint32) (*FileDescription, error) { + vd := vfs.NewAnonVirtualDentry("[eventfd]") + defer vd.DecRef() + efd := &EventFileDescription{ + val: initVal, + semMode: semMode, + hostfd: -1, + } + if err := efd.vfsfd.Init(efd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + }); err != nil { + return nil, err + } + return &efd.vfsfd, nil +} + +// HostFD returns the host eventfd associated with this event. +func (efd *EventFileDescription) HostFD() (int, error) { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + return efd.hostfd, nil + } + + flags := linux.EFD_NONBLOCK + if efd.semMode { + flags |= linux.EFD_SEMAPHORE + } + + fd, _, errno := syscall.Syscall(syscall.SYS_EVENTFD2, uintptr(efd.val), uintptr(flags), 0) + if errno != 0 { + return -1, errno + } + + if err := fdnotifier.AddFD(int32(fd), &efd.queue); err != nil { + if closeErr := syscall.Close(int(fd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", fd, closeErr) + } + return -1, err + } + + efd.hostfd = int(fd) + return efd.hostfd, nil +} + +// Release implements FileDescriptionImpl.Release() +func (efd *EventFileDescription) Release() { + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.RemoveFD(int32(efd.hostfd)) + if closeErr := syscall.Close(int(efd.hostfd)); closeErr != nil { + log.Warningf("close(%d) eventfd failed: %v", efd.hostfd, closeErr) + } + efd.hostfd = -1 + } +} + +// Read implements FileDescriptionImpl.Read. +func (efd *EventFileDescription) Read(ctx context.Context, dst usermem.IOSequence, _ ReadOptions) (int64, error) { + if dst.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.read(ctx, dst); err != nil { + return 0, err + } + return 8, nil +} + +// Write implements FileDescriptionImpl.Write. +func (efd *EventFileDescription) Write(ctx context.Context, src usermem.IOSequence, _ WriteOptions) (int64, error) { + if src.NumBytes() < 8 { + return 0, syscall.EINVAL + } + if err := efd.write(ctx, src); err != nil { + return 0, err + } + return 8, nil +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostReadLocked(ctx context.Context, dst usermem.IOSequence) error { + var buf [8]byte + if _, err := syscall.Read(efd.hostfd, buf[:]); err != nil { + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err + } + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +func (efd *EventFileDescription) read(ctx context.Context, dst usermem.IOSequence) error { + efd.mu.Lock() + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostReadLocked(ctx, dst) + } + + // We can't complete the read if the value is currently zero. + if efd.val == 0 { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + // Update the value based on the mode the event is operating in. + var val uint64 + if efd.semMode { + val = 1 + // Consistent with Linux, this is done even if writing to memory fails. + efd.val-- + } else { + val = efd.val + efd.val = 0 + } + + efd.mu.Unlock() + + // Notify writers. We do this even if we were already writable because + // it is possible that a writer is waiting to write the maximum value + // to the event. + efd.queue.Notify(waiter.EventOut) + + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := dst.CopyOut(ctx, buf[:]) + return err +} + +// Preconditions: Must be called with efd.mu locked. +func (efd *EventFileDescription) hostWriteLocked(val uint64) error { + var buf [8]byte + usermem.ByteOrder.PutUint64(buf[:], val) + _, err := syscall.Write(efd.hostfd, buf[:]) + if err == syscall.EWOULDBLOCK { + return syserror.ErrWouldBlock + } + return err +} + +func (efd *EventFileDescription) write(ctx context.Context, src usermem.IOSequence) error { + var buf [8]byte + if _, err := src.CopyIn(ctx, buf[:]); err != nil { + return err + } + val := usermem.ByteOrder.Uint64(buf[:]) + + return efd.Signal(val) +} + +// Signal is an internal function to signal the event fd. +func (efd *EventFileDescription) Signal(val uint64) error { + if val == math.MaxUint64 { + return syscall.EINVAL + } + + efd.mu.Lock() + + if efd.hostfd >= 0 { + defer efd.mu.Unlock() + return efd.hostWriteLocked(val) + } + + // We only allow writes that won't cause the value to go over the max + // uint64 minus 1. + if val > math.MaxUint64-1-efd.val { + efd.mu.Unlock() + return syserror.ErrWouldBlock + } + + efd.val += val + efd.mu.Unlock() + + // Always trigger a notification. + efd.queue.Notify(waiter.EventIn) + + return nil +} + +// Readiness implements waiter.Waitable.Readiness. +func (efd *EventFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + efd.mu.Lock() + defer efd.mu.Unlock() + + if efd.hostfd >= 0 { + return fdnotifier.NonBlockingPoll(int32(efd.hostfd), mask) + } + + ready := waiter.EventMask(0) + if efd.val > 0 { + ready |= waiter.EventIn + } + + if efd.val < math.MaxUint64-1 { + ready |= waiter.EventOut + } + + return mask & ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (efd *EventFileDescription) EventRegister(entry *waiter.Entry, mask waiter.EventMask) { + efd.queue.EventRegister(entry, mask) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (efd *EventFileDescription) EventUnregister(entry *waiter.Entry) { + efd.queue.EventUnregister(entry) + + efd.mu.Lock() + defer efd.mu.Unlock() + if efd.hostfd >= 0 { + fdnotifier.UpdateFD(int32(efd.hostfd)) + } +} diff --git a/pkg/sentry/vfs/eventfd_test.go b/pkg/sentry/vfs/eventfd_test.go new file mode 100644 index 000000000..2dff2d10b --- /dev/null +++ b/pkg/sentry/vfs/eventfd_test.go @@ -0,0 +1,96 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "testing" + + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/contexttest" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +func TestEventFD(t *testing.T) { + initVals := []uint64{ + 0, + // Using a non-zero initial value verifies that writing to an + // eventfd signals when the eventfd's counter was already + // non-zero. + 343, + } + + for _, initVal := range initVals { + ctx := contexttest.Context(t) + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := vfsObj.NewEventFD(initVal, false, linux.O_RDWR) + if err != nil { + t.Fatalf("NewEventFD failed: %v", err) + } + defer eventfd.DecRef() + + // Register a callback for a write event. + w, ch := waiter.NewChannelEntry(nil) + eventfd.EventRegister(&w, waiter.EventIn) + defer eventfd.EventUnregister(&w) + + data := []byte("00000124") + // Create and submit a write request. + n, err := eventfd.Write(ctx, usermem.BytesIOSequence(data), WriteOptions{}) + if err != nil { + t.Fatal(err) + } + if n != 8 { + t.Errorf("eventfd.write wrote %d bytes, not full int64", n) + } + + // Check if the callback fired due to the write event. + select { + case <-ch: + default: + t.Errorf("Didn't get notified of EventIn after write") + } + } +} + +func TestEventFDStat(t *testing.T) { + ctx := contexttest.Context(t) + vfsObj := &VirtualFilesystem{} + if err := vfsObj.Init(); err != nil { + t.Fatalf("VFS init: %v", err) + } + + // Make a new eventfd that is writable. + eventfd, err := vfsObj.NewEventFD(0, false, linux.O_RDWR) + if err != nil { + t.Fatalf("NewEventFD failed: %v", err) + } + defer eventfd.DecRef() + + statx, err := eventfd.Stat(ctx, StatOptions{ + Mask: linux.STATX_BASIC_STATS, + }) + if err != nil { + t.Fatalf("eventfd.Stat failed: %v", err) + } + if statx.Size != 0 { + t.Errorf("eventfd size should be 0") + } +} diff --git a/test/syscalls/linux/eventfd.cc b/test/syscalls/linux/eventfd.cc index 927001eee..548b05a64 100644 --- a/test/syscalls/linux/eventfd.cc +++ b/test/syscalls/linux/eventfd.cc @@ -100,6 +100,22 @@ TEST(EventfdTest, SmallRead) { ASSERT_THAT(read(efd.get(), &l, 4), SyscallFailsWithErrno(EINVAL)); } +TEST(EventfdTest, PreadIllegalSeek) { + FileDescriptor efd = + ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE)); + + uint64_t l = 0; + ASSERT_THAT(pread(efd.get(), &l, 4, 0), SyscallFailsWithErrno(ESPIPE)); +} + +TEST(EventfdTest, PwriteIllegalSeek) { + FileDescriptor efd = + ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE)); + + uint64_t l = 0; + ASSERT_THAT(pwrite(efd.get(), &l, 4, 0), SyscallFailsWithErrno(ESPIPE)); +} + TEST(EventfdTest, BigWrite) { FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD(0, EFD_NONBLOCK | EFD_SEMAPHORE));