gvisor/pkg/sentry/syscalls/linux/vfs2/epoll.go

229 lines
6.5 KiB
Go

// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package vfs2
import (
"math"
"time"
"gvisor.dev/gvisor/pkg/abi/linux"
"gvisor.dev/gvisor/pkg/sentry/arch"
"gvisor.dev/gvisor/pkg/sentry/kernel"
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
"gvisor.dev/gvisor/pkg/sentry/vfs"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/usermem"
"gvisor.dev/gvisor/pkg/waiter"
)
var sizeofEpollEvent = (*linux.EpollEvent)(nil).SizeBytes()
// EpollCreate1 implements Linux syscall epoll_create1(2).
func EpollCreate1(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
flags := args[0].Int()
if flags&^linux.EPOLL_CLOEXEC != 0 {
return 0, nil, syserror.EINVAL
}
file, err := t.Kernel().VFS().NewEpollInstanceFD()
if err != nil {
return 0, nil, err
}
defer file.DecRef()
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{
CloseOnExec: flags&linux.EPOLL_CLOEXEC != 0,
})
if err != nil {
return 0, nil, err
}
return uintptr(fd), nil, nil
}
// EpollCreate implements Linux syscall epoll_create(2).
func EpollCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
size := args[0].Int()
// "Since Linux 2.6.8, the size argument is ignored, but must be greater
// than zero" - epoll_create(2)
if size <= 0 {
return 0, nil, syserror.EINVAL
}
file, err := t.Kernel().VFS().NewEpollInstanceFD()
if err != nil {
return 0, nil, err
}
defer file.DecRef()
fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{})
if err != nil {
return 0, nil, err
}
return uintptr(fd), nil, nil
}
// EpollCtl implements Linux syscall epoll_ctl(2).
func EpollCtl(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
epfd := args[0].Int()
op := args[1].Int()
fd := args[2].Int()
eventAddr := args[3].Pointer()
epfile := t.GetFileVFS2(epfd)
if epfile == nil {
return 0, nil, syserror.EBADF
}
defer epfile.DecRef()
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
}
file := t.GetFileVFS2(fd)
if file == nil {
return 0, nil, syserror.EBADF
}
defer file.DecRef()
if epfile == file {
return 0, nil, syserror.EINVAL
}
var event linux.EpollEvent
switch op {
case linux.EPOLL_CTL_ADD:
if _, err := event.CopyIn(t, eventAddr); err != nil {
return 0, nil, err
}
return 0, nil, ep.AddInterest(file, fd, event)
case linux.EPOLL_CTL_DEL:
return 0, nil, ep.DeleteInterest(file, fd)
case linux.EPOLL_CTL_MOD:
if _, err := event.CopyIn(t, eventAddr); err != nil {
return 0, nil, err
}
return 0, nil, ep.ModifyInterest(file, fd, event)
default:
return 0, nil, syserror.EINVAL
}
}
// EpollWait implements Linux syscall epoll_wait(2).
func EpollWait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
epfd := args[0].Int()
eventsAddr := args[1].Pointer()
maxEvents := int(args[2].Int())
timeout := int(args[3].Int())
var _EP_MAX_EVENTS = math.MaxInt32 / sizeofEpollEvent // Linux: fs/eventpoll.c:EP_MAX_EVENTS
if maxEvents <= 0 || maxEvents > _EP_MAX_EVENTS {
return 0, nil, syserror.EINVAL
}
epfile := t.GetFileVFS2(epfd)
if epfile == nil {
return 0, nil, syserror.EBADF
}
defer epfile.DecRef()
ep, ok := epfile.Impl().(*vfs.EpollInstance)
if !ok {
return 0, nil, syserror.EINVAL
}
// Use a fixed-size buffer in a loop, instead of make([]linux.EpollEvent,
// maxEvents), so that the buffer can be allocated on the stack.
var (
events [16]linux.EpollEvent
total int
ch chan struct{}
haveDeadline bool
deadline ktime.Time
)
for {
batchEvents := len(events)
if batchEvents > maxEvents {
batchEvents = maxEvents
}
n := ep.ReadEvents(events[:batchEvents])
maxEvents -= n
if n != 0 {
// Copy what we read out.
copiedBytes, err := linux.CopyEpollEventSliceOut(t, eventsAddr, events[:n])
copiedEvents := copiedBytes / sizeofEpollEvent // rounded down
eventsAddr += usermem.Addr(copiedEvents * sizeofEpollEvent)
total += copiedEvents
if err != nil {
if total != 0 {
return uintptr(total), nil, nil
}
return 0, nil, err
}
// If we've filled the application's event buffer, we're done.
if maxEvents == 0 {
return uintptr(total), nil, nil
}
// Loop if we read a full batch, under the expectation that there
// may be more events to read.
if n == batchEvents {
continue
}
}
// We get here if n != batchEvents. If we read any number of events
// (just now, or in a previous iteration of this loop), or if timeout
// is 0 (such that epoll_wait should be non-blocking), return the
// events we've read so far to the application.
if total != 0 || timeout == 0 {
return uintptr(total), nil, nil
}
// In the first iteration of this loop, register with the epoll
// instance for readability events, but then immediately continue the
// loop since we need to retry ReadEvents() before blocking. In all
// subsequent iterations, block until events are available, the timeout
// expires, or an interrupt arrives.
if ch == nil {
var w waiter.Entry
w, ch = waiter.NewChannelEntry(nil)
epfile.EventRegister(&w, waiter.EventIn)
defer epfile.EventUnregister(&w)
} else {
// Set up the timer if a timeout was specified.
if timeout > 0 && !haveDeadline {
timeoutDur := time.Duration(timeout) * time.Millisecond
deadline = t.Kernel().MonotonicClock().Now().Add(timeoutDur)
haveDeadline = true
}
if err := t.BlockWithDeadline(ch, haveDeadline, deadline); err != nil {
if err == syserror.ETIMEDOUT {
err = nil
}
// total must be 0 since otherwise we would have returned
// above.
return 0, nil, err
}
}
}
}
// EpollPwait implements Linux syscall epoll_pwait(2).
func EpollPwait(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
maskAddr := args[4].Pointer()
maskSize := uint(args[5].Uint())
if err := setTempSignalSet(t, maskAddr, maskSize); err != nil {
return 0, nil, err
}
return EpollWait(t, args)
}