gvisor/pkg/sentry/syscalls/linux/sys_poll.go

430 lines
12 KiB
Go

// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package linux
import (
"time"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/sentry/arch"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel"
"gvisor.googlesource.com/gvisor/pkg/sentry/kernel/kdefs"
ktime "gvisor.googlesource.com/gvisor/pkg/sentry/kernel/time"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits"
"gvisor.googlesource.com/gvisor/pkg/sentry/syscalls"
"gvisor.googlesource.com/gvisor/pkg/sentry/usermem"
"gvisor.googlesource.com/gvisor/pkg/syserror"
"gvisor.googlesource.com/gvisor/pkg/waiter"
)
// fileCap is the maximum allowable files for poll & select.
const fileCap = 1024 * 1024
// Masks for "readable", "writable", and "exceptional" events as defined by
// select(2).
const (
// selectReadEvents is analogous to the Linux kernel's
// fs/select.c:POLLIN_SET.
selectReadEvents = waiter.EventIn | waiter.EventHUp | waiter.EventErr
// selectWriteEvents is analogous to the Linux kernel's
// fs/select.c:POLLOUT_SET.
selectWriteEvents = waiter.EventOut | waiter.EventErr
// selectExceptEvents is analogous to the Linux kernel's
// fs/select.c:POLLEX_SET.
selectExceptEvents = waiter.EventPri
)
func doPoll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (time.Duration, uintptr, error) {
if uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
return timeout, 0, syserror.EINVAL
}
pfd := make([]syscalls.PollFD, nfds)
if nfds > 0 {
if _, err := t.CopyIn(pfdAddr, &pfd); err != nil {
return timeout, 0, err
}
}
// Compatibility warning: Linux adds POLLHUP and POLLERR just before
// polling, in fs/select.c:do_pollfd(). Since pfd is copied out after
// polling, changing event masks here is an application-visible difference.
// (Linux also doesn't copy out event masks at all, only revents.)
for i := range pfd {
pfd[i].Events |= waiter.EventHUp | waiter.EventErr
}
remainingTimeout, n, err := syscalls.Poll(t, pfd, timeout)
err = syserror.ConvertIntr(err, syserror.EINTR)
// The poll entries are copied out regardless of whether
// any are set or not. This aligns with the Linux behavior.
if nfds > 0 && err == nil {
if _, err := t.CopyOut(pfdAddr, pfd); err != nil {
return remainingTimeout, 0, err
}
}
return remainingTimeout, n, err
}
func doSelect(t *kernel.Task, nfds int, readFDs, writeFDs, exceptFDs usermem.Addr, timeout time.Duration) (uintptr, error) {
if nfds < 0 || uint64(nfds) > t.ThreadGroup().Limits().GetCapped(limits.NumberOfFiles, fileCap) {
return 0, syserror.EINVAL
}
// Capture all the provided input vectors.
//
// N.B. This only works on little-endian architectures.
byteCount := (nfds + 7) / 8
bitsInLastPartialByte := uint(nfds % 8)
r := make([]byte, byteCount)
w := make([]byte, byteCount)
e := make([]byte, byteCount)
if readFDs != 0 {
if _, err := t.CopyIn(readFDs, &r); err != nil {
return 0, err
}
// Mask out bits above nfds.
if bitsInLastPartialByte != 0 {
r[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
}
}
if writeFDs != 0 {
if _, err := t.CopyIn(writeFDs, &w); err != nil {
return 0, err
}
if bitsInLastPartialByte != 0 {
w[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
}
}
if exceptFDs != 0 {
if _, err := t.CopyIn(exceptFDs, &e); err != nil {
return 0, err
}
if bitsInLastPartialByte != 0 {
e[byteCount-1] &^= byte(0xff) << bitsInLastPartialByte
}
}
// Count how many FDs are actually being requested so that we can build
// a PollFD array.
fdCount := 0
for i := 0; i < byteCount; i++ {
v := r[i] | w[i] | e[i]
for v != 0 {
v &= (v - 1)
fdCount++
}
}
// Build the PollFD array.
pfd := make([]syscalls.PollFD, 0, fdCount)
fd := kdefs.FD(0)
for i := 0; i < byteCount; i++ {
rV, wV, eV := r[i], w[i], e[i]
v := rV | wV | eV
m := byte(1)
for j := 0; j < 8; j++ {
if (v & m) != 0 {
// Make sure the fd is valid and decrement the reference
// immediately to ensure we don't leak. Note, another thread
// might be about to close fd. This is racy, but that's
// OK. Linux is racy in the same way.
file := t.FDMap().GetFile(fd)
if file == nil {
return 0, syserror.EBADF
}
file.DecRef()
mask := waiter.EventMask(0)
if (rV & m) != 0 {
mask |= selectReadEvents
}
if (wV & m) != 0 {
mask |= selectWriteEvents
}
if (eV & m) != 0 {
mask |= selectExceptEvents
}
pfd = append(pfd, syscalls.PollFD{
FD: fd,
Events: mask,
})
}
fd++
m <<= 1
}
}
// Do the syscall, then count the number of bits set.
_, _, err := syscalls.Poll(t, pfd, timeout)
if err != nil {
return 0, syserror.ConvertIntr(err, syserror.EINTR)
}
// r, w, and e are currently event mask bitsets; unset bits corresponding
// to events that *didn't* occur.
bitSetCount := uintptr(0)
for idx := range pfd {
events := pfd[idx].REvents
i, j := pfd[idx].FD/8, uint(pfd[idx].FD%8)
m := byte(1) << j
if r[i]&m != 0 {
if (events & selectReadEvents) != 0 {
bitSetCount++
} else {
r[i] &^= m
}
}
if w[i]&m != 0 {
if (events & selectWriteEvents) != 0 {
bitSetCount++
} else {
w[i] &^= m
}
}
if e[i]&m != 0 {
if (events & selectExceptEvents) != 0 {
bitSetCount++
} else {
e[i] &^= m
}
}
}
// Copy updated vectors back.
if readFDs != 0 {
if _, err := t.CopyOut(readFDs, r); err != nil {
return 0, err
}
}
if writeFDs != 0 {
if _, err := t.CopyOut(writeFDs, w); err != nil {
return 0, err
}
}
if exceptFDs != 0 {
if _, err := t.CopyOut(exceptFDs, e); err != nil {
return 0, err
}
}
return bitSetCount, nil
}
// timeoutRemaining returns the amount of time remaining for the specified
// timeout or 0 if it has elapsed.
//
// startNs must be from CLOCK_MONOTONIC.
func timeoutRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration) time.Duration {
now := t.Kernel().MonotonicClock().Now()
remaining := timeout - now.Sub(startNs)
if remaining < 0 {
remaining = 0
}
return remaining
}
// copyOutTimespecRemaining copies the time remaining in timeout to timespecAddr.
//
// startNs must be from CLOCK_MONOTONIC.
func copyOutTimespecRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timespecAddr usermem.Addr) error {
if timeout <= 0 {
return nil
}
remaining := timeoutRemaining(t, startNs, timeout)
tsRemaining := linux.NsecToTimespec(remaining.Nanoseconds())
return copyTimespecOut(t, timespecAddr, &tsRemaining)
}
// copyOutTimevalRemaining copies the time remaining in timeout to timevalAddr.
//
// startNs must be from CLOCK_MONOTONIC.
func copyOutTimevalRemaining(t *kernel.Task, startNs ktime.Time, timeout time.Duration, timevalAddr usermem.Addr) error {
if timeout <= 0 {
return nil
}
remaining := timeoutRemaining(t, startNs, timeout)
tvRemaining := linux.NsecToTimeval(remaining.Nanoseconds())
return copyTimevalOut(t, timevalAddr, &tvRemaining)
}
// pollRestartBlock encapsulates the state required to restart poll(2) via
// restart_syscall(2).
type pollRestartBlock struct {
pfdAddr usermem.Addr
nfds uint
timeout time.Duration
}
// Restart implements kernel.SyscallRestartBlock.Restart.
func (p *pollRestartBlock) Restart(t *kernel.Task) (uintptr, error) {
return poll(t, p.pfdAddr, p.nfds, p.timeout)
}
func poll(t *kernel.Task, pfdAddr usermem.Addr, nfds uint, timeout time.Duration) (uintptr, error) {
remainingTimeout, n, err := doPoll(t, pfdAddr, nfds, timeout)
// On an interrupt poll(2) is restarted with the remaining timeout.
if err == syserror.EINTR {
t.SetSyscallRestartBlock(&pollRestartBlock{
pfdAddr: pfdAddr,
nfds: nfds,
timeout: remainingTimeout,
})
return 0, kernel.ERESTART_RESTARTBLOCK
}
return n, err
}
// Poll implements linux syscall poll(2).
func Poll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pfdAddr := args[0].Pointer()
nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
timeout := time.Duration(args[2].Int()) * time.Millisecond
n, err := poll(t, pfdAddr, nfds, timeout)
return n, nil, err
}
// Ppoll implements linux syscall ppoll(2).
func Ppoll(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
pfdAddr := args[0].Pointer()
nfds := uint(args[1].Uint()) // poll(2) uses unsigned long.
timespecAddr := args[2].Pointer()
maskAddr := args[3].Pointer()
maskSize := uint(args[4].Uint())
timeout, err := copyTimespecInToDuration(t, timespecAddr)
if err != nil {
return 0, nil, err
}
var startNs ktime.Time
if timeout > 0 {
startNs = t.Kernel().MonotonicClock().Now()
}
if maskAddr != 0 {
mask, err := copyInSigSet(t, maskAddr, maskSize)
if err != nil {
return 0, nil, err
}
oldmask := t.SignalMask()
t.SetSignalMask(mask)
t.SetSavedSignalMask(oldmask)
}
_, n, err := doPoll(t, pfdAddr, nfds, timeout)
copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
// doPoll returns EINTR if interrupted, but ppoll is normally restartable
// if interrupted by something other than a signal handled by the
// application (i.e. returns ERESTARTNOHAND). However, if
// copyOutTimespecRemaining failed, then the restarted ppoll would use the
// wrong timeout, so the error should be left as EINTR.
//
// Note that this means that if err is nil but copyErr is not, copyErr is
// ignored. This is consistent with Linux.
if err == syserror.EINTR && copyErr == nil {
err = kernel.ERESTARTNOHAND
}
return n, nil, err
}
// Select implements linux syscall select(2).
func Select(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nfds := int(args[0].Int()) // select(2) uses an int.
readFDs := args[1].Pointer()
writeFDs := args[2].Pointer()
exceptFDs := args[3].Pointer()
timevalAddr := args[4].Pointer()
// Use a negative Duration to indicate "no timeout".
timeout := time.Duration(-1)
if timevalAddr != 0 {
timeval, err := copyTimevalIn(t, timevalAddr)
if err != nil {
return 0, nil, err
}
if timeval.Sec < 0 || timeval.Usec < 0 {
return 0, nil, syserror.EINVAL
}
timeout = time.Duration(timeval.ToNsecCapped())
}
startNs := t.Kernel().MonotonicClock().Now()
n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
copyErr := copyOutTimevalRemaining(t, startNs, timeout, timevalAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
err = kernel.ERESTARTNOHAND
}
return n, nil, err
}
// Pselect implements linux syscall pselect(2).
func Pselect(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) {
nfds := int(args[0].Int()) // select(2) uses an int.
readFDs := args[1].Pointer()
writeFDs := args[2].Pointer()
exceptFDs := args[3].Pointer()
timespecAddr := args[4].Pointer()
maskWithSizeAddr := args[5].Pointer()
timeout, err := copyTimespecInToDuration(t, timespecAddr)
if err != nil {
return 0, nil, err
}
var startNs ktime.Time
if timeout > 0 {
startNs = t.Kernel().MonotonicClock().Now()
}
if maskWithSizeAddr != 0 {
maskAddr, size, err := copyInSigSetWithSize(t, maskWithSizeAddr)
if err != nil {
return 0, nil, err
}
if maskAddr != 0 {
mask, err := copyInSigSet(t, maskAddr, size)
if err != nil {
return 0, nil, err
}
oldmask := t.SignalMask()
t.SetSignalMask(mask)
t.SetSavedSignalMask(oldmask)
}
}
n, err := doSelect(t, nfds, readFDs, writeFDs, exceptFDs, timeout)
copyErr := copyOutTimespecRemaining(t, startNs, timeout, timespecAddr)
// See comment in Ppoll.
if err == syserror.EINTR && copyErr == nil {
err = kernel.ERESTARTNOHAND
}
return n, nil, err
}