From f332a864e8cc7799332838deffab37244ff8ffc7 Mon Sep 17 00:00:00 2001 From: Nicolas Lacasse Date: Mon, 6 Apr 2020 10:51:54 -0700 Subject: [PATCH] Port timerfd to VFS2. PiperOrigin-RevId: 305067208 --- pkg/sentry/kernel/kernel.go | 28 ++-- pkg/sentry/syscalls/linux/vfs2/BUILD | 1 + .../linux/vfs2/linux64_override_amd64.go | 6 +- pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go | 123 +++++++++++++++ pkg/sentry/vfs/BUILD | 2 + pkg/sentry/vfs/file_description.go | 7 + pkg/sentry/vfs/timerfd.go | 142 ++++++++++++++++++ 7 files changed, 295 insertions(+), 14 deletions(-) create mode 100644 pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go create mode 100644 pkg/sentry/vfs/timerfd.go diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index ba8935a82..de8a95854 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -1044,14 +1044,17 @@ func (k *Kernel) pauseTimeLocked() { // This means we'll iterate FDTables shared by multiple tasks repeatedly, // but ktime.Timer.Pause is idempotent so this is harmless. if t.fdTable != nil { - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if !VFS2Enabled { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + tfd.PauseTimer() + } + } else { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.PauseTimer() } - }) - } + } + }) } } k.timekeeper.PauseUpdates() @@ -1076,15 +1079,18 @@ func (k *Kernel) resumeTimeLocked() { it.ResumeTimer() } } - // TODO(gvisor.dev/issue/1663): Add save support for VFS2. - if !VFS2Enabled { - if t.fdTable != nil { - t.fdTable.forEach(func(_ int32, file *fs.File, _ *vfs.FileDescription, _ FDFlags) { + if t.fdTable != nil { + t.fdTable.forEach(func(_ int32, file *fs.File, fd *vfs.FileDescription, _ FDFlags) { + if VFS2Enabled { + if tfd, ok := fd.Impl().(*vfs.TimerFileDescription); ok { + tfd.ResumeTimer() + } + } else { if tfd, ok := file.FileOperations.(*timerfd.TimerOperations); ok { tfd.ResumeTimer() } - }) - } + } + }) } } } diff --git a/pkg/sentry/syscalls/linux/vfs2/BUILD b/pkg/sentry/syscalls/linux/vfs2/BUILD index 2eb210014..0004e60d9 100644 --- a/pkg/sentry/syscalls/linux/vfs2/BUILD +++ b/pkg/sentry/syscalls/linux/vfs2/BUILD @@ -25,6 +25,7 @@ go_library( "stat_amd64.go", "stat_arm64.go", "sync.go", + "sys_timerfd.go", "xattr.go", ], marshal = True, diff --git a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go index 7d220bc20..63febc2f7 100644 --- a/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go +++ b/pkg/sentry/syscalls/linux/vfs2/linux64_override_amd64.go @@ -139,11 +139,11 @@ func Override(table map[uintptr]kernel.Syscall) { table[280] = syscalls.Supported("utimensat", Utimensat) table[281] = syscalls.Supported("epoll_pwait", EpollPwait) delete(table, 282) // signalfd - delete(table, 283) // timerfd_create + table[283] = syscalls.Supported("timerfd_create", TimerfdCreate) delete(table, 284) // eventfd delete(table, 285) // fallocate - delete(table, 286) // timerfd_settime - delete(table, 287) // timerfd_gettime + table[286] = syscalls.Supported("timerfd_settime", TimerfdSettime) + table[287] = syscalls.Supported("timerfd_gettime", TimerfdGettime) delete(table, 288) // accept4 delete(table, 289) // signalfd4 delete(table, 290) // eventfd2 diff --git a/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go new file mode 100644 index 000000000..7938a5249 --- /dev/null +++ b/pkg/sentry/syscalls/linux/vfs2/sys_timerfd.go @@ -0,0 +1,123 @@ +// Copyright 2018 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs2 + +import ( + "gvisor.dev/gvisor/pkg/abi/linux" + "gvisor.dev/gvisor/pkg/sentry/arch" + "gvisor.dev/gvisor/pkg/sentry/kernel" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/sentry/vfs" + "gvisor.dev/gvisor/pkg/syserror" +) + +// TimerfdCreate implements Linux syscall timerfd_create(2). +func TimerfdCreate(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + clockID := args[0].Int() + flags := args[1].Int() + + if flags&^(linux.TFD_CLOEXEC|linux.TFD_NONBLOCK) != 0 { + return 0, nil, syserror.EINVAL + } + + var fileFlags uint32 + if flags&linux.TFD_NONBLOCK != 0 { + fileFlags = linux.O_NONBLOCK + } + + var clock ktime.Clock + switch clockID { + case linux.CLOCK_REALTIME: + clock = t.Kernel().RealtimeClock() + case linux.CLOCK_MONOTONIC, linux.CLOCK_BOOTTIME: + clock = t.Kernel().MonotonicClock() + default: + return 0, nil, syserror.EINVAL + } + file, err := t.Kernel().VFS().NewTimerFD(clock, fileFlags) + if err != nil { + return 0, nil, err + } + defer file.DecRef() + fd, err := t.NewFDFromVFS2(0, file, kernel.FDFlags{ + CloseOnExec: flags&linux.TFD_CLOEXEC != 0, + }) + if err != nil { + return 0, nil, err + } + return uintptr(fd), nil, nil +} + +// TimerfdSettime implements Linux syscall timerfd_settime(2). +func TimerfdSettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + flags := args[1].Int() + newValAddr := args[2].Pointer() + oldValAddr := args[3].Pointer() + + if flags&^(linux.TFD_TIMER_ABSTIME) != 0 { + return 0, nil, syserror.EINVAL + } + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + tfd, ok := file.Impl().(*vfs.TimerFileDescription) + if !ok { + return 0, nil, syserror.EINVAL + } + + var newVal linux.Itimerspec + if _, err := t.CopyIn(newValAddr, &newVal); err != nil { + return 0, nil, err + } + newS, err := ktime.SettingFromItimerspec(newVal, flags&linux.TFD_TIMER_ABSTIME != 0, tfd.Clock()) + if err != nil { + return 0, nil, err + } + tm, oldS := tfd.SetTime(newS) + if oldValAddr != 0 { + oldVal := ktime.ItimerspecFromSetting(tm, oldS) + if _, err := t.CopyOut(oldValAddr, &oldVal); err != nil { + return 0, nil, err + } + } + return 0, nil, nil +} + +// TimerfdGettime implements Linux syscall timerfd_gettime(2). +func TimerfdGettime(t *kernel.Task, args arch.SyscallArguments) (uintptr, *kernel.SyscallControl, error) { + fd := args[0].Int() + curValAddr := args[1].Pointer() + + file := t.GetFileVFS2(fd) + if file == nil { + return 0, nil, syserror.EBADF + } + defer file.DecRef() + + tfd, ok := file.Impl().(*vfs.TimerFileDescription) + if !ok { + return 0, nil, syserror.EINVAL + } + + tm, s := tfd.GetTime() + curVal := ktime.ItimerspecFromSetting(tm, s) + _, err := t.CopyOut(curValAddr, &curVal) + return 0, nil, err +} diff --git a/pkg/sentry/vfs/BUILD b/pkg/sentry/vfs/BUILD index bf4d27c7d..9aeb83fb0 100644 --- a/pkg/sentry/vfs/BUILD +++ b/pkg/sentry/vfs/BUILD @@ -36,6 +36,7 @@ go_library( "pathname.go", "permissions.go", "resolving_path.go", + "timerfd.go", "vfs.go", ], visibility = ["//pkg/sentry:internal"], @@ -51,6 +52,7 @@ go_library( "//pkg/sentry/fs", "//pkg/sentry/fs/lock", "//pkg/sentry/kernel/auth", + "//pkg/sentry/kernel/time", "//pkg/sentry/limits", "//pkg/sentry/memmap", "//pkg/sentry/socket/unix/transport", diff --git a/pkg/sentry/vfs/file_description.go b/pkg/sentry/vfs/file_description.go index 28e93a441..20c545fca 100644 --- a/pkg/sentry/vfs/file_description.go +++ b/pkg/sentry/vfs/file_description.go @@ -91,6 +91,10 @@ type FileDescriptionOptions struct { // ESPIPE. DenyPWrite bool + // if InvalidWrite is true, calls to FileDescription.Write() return + // EINVAL. + InvalidWrite bool + // If UseDentryMetadata is true, calls to FileDescription methods that // interact with file and filesystem metadata (Stat, SetStat, StatFS, // Listxattr, Getxattr, Setxattr, Removexattr) are implemented by calling @@ -562,6 +566,9 @@ func (fd *FileDescription) PWrite(ctx context.Context, src usermem.IOSequence, o // Write is similar to PWrite, but does not specify an offset. func (fd *FileDescription) Write(ctx context.Context, src usermem.IOSequence, opts WriteOptions) (int64, error) { + if fd.opts.InvalidWrite { + return 0, syserror.EINVAL + } if !fd.writable { return 0, syserror.EBADF } diff --git a/pkg/sentry/vfs/timerfd.go b/pkg/sentry/vfs/timerfd.go new file mode 100644 index 000000000..42b880656 --- /dev/null +++ b/pkg/sentry/vfs/timerfd.go @@ -0,0 +1,142 @@ +// Copyright 2020 The gVisor Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package vfs + +import ( + "sync/atomic" + + "gvisor.dev/gvisor/pkg/context" + ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time" + "gvisor.dev/gvisor/pkg/syserror" + "gvisor.dev/gvisor/pkg/usermem" + "gvisor.dev/gvisor/pkg/waiter" +) + +// TimerFileDescription implements FileDescriptionImpl for timer fds. It also +// implements ktime.TimerListener. +type TimerFileDescription struct { + vfsfd FileDescription + FileDescriptionDefaultImpl + DentryMetadataFileDescriptionImpl + + events waiter.Queue + timer *ktime.Timer + + // val is the number of timer expirations since the last successful + // call to PRead, or SetTime. val must be accessed using atomic memory + // operations. + val uint64 +} + +var _ FileDescriptionImpl = (*TimerFileDescription)(nil) +var _ ktime.TimerListener = (*TimerFileDescription)(nil) + +// NewTimerFD returns a new timer fd. +func (vfs *VirtualFilesystem) NewTimerFD(clock ktime.Clock, flags uint32) (*FileDescription, error) { + vd := vfs.NewAnonVirtualDentry("[timerfd]") + defer vd.DecRef() + tfd := &TimerFileDescription{} + tfd.timer = ktime.NewTimer(clock, tfd) + if err := tfd.vfsfd.Init(tfd, flags, vd.Mount(), vd.Dentry(), &FileDescriptionOptions{ + UseDentryMetadata: true, + DenyPRead: true, + DenyPWrite: true, + InvalidWrite: true, + }); err != nil { + return nil, err + } + return &tfd.vfsfd, nil +} + +// Read implements FileDescriptionImpl.Read. +func (tfd *TimerFileDescription) Read(ctx context.Context, dst usermem.IOSequence, opts ReadOptions) (int64, error) { + const sizeofUint64 = 8 + if dst.NumBytes() < sizeofUint64 { + return 0, syserror.EINVAL + } + if val := atomic.SwapUint64(&tfd.val, 0); val != 0 { + var buf [sizeofUint64]byte + usermem.ByteOrder.PutUint64(buf[:], val) + if _, err := dst.CopyOut(ctx, buf[:]); err != nil { + // Linux does not undo consuming the number of + // expirations even if writing to userspace fails. + return 0, err + } + return sizeofUint64, nil + } + return 0, syserror.ErrWouldBlock +} + +// Clock returns the timer fd's Clock. +func (tfd *TimerFileDescription) Clock() ktime.Clock { + return tfd.timer.Clock() +} + +// GetTime returns the associated Timer's setting and the time at which it was +// observed. +func (tfd *TimerFileDescription) GetTime() (ktime.Time, ktime.Setting) { + return tfd.timer.Get() +} + +// SetTime atomically changes the associated Timer's setting, resets the number +// of expirations to 0, and returns the previous setting and the time at which +// it was observed. +func (tfd *TimerFileDescription) SetTime(s ktime.Setting) (ktime.Time, ktime.Setting) { + return tfd.timer.SwapAnd(s, func() { atomic.StoreUint64(&tfd.val, 0) }) +} + +// Readiness implements waiter.Waitable.Readiness. +func (tfd *TimerFileDescription) Readiness(mask waiter.EventMask) waiter.EventMask { + var ready waiter.EventMask + if atomic.LoadUint64(&tfd.val) != 0 { + ready |= waiter.EventIn + } + return ready +} + +// EventRegister implements waiter.Waitable.EventRegister. +func (tfd *TimerFileDescription) EventRegister(e *waiter.Entry, mask waiter.EventMask) { + tfd.events.EventRegister(e, mask) +} + +// EventUnregister implements waiter.Waitable.EventUnregister. +func (tfd *TimerFileDescription) EventUnregister(e *waiter.Entry) { + tfd.events.EventUnregister(e) +} + +// PauseTimer pauses the associated Timer. +func (tfd *TimerFileDescription) PauseTimer() { + tfd.timer.Pause() +} + +// ResumeTimer resumes the associated Timer. +func (tfd *TimerFileDescription) ResumeTimer() { + tfd.timer.Resume() +} + +// Release implements FileDescriptionImpl.Release() +func (tfd *TimerFileDescription) Release() { + tfd.timer.Destroy() +} + +// Notify implements ktime.TimerListener.Notify. +func (tfd *TimerFileDescription) Notify(exp uint64, setting ktime.Setting) (ktime.Setting, bool) { + atomic.AddUint64(&tfd.val, exp) + tfd.events.Notify(waiter.EventIn) + return ktime.Setting{}, false +} + +// Destroy implements ktime.TimerListener.Destroy. +func (tfd *TimerFileDescription) Destroy() {}