394 lines
12 KiB
Go
394 lines
12 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package kernel
|
|
|
|
import (
|
|
"fmt"
|
|
|
|
"gvisor.dev/gvisor/pkg/abi/linux"
|
|
"gvisor.dev/gvisor/pkg/sentry/hostcpu"
|
|
"gvisor.dev/gvisor/pkg/syserror"
|
|
"gvisor.dev/gvisor/pkg/usermem"
|
|
)
|
|
|
|
// Restartable sequences.
|
|
//
|
|
// We support two different APIs for restartable sequences.
|
|
//
|
|
// 1. The upstream interface added in v4.18.
|
|
// 2. The interface described in https://lwn.net/Articles/650333/.
|
|
//
|
|
// Throughout this file and other parts of the kernel, the latter is referred
|
|
// to as "old rseq". This interface was never merged upstream, but is supported
|
|
// for a limited set of applications that use it regardless.
|
|
|
|
// OldRSeqCriticalRegion describes an old rseq critical region.
|
|
//
|
|
// +stateify savable
|
|
type OldRSeqCriticalRegion struct {
|
|
// When a task in this thread group has its CPU preempted (as defined by
|
|
// platform.ErrContextCPUPreempted) or has a signal delivered to an
|
|
// application handler while its instruction pointer is in CriticalSection,
|
|
// set the instruction pointer to Restart and application register r10 (on
|
|
// amd64) to the former instruction pointer.
|
|
CriticalSection usermem.AddrRange
|
|
Restart usermem.Addr
|
|
}
|
|
|
|
// RSeqAvailable returns true if t supports (old and new) restartable sequences.
|
|
func (t *Task) RSeqAvailable() bool {
|
|
return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption()
|
|
}
|
|
|
|
// SetRSeq registers addr as this thread's rseq structure.
|
|
//
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error {
|
|
if t.rseqAddr != 0 {
|
|
if t.rseqAddr != addr {
|
|
return syserror.EINVAL
|
|
}
|
|
if t.rseqSignature != signature {
|
|
return syserror.EINVAL
|
|
}
|
|
return syserror.EBUSY
|
|
}
|
|
|
|
// rseq must be aligned and correctly sized.
|
|
if addr&(linux.AlignOfRSeq-1) != 0 {
|
|
return syserror.EINVAL
|
|
}
|
|
if length != linux.SizeOfRSeq {
|
|
return syserror.EINVAL
|
|
}
|
|
if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok {
|
|
return syserror.EFAULT
|
|
}
|
|
|
|
t.rseqAddr = addr
|
|
t.rseqSignature = signature
|
|
|
|
// Initialize the CPUID.
|
|
//
|
|
// Linux implicitly does this on return from userspace, where failure
|
|
// would cause SIGSEGV.
|
|
if err := t.rseqUpdateCPU(); err != nil {
|
|
t.rseqAddr = 0
|
|
t.rseqSignature = 0
|
|
|
|
t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return syserror.EFAULT
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// ClearRSeq unregisters addr as this thread's rseq structure.
|
|
//
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error {
|
|
if t.rseqAddr == 0 {
|
|
return syserror.EINVAL
|
|
}
|
|
if t.rseqAddr != addr {
|
|
return syserror.EINVAL
|
|
}
|
|
if length != linux.SizeOfRSeq {
|
|
return syserror.EINVAL
|
|
}
|
|
if t.rseqSignature != signature {
|
|
return syserror.EPERM
|
|
}
|
|
|
|
if err := t.rseqClearCPU(); err != nil {
|
|
return err
|
|
}
|
|
|
|
t.rseqAddr = 0
|
|
t.rseqSignature = 0
|
|
|
|
if t.oldRSeqCPUAddr == 0 {
|
|
// rseqCPU no longer needed.
|
|
t.rseqCPU = -1
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// OldRSeqCriticalRegion returns a copy of t's thread group's current
|
|
// old restartable sequence.
|
|
func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion {
|
|
return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
|
|
}
|
|
|
|
// SetOldRSeqCriticalRegion replaces t's thread group's old restartable
|
|
// sequence.
|
|
//
|
|
// Preconditions: t.RSeqAvailable() == true.
|
|
func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error {
|
|
// These checks are somewhat more lenient than in Linux, which (bizarrely)
|
|
// requires r.CriticalSection to be non-empty and r.Restart to be
|
|
// outside of r.CriticalSection, even if r.CriticalSection.Start == 0
|
|
// (which disables the critical region).
|
|
if r.CriticalSection.Start == 0 {
|
|
r.CriticalSection.End = 0
|
|
r.Restart = 0
|
|
t.tg.oldRSeqCritical.Store(&r)
|
|
return nil
|
|
}
|
|
if r.CriticalSection.Start >= r.CriticalSection.End {
|
|
return syserror.EINVAL
|
|
}
|
|
if r.CriticalSection.Contains(r.Restart) {
|
|
return syserror.EINVAL
|
|
}
|
|
// TODO(jamieliu): check that r.CriticalSection and r.Restart are in
|
|
// the application address range, for consistency with Linux.
|
|
t.tg.oldRSeqCritical.Store(&r)
|
|
return nil
|
|
}
|
|
|
|
// OldRSeqCPUAddr returns the address that old rseq will keep updated with t's
|
|
// CPU number.
|
|
//
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) OldRSeqCPUAddr() usermem.Addr {
|
|
return t.oldRSeqCPUAddr
|
|
}
|
|
|
|
// SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with
|
|
// t's CPU number.
|
|
//
|
|
// Preconditions: t.RSeqAvailable() == true. The caller must be running on the
|
|
// task goroutine. t's AddressSpace must be active.
|
|
func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error {
|
|
t.oldRSeqCPUAddr = addr
|
|
|
|
// Check that addr is writable.
|
|
//
|
|
// N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's
|
|
// unfortunate, but unlikely in a correct program.
|
|
if err := t.rseqUpdateCPU(); err != nil {
|
|
t.oldRSeqCPUAddr = 0
|
|
return syserror.EINVAL // yes, EINVAL, not err or EFAULT
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine. t's
|
|
// AddressSpace must be active.
|
|
func (t *Task) rseqUpdateCPU() error {
|
|
if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 {
|
|
t.rseqCPU = -1
|
|
return nil
|
|
}
|
|
|
|
t.rseqCPU = int32(hostcpu.GetCPU())
|
|
|
|
// Update both CPUs, even if one fails.
|
|
rerr := t.rseqCopyOutCPU()
|
|
oerr := t.oldRSeqCopyOutCPU()
|
|
|
|
if rerr != nil {
|
|
return rerr
|
|
}
|
|
return oerr
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine. t's
|
|
// AddressSpace must be active.
|
|
func (t *Task) oldRSeqCopyOutCPU() error {
|
|
if t.oldRSeqCPUAddr == 0 {
|
|
return nil
|
|
}
|
|
|
|
buf := t.CopyScratchBuffer(4)
|
|
usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU))
|
|
_, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf)
|
|
return err
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine. t's
|
|
// AddressSpace must be active.
|
|
func (t *Task) rseqCopyOutCPU() error {
|
|
if t.rseqAddr == 0 {
|
|
return nil
|
|
}
|
|
|
|
buf := t.CopyScratchBuffer(8)
|
|
// CPUIDStart and CPUID are the first two fields in linux.RSeq.
|
|
usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart
|
|
usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID
|
|
// N.B. This write is not atomic, but since this occurs on the task
|
|
// goroutine then as long as userspace uses a single-instruction read
|
|
// it can't see an invalid value.
|
|
_, err := t.CopyOutBytes(t.rseqAddr, buf)
|
|
return err
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine. t's
|
|
// AddressSpace must be active.
|
|
func (t *Task) rseqClearCPU() error {
|
|
buf := t.CopyScratchBuffer(8)
|
|
// CPUIDStart and CPUID are the first two fields in linux.RSeq.
|
|
usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart
|
|
usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID
|
|
// N.B. This write is not atomic, but since this occurs on the task
|
|
// goroutine then as long as userspace uses a single-instruction read
|
|
// it can't see an invalid value.
|
|
_, err := t.CopyOutBytes(t.rseqAddr, buf)
|
|
return err
|
|
}
|
|
|
|
// rseqAddrInterrupt checks if IP is in a critical section, and aborts if so.
|
|
//
|
|
// This is a bit complex since both the RSeq and RSeqCriticalSection structs
|
|
// are stored in userspace. So we must:
|
|
//
|
|
// 1. Copy in the address of RSeqCriticalSection from RSeq.
|
|
// 2. Copy in RSeqCriticalSection itself.
|
|
// 3. Validate critical section struct version, address range, abort address.
|
|
// 4. Validate the abort signature (4 bytes preceding abort IP match expected
|
|
// signature).
|
|
// 5. Clear address of RSeqCriticalSection from RSeq.
|
|
// 6. Finally, conditionally abort.
|
|
//
|
|
// See kernel/rseq.c:rseq_ip_fixup for reference.
|
|
//
|
|
// Preconditions: The caller must be running on the task goroutine. t's
|
|
// AddressSpace must be active.
|
|
func (t *Task) rseqAddrInterrupt() {
|
|
if t.rseqAddr == 0 {
|
|
return
|
|
}
|
|
|
|
critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection)
|
|
if !ok {
|
|
// SetRSeq should validate this.
|
|
panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr))
|
|
}
|
|
|
|
if t.Arch().Width() != 8 {
|
|
// We only handle 64-bit for now.
|
|
t.Debugf("Only 64-bit rseq supported.")
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
buf := t.CopyScratchBuffer(8)
|
|
if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil {
|
|
t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf))
|
|
if critAddr == 0 {
|
|
return
|
|
}
|
|
|
|
var cs linux.RSeqCriticalSection
|
|
if _, err := cs.CopyIn(t, critAddr); err != nil {
|
|
t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
if cs.Version != 0 {
|
|
t.Debugf("Unknown version in %+v", cs)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
start := usermem.Addr(cs.Start)
|
|
critRange, ok := start.ToRange(cs.PostCommitOffset)
|
|
if !ok {
|
|
t.Debugf("Invalid start and offset in %+v", cs)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
abort := usermem.Addr(cs.Abort)
|
|
if critRange.Contains(abort) {
|
|
t.Debugf("Abort in critical section in %+v", cs)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
// Verify signature.
|
|
sigAddr := abort - linux.SizeOfRSeqSignature
|
|
|
|
buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature)
|
|
if _, err := t.CopyInBytes(sigAddr, buf); err != nil {
|
|
t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
sig := usermem.ByteOrder.Uint32(buf)
|
|
if sig != t.rseqSignature {
|
|
t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
// Clear the critical section address.
|
|
//
|
|
// NOTE(b/143949567): We don't support any rseq flags, so we always
|
|
// restart if we are in the critical section, and thus *always* clear
|
|
// critAddrAddr.
|
|
if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{
|
|
AddressSpaceActive: true,
|
|
}); err != nil {
|
|
t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err)
|
|
t.forceSignal(linux.SIGSEGV, false /* unconditional */)
|
|
t.SendSignal(SignalInfoPriv(linux.SIGSEGV))
|
|
return
|
|
}
|
|
|
|
// Finally we can actually decide whether or not to restart.
|
|
if !critRange.Contains(usermem.Addr(t.Arch().IP())) {
|
|
return
|
|
}
|
|
|
|
t.Arch().SetIP(uintptr(cs.Abort))
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) oldRSeqInterrupt() {
|
|
r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion)
|
|
if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) {
|
|
t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart)
|
|
t.Arch().SetIP(uintptr(r.Restart))
|
|
t.Arch().SetOldRSeqInterruptedIP(ip)
|
|
}
|
|
}
|
|
|
|
// Preconditions: The caller must be running on the task goroutine.
|
|
func (t *Task) rseqInterrupt() {
|
|
t.rseqAddrInterrupt()
|
|
t.oldRSeqInterrupt()
|
|
}
|