// Copyright 2018 The gVisor Authors. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. package kernel import ( "fmt" "gvisor.dev/gvisor/pkg/abi/linux" "gvisor.dev/gvisor/pkg/sentry/hostcpu" "gvisor.dev/gvisor/pkg/syserror" "gvisor.dev/gvisor/pkg/usermem" ) // Restartable sequences. // // We support two different APIs for restartable sequences. // // 1. The upstream interface added in v4.18. // 2. The interface described in https://lwn.net/Articles/650333/. // // Throughout this file and other parts of the kernel, the latter is referred // to as "old rseq". This interface was never merged upstream, but is supported // for a limited set of applications that use it regardless. // OldRSeqCriticalRegion describes an old rseq critical region. // // +stateify savable type OldRSeqCriticalRegion struct { // When a task in this thread group has its CPU preempted (as defined by // platform.ErrContextCPUPreempted) or has a signal delivered to an // application handler while its instruction pointer is in CriticalSection, // set the instruction pointer to Restart and application register r10 (on // amd64) to the former instruction pointer. CriticalSection usermem.AddrRange Restart usermem.Addr } // RSeqAvailable returns true if t supports (old and new) restartable sequences. func (t *Task) RSeqAvailable() bool { return t.k.useHostCores && t.k.Platform.DetectsCPUPreemption() } // SetRSeq registers addr as this thread's rseq structure. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) SetRSeq(addr usermem.Addr, length, signature uint32) error { if t.rseqAddr != 0 { if t.rseqAddr != addr { return syserror.EINVAL } if t.rseqSignature != signature { return syserror.EINVAL } return syserror.EBUSY } // rseq must be aligned and correctly sized. if addr&(linux.AlignOfRSeq-1) != 0 { return syserror.EINVAL } if length != linux.SizeOfRSeq { return syserror.EINVAL } if _, ok := t.MemoryManager().CheckIORange(addr, linux.SizeOfRSeq); !ok { return syserror.EFAULT } t.rseqAddr = addr t.rseqSignature = signature // Initialize the CPUID. // // Linux implicitly does this on return from userspace, where failure // would cause SIGSEGV. if err := t.rseqUpdateCPU(); err != nil { t.rseqAddr = 0 t.rseqSignature = 0 t.Debugf("Failed to copy CPU to %#x for rseq: %v", t.rseqAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return syserror.EFAULT } return nil } // ClearRSeq unregisters addr as this thread's rseq structure. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) ClearRSeq(addr usermem.Addr, length, signature uint32) error { if t.rseqAddr == 0 { return syserror.EINVAL } if t.rseqAddr != addr { return syserror.EINVAL } if length != linux.SizeOfRSeq { return syserror.EINVAL } if t.rseqSignature != signature { return syserror.EPERM } if err := t.rseqClearCPU(); err != nil { return err } t.rseqAddr = 0 t.rseqSignature = 0 if t.oldRSeqCPUAddr == 0 { // rseqCPU no longer needed. t.rseqCPU = -1 } return nil } // OldRSeqCriticalRegion returns a copy of t's thread group's current // old restartable sequence. func (t *Task) OldRSeqCriticalRegion() OldRSeqCriticalRegion { return *t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) } // SetOldRSeqCriticalRegion replaces t's thread group's old restartable // sequence. // // Preconditions: t.RSeqAvailable() == true. func (t *Task) SetOldRSeqCriticalRegion(r OldRSeqCriticalRegion) error { // These checks are somewhat more lenient than in Linux, which (bizarrely) // requires r.CriticalSection to be non-empty and r.Restart to be // outside of r.CriticalSection, even if r.CriticalSection.Start == 0 // (which disables the critical region). if r.CriticalSection.Start == 0 { r.CriticalSection.End = 0 r.Restart = 0 t.tg.oldRSeqCritical.Store(&r) return nil } if r.CriticalSection.Start >= r.CriticalSection.End { return syserror.EINVAL } if r.CriticalSection.Contains(r.Restart) { return syserror.EINVAL } // TODO(jamieliu): check that r.CriticalSection and r.Restart are in // the application address range, for consistency with Linux. t.tg.oldRSeqCritical.Store(&r) return nil } // OldRSeqCPUAddr returns the address that old rseq will keep updated with t's // CPU number. // // Preconditions: The caller must be running on the task goroutine. func (t *Task) OldRSeqCPUAddr() usermem.Addr { return t.oldRSeqCPUAddr } // SetOldRSeqCPUAddr replaces the address that old rseq will keep updated with // t's CPU number. // // Preconditions: t.RSeqAvailable() == true. The caller must be running on the // task goroutine. t's AddressSpace must be active. func (t *Task) SetOldRSeqCPUAddr(addr usermem.Addr) error { t.oldRSeqCPUAddr = addr // Check that addr is writable. // // N.B. rseqUpdateCPU may fail on a bad t.rseqAddr as well. That's // unfortunate, but unlikely in a correct program. if err := t.rseqUpdateCPU(); err != nil { t.oldRSeqCPUAddr = 0 return syserror.EINVAL // yes, EINVAL, not err or EFAULT } return nil } // Preconditions: The caller must be running on the task goroutine. t's // AddressSpace must be active. func (t *Task) rseqUpdateCPU() error { if t.rseqAddr == 0 && t.oldRSeqCPUAddr == 0 { t.rseqCPU = -1 return nil } t.rseqCPU = int32(hostcpu.GetCPU()) // Update both CPUs, even if one fails. rerr := t.rseqCopyOutCPU() oerr := t.oldRSeqCopyOutCPU() if rerr != nil { return rerr } return oerr } // Preconditions: The caller must be running on the task goroutine. t's // AddressSpace must be active. func (t *Task) oldRSeqCopyOutCPU() error { if t.oldRSeqCPUAddr == 0 { return nil } buf := t.CopyScratchBuffer(4) usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) _, err := t.CopyOutBytes(t.oldRSeqCPUAddr, buf) return err } // Preconditions: The caller must be running on the task goroutine. t's // AddressSpace must be active. func (t *Task) rseqCopyOutCPU() error { if t.rseqAddr == 0 { return nil } buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. usermem.ByteOrder.PutUint32(buf, uint32(t.rseqCPU)) // CPUIDStart usermem.ByteOrder.PutUint32(buf[4:], uint32(t.rseqCPU)) // CPUID // N.B. This write is not atomic, but since this occurs on the task // goroutine then as long as userspace uses a single-instruction read // it can't see an invalid value. _, err := t.CopyOutBytes(t.rseqAddr, buf) return err } // Preconditions: The caller must be running on the task goroutine. t's // AddressSpace must be active. func (t *Task) rseqClearCPU() error { buf := t.CopyScratchBuffer(8) // CPUIDStart and CPUID are the first two fields in linux.RSeq. usermem.ByteOrder.PutUint32(buf, 0) // CPUIDStart usermem.ByteOrder.PutUint32(buf[4:], linux.RSEQ_CPU_ID_UNINITIALIZED) // CPUID // N.B. This write is not atomic, but since this occurs on the task // goroutine then as long as userspace uses a single-instruction read // it can't see an invalid value. _, err := t.CopyOutBytes(t.rseqAddr, buf) return err } // rseqAddrInterrupt checks if IP is in a critical section, and aborts if so. // // This is a bit complex since both the RSeq and RSeqCriticalSection structs // are stored in userspace. So we must: // // 1. Copy in the address of RSeqCriticalSection from RSeq. // 2. Copy in RSeqCriticalSection itself. // 3. Validate critical section struct version, address range, abort address. // 4. Validate the abort signature (4 bytes preceding abort IP match expected // signature). // 5. Clear address of RSeqCriticalSection from RSeq. // 6. Finally, conditionally abort. // // See kernel/rseq.c:rseq_ip_fixup for reference. // // Preconditions: The caller must be running on the task goroutine. t's // AddressSpace must be active. func (t *Task) rseqAddrInterrupt() { if t.rseqAddr == 0 { return } critAddrAddr, ok := t.rseqAddr.AddLength(linux.OffsetOfRSeqCriticalSection) if !ok { // SetRSeq should validate this. panic(fmt.Sprintf("t.rseqAddr (%#x) not large enough", t.rseqAddr)) } if t.Arch().Width() != 8 { // We only handle 64-bit for now. t.Debugf("Only 64-bit rseq supported.") t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } buf := t.CopyScratchBuffer(8) if _, err := t.CopyInBytes(critAddrAddr, buf); err != nil { t.Debugf("Failed to copy critical section address from %#x for rseq: %v", critAddrAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } critAddr := usermem.Addr(usermem.ByteOrder.Uint64(buf)) if critAddr == 0 { return } buf = t.CopyScratchBuffer(linux.SizeOfRSeqCriticalSection) if _, err := t.CopyInBytes(critAddr, buf); err != nil { t.Debugf("Failed to copy critical section from %#x for rseq: %v", critAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } // Manually marshal RSeqCriticalSection as this is in the hot path when // rseq is enabled. It must be as fast as possible. // // TODO(b/130243041): Replace with go_marshal. cs := linux.RSeqCriticalSection{ Version: usermem.ByteOrder.Uint32(buf[0:4]), Flags: usermem.ByteOrder.Uint32(buf[4:8]), Start: usermem.ByteOrder.Uint64(buf[8:16]), PostCommitOffset: usermem.ByteOrder.Uint64(buf[16:24]), Abort: usermem.ByteOrder.Uint64(buf[24:32]), } if cs.Version != 0 { t.Debugf("Unknown version in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } start := usermem.Addr(cs.Start) critRange, ok := start.ToRange(cs.PostCommitOffset) if !ok { t.Debugf("Invalid start and offset in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } abort := usermem.Addr(cs.Abort) if critRange.Contains(abort) { t.Debugf("Abort in critical section in %+v", cs) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } // Verify signature. sigAddr := abort - linux.SizeOfRSeqSignature buf = t.CopyScratchBuffer(linux.SizeOfRSeqSignature) if _, err := t.CopyInBytes(sigAddr, buf); err != nil { t.Debugf("Failed to copy critical section signature from %#x for rseq: %v", sigAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } sig := usermem.ByteOrder.Uint32(buf) if sig != t.rseqSignature { t.Debugf("Mismatched rseq signature %d != %d", sig, t.rseqSignature) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } // Clear the critical section address. // // NOTE(b/143949567): We don't support any rseq flags, so we always // restart if we are in the critical section, and thus *always* clear // critAddrAddr. if _, err := t.MemoryManager().ZeroOut(t, critAddrAddr, int64(t.Arch().Width()), usermem.IOOpts{ AddressSpaceActive: true, }); err != nil { t.Debugf("Failed to clear critical section address from %#x for rseq: %v", critAddrAddr, err) t.forceSignal(linux.SIGSEGV, false /* unconditional */) t.SendSignal(SignalInfoPriv(linux.SIGSEGV)) return } // Finally we can actually decide whether or not to restart. if !critRange.Contains(usermem.Addr(t.Arch().IP())) { return } t.Arch().SetIP(uintptr(cs.Abort)) } // Preconditions: The caller must be running on the task goroutine. func (t *Task) oldRSeqInterrupt() { r := t.tg.oldRSeqCritical.Load().(*OldRSeqCriticalRegion) if ip := t.Arch().IP(); r.CriticalSection.Contains(usermem.Addr(ip)) { t.Debugf("Interrupted rseq critical section at %#x; restarting at %#x", ip, r.Restart) t.Arch().SetIP(uintptr(r.Restart)) t.Arch().SetOldRSeqInterruptedIP(ip) } } // Preconditions: The caller must be running on the task goroutine. func (t *Task) rseqInterrupt() { t.rseqAddrInterrupt() t.oldRSeqInterrupt() }