gvisor/pkg/sentry/loader/vdso.go

// Copyright 2018 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package loader

import (
	"debug/elf"
	"fmt"
	"io"

	"gvisor.dev/gvisor/pkg/abi"
	"gvisor.dev/gvisor/pkg/log"
	"gvisor.dev/gvisor/pkg/sentry/arch"
	"gvisor.dev/gvisor/pkg/sentry/context"
	"gvisor.dev/gvisor/pkg/sentry/fs"
	"gvisor.dev/gvisor/pkg/sentry/fs/anon"
	"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
	"gvisor.dev/gvisor/pkg/sentry/memmap"
	"gvisor.dev/gvisor/pkg/sentry/mm"
	"gvisor.dev/gvisor/pkg/sentry/pgalloc"
	"gvisor.dev/gvisor/pkg/sentry/safemem"
	"gvisor.dev/gvisor/pkg/sentry/uniqueid"
	"gvisor.dev/gvisor/pkg/sentry/usage"
	"gvisor.dev/gvisor/pkg/sentry/usermem"
	"gvisor.dev/gvisor/pkg/syserror"
	"gvisor.dev/gvisor/pkg/waiter"
)

type fileContext struct {
	context.Context
}

func (f *fileContext) Value(key interface{}) interface{} {
	switch key {
	case uniqueid.CtxGlobalUniqueID:
		return uint64(0)
	default:
		return f.Context.Value(key)
	}
}

// byteReader implements fs.FileOperations for reading from a []byte source.
type byteReader struct {
	fsutil.FileNoFsync              `state:"nosave"`
	fsutil.FileNoIoctl              `state:"nosave"`
	fsutil.FileNoMMap               `state:"nosave"`
	fsutil.FileNoSplice             `state:"nosave"`
	fsutil.FileNoopFlush            `state:"nosave"`
	fsutil.FileNoopRelease          `state:"nosave"`
	fsutil.FileNotDirReaddir        `state:"nosave"`
	fsutil.FilePipeSeek             `state:"nosave"`
	fsutil.FileUseInodeUnstableAttr `state:"nosave"`
	waiter.AlwaysReady              `state:"nosave"`

	data []byte
}

var _ fs.FileOperations = (*byteReader)(nil)

// newByteReaderFile creates a fake file to read data from.
func newByteReaderFile(ctx context.Context, data []byte) *fs.File {
	// Create a fake inode.
	inode := fs.NewInode(
		ctx,
		&fsutil.SimpleFileInode{},
		fs.NewPseudoMountSource(ctx),
		fs.StableAttr{
			Type:      fs.Anonymous,
			DeviceID:  anon.PseudoDevice.DeviceID(),
			InodeID:   anon.PseudoDevice.NextIno(),
			BlockSize: usermem.PageSize,
		})

	// Use the fake inode to create a fake dirent.
	dirent := fs.NewTransientDirent(inode)
	defer dirent.DecRef()

	// Use the fake dirent to make a fake file.
	flags := fs.FileFlags{Read: true, Pread: true}
	return fs.NewFile(&fileContext{Context: context.Background()}, dirent, flags, &byteReader{
		data: data,
	})
}

func (b *byteReader) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
	if offset < 0 {
		return 0, syserror.EINVAL
	}
	if offset >= int64(len(b.data)) {
		return 0, io.EOF
	}
	n, err := dst.CopyOut(ctx, b.data[offset:])
	return int64(n), err
}

func (b *byteReader) Write(ctx context.Context, file *fs.File, src usermem.IOSequence, offset int64) (int64, error) {
	panic("Write not supported")
}

// validateVDSO checks that the VDSO can be loaded by loadVDSO.
//
// VDSOs are special (see below). Since we are going to map the VDSO directly
// rather than using a normal loading process, we require that the PT_LOAD
// segments have the same layout in the ELF as they expect to have in memory.
//
// Namely, this means that we must verify:
// * PT_LOAD file offsets are equivalent to the memory offset from the first
//   segment.
// * No extra zeroed space (memsz) is required.
// * PT_LOAD segments are in order.
// * No two PT_LOAD segments occupy parts of the same page.
// * PT_LOAD segments don't extend beyond the end of the file.
//
// ctx may be nil if f does not need it.
func validateVDSO(ctx context.Context, f *fs.File, size uint64) (elfInfo, error) {
	info, err := parseHeader(ctx, f)
	if err != nil {
		log.Infof("Unable to parse VDSO header: %v", err)
		return elfInfo{}, err
	}

	var first *elf.ProgHeader
	var prev *elf.ProgHeader
	var prevEnd usermem.Addr
	for i, phdr := range info.phdrs {
		if phdr.Type != elf.PT_LOAD {
			continue
		}

		if first == nil {
			first = &info.phdrs[i]
			if phdr.Off != 0 {
				log.Warningf("First PT_LOAD segment has non-zero file offset")
				return elfInfo{}, syserror.ENOEXEC
			}
		}

		memoryOffset := phdr.Vaddr - first.Vaddr
		if memoryOffset != phdr.Off {
			log.Warningf("PT_LOAD segment memory offset %#x != file offset %#x", memoryOffset, phdr.Off)
			return elfInfo{}, syserror.ENOEXEC
		}

		// memsz larger than filesz means that extra zeroed space should be
		// provided at the end of the segment. Since we are mapping the ELF
		// directly, we don't want to just overwrite part of the ELF with
		// zeroes.
		if phdr.Memsz != phdr.Filesz {
			log.Warningf("PT_LOAD segment memsz %#x != filesz %#x", phdr.Memsz, phdr.Filesz)
			return elfInfo{}, syserror.ENOEXEC
		}

		start := usermem.Addr(memoryOffset)
		end, ok := start.AddLength(phdr.Memsz)
		if !ok {
			log.Warningf("PT_LOAD segment size overflows: %#x + %#x", start, end)
			return elfInfo{}, syserror.ENOEXEC
		}
		if uint64(end) > size {
			log.Warningf("PT_LOAD segment end %#x extends beyond end of file %#x", end, size)
			return elfInfo{}, syserror.ENOEXEC
		}

		if prev != nil {
			if start < prevEnd {
				log.Warningf("PT_LOAD segments out of order")
				return elfInfo{}, syserror.ENOEXEC
			}

			// We mprotect entire pages, so each segment must be in
			// its own page.
			prevEndPage := prevEnd.RoundDown()
			startPage := start.RoundDown()
			if prevEndPage >= startPage {
				log.Warningf("PT_LOAD segments share a page: %#x", prevEndPage)
				return elfInfo{}, syserror.ENOEXEC
			}
		}
		prev = &info.phdrs[i]
		prevEnd = end
	}

	return info, nil
}

// VDSO describes a VDSO.
//
// NOTE(mpratt): to support multiple architectures or operating systems, this
// would need to contain a VDSO for each.
//
// +stateify savable
type VDSO struct {
	// ParamPage is the VDSO parameter page. This page should be updated to
	// inform the VDSO for timekeeping data.
	ParamPage *mm.SpecialMappable

	// vdso is the VDSO ELF itself.
	vdso *mm.SpecialMappable

	// os is the operating system targeted by the VDSO.
	os abi.OS

	// arch is the architecture targeted by the VDSO.
	arch arch.Arch

	// phdrs are the VDSO ELF phdrs.
	phdrs []elf.ProgHeader `state:".([]elfProgHeader)"`
}

// PrepareVDSO validates the system VDSO and returns a VDSO, containing the
// param page for updating by the kernel.
func PrepareVDSO(ctx context.Context, mfp pgalloc.MemoryFileProvider) (*VDSO, error) {
	vdsoFile := newByteReaderFile(ctx, vdsoBin)

	// First make sure the VDSO is valid. vdsoFile does not use ctx, so a
	// nil context can be passed.
	info, err := validateVDSO(nil, vdsoFile, uint64(len(vdsoBin)))
	vdsoFile.DecRef()
	if err != nil {
		return nil, err
	}

	// Then copy it into a VDSO mapping.
	size, ok := usermem.Addr(len(vdsoBin)).RoundUp()
	if !ok {
		return nil, fmt.Errorf("VDSO size overflows? %#x", len(vdsoBin))
	}

	mf := mfp.MemoryFile()
	vdso, err := mf.Allocate(uint64(size), usage.System)
	if err != nil {
		return nil, fmt.Errorf("unable to allocate VDSO memory: %v", err)
	}

	ims, err := mf.MapInternal(vdso, usermem.ReadWrite)
	if err != nil {
		mf.DecRef(vdso)
		return nil, fmt.Errorf("unable to map VDSO memory: %v", err)
	}

	_, err = safemem.CopySeq(ims, safemem.BlockSeqOf(safemem.BlockFromSafeSlice(vdsoBin)))
	if err != nil {
		mf.DecRef(vdso)
		return nil, fmt.Errorf("unable to copy VDSO into memory: %v", err)
	}

	// Finally, allocate a param page for this VDSO.
	paramPage, err := mf.Allocate(usermem.PageSize, usage.System)
	if err != nil {
		mf.DecRef(vdso)
		return nil, fmt.Errorf("unable to allocate VDSO param page: %v", err)
	}

	return &VDSO{
		ParamPage: mm.NewSpecialMappable("[vvar]", mfp, paramPage),
		// TODO(gvisor.dev/issue/157): Don't advertise the VDSO, as
		// some applications may not be able to handle multiple [vdso]
		// hints.
		vdso:  mm.NewSpecialMappable("", mfp, vdso),
		os:    info.os,
		arch:  info.arch,
		phdrs: info.phdrs,
	}, nil
}

// loadVDSO loads the VDSO into m.
//
// VDSOs are special.
//
// VDSOs are fully position independent. However, instead of loading a VDSO
// like a normal ELF binary, mapping only the PT_LOAD segments, the Linux
// kernel simply directly maps the entire file into process memory, with very
// little real ELF parsing.
//
// NOTE(b/25323870): This means that userspace can, and unfortunately does,
// depend on parts of the ELF that would normally not be mapped.  To maintain
// compatibility with such binaries, we load the VDSO much like Linux.
//
// loadVDSO takes a reference on the VDSO and parameter page FrameRegions.
func loadVDSO(ctx context.Context, m *mm.MemoryManager, v *VDSO, bin loadedELF) (usermem.Addr, error) {
	if v.os != bin.os {
		ctx.Warningf("Binary ELF OS %v and VDSO ELF OS %v differ", bin.os, v.os)
		return 0, syserror.ENOEXEC
	}
	if v.arch != bin.arch {
		ctx.Warningf("Binary ELF arch %v and VDSO ELF arch %v differ", bin.arch, v.arch)
		return 0, syserror.ENOEXEC
	}

	// Reserve address space for the VDSO and its parameter page, which is
	// mapped just before the VDSO.
	mapSize := v.vdso.Length() + v.ParamPage.Length()
	addr, err := m.MMap(ctx, memmap.MMapOpts{
		Length:  mapSize,
		Private: true,
	})
	if err != nil {
		ctx.Infof("Unable to reserve VDSO address space: %v", err)
		return 0, err
	}

	// Now map the param page.
	_, err = m.MMap(ctx, memmap.MMapOpts{
		Length:          v.ParamPage.Length(),
		MappingIdentity: v.ParamPage,
		Mappable:        v.ParamPage,
		Addr:            addr,
		Fixed:           true,
		Unmap:           true,
		Private:         true,
		Perms:           usermem.Read,
		MaxPerms:        usermem.Read,
	})
	if err != nil {
		ctx.Infof("Unable to map VDSO param page: %v", err)
		return 0, err
	}

	// Now map the VDSO itself.
	vdsoAddr, ok := addr.AddLength(v.ParamPage.Length())
	if !ok {
		panic(fmt.Sprintf("Part of mapped range overflows? %#x + %#x", addr, v.ParamPage.Length()))
	}
	_, err = m.MMap(ctx, memmap.MMapOpts{
		Length:          v.vdso.Length(),
		MappingIdentity: v.vdso,
		Mappable:        v.vdso,
		Addr:            vdsoAddr,
		Fixed:           true,
		Unmap:           true,
		Private:         true,
		Perms:           usermem.Read,
		MaxPerms:        usermem.AnyAccess,
	})
	if err != nil {
		ctx.Infof("Unable to map VDSO: %v", err)
		return 0, err
	}

	vdsoEnd, ok := vdsoAddr.AddLength(v.vdso.Length())
	if !ok {
		panic(fmt.Sprintf("VDSO mapping overflows? %#x + %#x", vdsoAddr, v.vdso.Length()))
	}

	// Set additional protections for the individual segments.
	var first *elf.ProgHeader
	for i, phdr := range v.phdrs {
		if phdr.Type != elf.PT_LOAD {
			continue
		}

		if first == nil {
			first = &v.phdrs[i]
		}

		memoryOffset := phdr.Vaddr - first.Vaddr
		segAddr, ok := vdsoAddr.AddLength(memoryOffset)
		if !ok {
			ctx.Warningf("PT_LOAD segment address overflows: %#x + %#x", segAddr, memoryOffset)
			return 0, syserror.ENOEXEC
		}
		segPage := segAddr.RoundDown()
		segSize := usermem.Addr(phdr.Memsz)
		segSize, ok = segSize.AddLength(segAddr.PageOffset())
		if !ok {
			ctx.Warningf("PT_LOAD segment memsize %#x + offset %#x overflows", phdr.Memsz, segAddr.PageOffset())
			return 0, syserror.ENOEXEC
		}
		segSize, ok = segSize.RoundUp()
		if !ok {
			ctx.Warningf("PT_LOAD segment size overflows: %#x", phdr.Memsz+segAddr.PageOffset())
			return 0, syserror.ENOEXEC
		}
		segEnd, ok := segPage.AddLength(uint64(segSize))
		if !ok {
			ctx.Warningf("PT_LOAD segment range overflows: %#x + %#x", segAddr, segSize)
			return 0, syserror.ENOEXEC
		}
		if segEnd > vdsoEnd {
			ctx.Warningf("PT_LOAD segment ends beyond VDSO: %#x > %#x", segEnd, vdsoEnd)
			return 0, syserror.ENOEXEC
		}

		perms := progFlagsAsPerms(phdr.Flags)
		if perms != usermem.Read {
			if err := m.MProtect(segPage, uint64(segSize), perms, false); err != nil {
				ctx.Warningf("Unable to set PT_LOAD segment protections %+v at [%#x, %#x): %v", perms, segAddr, segEnd, err)
				return 0, syserror.ENOEXEC
			}
		}
	}

	return vdsoAddr, nil
}