Implement mmap for host fs in vfs2.

In VFS1, both fs/host and fs/gofer used the same utils for host file mappings.
Refactor parts of fsimpl/gofer to create similar utils to share with
fsimpl/host (memory accounting code moved to fsutil, page rounding arithmetic
moved to usermem).

Updates #1476.

PiperOrigin-RevId: 312345090
This commit is contained in:
Dean Deng 2020-05-19 13:45:23 -07:00 committed by gVisor bot
parent 064347afdf
commit 05c89af6ed
9 changed files with 255 additions and 93 deletions

View File

@ -18,6 +18,7 @@ import (
"math"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sentry/usage"
)
// FrameRefSetFunctions implements segment.Functions for FrameRefSet.
@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
return val, val
}
// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
// are accounted as host page cache memory mappings.
func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
seg, gap := refs.Find(fr.Start)
for {
switch {
case seg.Ok() && seg.Start() < fr.End:
seg = refs.Isolate(seg, fr)
seg.SetValue(seg.Value() + 1)
seg, gap = seg.NextNonEmpty()
case gap.Ok() && gap.Start() < fr.End:
newRange := gap.Range().Intersect(fr)
usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
default:
refs.MergeAdjacent(fr)
return
}
}
}
// DecRefAndAccount removes a reference on the range fr and untracks segments
// that are removed from memory accounting.
func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
seg := refs.FindSegment(fr.Start)
for seg.Ok() && seg.Start() < fr.End {
seg = refs.Isolate(seg, fr)
if old := seg.Value(); old == 1 {
usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
seg = refs.Remove(seg).NextSegment()
} else {
seg.SetValue(old - 1)
seg = seg.NextSegment()
}
}
refs.MergeAdjacent(fr)
}

View File

@ -36,7 +36,6 @@ go_library(
"gofer.go",
"handle.go",
"p9file.go",
"pagemath.go",
"regular_file.go",
"socket.go",
"special_file.go",

View File

@ -928,8 +928,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
// so we can't race with Write or another truncate.)
d.dataMu.Unlock()
if d.size < oldSize {
oldpgend := pageRoundUp(oldSize)
newpgend := pageRoundUp(d.size)
oldpgend, _ := usermem.PageRoundUp(oldSize)
newpgend, _ := usermem.PageRoundUp(d.size)
if oldpgend != newpgend {
d.mapsMu.Lock()
d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{

View File

@ -1,31 +0,0 @@
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package gofer
import (
"gvisor.dev/gvisor/pkg/usermem"
)
// This are equivalent to usermem.Addr.RoundDown/Up, but without the
// potentially truncating conversion to usermem.Addr. This is necessary because
// there is no way to define generic "PageRoundDown/Up" functions in Go.
func pageRoundDown(x uint64) uint64 {
return x &^ (usermem.PageSize - 1)
}
func pageRoundUp(x uint64) uint64 {
return pageRoundDown(x + usermem.PageSize - 1)
}

View File

@ -148,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
return 0, err
}
// Remove touched pages from the cache.
pgstart := pageRoundDown(uint64(offset))
pgend := pageRoundUp(uint64(offset + src.NumBytes()))
if pgend < pgstart {
pgstart := usermem.PageRoundDown(uint64(offset))
pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
if !ok {
return 0, syserror.EINVAL
}
mr := memmap.MappableRange{pgstart, pgend}
@ -306,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
if fillCache {
// Read into the cache, then re-enter the loop to read from the
// cache.
gapEnd, _ := usermem.PageRoundUp(gapMR.End)
reqMR := memmap.MappableRange{
Start: pageRoundDown(gapMR.Start),
End: pageRoundUp(gapMR.End),
Start: usermem.PageRoundDown(gapMR.Start),
End: gapEnd,
}
optMR := gap.Range()
err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
@ -671,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
// Constrain translations to d.size (rounded up) to prevent translation to
// pages that may be concurrently truncated.
pgend := pageRoundUp(d.size)
pgend, _ := usermem.PageRoundUp(d.size)
var beyondEOF bool
if required.End > pgend {
if required.Start >= pgend {
@ -818,43 +819,15 @@ type dentryPlatformFile struct {
// IncRef implements platform.File.IncRef.
func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
d.dataMu.Lock()
seg, gap := d.fdRefs.Find(fr.Start)
for {
switch {
case seg.Ok() && seg.Start() < fr.End:
seg = d.fdRefs.Isolate(seg, fr)
seg.SetValue(seg.Value() + 1)
seg, gap = seg.NextNonEmpty()
case gap.Ok() && gap.Start() < fr.End:
newRange := gap.Range().Intersect(fr)
usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
default:
d.fdRefs.MergeAdjacent(fr)
d.dataMu.Unlock()
return
}
}
d.fdRefs.IncRefAndAccount(fr)
d.dataMu.Unlock()
}
// DecRef implements platform.File.DecRef.
func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
d.dataMu.Lock()
seg := d.fdRefs.FindSegment(fr.Start)
for seg.Ok() && seg.Start() < fr.End {
seg = d.fdRefs.Isolate(seg, fr)
if old := seg.Value(); old == 1 {
usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
seg = d.fdRefs.Remove(seg).NextSegment()
} else {
seg.SetValue(old - 1)
seg = seg.NextSegment()
}
}
d.fdRefs.MergeAdjacent(fr)
d.fdRefs.DecRefAndAccount(fr)
d.dataMu.Unlock()
}
// MapInternal implements platform.File.MapInternal.

View File

@ -8,6 +8,7 @@ go_library(
"control.go",
"host.go",
"ioctl_unsafe.go",
"mmap.go",
"socket.go",
"socket_iovec.go",
"socket_unsafe.go",
@ -23,12 +24,15 @@ go_library(
"//pkg/fspath",
"//pkg/log",
"//pkg/refs",
"//pkg/safemem",
"//pkg/sentry/arch",
"//pkg/sentry/fs/fsutil",
"//pkg/sentry/fsimpl/kernfs",
"//pkg/sentry/hostfd",
"//pkg/sentry/kernel",
"//pkg/sentry/kernel/auth",
"//pkg/sentry/memmap",
"//pkg/sentry/platform",
"//pkg/sentry/socket/control",
"//pkg/sentry/socket/unix",
"//pkg/sentry/socket/unix/transport",

View File

@ -86,15 +86,16 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
i := &inode{
hostFD: hostFD,
seekable: seekable,
isTTY: opts.IsTTY,
canMap: canMap(uint32(fileType)),
wouldBlock: wouldBlock(uint32(fileType)),
ino: fs.NextIno(),
isTTY: opts.IsTTY,
wouldBlock: wouldBlock(uint32(fileType)),
seekable: seekable,
// For simplicity, set offset to 0. Technically, we should use the existing
// offset on the host if the file is seekable.
offset: 0,
canMap: canMap(uint32(fileType)),
}
i.pf.inode = i
// Non-seekable files can't be memory mapped, assert this.
if !i.seekable && i.canMap {
@ -189,11 +190,15 @@ type inode struct {
// This field is initialized at creation time and is immutable.
hostFD int
// wouldBlock is true if the host FD would return EWOULDBLOCK for
// operations that would block.
// ino is an inode number unique within this filesystem.
//
// This field is initialized at creation time and is immutable.
wouldBlock bool
ino uint64
// isTTY is true if this file represents a TTY.
//
// This field is initialized at creation time and is immutable.
isTTY bool
// seekable is false if the host fd points to a file representing a stream,
// e.g. a socket or a pipe. Such files are not seekable and can return
@ -202,29 +207,36 @@ type inode struct {
// This field is initialized at creation time and is immutable.
seekable bool
// isTTY is true if this file represents a TTY.
// offsetMu protects offset.
offsetMu sync.Mutex
// offset specifies the current file offset. It is only meaningful when
// seekable is true.
offset int64
// wouldBlock is true if the host FD would return EWOULDBLOCK for
// operations that would block.
//
// This field is initialized at creation time and is immutable.
isTTY bool
wouldBlock bool
// Event queue for blocking operations.
queue waiter.Queue
// canMap specifies whether we allow the file to be memory mapped.
//
// This field is initialized at creation time and is immutable.
canMap bool
// ino is an inode number unique within this filesystem.
//
// This field is initialized at creation time and is immutable.
ino uint64
// mapsMu protects mappings.
mapsMu sync.Mutex
// offsetMu protects offset.
offsetMu sync.Mutex
// If canMap is true, mappings tracks mappings of hostFD into
// memmap.MappingSpaces.
mappings memmap.MappingSet
// offset specifies the current file offset.
offset int64
// Event queue for blocking operations.
queue waiter.Queue
// pf implements platform.File for mappings of hostFD.
pf inodePlatformFile
}
// CheckPermissions implements kernfs.Inode.
@ -388,6 +400,21 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
return err
}
oldSize := uint64(hostStat.Size)
if s.Size < oldSize {
oldpgend, _ := usermem.PageRoundUp(oldSize)
newpgend, _ := usermem.PageRoundUp(s.Size)
if oldpgend != newpgend {
i.mapsMu.Lock()
i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
// Compare Linux's mm/truncate.c:truncate_setsize() =>
// truncate_pagecache() =>
// mm/memory.c:unmap_mapping_range(evencows=1).
InvalidatePrivate: true,
})
i.mapsMu.Unlock()
}
}
}
if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
ts := [2]syscall.Timespec{
@ -666,8 +693,9 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts
if !f.inode.canMap {
return syserror.ENODEV
}
// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
return syserror.ENODEV
i := f.inode
i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init)
return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
}
// EventRegister implements waiter.Waitable.EventRegister.

View File

@ -0,0 +1,132 @@
// Copyright 2020 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package host
import (
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/platform"
"gvisor.dev/gvisor/pkg/sync"
"gvisor.dev/gvisor/pkg/usermem"
)
// inodePlatformFile implements platform.File. It exists solely because inode
// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
//
// inodePlatformFile should only be used if inode.canMap is true.
type inodePlatformFile struct {
*inode
// fdRefsMu protects fdRefs.
fdRefsMu sync.Mutex
// fdRefs counts references on platform.File offsets. It is used solely for
// memory accounting.
fdRefs fsutil.FrameRefSet
// fileMapper caches mappings of the host file represented by this inode.
fileMapper fsutil.HostFileMapper
// fileMapperInitOnce is used to lazily initialize fileMapper.
fileMapperInitOnce sync.Once
}
// IncRef implements platform.File.IncRef.
//
// Precondition: i.inode.canMap must be true.
func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
i.fdRefsMu.Lock()
i.fdRefs.IncRefAndAccount(fr)
i.fdRefsMu.Unlock()
}
// DecRef implements platform.File.DecRef.
//
// Precondition: i.inode.canMap must be true.
func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
i.fdRefsMu.Lock()
i.fdRefs.DecRefAndAccount(fr)
i.fdRefsMu.Unlock()
}
// MapInternal implements platform.File.MapInternal.
//
// Precondition: i.inode.canMap must be true.
func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
}
// FD implements platform.File.FD.
func (i *inodePlatformFile) FD() int {
return i.hostFD
}
// AddMapping implements memmap.Mappable.AddMapping.
//
// Precondition: i.inode.canMap must be true.
func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
i.mapsMu.Lock()
mapped := i.mappings.AddMapping(ms, ar, offset, writable)
for _, r := range mapped {
i.pf.fileMapper.IncRefOn(r)
}
i.mapsMu.Unlock()
return nil
}
// RemoveMapping implements memmap.Mappable.RemoveMapping.
//
// Precondition: i.inode.canMap must be true.
func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
i.mapsMu.Lock()
unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable)
for _, r := range unmapped {
i.pf.fileMapper.DecRefOn(r)
}
i.mapsMu.Unlock()
}
// CopyMapping implements memmap.Mappable.CopyMapping.
//
// Precondition: i.inode.canMap must be true.
func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
return i.AddMapping(ctx, ms, dstAR, offset, writable)
}
// Translate implements memmap.Mappable.Translate.
//
// Precondition: i.inode.canMap must be true.
func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
mr := optional
return []memmap.Translation{
{
Source: mr,
File: &i.pf,
Offset: mr.Start,
Perms: usermem.AnyAccess,
},
}, nil
}
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
//
// Precondition: i.inode.canMap must be true.
func (i *inode) InvalidateUnsavable(ctx context.Context) error {
// We expect the same host fd across save/restore, so all translations
// should be valid.
return nil
}

View File

@ -106,3 +106,20 @@ func (ar AddrRange) IsPageAligned() bool {
func (ar AddrRange) String() string {
return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
}
// PageRoundDown/Up are equivalent to Addr.RoundDown/Up, but without the
// potentially truncating conversion from uint64 to Addr. This is necessary
// because there is no way to define generic "PageRoundDown/Up" functions in Go.
// PageRoundDown returns x rounded down to the nearest page boundary.
func PageRoundDown(x uint64) uint64 {
return x &^ (PageSize - 1)
}
// PageRoundUp returns x rounded up to the nearest page boundary.
// ok is true iff rounding up did not wrap around.
func PageRoundUp(x uint64) (addr uint64, ok bool) {
addr = PageRoundDown(x + PageSize - 1)
ok = addr >= x
return
}