Implement mmap for host fs in vfs2.
In VFS1, both fs/host and fs/gofer used the same utils for host file mappings. Refactor parts of fsimpl/gofer to create similar utils to share with fsimpl/host (memory accounting code moved to fsutil, page rounding arithmetic moved to usermem). Updates #1476. PiperOrigin-RevId: 312345090
This commit is contained in:
parent
064347afdf
commit
05c89af6ed
|
@ -18,6 +18,7 @@ import (
|
|||
"math"
|
||||
|
||||
"gvisor.dev/gvisor/pkg/sentry/platform"
|
||||
"gvisor.dev/gvisor/pkg/sentry/usage"
|
||||
)
|
||||
|
||||
// FrameRefSetFunctions implements segment.Functions for FrameRefSet.
|
||||
|
@ -49,3 +50,42 @@ func (FrameRefSetFunctions) Merge(_ platform.FileRange, val1 uint64, _ platform.
|
|||
func (FrameRefSetFunctions) Split(_ platform.FileRange, val uint64, _ uint64) (uint64, uint64) {
|
||||
return val, val
|
||||
}
|
||||
|
||||
// IncRefAndAccount adds a reference on the range fr. All newly inserted segments
|
||||
// are accounted as host page cache memory mappings.
|
||||
func (refs *FrameRefSet) IncRefAndAccount(fr platform.FileRange) {
|
||||
seg, gap := refs.Find(fr.Start)
|
||||
for {
|
||||
switch {
|
||||
case seg.Ok() && seg.Start() < fr.End:
|
||||
seg = refs.Isolate(seg, fr)
|
||||
seg.SetValue(seg.Value() + 1)
|
||||
seg, gap = seg.NextNonEmpty()
|
||||
case gap.Ok() && gap.Start() < fr.End:
|
||||
newRange := gap.Range().Intersect(fr)
|
||||
usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
|
||||
seg, gap = refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
|
||||
default:
|
||||
refs.MergeAdjacent(fr)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// DecRefAndAccount removes a reference on the range fr and untracks segments
|
||||
// that are removed from memory accounting.
|
||||
func (refs *FrameRefSet) DecRefAndAccount(fr platform.FileRange) {
|
||||
seg := refs.FindSegment(fr.Start)
|
||||
|
||||
for seg.Ok() && seg.Start() < fr.End {
|
||||
seg = refs.Isolate(seg, fr)
|
||||
if old := seg.Value(); old == 1 {
|
||||
usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
|
||||
seg = refs.Remove(seg).NextSegment()
|
||||
} else {
|
||||
seg.SetValue(old - 1)
|
||||
seg = seg.NextSegment()
|
||||
}
|
||||
}
|
||||
refs.MergeAdjacent(fr)
|
||||
}
|
||||
|
|
|
@ -36,7 +36,6 @@ go_library(
|
|||
"gofer.go",
|
||||
"handle.go",
|
||||
"p9file.go",
|
||||
"pagemath.go",
|
||||
"regular_file.go",
|
||||
"socket.go",
|
||||
"special_file.go",
|
||||
|
|
|
@ -928,8 +928,8 @@ func (d *dentry) setStat(ctx context.Context, creds *auth.Credentials, stat *lin
|
|||
// so we can't race with Write or another truncate.)
|
||||
d.dataMu.Unlock()
|
||||
if d.size < oldSize {
|
||||
oldpgend := pageRoundUp(oldSize)
|
||||
newpgend := pageRoundUp(d.size)
|
||||
oldpgend, _ := usermem.PageRoundUp(oldSize)
|
||||
newpgend, _ := usermem.PageRoundUp(d.size)
|
||||
if oldpgend != newpgend {
|
||||
d.mapsMu.Lock()
|
||||
d.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
// Copyright 2019 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package gofer
|
||||
|
||||
import (
|
||||
"gvisor.dev/gvisor/pkg/usermem"
|
||||
)
|
||||
|
||||
// This are equivalent to usermem.Addr.RoundDown/Up, but without the
|
||||
// potentially truncating conversion to usermem.Addr. This is necessary because
|
||||
// there is no way to define generic "PageRoundDown/Up" functions in Go.
|
||||
|
||||
func pageRoundDown(x uint64) uint64 {
|
||||
return x &^ (usermem.PageSize - 1)
|
||||
}
|
||||
|
||||
func pageRoundUp(x uint64) uint64 {
|
||||
return pageRoundDown(x + usermem.PageSize - 1)
|
||||
}
|
|
@ -148,9 +148,9 @@ func (fd *regularFileFD) PWrite(ctx context.Context, src usermem.IOSequence, off
|
|||
return 0, err
|
||||
}
|
||||
// Remove touched pages from the cache.
|
||||
pgstart := pageRoundDown(uint64(offset))
|
||||
pgend := pageRoundUp(uint64(offset + src.NumBytes()))
|
||||
if pgend < pgstart {
|
||||
pgstart := usermem.PageRoundDown(uint64(offset))
|
||||
pgend, ok := usermem.PageRoundUp(uint64(offset + src.NumBytes()))
|
||||
if !ok {
|
||||
return 0, syserror.EINVAL
|
||||
}
|
||||
mr := memmap.MappableRange{pgstart, pgend}
|
||||
|
@ -306,9 +306,10 @@ func (rw *dentryReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error)
|
|||
if fillCache {
|
||||
// Read into the cache, then re-enter the loop to read from the
|
||||
// cache.
|
||||
gapEnd, _ := usermem.PageRoundUp(gapMR.End)
|
||||
reqMR := memmap.MappableRange{
|
||||
Start: pageRoundDown(gapMR.Start),
|
||||
End: pageRoundUp(gapMR.End),
|
||||
Start: usermem.PageRoundDown(gapMR.Start),
|
||||
End: gapEnd,
|
||||
}
|
||||
optMR := gap.Range()
|
||||
err := rw.d.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mf, usage.PageCache, rw.d.handle.readToBlocksAt)
|
||||
|
@ -671,7 +672,7 @@ func (d *dentry) Translate(ctx context.Context, required, optional memmap.Mappab
|
|||
|
||||
// Constrain translations to d.size (rounded up) to prevent translation to
|
||||
// pages that may be concurrently truncated.
|
||||
pgend := pageRoundUp(d.size)
|
||||
pgend, _ := usermem.PageRoundUp(d.size)
|
||||
var beyondEOF bool
|
||||
if required.End > pgend {
|
||||
if required.Start >= pgend {
|
||||
|
@ -818,43 +819,15 @@ type dentryPlatformFile struct {
|
|||
// IncRef implements platform.File.IncRef.
|
||||
func (d *dentryPlatformFile) IncRef(fr platform.FileRange) {
|
||||
d.dataMu.Lock()
|
||||
seg, gap := d.fdRefs.Find(fr.Start)
|
||||
for {
|
||||
switch {
|
||||
case seg.Ok() && seg.Start() < fr.End:
|
||||
seg = d.fdRefs.Isolate(seg, fr)
|
||||
seg.SetValue(seg.Value() + 1)
|
||||
seg, gap = seg.NextNonEmpty()
|
||||
case gap.Ok() && gap.Start() < fr.End:
|
||||
newRange := gap.Range().Intersect(fr)
|
||||
usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
|
||||
seg, gap = d.fdRefs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
|
||||
default:
|
||||
d.fdRefs.MergeAdjacent(fr)
|
||||
d.dataMu.Unlock()
|
||||
return
|
||||
}
|
||||
}
|
||||
d.fdRefs.IncRefAndAccount(fr)
|
||||
d.dataMu.Unlock()
|
||||
}
|
||||
|
||||
// DecRef implements platform.File.DecRef.
|
||||
func (d *dentryPlatformFile) DecRef(fr platform.FileRange) {
|
||||
d.dataMu.Lock()
|
||||
seg := d.fdRefs.FindSegment(fr.Start)
|
||||
|
||||
for seg.Ok() && seg.Start() < fr.End {
|
||||
seg = d.fdRefs.Isolate(seg, fr)
|
||||
if old := seg.Value(); old == 1 {
|
||||
usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
|
||||
seg = d.fdRefs.Remove(seg).NextSegment()
|
||||
} else {
|
||||
seg.SetValue(old - 1)
|
||||
seg = seg.NextSegment()
|
||||
}
|
||||
}
|
||||
d.fdRefs.MergeAdjacent(fr)
|
||||
d.fdRefs.DecRefAndAccount(fr)
|
||||
d.dataMu.Unlock()
|
||||
|
||||
}
|
||||
|
||||
// MapInternal implements platform.File.MapInternal.
|
||||
|
|
|
@ -8,6 +8,7 @@ go_library(
|
|||
"control.go",
|
||||
"host.go",
|
||||
"ioctl_unsafe.go",
|
||||
"mmap.go",
|
||||
"socket.go",
|
||||
"socket_iovec.go",
|
||||
"socket_unsafe.go",
|
||||
|
@ -23,12 +24,15 @@ go_library(
|
|||
"//pkg/fspath",
|
||||
"//pkg/log",
|
||||
"//pkg/refs",
|
||||
"//pkg/safemem",
|
||||
"//pkg/sentry/arch",
|
||||
"//pkg/sentry/fs/fsutil",
|
||||
"//pkg/sentry/fsimpl/kernfs",
|
||||
"//pkg/sentry/hostfd",
|
||||
"//pkg/sentry/kernel",
|
||||
"//pkg/sentry/kernel/auth",
|
||||
"//pkg/sentry/memmap",
|
||||
"//pkg/sentry/platform",
|
||||
"//pkg/sentry/socket/control",
|
||||
"//pkg/sentry/socket/unix",
|
||||
"//pkg/sentry/socket/unix/transport",
|
||||
|
|
|
@ -86,15 +86,16 @@ func NewFD(ctx context.Context, mnt *vfs.Mount, hostFD int, opts *NewFDOptions)
|
|||
|
||||
i := &inode{
|
||||
hostFD: hostFD,
|
||||
seekable: seekable,
|
||||
isTTY: opts.IsTTY,
|
||||
canMap: canMap(uint32(fileType)),
|
||||
wouldBlock: wouldBlock(uint32(fileType)),
|
||||
ino: fs.NextIno(),
|
||||
isTTY: opts.IsTTY,
|
||||
wouldBlock: wouldBlock(uint32(fileType)),
|
||||
seekable: seekable,
|
||||
// For simplicity, set offset to 0. Technically, we should use the existing
|
||||
// offset on the host if the file is seekable.
|
||||
offset: 0,
|
||||
canMap: canMap(uint32(fileType)),
|
||||
}
|
||||
i.pf.inode = i
|
||||
|
||||
// Non-seekable files can't be memory mapped, assert this.
|
||||
if !i.seekable && i.canMap {
|
||||
|
@ -189,11 +190,15 @@ type inode struct {
|
|||
// This field is initialized at creation time and is immutable.
|
||||
hostFD int
|
||||
|
||||
// wouldBlock is true if the host FD would return EWOULDBLOCK for
|
||||
// operations that would block.
|
||||
// ino is an inode number unique within this filesystem.
|
||||
//
|
||||
// This field is initialized at creation time and is immutable.
|
||||
wouldBlock bool
|
||||
ino uint64
|
||||
|
||||
// isTTY is true if this file represents a TTY.
|
||||
//
|
||||
// This field is initialized at creation time and is immutable.
|
||||
isTTY bool
|
||||
|
||||
// seekable is false if the host fd points to a file representing a stream,
|
||||
// e.g. a socket or a pipe. Such files are not seekable and can return
|
||||
|
@ -202,29 +207,36 @@ type inode struct {
|
|||
// This field is initialized at creation time and is immutable.
|
||||
seekable bool
|
||||
|
||||
// isTTY is true if this file represents a TTY.
|
||||
// offsetMu protects offset.
|
||||
offsetMu sync.Mutex
|
||||
|
||||
// offset specifies the current file offset. It is only meaningful when
|
||||
// seekable is true.
|
||||
offset int64
|
||||
|
||||
// wouldBlock is true if the host FD would return EWOULDBLOCK for
|
||||
// operations that would block.
|
||||
//
|
||||
// This field is initialized at creation time and is immutable.
|
||||
isTTY bool
|
||||
wouldBlock bool
|
||||
|
||||
// Event queue for blocking operations.
|
||||
queue waiter.Queue
|
||||
|
||||
// canMap specifies whether we allow the file to be memory mapped.
|
||||
//
|
||||
// This field is initialized at creation time and is immutable.
|
||||
canMap bool
|
||||
|
||||
// ino is an inode number unique within this filesystem.
|
||||
//
|
||||
// This field is initialized at creation time and is immutable.
|
||||
ino uint64
|
||||
// mapsMu protects mappings.
|
||||
mapsMu sync.Mutex
|
||||
|
||||
// offsetMu protects offset.
|
||||
offsetMu sync.Mutex
|
||||
// If canMap is true, mappings tracks mappings of hostFD into
|
||||
// memmap.MappingSpaces.
|
||||
mappings memmap.MappingSet
|
||||
|
||||
// offset specifies the current file offset.
|
||||
offset int64
|
||||
|
||||
// Event queue for blocking operations.
|
||||
queue waiter.Queue
|
||||
// pf implements platform.File for mappings of hostFD.
|
||||
pf inodePlatformFile
|
||||
}
|
||||
|
||||
// CheckPermissions implements kernfs.Inode.
|
||||
|
@ -388,6 +400,21 @@ func (i *inode) SetStat(ctx context.Context, fs *vfs.Filesystem, creds *auth.Cre
|
|||
if err := syscall.Ftruncate(i.hostFD, int64(s.Size)); err != nil {
|
||||
return err
|
||||
}
|
||||
oldSize := uint64(hostStat.Size)
|
||||
if s.Size < oldSize {
|
||||
oldpgend, _ := usermem.PageRoundUp(oldSize)
|
||||
newpgend, _ := usermem.PageRoundUp(s.Size)
|
||||
if oldpgend != newpgend {
|
||||
i.mapsMu.Lock()
|
||||
i.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
|
||||
// Compare Linux's mm/truncate.c:truncate_setsize() =>
|
||||
// truncate_pagecache() =>
|
||||
// mm/memory.c:unmap_mapping_range(evencows=1).
|
||||
InvalidatePrivate: true,
|
||||
})
|
||||
i.mapsMu.Unlock()
|
||||
}
|
||||
}
|
||||
}
|
||||
if m&(linux.STATX_ATIME|linux.STATX_MTIME) != 0 {
|
||||
ts := [2]syscall.Timespec{
|
||||
|
@ -666,8 +693,9 @@ func (f *fileDescription) ConfigureMMap(_ context.Context, opts *memmap.MMapOpts
|
|||
if !f.inode.canMap {
|
||||
return syserror.ENODEV
|
||||
}
|
||||
// TODO(gvisor.dev/issue/1672): Implement ConfigureMMap and Mappable interface.
|
||||
return syserror.ENODEV
|
||||
i := f.inode
|
||||
i.pf.fileMapperInitOnce.Do(i.pf.fileMapper.Init)
|
||||
return vfs.GenericConfigureMMap(&f.vfsfd, i, opts)
|
||||
}
|
||||
|
||||
// EventRegister implements waiter.Waitable.EventRegister.
|
||||
|
|
|
@ -0,0 +1,132 @@
|
|||
// Copyright 2020 The gVisor Authors.
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package host
|
||||
|
||||
import (
|
||||
"gvisor.dev/gvisor/pkg/context"
|
||||
"gvisor.dev/gvisor/pkg/safemem"
|
||||
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
|
||||
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
||||
"gvisor.dev/gvisor/pkg/sentry/platform"
|
||||
"gvisor.dev/gvisor/pkg/sync"
|
||||
"gvisor.dev/gvisor/pkg/usermem"
|
||||
)
|
||||
|
||||
// inodePlatformFile implements platform.File. It exists solely because inode
|
||||
// cannot implement both kernfs.Inode.IncRef and platform.File.IncRef.
|
||||
//
|
||||
// inodePlatformFile should only be used if inode.canMap is true.
|
||||
type inodePlatformFile struct {
|
||||
*inode
|
||||
|
||||
// fdRefsMu protects fdRefs.
|
||||
fdRefsMu sync.Mutex
|
||||
|
||||
// fdRefs counts references on platform.File offsets. It is used solely for
|
||||
// memory accounting.
|
||||
fdRefs fsutil.FrameRefSet
|
||||
|
||||
// fileMapper caches mappings of the host file represented by this inode.
|
||||
fileMapper fsutil.HostFileMapper
|
||||
|
||||
// fileMapperInitOnce is used to lazily initialize fileMapper.
|
||||
fileMapperInitOnce sync.Once
|
||||
}
|
||||
|
||||
// IncRef implements platform.File.IncRef.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inodePlatformFile) IncRef(fr platform.FileRange) {
|
||||
i.fdRefsMu.Lock()
|
||||
i.fdRefs.IncRefAndAccount(fr)
|
||||
i.fdRefsMu.Unlock()
|
||||
}
|
||||
|
||||
// DecRef implements platform.File.DecRef.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inodePlatformFile) DecRef(fr platform.FileRange) {
|
||||
i.fdRefsMu.Lock()
|
||||
i.fdRefs.DecRefAndAccount(fr)
|
||||
i.fdRefsMu.Unlock()
|
||||
}
|
||||
|
||||
// MapInternal implements platform.File.MapInternal.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inodePlatformFile) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
|
||||
return i.fileMapper.MapInternal(fr, i.hostFD, at.Write)
|
||||
}
|
||||
|
||||
// FD implements platform.File.FD.
|
||||
func (i *inodePlatformFile) FD() int {
|
||||
return i.hostFD
|
||||
}
|
||||
|
||||
// AddMapping implements memmap.Mappable.AddMapping.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inode) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
|
||||
i.mapsMu.Lock()
|
||||
mapped := i.mappings.AddMapping(ms, ar, offset, writable)
|
||||
for _, r := range mapped {
|
||||
i.pf.fileMapper.IncRefOn(r)
|
||||
}
|
||||
i.mapsMu.Unlock()
|
||||
return nil
|
||||
}
|
||||
|
||||
// RemoveMapping implements memmap.Mappable.RemoveMapping.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inode) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
|
||||
i.mapsMu.Lock()
|
||||
unmapped := i.mappings.RemoveMapping(ms, ar, offset, writable)
|
||||
for _, r := range unmapped {
|
||||
i.pf.fileMapper.DecRefOn(r)
|
||||
}
|
||||
i.mapsMu.Unlock()
|
||||
}
|
||||
|
||||
// CopyMapping implements memmap.Mappable.CopyMapping.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inode) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
|
||||
return i.AddMapping(ctx, ms, dstAR, offset, writable)
|
||||
}
|
||||
|
||||
// Translate implements memmap.Mappable.Translate.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inode) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
|
||||
mr := optional
|
||||
return []memmap.Translation{
|
||||
{
|
||||
Source: mr,
|
||||
File: &i.pf,
|
||||
Offset: mr.Start,
|
||||
Perms: usermem.AnyAccess,
|
||||
},
|
||||
}, nil
|
||||
}
|
||||
|
||||
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
|
||||
//
|
||||
// Precondition: i.inode.canMap must be true.
|
||||
func (i *inode) InvalidateUnsavable(ctx context.Context) error {
|
||||
// We expect the same host fd across save/restore, so all translations
|
||||
// should be valid.
|
||||
return nil
|
||||
}
|
|
@ -106,3 +106,20 @@ func (ar AddrRange) IsPageAligned() bool {
|
|||
func (ar AddrRange) String() string {
|
||||
return fmt.Sprintf("[%#x, %#x)", ar.Start, ar.End)
|
||||
}
|
||||
|
||||
// PageRoundDown/Up are equivalent to Addr.RoundDown/Up, but without the
|
||||
// potentially truncating conversion from uint64 to Addr. This is necessary
|
||||
// because there is no way to define generic "PageRoundDown/Up" functions in Go.
|
||||
|
||||
// PageRoundDown returns x rounded down to the nearest page boundary.
|
||||
func PageRoundDown(x uint64) uint64 {
|
||||
return x &^ (PageSize - 1)
|
||||
}
|
||||
|
||||
// PageRoundUp returns x rounded up to the nearest page boundary.
|
||||
// ok is true iff rounding up did not wrap around.
|
||||
func PageRoundUp(x uint64) (addr uint64, ok bool) {
|
||||
addr = PageRoundDown(x + PageSize - 1)
|
||||
ok = addr >= x
|
||||
return
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue