1054 lines
32 KiB
Go
1054 lines
32 KiB
Go
// Copyright 2018 The gVisor Authors.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package fsutil
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"sync"
|
|
|
|
"gvisor.dev/gvisor/pkg/log"
|
|
"gvisor.dev/gvisor/pkg/sentry/context"
|
|
"gvisor.dev/gvisor/pkg/sentry/fs"
|
|
"gvisor.dev/gvisor/pkg/sentry/kernel/time"
|
|
ktime "gvisor.dev/gvisor/pkg/sentry/kernel/time"
|
|
"gvisor.dev/gvisor/pkg/sentry/memmap"
|
|
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
|
|
"gvisor.dev/gvisor/pkg/sentry/platform"
|
|
"gvisor.dev/gvisor/pkg/sentry/safemem"
|
|
"gvisor.dev/gvisor/pkg/sentry/usage"
|
|
"gvisor.dev/gvisor/pkg/sentry/usermem"
|
|
)
|
|
|
|
// Lock order (compare the lock order model in mm/mm.go):
|
|
//
|
|
// CachingInodeOperations.attrMu ("fs locks")
|
|
// CachingInodeOperations.mapsMu ("memmap.Mappable locks not taken by Translate")
|
|
// CachingInodeOperations.dataMu ("memmap.Mappable locks taken by Translate")
|
|
// CachedFileObject locks
|
|
|
|
// CachingInodeOperations caches the metadata and content of a CachedFileObject.
|
|
// It implements a subset of InodeOperations. As a utility it can be used to
|
|
// implement the full set of InodeOperations. Generally it should not be
|
|
// embedded to avoid unexpected inherited behavior.
|
|
//
|
|
// CachingInodeOperations implements Mappable for the CachedFileObject:
|
|
//
|
|
// - If CachedFileObject.FD returns a value >= 0 then the file descriptor
|
|
// will be memory mapped on the host.
|
|
//
|
|
// - Otherwise, the contents of CachedFileObject are buffered into memory
|
|
// managed by the CachingInodeOperations.
|
|
//
|
|
// Implementations of FileOperations for a CachedFileObject must read and
|
|
// write through CachingInodeOperations using Read and Write respectively.
|
|
//
|
|
// Implementations of InodeOperations.WriteOut must call Sync to write out
|
|
// in-memory modifications of data and metadata to the CachedFileObject.
|
|
//
|
|
// +stateify savable
|
|
type CachingInodeOperations struct {
|
|
// backingFile is a handle to a cached file object.
|
|
backingFile CachedFileObject
|
|
|
|
// mfp is used to allocate memory that caches backingFile's contents.
|
|
mfp pgalloc.MemoryFileProvider
|
|
|
|
// opts contains options. opts is immutable.
|
|
opts CachingInodeOperationsOptions
|
|
|
|
attrMu sync.Mutex `state:"nosave"`
|
|
|
|
// attr is unstable cached metadata.
|
|
//
|
|
// attr is protected by attrMu. attr.Size is protected by both attrMu and
|
|
// dataMu; reading it requires locking either mutex, while mutating it
|
|
// requires locking both.
|
|
attr fs.UnstableAttr
|
|
|
|
// dirtyAttr is metadata that was updated in-place but hasn't yet
|
|
// been successfully written out.
|
|
//
|
|
// dirtyAttr is protected by attrMu.
|
|
dirtyAttr fs.AttrMask
|
|
|
|
mapsMu sync.Mutex `state:"nosave"`
|
|
|
|
// mappings tracks mappings of the cached file object into
|
|
// memmap.MappingSpaces.
|
|
//
|
|
// mappings is protected by mapsMu.
|
|
mappings memmap.MappingSet
|
|
|
|
dataMu sync.RWMutex `state:"nosave"`
|
|
|
|
// cache maps offsets into the cached file to offsets into
|
|
// mfp.MemoryFile() that store the file's data.
|
|
//
|
|
// cache is protected by dataMu.
|
|
cache FileRangeSet
|
|
|
|
// dirty tracks dirty segments in cache.
|
|
//
|
|
// dirty is protected by dataMu.
|
|
dirty DirtySet
|
|
|
|
// hostFileMapper caches internal mappings of backingFile.FD().
|
|
hostFileMapper *HostFileMapper
|
|
|
|
// refs tracks active references to data in the cache.
|
|
//
|
|
// refs is protected by dataMu.
|
|
refs frameRefSet
|
|
}
|
|
|
|
// CachingInodeOperationsOptions configures a CachingInodeOperations.
|
|
//
|
|
// +stateify savable
|
|
type CachingInodeOperationsOptions struct {
|
|
// If ForcePageCache is true, use the sentry page cache even if a host file
|
|
// descriptor is available.
|
|
ForcePageCache bool
|
|
|
|
// If LimitHostFDTranslation is true, apply maxFillRange() constraints to
|
|
// host file descriptor mappings returned by
|
|
// CachingInodeOperations.Translate().
|
|
LimitHostFDTranslation bool
|
|
}
|
|
|
|
// CachedFileObject is a file that may require caching.
|
|
type CachedFileObject interface {
|
|
// ReadToBlocksAt reads up to dsts.NumBytes() bytes from the file to dsts,
|
|
// starting at offset, and returns the number of bytes read. ReadToBlocksAt
|
|
// may return a partial read without an error.
|
|
ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error)
|
|
|
|
// WriteFromBlocksAt writes up to srcs.NumBytes() bytes from srcs to the
|
|
// file, starting at offset, and returns the number of bytes written.
|
|
// WriteFromBlocksAt may return a partial write without an error.
|
|
WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error)
|
|
|
|
// SetMaskedAttributes sets the attributes in attr that are true in mask
|
|
// on the backing file.
|
|
//
|
|
// SetMaskedAttributes may be called at any point, regardless of whether
|
|
// the file was opened.
|
|
SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error
|
|
|
|
// Allocate allows the caller to reserve disk space for the inode.
|
|
// It's equivalent to fallocate(2) with 'mode=0'.
|
|
Allocate(ctx context.Context, offset int64, length int64) error
|
|
|
|
// Sync instructs the remote filesystem to sync the file to stable storage.
|
|
Sync(ctx context.Context) error
|
|
|
|
// FD returns a host file descriptor. If it is possible for
|
|
// CachingInodeOperations.AddMapping to have ever been called with writable
|
|
// = true, the FD must have been opened O_RDWR; otherwise, it may have been
|
|
// opened O_RDONLY or O_RDWR. (mmap unconditionally requires that mapped
|
|
// files are readable.) If no host file descriptor is available, FD returns
|
|
// a negative number.
|
|
//
|
|
// For any given CachedFileObject, if FD() ever succeeds (returns a
|
|
// non-negative number), it must always succeed.
|
|
//
|
|
// FD is called iff the file has been memory mapped. This implies that
|
|
// the file was opened (see fs.InodeOperations.GetFile).
|
|
FD() int
|
|
}
|
|
|
|
// NewCachingInodeOperations returns a new CachingInodeOperations backed by
|
|
// a CachedFileObject and its initial unstable attributes.
|
|
func NewCachingInodeOperations(ctx context.Context, backingFile CachedFileObject, uattr fs.UnstableAttr, opts CachingInodeOperationsOptions) *CachingInodeOperations {
|
|
mfp := pgalloc.MemoryFileProviderFromContext(ctx)
|
|
if mfp == nil {
|
|
panic(fmt.Sprintf("context.Context %T lacks non-nil value for key %T", ctx, pgalloc.CtxMemoryFileProvider))
|
|
}
|
|
return &CachingInodeOperations{
|
|
backingFile: backingFile,
|
|
mfp: mfp,
|
|
opts: opts,
|
|
attr: uattr,
|
|
hostFileMapper: NewHostFileMapper(),
|
|
}
|
|
}
|
|
|
|
// Release implements fs.InodeOperations.Release.
|
|
func (c *CachingInodeOperations) Release() {
|
|
c.mapsMu.Lock()
|
|
defer c.mapsMu.Unlock()
|
|
c.dataMu.Lock()
|
|
defer c.dataMu.Unlock()
|
|
|
|
// Something has gone terribly wrong if we're releasing an inode that is
|
|
// still memory-mapped.
|
|
if !c.mappings.IsEmpty() {
|
|
panic(fmt.Sprintf("Releasing CachingInodeOperations with mappings:\n%s", &c.mappings))
|
|
}
|
|
|
|
// Drop any cached pages that are still awaiting MemoryFile eviction. (This
|
|
// means that MemoryFile no longer needs to evict them.)
|
|
mf := c.mfp.MemoryFile()
|
|
mf.MarkAllUnevictable(c)
|
|
if err := SyncDirtyAll(context.Background(), &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
|
|
panic(fmt.Sprintf("Failed to writeback cached data: %v", err))
|
|
}
|
|
c.cache.DropAll(mf)
|
|
c.dirty.RemoveAll()
|
|
}
|
|
|
|
// UnstableAttr implements fs.InodeOperations.UnstableAttr.
|
|
func (c *CachingInodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
|
|
c.attrMu.Lock()
|
|
attr := c.attr
|
|
c.attrMu.Unlock()
|
|
return attr, nil
|
|
}
|
|
|
|
// SetPermissions implements fs.InodeOperations.SetPermissions.
|
|
func (c *CachingInodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, perms fs.FilePermissions) bool {
|
|
c.attrMu.Lock()
|
|
defer c.attrMu.Unlock()
|
|
|
|
now := ktime.NowFromContext(ctx)
|
|
masked := fs.AttrMask{Perms: true}
|
|
if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Perms: perms}); err != nil {
|
|
return false
|
|
}
|
|
c.attr.Perms = perms
|
|
c.touchStatusChangeTimeLocked(now)
|
|
return true
|
|
}
|
|
|
|
// SetOwner implements fs.InodeOperations.SetOwner.
|
|
func (c *CachingInodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
|
|
if !owner.UID.Ok() && !owner.GID.Ok() {
|
|
return nil
|
|
}
|
|
|
|
c.attrMu.Lock()
|
|
defer c.attrMu.Unlock()
|
|
|
|
now := ktime.NowFromContext(ctx)
|
|
masked := fs.AttrMask{
|
|
UID: owner.UID.Ok(),
|
|
GID: owner.GID.Ok(),
|
|
}
|
|
if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{Owner: owner}); err != nil {
|
|
return err
|
|
}
|
|
if owner.UID.Ok() {
|
|
c.attr.Owner.UID = owner.UID
|
|
}
|
|
if owner.GID.Ok() {
|
|
c.attr.Owner.GID = owner.GID
|
|
}
|
|
c.touchStatusChangeTimeLocked(now)
|
|
return nil
|
|
}
|
|
|
|
// SetTimestamps implements fs.InodeOperations.SetTimestamps.
|
|
func (c *CachingInodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
|
|
if ts.ATimeOmit && ts.MTimeOmit {
|
|
return nil
|
|
}
|
|
|
|
c.attrMu.Lock()
|
|
defer c.attrMu.Unlock()
|
|
|
|
// Replace requests to use the "system time" with the current time to
|
|
// ensure that cached timestamps remain consistent with the remote
|
|
// filesystem.
|
|
now := ktime.NowFromContext(ctx)
|
|
if ts.ATimeSetSystemTime {
|
|
ts.ATime = now
|
|
}
|
|
if ts.MTimeSetSystemTime {
|
|
ts.MTime = now
|
|
}
|
|
masked := fs.AttrMask{
|
|
AccessTime: !ts.ATimeOmit,
|
|
ModificationTime: !ts.MTimeOmit,
|
|
}
|
|
if err := c.backingFile.SetMaskedAttributes(ctx, masked, fs.UnstableAttr{AccessTime: ts.ATime, ModificationTime: ts.MTime}); err != nil {
|
|
return err
|
|
}
|
|
if !ts.ATimeOmit {
|
|
c.attr.AccessTime = ts.ATime
|
|
}
|
|
if !ts.MTimeOmit {
|
|
c.attr.ModificationTime = ts.MTime
|
|
}
|
|
c.touchStatusChangeTimeLocked(now)
|
|
return nil
|
|
}
|
|
|
|
// Truncate implements fs.InodeOperations.Truncate.
|
|
func (c *CachingInodeOperations) Truncate(ctx context.Context, inode *fs.Inode, size int64) error {
|
|
c.attrMu.Lock()
|
|
defer c.attrMu.Unlock()
|
|
|
|
// c.attr.Size is protected by both c.attrMu and c.dataMu.
|
|
c.dataMu.Lock()
|
|
now := ktime.NowFromContext(ctx)
|
|
masked := fs.AttrMask{Size: true}
|
|
attr := fs.UnstableAttr{Size: size}
|
|
if err := c.backingFile.SetMaskedAttributes(ctx, masked, attr); err != nil {
|
|
c.dataMu.Unlock()
|
|
return err
|
|
}
|
|
oldSize := c.attr.Size
|
|
c.attr.Size = size
|
|
c.touchModificationAndStatusChangeTimeLocked(now)
|
|
|
|
// We drop c.dataMu here so that we can lock c.mapsMu and invalidate
|
|
// mappings below. This allows concurrent calls to Read/Translate/etc.
|
|
// These functions synchronize with an in-progress Truncate by refusing to
|
|
// use cache contents beyond the new c.attr.Size. (We are still holding
|
|
// c.attrMu, so we can't race with Truncate/Write.)
|
|
c.dataMu.Unlock()
|
|
|
|
// Nothing left to do unless shrinking the file.
|
|
if size >= oldSize {
|
|
return nil
|
|
}
|
|
|
|
oldpgend := fs.OffsetPageEnd(oldSize)
|
|
newpgend := fs.OffsetPageEnd(size)
|
|
|
|
// Invalidate past translations of truncated pages.
|
|
if newpgend != oldpgend {
|
|
c.mapsMu.Lock()
|
|
c.mappings.Invalidate(memmap.MappableRange{newpgend, oldpgend}, memmap.InvalidateOpts{
|
|
// Compare Linux's mm/truncate.c:truncate_setsize() =>
|
|
// truncate_pagecache() =>
|
|
// mm/memory.c:unmap_mapping_range(evencows=1).
|
|
InvalidatePrivate: true,
|
|
})
|
|
c.mapsMu.Unlock()
|
|
}
|
|
|
|
// We are now guaranteed that there are no translations of truncated pages,
|
|
// and can remove them from the cache. Since truncated pages have been
|
|
// removed from the backing file, they should be dropped without being
|
|
// written back.
|
|
c.dataMu.Lock()
|
|
defer c.dataMu.Unlock()
|
|
c.cache.Truncate(uint64(size), c.mfp.MemoryFile())
|
|
c.dirty.KeepClean(memmap.MappableRange{uint64(size), oldpgend})
|
|
|
|
return nil
|
|
}
|
|
|
|
// Allocate implements fs.InodeOperations.Allocate.
|
|
func (c *CachingInodeOperations) Allocate(ctx context.Context, offset, length int64) error {
|
|
newSize := offset + length
|
|
|
|
// c.attr.Size is protected by both c.attrMu and c.dataMu.
|
|
c.attrMu.Lock()
|
|
defer c.attrMu.Unlock()
|
|
c.dataMu.Lock()
|
|
defer c.dataMu.Unlock()
|
|
|
|
if newSize <= c.attr.Size {
|
|
return nil
|
|
}
|
|
|
|
now := ktime.NowFromContext(ctx)
|
|
if err := c.backingFile.Allocate(ctx, offset, length); err != nil {
|
|
return err
|
|
}
|
|
|
|
c.attr.Size = newSize
|
|
c.touchModificationAndStatusChangeTimeLocked(now)
|
|
return nil
|
|
}
|
|
|
|
// WriteOut implements fs.InodeOperations.WriteOut.
|
|
func (c *CachingInodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
|
|
c.attrMu.Lock()
|
|
|
|
// Write dirty pages back.
|
|
c.dataMu.Lock()
|
|
err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), c.mfp.MemoryFile(), c.backingFile.WriteFromBlocksAt)
|
|
c.dataMu.Unlock()
|
|
if err != nil {
|
|
c.attrMu.Unlock()
|
|
return err
|
|
}
|
|
|
|
// SyncDirtyAll above would have grown if needed. On shrinks, the backing
|
|
// file is called directly, so size is never needs to be updated.
|
|
c.dirtyAttr.Size = false
|
|
|
|
// Write out cached attributes.
|
|
if err := c.backingFile.SetMaskedAttributes(ctx, c.dirtyAttr, c.attr); err != nil {
|
|
c.attrMu.Unlock()
|
|
return err
|
|
}
|
|
c.dirtyAttr = fs.AttrMask{}
|
|
|
|
c.attrMu.Unlock()
|
|
|
|
// Fsync the remote file.
|
|
return c.backingFile.Sync(ctx)
|
|
}
|
|
|
|
// IncLinks increases the link count and updates cached modification time.
|
|
func (c *CachingInodeOperations) IncLinks(ctx context.Context) {
|
|
c.attrMu.Lock()
|
|
c.attr.Links++
|
|
c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
|
|
c.attrMu.Unlock()
|
|
}
|
|
|
|
// DecLinks decreases the link count and updates cached modification time.
|
|
func (c *CachingInodeOperations) DecLinks(ctx context.Context) {
|
|
c.attrMu.Lock()
|
|
c.attr.Links--
|
|
c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
|
|
c.attrMu.Unlock()
|
|
}
|
|
|
|
// TouchAccessTime updates the cached access time in-place to the
|
|
// current time. It does not update status change time in-place. See
|
|
// mm/filemap.c:do_generic_file_read -> include/linux/h:file_accessed.
|
|
func (c *CachingInodeOperations) TouchAccessTime(ctx context.Context, inode *fs.Inode) {
|
|
if inode.MountSource.Flags.NoAtime {
|
|
return
|
|
}
|
|
|
|
c.attrMu.Lock()
|
|
c.touchAccessTimeLocked(ktime.NowFromContext(ctx))
|
|
c.attrMu.Unlock()
|
|
}
|
|
|
|
// touchAccesstimeLocked updates the cached access time in-place to the current
|
|
// time.
|
|
//
|
|
// Preconditions: c.attrMu is locked for writing.
|
|
func (c *CachingInodeOperations) touchAccessTimeLocked(now time.Time) {
|
|
c.attr.AccessTime = now
|
|
c.dirtyAttr.AccessTime = true
|
|
}
|
|
|
|
// TouchModificationAndStatusChangeTime updates the cached modification and
|
|
// status change times in-place to the current time.
|
|
func (c *CachingInodeOperations) TouchModificationAndStatusChangeTime(ctx context.Context) {
|
|
c.attrMu.Lock()
|
|
c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
|
|
c.attrMu.Unlock()
|
|
}
|
|
|
|
// touchModificationAndStatusChangeTimeLocked updates the cached modification
|
|
// and status change times in-place to the current time.
|
|
//
|
|
// Preconditions: c.attrMu is locked for writing.
|
|
func (c *CachingInodeOperations) touchModificationAndStatusChangeTimeLocked(now time.Time) {
|
|
c.attr.ModificationTime = now
|
|
c.dirtyAttr.ModificationTime = true
|
|
c.attr.StatusChangeTime = now
|
|
c.dirtyAttr.StatusChangeTime = true
|
|
}
|
|
|
|
// TouchStatusChangeTime updates the cached status change time in-place to the
|
|
// current time.
|
|
func (c *CachingInodeOperations) TouchStatusChangeTime(ctx context.Context) {
|
|
c.attrMu.Lock()
|
|
c.touchStatusChangeTimeLocked(ktime.NowFromContext(ctx))
|
|
c.attrMu.Unlock()
|
|
}
|
|
|
|
// touchStatusChangeTimeLocked updates the cached status change time
|
|
// in-place to the current time.
|
|
//
|
|
// Preconditions: c.attrMu is locked for writing.
|
|
func (c *CachingInodeOperations) touchStatusChangeTimeLocked(now time.Time) {
|
|
c.attr.StatusChangeTime = now
|
|
c.dirtyAttr.StatusChangeTime = true
|
|
}
|
|
|
|
// UpdateUnstable updates the cached unstable attributes. Only non-dirty
|
|
// attributes are updated.
|
|
func (c *CachingInodeOperations) UpdateUnstable(attr fs.UnstableAttr) {
|
|
// All attributes are protected by attrMu.
|
|
c.attrMu.Lock()
|
|
|
|
if !c.dirtyAttr.Usage {
|
|
c.attr.Usage = attr.Usage
|
|
}
|
|
if !c.dirtyAttr.Perms {
|
|
c.attr.Perms = attr.Perms
|
|
}
|
|
if !c.dirtyAttr.UID {
|
|
c.attr.Owner.UID = attr.Owner.UID
|
|
}
|
|
if !c.dirtyAttr.GID {
|
|
c.attr.Owner.GID = attr.Owner.GID
|
|
}
|
|
if !c.dirtyAttr.AccessTime {
|
|
c.attr.AccessTime = attr.AccessTime
|
|
}
|
|
if !c.dirtyAttr.ModificationTime {
|
|
c.attr.ModificationTime = attr.ModificationTime
|
|
}
|
|
if !c.dirtyAttr.StatusChangeTime {
|
|
c.attr.StatusChangeTime = attr.StatusChangeTime
|
|
}
|
|
if !c.dirtyAttr.Links {
|
|
c.attr.Links = attr.Links
|
|
}
|
|
|
|
// Size requires holding attrMu and dataMu.
|
|
c.dataMu.Lock()
|
|
if !c.dirtyAttr.Size {
|
|
c.attr.Size = attr.Size
|
|
}
|
|
c.dataMu.Unlock()
|
|
|
|
c.attrMu.Unlock()
|
|
}
|
|
|
|
// Read reads from frames and otherwise directly from the backing file
|
|
// into dst starting at offset until dst is full, EOF is reached, or an
|
|
// error is encountered.
|
|
//
|
|
// Read may partially fill dst and return a nil error.
|
|
func (c *CachingInodeOperations) Read(ctx context.Context, file *fs.File, dst usermem.IOSequence, offset int64) (int64, error) {
|
|
if dst.NumBytes() == 0 {
|
|
return 0, nil
|
|
}
|
|
|
|
// Have we reached EOF? We check for this again in
|
|
// inodeReadWriter.ReadToBlocks to avoid holding c.attrMu (which would
|
|
// serialize reads) or c.dataMu (which would violate lock ordering), but
|
|
// check here first (before calling into MM) since reading at EOF is
|
|
// common: getting a return value of 0 from a read syscall is the only way
|
|
// to detect EOF.
|
|
//
|
|
// TODO(jamieliu): Separate out c.attr.Size and use atomics instead of
|
|
// c.dataMu.
|
|
c.dataMu.RLock()
|
|
size := c.attr.Size
|
|
c.dataMu.RUnlock()
|
|
if offset >= size {
|
|
return 0, io.EOF
|
|
}
|
|
|
|
n, err := dst.CopyOutFrom(ctx, &inodeReadWriter{ctx, c, offset})
|
|
// Compare Linux's mm/filemap.c:do_generic_file_read() => file_accessed().
|
|
c.TouchAccessTime(ctx, file.Dirent.Inode)
|
|
return n, err
|
|
}
|
|
|
|
// Write writes to frames and otherwise directly to the backing file
|
|
// from src starting at offset and until src is empty or an error is
|
|
// encountered.
|
|
//
|
|
// If Write partially fills src, a non-nil error is returned.
|
|
func (c *CachingInodeOperations) Write(ctx context.Context, src usermem.IOSequence, offset int64) (int64, error) {
|
|
// Hot path. Avoid defers.
|
|
if src.NumBytes() == 0 {
|
|
return 0, nil
|
|
}
|
|
|
|
c.attrMu.Lock()
|
|
// Compare Linux's mm/filemap.c:__generic_file_write_iter() => file_update_time().
|
|
c.touchModificationAndStatusChangeTimeLocked(ktime.NowFromContext(ctx))
|
|
n, err := src.CopyInTo(ctx, &inodeReadWriter{ctx, c, offset})
|
|
c.attrMu.Unlock()
|
|
return n, err
|
|
}
|
|
|
|
type inodeReadWriter struct {
|
|
ctx context.Context
|
|
c *CachingInodeOperations
|
|
offset int64
|
|
}
|
|
|
|
// ReadToBlocks implements safemem.Reader.ReadToBlocks.
|
|
func (rw *inodeReadWriter) ReadToBlocks(dsts safemem.BlockSeq) (uint64, error) {
|
|
mem := rw.c.mfp.MemoryFile()
|
|
fillCache := !rw.c.useHostPageCache() && mem.ShouldCacheEvictable()
|
|
|
|
// Hot path. Avoid defers.
|
|
var unlock func()
|
|
if fillCache {
|
|
rw.c.dataMu.Lock()
|
|
unlock = rw.c.dataMu.Unlock
|
|
} else {
|
|
rw.c.dataMu.RLock()
|
|
unlock = rw.c.dataMu.RUnlock
|
|
}
|
|
|
|
// Compute the range to read.
|
|
if rw.offset >= rw.c.attr.Size {
|
|
unlock()
|
|
return 0, io.EOF
|
|
}
|
|
end := fs.ReadEndOffset(rw.offset, int64(dsts.NumBytes()), rw.c.attr.Size)
|
|
if end == rw.offset { // dsts.NumBytes() == 0?
|
|
unlock()
|
|
return 0, nil
|
|
}
|
|
|
|
var done uint64
|
|
seg, gap := rw.c.cache.Find(uint64(rw.offset))
|
|
for rw.offset < end {
|
|
mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
|
|
switch {
|
|
case seg.Ok():
|
|
// Get internal mappings from the cache.
|
|
ims, err := mem.MapInternal(seg.FileRangeOf(seg.Range().Intersect(mr)), usermem.Read)
|
|
if err != nil {
|
|
unlock()
|
|
return done, err
|
|
}
|
|
|
|
// Copy from internal mappings.
|
|
n, err := safemem.CopySeq(dsts, ims)
|
|
done += n
|
|
rw.offset += int64(n)
|
|
dsts = dsts.DropFirst64(n)
|
|
if err != nil {
|
|
unlock()
|
|
return done, err
|
|
}
|
|
|
|
// Continue.
|
|
seg, gap = seg.NextNonEmpty()
|
|
|
|
case gap.Ok():
|
|
gapMR := gap.Range().Intersect(mr)
|
|
if fillCache {
|
|
// Read into the cache, then re-enter the loop to read from the
|
|
// cache.
|
|
reqMR := memmap.MappableRange{
|
|
Start: uint64(usermem.Addr(gapMR.Start).RoundDown()),
|
|
End: fs.OffsetPageEnd(int64(gapMR.End)),
|
|
}
|
|
optMR := gap.Range()
|
|
err := rw.c.cache.Fill(rw.ctx, reqMR, maxFillRange(reqMR, optMR), mem, usage.PageCache, rw.c.backingFile.ReadToBlocksAt)
|
|
mem.MarkEvictable(rw.c, pgalloc.EvictableRange{optMR.Start, optMR.End})
|
|
seg, gap = rw.c.cache.Find(uint64(rw.offset))
|
|
if !seg.Ok() {
|
|
unlock()
|
|
return done, err
|
|
}
|
|
// err might have occurred in part of gap.Range() outside
|
|
// gapMR. Forget about it for now; if the error matters and
|
|
// persists, we'll run into it again in a later iteration of
|
|
// this loop.
|
|
} else {
|
|
// Read directly from the backing file.
|
|
dst := dsts.TakeFirst64(gapMR.Length())
|
|
n, err := rw.c.backingFile.ReadToBlocksAt(rw.ctx, dst, gapMR.Start)
|
|
done += n
|
|
rw.offset += int64(n)
|
|
dsts = dsts.DropFirst64(n)
|
|
// Partial reads are fine. But we must stop reading.
|
|
if n != dst.NumBytes() || err != nil {
|
|
unlock()
|
|
return done, err
|
|
}
|
|
|
|
// Continue.
|
|
seg, gap = gap.NextSegment(), FileRangeGapIterator{}
|
|
}
|
|
|
|
default:
|
|
break
|
|
}
|
|
}
|
|
unlock()
|
|
return done, nil
|
|
}
|
|
|
|
// maybeGrowFile grows the file's size if data has been written past the old
|
|
// size.
|
|
//
|
|
// Preconditions: rw.c.attrMu and rw.c.dataMu bust be locked.
|
|
func (rw *inodeReadWriter) maybeGrowFile() {
|
|
// If the write ends beyond the file's previous size, it causes the
|
|
// file to grow.
|
|
if rw.offset > rw.c.attr.Size {
|
|
rw.c.attr.Size = rw.offset
|
|
rw.c.dirtyAttr.Size = true
|
|
}
|
|
if rw.offset > rw.c.attr.Usage {
|
|
// This is incorrect if CachingInodeOperations is caching a sparse
|
|
// file. (In Linux, keeping inode::i_blocks up to date is the
|
|
// filesystem's responsibility.)
|
|
rw.c.attr.Usage = rw.offset
|
|
rw.c.dirtyAttr.Usage = true
|
|
}
|
|
}
|
|
|
|
// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
|
|
//
|
|
// Preconditions: rw.c.attrMu must be locked.
|
|
func (rw *inodeReadWriter) WriteFromBlocks(srcs safemem.BlockSeq) (uint64, error) {
|
|
// Hot path. Avoid defers.
|
|
rw.c.dataMu.Lock()
|
|
|
|
// Compute the range to write.
|
|
end := fs.WriteEndOffset(rw.offset, int64(srcs.NumBytes()))
|
|
if end == rw.offset { // srcs.NumBytes() == 0?
|
|
rw.c.dataMu.Unlock()
|
|
return 0, nil
|
|
}
|
|
|
|
mf := rw.c.mfp.MemoryFile()
|
|
var done uint64
|
|
seg, gap := rw.c.cache.Find(uint64(rw.offset))
|
|
for rw.offset < end {
|
|
mr := memmap.MappableRange{uint64(rw.offset), uint64(end)}
|
|
switch {
|
|
case seg.Ok() && seg.Start() < mr.End:
|
|
// Get internal mappings from the cache.
|
|
segMR := seg.Range().Intersect(mr)
|
|
ims, err := mf.MapInternal(seg.FileRangeOf(segMR), usermem.Write)
|
|
if err != nil {
|
|
rw.maybeGrowFile()
|
|
rw.c.dataMu.Unlock()
|
|
return done, err
|
|
}
|
|
|
|
// Copy to internal mappings.
|
|
n, err := safemem.CopySeq(ims, srcs)
|
|
done += n
|
|
rw.offset += int64(n)
|
|
srcs = srcs.DropFirst64(n)
|
|
rw.c.dirty.MarkDirty(segMR)
|
|
if err != nil {
|
|
rw.maybeGrowFile()
|
|
rw.c.dataMu.Unlock()
|
|
return done, err
|
|
}
|
|
|
|
// Continue.
|
|
seg, gap = seg.NextNonEmpty()
|
|
|
|
case gap.Ok() && gap.Start() < mr.End:
|
|
// Write directly to the backing file. At present, we never fill
|
|
// the cache when writing, since doing so can convert small writes
|
|
// into inefficient read-modify-write cycles, and we have no
|
|
// mechanism for detecting or avoiding this.
|
|
gapmr := gap.Range().Intersect(mr)
|
|
src := srcs.TakeFirst64(gapmr.Length())
|
|
n, err := rw.c.backingFile.WriteFromBlocksAt(rw.ctx, src, gapmr.Start)
|
|
done += n
|
|
rw.offset += int64(n)
|
|
srcs = srcs.DropFirst64(n)
|
|
// Partial writes are fine. But we must stop writing.
|
|
if n != src.NumBytes() || err != nil {
|
|
rw.maybeGrowFile()
|
|
rw.c.dataMu.Unlock()
|
|
return done, err
|
|
}
|
|
|
|
// Continue.
|
|
seg, gap = gap.NextSegment(), FileRangeGapIterator{}
|
|
|
|
default:
|
|
break
|
|
}
|
|
}
|
|
rw.maybeGrowFile()
|
|
rw.c.dataMu.Unlock()
|
|
return done, nil
|
|
}
|
|
|
|
// useHostPageCache returns true if c uses c.backingFile.FD() for all file I/O
|
|
// and memory mappings, and false if c.cache may contain data cached from
|
|
// c.backingFile.
|
|
func (c *CachingInodeOperations) useHostPageCache() bool {
|
|
return !c.opts.ForcePageCache && c.backingFile.FD() >= 0
|
|
}
|
|
|
|
// AddMapping implements memmap.Mappable.AddMapping.
|
|
func (c *CachingInodeOperations) AddMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) error {
|
|
// Hot path. Avoid defers.
|
|
c.mapsMu.Lock()
|
|
mapped := c.mappings.AddMapping(ms, ar, offset, writable)
|
|
// Do this unconditionally since whether we have c.backingFile.FD() >= 0
|
|
// can change across save/restore.
|
|
for _, r := range mapped {
|
|
c.hostFileMapper.IncRefOn(r)
|
|
}
|
|
if !c.useHostPageCache() {
|
|
// c.Evict() will refuse to evict memory-mapped pages, so tell the
|
|
// MemoryFile to not bother trying.
|
|
mf := c.mfp.MemoryFile()
|
|
for _, r := range mapped {
|
|
mf.MarkUnevictable(c, pgalloc.EvictableRange{r.Start, r.End})
|
|
}
|
|
}
|
|
if c.useHostPageCache() && !usage.IncrementalMappedAccounting {
|
|
for _, r := range mapped {
|
|
usage.MemoryAccounting.Inc(r.Length(), usage.Mapped)
|
|
}
|
|
}
|
|
c.mapsMu.Unlock()
|
|
return nil
|
|
}
|
|
|
|
// RemoveMapping implements memmap.Mappable.RemoveMapping.
|
|
func (c *CachingInodeOperations) RemoveMapping(ctx context.Context, ms memmap.MappingSpace, ar usermem.AddrRange, offset uint64, writable bool) {
|
|
// Hot path. Avoid defers.
|
|
c.mapsMu.Lock()
|
|
unmapped := c.mappings.RemoveMapping(ms, ar, offset, writable)
|
|
for _, r := range unmapped {
|
|
c.hostFileMapper.DecRefOn(r)
|
|
}
|
|
if c.useHostPageCache() {
|
|
if !usage.IncrementalMappedAccounting {
|
|
for _, r := range unmapped {
|
|
usage.MemoryAccounting.Dec(r.Length(), usage.Mapped)
|
|
}
|
|
}
|
|
c.mapsMu.Unlock()
|
|
return
|
|
}
|
|
|
|
// Pages that are no longer referenced by any application memory mappings
|
|
// are now considered unused; allow MemoryFile to evict them when
|
|
// necessary.
|
|
mf := c.mfp.MemoryFile()
|
|
c.dataMu.Lock()
|
|
for _, r := range unmapped {
|
|
// Since these pages are no longer mapped, they are no longer
|
|
// concurrently dirtyable by a writable memory mapping.
|
|
c.dirty.AllowClean(r)
|
|
mf.MarkEvictable(c, pgalloc.EvictableRange{r.Start, r.End})
|
|
}
|
|
c.dataMu.Unlock()
|
|
c.mapsMu.Unlock()
|
|
}
|
|
|
|
// CopyMapping implements memmap.Mappable.CopyMapping.
|
|
func (c *CachingInodeOperations) CopyMapping(ctx context.Context, ms memmap.MappingSpace, srcAR, dstAR usermem.AddrRange, offset uint64, writable bool) error {
|
|
return c.AddMapping(ctx, ms, dstAR, offset, writable)
|
|
}
|
|
|
|
// Translate implements memmap.Mappable.Translate.
|
|
func (c *CachingInodeOperations) Translate(ctx context.Context, required, optional memmap.MappableRange, at usermem.AccessType) ([]memmap.Translation, error) {
|
|
// Hot path. Avoid defer.
|
|
if c.useHostPageCache() {
|
|
mr := optional
|
|
if c.opts.LimitHostFDTranslation {
|
|
mr = maxFillRange(required, optional)
|
|
}
|
|
return []memmap.Translation{
|
|
{
|
|
Source: mr,
|
|
File: c,
|
|
Offset: mr.Start,
|
|
Perms: usermem.AnyAccess,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
c.dataMu.Lock()
|
|
|
|
// Constrain translations to c.attr.Size (rounded up) to prevent
|
|
// translation to pages that may be concurrently truncated.
|
|
pgend := fs.OffsetPageEnd(c.attr.Size)
|
|
var beyondEOF bool
|
|
if required.End > pgend {
|
|
if required.Start >= pgend {
|
|
c.dataMu.Unlock()
|
|
return nil, &memmap.BusError{io.EOF}
|
|
}
|
|
beyondEOF = true
|
|
required.End = pgend
|
|
}
|
|
if optional.End > pgend {
|
|
optional.End = pgend
|
|
}
|
|
|
|
mf := c.mfp.MemoryFile()
|
|
cerr := c.cache.Fill(ctx, required, maxFillRange(required, optional), mf, usage.PageCache, c.backingFile.ReadToBlocksAt)
|
|
|
|
var ts []memmap.Translation
|
|
var translatedEnd uint64
|
|
for seg := c.cache.FindSegment(required.Start); seg.Ok() && seg.Start() < required.End; seg, _ = seg.NextNonEmpty() {
|
|
segMR := seg.Range().Intersect(optional)
|
|
// TODO(jamieliu): Make Translations writable even if writability is
|
|
// not required if already kept-dirty by another writable translation.
|
|
perms := usermem.AccessType{
|
|
Read: true,
|
|
Execute: true,
|
|
}
|
|
if at.Write {
|
|
// From this point forward, this memory can be dirtied through the
|
|
// mapping at any time.
|
|
c.dirty.KeepDirty(segMR)
|
|
perms.Write = true
|
|
}
|
|
ts = append(ts, memmap.Translation{
|
|
Source: segMR,
|
|
File: mf,
|
|
Offset: seg.FileRangeOf(segMR).Start,
|
|
Perms: perms,
|
|
})
|
|
translatedEnd = segMR.End
|
|
}
|
|
|
|
c.dataMu.Unlock()
|
|
|
|
// Don't return the error returned by c.cache.Fill if it occurred outside
|
|
// of required.
|
|
if translatedEnd < required.End && cerr != nil {
|
|
return ts, &memmap.BusError{cerr}
|
|
}
|
|
if beyondEOF {
|
|
return ts, &memmap.BusError{io.EOF}
|
|
}
|
|
return ts, nil
|
|
}
|
|
|
|
func maxFillRange(required, optional memmap.MappableRange) memmap.MappableRange {
|
|
const maxReadahead = 64 << 10 // 64 KB, chosen arbitrarily
|
|
if required.Length() >= maxReadahead {
|
|
return required
|
|
}
|
|
if optional.Length() <= maxReadahead {
|
|
return optional
|
|
}
|
|
optional.Start = required.Start
|
|
if optional.Length() <= maxReadahead {
|
|
return optional
|
|
}
|
|
optional.End = optional.Start + maxReadahead
|
|
return optional
|
|
}
|
|
|
|
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
|
|
func (c *CachingInodeOperations) InvalidateUnsavable(ctx context.Context) error {
|
|
// Whether we have a host fd (and consequently what platform.File is
|
|
// mapped) can change across save/restore, so invalidate all translations
|
|
// unconditionally.
|
|
c.mapsMu.Lock()
|
|
defer c.mapsMu.Unlock()
|
|
c.mappings.InvalidateAll(memmap.InvalidateOpts{})
|
|
|
|
// Sync the cache's contents so that if we have a host fd after restore,
|
|
// the remote file's contents are coherent.
|
|
mf := c.mfp.MemoryFile()
|
|
c.dataMu.Lock()
|
|
defer c.dataMu.Unlock()
|
|
if err := SyncDirtyAll(ctx, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Discard the cache so that it's not stored in saved state. This is safe
|
|
// because per InvalidateUnsavable invariants, no new translations can have
|
|
// been returned after we invalidated all existing translations above.
|
|
c.cache.DropAll(mf)
|
|
c.dirty.RemoveAll()
|
|
|
|
return nil
|
|
}
|
|
|
|
// Evict implements pgalloc.EvictableMemoryUser.Evict.
|
|
func (c *CachingInodeOperations) Evict(ctx context.Context, er pgalloc.EvictableRange) {
|
|
c.mapsMu.Lock()
|
|
defer c.mapsMu.Unlock()
|
|
c.dataMu.Lock()
|
|
defer c.dataMu.Unlock()
|
|
|
|
mr := memmap.MappableRange{er.Start, er.End}
|
|
mf := c.mfp.MemoryFile()
|
|
// Only allow pages that are no longer memory-mapped to be evicted.
|
|
for mgap := c.mappings.LowerBoundGap(mr.Start); mgap.Ok() && mgap.Start() < mr.End; mgap = mgap.NextGap() {
|
|
mgapMR := mgap.Range().Intersect(mr)
|
|
if mgapMR.Length() == 0 {
|
|
continue
|
|
}
|
|
if err := SyncDirty(ctx, mgapMR, &c.cache, &c.dirty, uint64(c.attr.Size), mf, c.backingFile.WriteFromBlocksAt); err != nil {
|
|
log.Warningf("Failed to writeback cached data %v: %v", mgapMR, err)
|
|
}
|
|
c.cache.Drop(mgapMR, mf)
|
|
c.dirty.KeepClean(mgapMR)
|
|
}
|
|
}
|
|
|
|
// IncRef implements platform.File.IncRef. This is used when we directly map an
|
|
// underlying host fd and CachingInodeOperations is used as the platform.File
|
|
// during translation.
|
|
func (c *CachingInodeOperations) IncRef(fr platform.FileRange) {
|
|
// Hot path. Avoid defers.
|
|
c.dataMu.Lock()
|
|
seg, gap := c.refs.Find(fr.Start)
|
|
for {
|
|
switch {
|
|
case seg.Ok() && seg.Start() < fr.End:
|
|
seg = c.refs.Isolate(seg, fr)
|
|
seg.SetValue(seg.Value() + 1)
|
|
seg, gap = seg.NextNonEmpty()
|
|
case gap.Ok() && gap.Start() < fr.End:
|
|
newRange := gap.Range().Intersect(fr)
|
|
if usage.IncrementalMappedAccounting {
|
|
usage.MemoryAccounting.Inc(newRange.Length(), usage.Mapped)
|
|
}
|
|
seg, gap = c.refs.InsertWithoutMerging(gap, newRange, 1).NextNonEmpty()
|
|
default:
|
|
c.refs.MergeAdjacent(fr)
|
|
c.dataMu.Unlock()
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// DecRef implements platform.File.DecRef. This is used when we directly map an
|
|
// underlying host fd and CachingInodeOperations is used as the platform.File
|
|
// during translation.
|
|
func (c *CachingInodeOperations) DecRef(fr platform.FileRange) {
|
|
// Hot path. Avoid defers.
|
|
c.dataMu.Lock()
|
|
seg := c.refs.FindSegment(fr.Start)
|
|
|
|
for seg.Ok() && seg.Start() < fr.End {
|
|
seg = c.refs.Isolate(seg, fr)
|
|
if old := seg.Value(); old == 1 {
|
|
if usage.IncrementalMappedAccounting {
|
|
usage.MemoryAccounting.Dec(seg.Range().Length(), usage.Mapped)
|
|
}
|
|
seg = c.refs.Remove(seg).NextSegment()
|
|
} else {
|
|
seg.SetValue(old - 1)
|
|
seg = seg.NextSegment()
|
|
}
|
|
}
|
|
c.refs.MergeAdjacent(fr)
|
|
c.dataMu.Unlock()
|
|
|
|
}
|
|
|
|
// MapInternal implements platform.File.MapInternal. This is used when we
|
|
// directly map an underlying host fd and CachingInodeOperations is used as the
|
|
// platform.File during translation.
|
|
func (c *CachingInodeOperations) MapInternal(fr platform.FileRange, at usermem.AccessType) (safemem.BlockSeq, error) {
|
|
return c.hostFileMapper.MapInternal(fr, c.backingFile.FD(), at.Write)
|
|
}
|
|
|
|
// FD implements platform.File.FD. This is used when we directly map an
|
|
// underlying host fd and CachingInodeOperations is used as the platform.File
|
|
// during translation.
|
|
func (c *CachingInodeOperations) FD() int {
|
|
return c.backingFile.FD()
|
|
}
|