gvisor/pkg/sentry/fs/gofer/inode.go

609 lines
19 KiB
Go

// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package gofer
import (
"errors"
"sync"
"syscall"
"gvisor.googlesource.com/gvisor/pkg/abi/linux"
"gvisor.googlesource.com/gvisor/pkg/fd"
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/p9"
"gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/device"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fdpipe"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/fsutil"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
"gvisor.googlesource.com/gvisor/pkg/sentry/memmap"
"gvisor.googlesource.com/gvisor/pkg/sentry/safemem"
"gvisor.googlesource.com/gvisor/pkg/syserror"
)
// inodeOperations implements fs.InodeOperations.
//
// +stateify savable
type inodeOperations struct {
fsutil.InodeNotVirtual `state:"nosave"`
fsutil.InodeNoExtendedAttributes `state:"nosave"`
// fileState implements fs.CachedFileObject. It exists
// to break a circular load dependency between inodeOperations
// and cachingInodeOps (below).
fileState *inodeFileState `state:"wait"`
// cachingInodeOps implement memmap.Mappable for inodeOperations.
cachingInodeOps *fsutil.CachingInodeOperations
// readdirMu protects readdirCache and concurrent Readdirs.
readdirMu sync.Mutex `state:"nosave"`
// readdirCache is a cache of readdir results in the form of
// a fs.SortedDentryMap.
//
// Starts out as nil, and is initialized under readdirMu lazily;
// invalidating the cache means setting it to nil.
readdirCache *fs.SortedDentryMap `state:"nosave"`
}
// inodeFileState implements fs.CachedFileObject and otherwise fully
// encapsulates state that needs to be manually loaded on restore for
// this file object.
//
// This unfortunate structure exists because fs.CachingInodeOperations
// defines afterLoad and therefore cannot be lazily loaded (to break a
// circular load dependency between it and inodeOperations). Even with
// lazy loading, this approach defines the dependencies between objects
// and the expected load behavior more concretely.
//
// +stateify savable
type inodeFileState struct {
// s is common file system state for Gofers.
s *session `state:"wait"`
// MultiDeviceKey consists of:
//
// * Device: file system device from a specific gofer.
// * SecondaryDevice: unique identifier of the attach point.
// * Inode: the inode of this resource, unique per Device.=
//
// These fields combined enable consistent hashing of virtual inodes
// on goferDevice.
key device.MultiDeviceKey `state:"nosave"`
// file is the p9 file that contains a single unopened fid.
file contextFile `state:"nosave"`
// sattr caches the stable attributes.
sattr fs.StableAttr `state:"wait"`
// handlesMu protects the below fields.
handlesMu sync.RWMutex `state:"nosave"`
// Do minimal open handle caching: only for read only filesystems.
readonly *handles `state:"nosave"`
// Maintain readthrough handles for populating page caches.
readthrough *handles `state:"nosave"`
// Maintain writeback handles for syncing from page caches.
writeback *handles `state:"nosave"`
// writebackRW indicates whether writeback is opened read-write. If
// it is not and a read-write handle could replace writeback (above),
// then writeback is replaced with the read-write handle. This
// ensures that files that were first opened write-only and then
// later are opened read-write to be mapped can in fact be mapped.
writebackRW bool
// loading is acquired when the inodeFileState begins an asynchronous
// load. It releases when the load is complete. Callers that require all
// state to be available should call waitForLoad() to ensure that.
loading sync.Mutex `state:".(struct{})"`
// savedUAttr is only allocated during S/R. It points to the save-time
// unstable attributes and is used to validate restore-time ones.
//
// Note that these unstable attributes are only used to detect cross-S/R
// external file system metadata changes. They may differ from the
// cached unstable attributes in cachingInodeOps, as that might differ
// from the external file system attributes if there had been WriteOut
// failures. S/R is transparent to Sentry and the latter will continue
// using its cached values after restore.
savedUAttr *fs.UnstableAttr
// hostMappable is created when using 'cacheRemoteRevalidating' to map pages
// directly from host.
hostMappable *fsutil.HostMappable
}
// Release releases file handles.
func (i *inodeFileState) Release(ctx context.Context) {
i.file.close(ctx)
if i.readonly != nil {
i.readonly.DecRef()
}
if i.readthrough != nil {
i.readthrough.DecRef()
}
if i.writeback != nil {
i.writeback.DecRef()
}
}
// setHandlesForCachedIO installs file handles for reading and writing
// through fs.CachingInodeOperations.
func (i *inodeFileState) setHandlesForCachedIO(flags fs.FileFlags, h *handles) {
i.handlesMu.Lock()
defer i.handlesMu.Unlock()
if flags.Read {
if i.readthrough == nil {
h.IncRef()
i.readthrough = h
}
}
if flags.Write {
if i.writeback == nil {
h.IncRef()
i.writeback = h
} else if !i.writebackRW && flags.Read {
i.writeback.DecRef()
h.IncRef()
i.writeback = h
}
if flags.Read {
i.writebackRW = true
}
}
if i.hostMappable != nil {
i.hostMappable.UpdateFD(i.fdLocked())
}
}
// getCachedHandles returns any cached handles which would accelerate
// performance generally. These handles should only be used if the mount
// supports caching. This is distinct from fs.CachingInodeOperations
// which is used for a limited set of file types (those that can be mapped).
func (i *inodeFileState) getCachedHandles(ctx context.Context, flags fs.FileFlags, msrc *fs.MountSource) (*handles, bool) {
i.handlesMu.Lock()
defer i.handlesMu.Unlock()
if flags.Read && !flags.Write && msrc.Flags.ReadOnly {
if i.readonly != nil {
i.readonly.IncRef()
return i.readonly, true
}
h, err := newHandles(ctx, i.file, flags)
if err != nil {
return nil, false
}
i.readonly = h
i.readonly.IncRef()
return i.readonly, true
}
return nil, false
}
// ReadToBlocksAt implements fsutil.CachedFileObject.ReadToBlocksAt.
func (i *inodeFileState) ReadToBlocksAt(ctx context.Context, dsts safemem.BlockSeq, offset uint64) (uint64, error) {
i.handlesMu.RLock()
defer i.handlesMu.RUnlock()
return i.readthrough.readWriterAt(ctx, int64(offset)).ReadToBlocks(dsts)
}
// WriteFromBlocksAt implements fsutil.CachedFileObject.WriteFromBlocksAt.
func (i *inodeFileState) WriteFromBlocksAt(ctx context.Context, srcs safemem.BlockSeq, offset uint64) (uint64, error) {
i.handlesMu.RLock()
defer i.handlesMu.RUnlock()
return i.writeback.readWriterAt(ctx, int64(offset)).WriteFromBlocks(srcs)
}
// SetMaskedAttributes implements fsutil.CachedFileObject.SetMaskedAttributes.
func (i *inodeFileState) SetMaskedAttributes(ctx context.Context, mask fs.AttrMask, attr fs.UnstableAttr) error {
if i.skipSetAttr(mask) {
return nil
}
as, ans := attr.AccessTime.Unix()
ms, mns := attr.ModificationTime.Unix()
// An update of status change time is implied by mask.AccessTime
// or mask.ModificationTime. Updating status change time to a
// time earlier than the system time is not possible.
return i.file.setAttr(
ctx,
p9.SetAttrMask{
Permissions: mask.Perms,
Size: mask.Size,
UID: mask.UID,
GID: mask.GID,
ATime: mask.AccessTime,
ATimeNotSystemTime: true,
MTime: mask.ModificationTime,
MTimeNotSystemTime: true,
}, p9.SetAttr{
Permissions: p9.FileMode(attr.Perms.LinuxMode()),
UID: p9.UID(attr.Owner.UID),
GID: p9.GID(attr.Owner.GID),
Size: uint64(attr.Size),
ATimeSeconds: uint64(as),
ATimeNanoSeconds: uint64(ans),
MTimeSeconds: uint64(ms),
MTimeNanoSeconds: uint64(mns),
})
}
// skipSetAttr checks if attribute change can be skipped. It can be skipped
// when:
// - Mask is empty
// - Mask contains only atime and/or mtime, and host FD exists
//
// Updates to atime and mtime can be skipped because cached value will be
// "close enough" to host value, given that operation went directly to host FD.
// Skipping atime updates is particularly important to reduce the number of
// operations sent to the Gofer for readonly files.
func (i *inodeFileState) skipSetAttr(mask fs.AttrMask) bool {
if mask.Empty() {
return true
}
cpy := mask
cpy.AccessTime = false
cpy.ModificationTime = false
if !cpy.Empty() {
// More than just atime and mtime is being set.
return false
}
i.handlesMu.RLock()
defer i.handlesMu.RUnlock()
return (i.readonly != nil && i.readonly.Host != nil) ||
(i.readthrough != nil && i.readthrough.Host != nil) ||
(i.writeback != nil && i.writeback.Host != nil)
}
// Sync implements fsutil.CachedFileObject.Sync.
func (i *inodeFileState) Sync(ctx context.Context) error {
i.handlesMu.RLock()
defer i.handlesMu.RUnlock()
if i.writeback == nil {
return nil
}
return i.writeback.File.fsync(ctx)
}
// FD implements fsutil.CachedFileObject.FD.
//
// FD meets the requirements of fsutil.CachedFileObject.FD because p9.File.Open
// returns a host file descriptor to back _both_ readthrough and writeback or
// not at all (e.g. both are nil).
func (i *inodeFileState) FD() int {
i.handlesMu.RLock()
defer i.handlesMu.RUnlock()
return i.fdLocked()
}
func (i *inodeFileState) fdLocked() int {
// Assert that the file was actually opened.
if i.writeback == nil && i.readthrough == nil {
panic("cannot get host FD for a file that was never opened")
}
// If this file is mapped, then it must have been opened
// read-write and i.writeback was upgraded to a read-write
// handle. Prefer that to map.
if i.writeback != nil {
if i.writeback.Host == nil {
return -1
}
return int(i.writeback.Host.FD())
}
// Otherwise the file may only have been opened readable
// so far. That's the only way it can be accessed.
if i.readthrough.Host == nil {
return -1
}
return int(i.readthrough.Host.FD())
}
// waitForLoad makes sure any restore-issued loading is done.
func (i *inodeFileState) waitForLoad() {
// This is not a no-op. The loading mutex is hold upon restore until
// all loading actions are done.
i.loading.Lock()
i.loading.Unlock()
}
func (i *inodeFileState) unstableAttr(ctx context.Context) (fs.UnstableAttr, error) {
_, valid, pattr, err := getattr(ctx, i.file)
if err != nil {
return fs.UnstableAttr{}, err
}
return unstable(ctx, valid, pattr, i.s.mounter, i.s.client), nil
}
// session extracts the gofer's session from the MountSource.
func (i *inodeOperations) session() *session {
return i.fileState.s
}
// Release implements fs.InodeOperations.Release.
func (i *inodeOperations) Release(ctx context.Context) {
i.cachingInodeOps.Release()
// Releasing the fileState may make RPCs to the gofer. There is
// no need to wait for those to return, so we can do this
// asynchronously.
fs.Async(func() {
i.fileState.Release(ctx)
})
}
// Mappable implements fs.InodeOperations.Mappable.
func (i *inodeOperations) Mappable(inode *fs.Inode) memmap.Mappable {
if i.session().cachePolicy.useCachingInodeOps(inode) {
return i.cachingInodeOps
}
// This check is necessary because it's returning an interface type.
if i.fileState.hostMappable != nil {
return i.fileState.hostMappable
}
return nil
}
// UnstableAttr implements fs.InodeOperations.UnstableAttr.
func (i *inodeOperations) UnstableAttr(ctx context.Context, inode *fs.Inode) (fs.UnstableAttr, error) {
if i.session().cachePolicy.cacheUAttrs(inode) {
return i.cachingInodeOps.UnstableAttr(ctx, inode)
}
return i.fileState.unstableAttr(ctx)
}
// Check implements fs.InodeOperations.Check.
func (i *inodeOperations) Check(ctx context.Context, inode *fs.Inode, p fs.PermMask) bool {
return fs.ContextCanAccessFile(ctx, inode, p)
}
// GetFile implements fs.InodeOperations.GetFile.
func (i *inodeOperations) GetFile(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
switch d.Inode.StableAttr.Type {
case fs.Socket:
return i.getFileSocket(ctx, d, flags)
case fs.Pipe:
return i.getFilePipe(ctx, d, flags)
default:
return i.getFileDefault(ctx, d, flags)
}
}
func (i *inodeOperations) getFileSocket(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
f, err := i.fileState.file.connect(ctx, p9.AnonymousSocket)
if err != nil {
return nil, syscall.EIO
}
fsf, err := host.NewSocketWithDirent(ctx, d, f, flags)
if err != nil {
f.Close()
return nil, err
}
return fsf, nil
}
func (i *inodeOperations) getFilePipe(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
// Try to open as a host pipe.
if pipeOps, err := fdpipe.Open(ctx, i, flags); err != errNotHostFile {
return fs.NewFile(ctx, d, flags, pipeOps), err
}
// If the error is due to the fact that this was never a host pipe, then back
// this file with its dirent.
h, err := newHandles(ctx, i.fileState.file, flags)
if err != nil {
return nil, err
}
return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
}
// errNotHostFile indicates that the file is not a host file.
var errNotHostFile = errors.New("not a host file")
// NonBlockingOpen implements fdpipe.NonBlockingOpener for opening host named pipes.
func (i *inodeOperations) NonBlockingOpen(ctx context.Context, p fs.PermMask) (*fd.FD, error) {
i.fileState.waitForLoad()
// Get a cloned fid which we will open.
_, newFile, err := i.fileState.file.walk(ctx, nil)
if err != nil {
log.Warningf("Open Walk failed: %v", err)
return nil, err
}
defer newFile.close(ctx)
flags, err := openFlagsFromPerms(p)
if err != nil {
log.Warningf("Open flags %s parsing failed: %v", p, err)
return nil, err
}
hostFile, _, _, err := newFile.open(ctx, flags)
// If the host file returned is nil and the error is nil,
// then this was never a host file to begin with, and should
// be treated like a remote file.
if hostFile == nil && err == nil {
return nil, errNotHostFile
}
return hostFile, err
}
func (i *inodeOperations) getFileDefault(ctx context.Context, d *fs.Dirent, flags fs.FileFlags) (*fs.File, error) {
if !i.session().cachePolicy.cacheHandles(d.Inode) {
h, err := newHandles(ctx, i.fileState.file, flags)
if err != nil {
return nil, err
}
return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
}
h, ok := i.fileState.getCachedHandles(ctx, flags, d.Inode.MountSource)
if !ok {
var err error
h, err = newHandles(ctx, i.fileState.file, flags)
if err != nil {
return nil, err
}
}
i.fileState.setHandlesForCachedIO(flags, h)
return NewFile(ctx, d, d.BaseName(), flags, i, h), nil
}
// SetPermissions implements fs.InodeOperations.SetPermissions.
func (i *inodeOperations) SetPermissions(ctx context.Context, inode *fs.Inode, p fs.FilePermissions) bool {
if i.session().cachePolicy.cacheUAttrs(inode) {
return i.cachingInodeOps.SetPermissions(ctx, inode, p)
}
mask := p9.SetAttrMask{Permissions: true}
pattr := p9.SetAttr{Permissions: p9.FileMode(p.LinuxMode())}
// Execute the chmod.
return i.fileState.file.setAttr(ctx, mask, pattr) == nil
}
// SetOwner implements fs.InodeOperations.SetOwner.
func (i *inodeOperations) SetOwner(ctx context.Context, inode *fs.Inode, owner fs.FileOwner) error {
// Save the roundtrip.
if !owner.UID.Ok() && !owner.GID.Ok() {
return nil
}
if i.session().cachePolicy.cacheUAttrs(inode) {
return i.cachingInodeOps.SetOwner(ctx, inode, owner)
}
var mask p9.SetAttrMask
var attr p9.SetAttr
if owner.UID.Ok() {
mask.UID = true
attr.UID = p9.UID(owner.UID)
}
if owner.GID.Ok() {
mask.GID = true
attr.GID = p9.GID(owner.GID)
}
return i.fileState.file.setAttr(ctx, mask, attr)
}
// SetTimestamps implements fs.InodeOperations.SetTimestamps.
func (i *inodeOperations) SetTimestamps(ctx context.Context, inode *fs.Inode, ts fs.TimeSpec) error {
if i.session().cachePolicy.cacheUAttrs(inode) {
return i.cachingInodeOps.SetTimestamps(ctx, inode, ts)
}
return utimes(ctx, i.fileState.file, ts)
}
// Truncate implements fs.InodeOperations.Truncate.
func (i *inodeOperations) Truncate(ctx context.Context, inode *fs.Inode, length int64) error {
// This can only be called for files anyway.
if i.session().cachePolicy.useCachingInodeOps(inode) {
return i.cachingInodeOps.Truncate(ctx, inode, length)
}
return i.fileState.file.setAttr(ctx, p9.SetAttrMask{Size: true}, p9.SetAttr{Size: uint64(length)})
}
// WriteOut implements fs.InodeOperations.WriteOut.
func (i *inodeOperations) WriteOut(ctx context.Context, inode *fs.Inode) error {
if !i.session().cachePolicy.cacheUAttrs(inode) {
return nil
}
return i.cachingInodeOps.WriteOut(ctx, inode)
}
// Readlink implements fs.InodeOperations.Readlink.
func (i *inodeOperations) Readlink(ctx context.Context, inode *fs.Inode) (string, error) {
if !fs.IsSymlink(inode.StableAttr) {
return "", syscall.ENOLINK
}
return i.fileState.file.readlink(ctx)
}
// Getlink implementfs fs.InodeOperations.Getlink.
func (i *inodeOperations) Getlink(context.Context, *fs.Inode) (*fs.Dirent, error) {
if !fs.IsSymlink(i.fileState.sattr) {
return nil, syserror.ENOLINK
}
return nil, fs.ErrResolveViaReadlink
}
// StatFS makes a StatFS request.
func (i *inodeOperations) StatFS(ctx context.Context) (fs.Info, error) {
fsstat, err := i.fileState.file.statFS(ctx)
if err != nil {
return fs.Info{}, err
}
info := fs.Info{
// This is primarily for distinguishing a gofer file system in
// tests. Testing is important, so instead of defining
// something completely random, use a standard value.
Type: linux.V9FS_MAGIC,
TotalBlocks: fsstat.Blocks,
FreeBlocks: fsstat.BlocksFree,
TotalFiles: fsstat.Files,
FreeFiles: fsstat.FilesFree,
}
// If blocks available is non-zero, prefer that.
if fsstat.BlocksAvailable != 0 {
info.FreeBlocks = fsstat.BlocksAvailable
}
return info, nil
}
func (i *inodeOperations) configureMMap(file *fs.File, opts *memmap.MMapOpts) error {
if i.session().cachePolicy.useCachingInodeOps(file.Dirent.Inode) {
return fsutil.GenericConfigureMMap(file, i.cachingInodeOps, opts)
}
if i.fileState.hostMappable != nil {
return fsutil.GenericConfigureMMap(file, i.fileState.hostMappable, opts)
}
return syserror.ENODEV
}
func init() {
syserror.AddErrorUnwrapper(func(err error) (syscall.Errno, bool) {
if _, ok := err.(p9.ErrSocket); ok {
// Treat as an I/O error.
return syscall.EIO, true
}
return 0, false
})
}
// AddLink implements InodeOperations.AddLink, but is currently a noop.
// FIXME: Remove this from InodeOperations altogether.
func (*inodeOperations) AddLink() {}
// DropLink implements InodeOperations.DropLink, but is currently a noop.
// FIXME: Remove this from InodeOperations altogether.
func (*inodeOperations) DropLink() {}
// NotifyStatusChange implements fs.InodeOperations.NotifyStatusChange.
// FIXME: Remove this from InodeOperations altogether.
func (i *inodeOperations) NotifyStatusChange(ctx context.Context) {}