2019-04-29 21:25:05 +00:00
// Copyright 2018 The gVisor Authors.
2018-04-27 17:37:02 +00:00
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package gofer
import (
2018-08-07 18:42:29 +00:00
"fmt"
2018-04-27 17:37:02 +00:00
"syscall"
2019-02-21 21:07:25 +00:00
"time"
2018-04-27 17:37:02 +00:00
2019-06-13 23:49:09 +00:00
"gvisor.dev/gvisor/pkg/log"
"gvisor.dev/gvisor/pkg/metric"
"gvisor.dev/gvisor/pkg/p9"
"gvisor.dev/gvisor/pkg/sentry/context"
"gvisor.dev/gvisor/pkg/sentry/device"
"gvisor.dev/gvisor/pkg/sentry/fs"
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/usermem"
"gvisor.dev/gvisor/pkg/syserror"
"gvisor.dev/gvisor/pkg/waiter"
2018-04-27 17:37:02 +00:00
)
2019-02-13 20:06:20 +00:00
var (
2019-02-21 21:07:25 +00:00
opensWX = metric . MustCreateNewUint64Metric ( "/gofer/opened_write_execute_file" , true /* sync */ , "Number of times a writable+executable file was opened from a gofer." )
opens9P = metric . MustCreateNewUint64Metric ( "/gofer/opens_9p" , false /* sync */ , "Number of times a 9P file was opened from a gofer." )
opensHost = metric . MustCreateNewUint64Metric ( "/gofer/opens_host" , false /* sync */ , "Number of times a host file was opened from a gofer." )
reads9P = metric . MustCreateNewUint64Metric ( "/gofer/reads_9p" , false /* sync */ , "Number of 9P file reads from a gofer." )
readWait9P = metric . MustCreateNewUint64Metric ( "/gofer/read_wait_9p" , false /* sync */ , "Time waiting on 9P file reads from a gofer, in nanoseconds." )
readsHost = metric . MustCreateNewUint64Metric ( "/gofer/reads_host" , false /* sync */ , "Number of host file reads from a gofer." )
readWaitHost = metric . MustCreateNewUint64Metric ( "/gofer/read_wait_host" , false /* sync */ , "Time waiting on host file reads from a gofer, in nanoseconds." )
2019-02-13 20:06:20 +00:00
)
2018-04-27 17:37:02 +00:00
// fileOperations implements fs.FileOperations for a remote file system.
2018-08-02 17:41:44 +00:00
//
// +stateify savable
2018-04-27 17:37:02 +00:00
type fileOperations struct {
2019-05-21 22:17:05 +00:00
fsutil . FileNoIoctl ` state:"nosave" `
fsutil . FileNoSplice ` state:"nosplice" `
waiter . AlwaysReady ` state:"nosave" `
2018-04-27 17:37:02 +00:00
// inodeOperations is the inodeOperations backing the file. It is protected
// by a reference held by File.Dirent.Inode which is stable until
// FileOperations.Release is called.
inodeOperations * inodeOperations ` state:"wait" `
// dirCursor is the directory cursor.
dirCursor string
// handles are the opened remote file system handles, which may
// be shared with other files.
handles * handles ` state:"nosave" `
// flags are the flags used to open handles.
flags fs . FileFlags ` state:"wait" `
}
// fileOperations implements fs.FileOperations.
var _ fs . FileOperations = ( * fileOperations ) ( nil )
// NewFile returns a file. NewFile is not appropriate with host pipes and sockets.
2018-07-18 18:48:56 +00:00
//
// The `name` argument is only used to log a warning if we are returning a
// writeable+executable file. (A metric counter is incremented in this case as
// well.) Note that we cannot call d.BaseName() directly in this function,
// because that would lead to a lock order violation, since this is called in
// d.Create which holds d.mu, while d.BaseName() takes d.parent.mu, and the two
// locks must be taken in the opposite order.
func NewFile ( ctx context . Context , dirent * fs . Dirent , name string , flags fs . FileFlags , i * inodeOperations , handles * handles ) * fs . File {
2018-04-27 17:37:02 +00:00
// Remote file systems enforce readability/writability at an offset,
// see fs/9p/vfs_inode.c:v9fs_vfs_atomic_open -> fs/open.c:finish_open.
flags . Pread = true
flags . Pwrite = true
2018-08-07 18:42:29 +00:00
if fs . IsFile ( dirent . Inode . StableAttr ) {
// If cache policy is "remote revalidating", then we must
// ensure that we have a host FD. Otherwise, the
// sentry-internal page cache will be used, and we can end up
// in an inconsistent state if the remote file changes.
cp := dirent . Inode . InodeOperations . ( * inodeOperations ) . session ( ) . cachePolicy
if cp == cacheRemoteRevalidating && handles . Host == nil {
panic ( fmt . Sprintf ( "remote-revalidating cache policy requires gofer to donate host FD, but file %q did not have host FD" , name ) )
}
}
2018-04-27 17:37:02 +00:00
f := & fileOperations {
inodeOperations : i ,
handles : handles ,
flags : flags ,
}
if flags . Write {
if err := dirent . Inode . CheckPermission ( ctx , fs . PermMask { Execute : true } ) ; err == nil {
2019-02-13 20:06:20 +00:00
opensWX . Increment ( )
2018-04-27 17:37:02 +00:00
log . Warningf ( "Opened a writable executable: %q" , name )
}
}
2019-02-13 20:06:20 +00:00
if handles . Host != nil {
opensHost . Increment ( )
} else {
opens9P . Increment ( )
}
2018-04-27 17:37:02 +00:00
return fs . NewFile ( ctx , dirent , flags , f )
}
// Release implements fs.FileOpeations.Release.
func ( f * fileOperations ) Release ( ) {
f . handles . DecRef ( )
}
// Readdir implements fs.FileOperations.Readdir.
func ( f * fileOperations ) Readdir ( ctx context . Context , file * fs . File , serializer fs . DentrySerializer ) ( int64 , error ) {
root := fs . RootFromContext ( ctx )
2019-04-10 23:35:22 +00:00
if root != nil {
defer root . DecRef ( )
}
2018-04-27 17:37:02 +00:00
dirCtx := & fs . DirCtx {
Serializer : serializer ,
DirCursor : & f . dirCursor ,
}
n , err := fs . DirentReaddir ( ctx , file . Dirent , f , root , dirCtx , file . Offset ( ) )
2018-06-19 18:09:20 +00:00
if f . inodeOperations . session ( ) . cachePolicy . cacheUAttrs ( file . Dirent . Inode ) {
2018-04-27 17:37:02 +00:00
f . inodeOperations . cachingInodeOps . TouchAccessTime ( ctx , file . Dirent . Inode )
}
return n , err
}
// IterateDir implements fs.DirIterator.IterateDir.
2019-06-27 21:22:40 +00:00
func ( f * fileOperations ) IterateDir ( ctx context . Context , d * fs . Dirent , dirCtx * fs . DirCtx , offset int ) ( int , error ) {
2018-04-27 17:37:02 +00:00
f . inodeOperations . readdirMu . Lock ( )
defer f . inodeOperations . readdirMu . Unlock ( )
// Fetch directory entries if needed.
2018-06-19 18:09:20 +00:00
if ! f . inodeOperations . session ( ) . cachePolicy . cacheReaddir ( ) || f . inodeOperations . readdirCache == nil {
2018-04-27 17:37:02 +00:00
entries , err := f . readdirAll ( ctx )
if err != nil {
return offset , err
}
// Cache the readdir result.
f . inodeOperations . readdirCache = fs . NewSortedDentryMap ( entries )
}
// Serialize the entries.
n , err := fs . GenericReaddir ( dirCtx , f . inodeOperations . readdirCache )
return offset + n , err
}
// readdirAll fetches fs.DentAttrs for f, using the attributes of g.
func ( f * fileOperations ) readdirAll ( ctx context . Context ) ( map [ string ] fs . DentAttr , error ) {
entries := make ( map [ string ] fs . DentAttr )
var readOffset uint64
for {
// We choose some arbitrary high number of directory entries (64k) and call
// Readdir until we've exhausted them all.
dirents , err := f . handles . File . readdir ( ctx , readOffset , 64 * 1024 )
if err != nil {
return nil , err
}
if len ( dirents ) == 0 {
// We're done, we reached EOF.
break
}
// The last dirent contains the offset into the next set of dirents. The gofer
// returns the offset as an index into directories, not as a byte offset, because
// converting a byte offset to an index into directories entries is a huge pain.
// But everything is fine if we're consistent.
readOffset = dirents [ len ( dirents ) - 1 ] . Offset
for _ , dirent := range dirents {
if dirent . Name == "." || dirent . Name == ".." {
// These must not be included in Readdir results.
continue
}
// Find a best approximation of the type.
var nt fs . InodeType
switch dirent . Type {
case p9 . TypeDir :
nt = fs . Directory
case p9 . TypeSymlink :
nt = fs . Symlink
default :
nt = fs . RegularFile
}
// Install the DentAttr.
entries [ dirent . Name ] = fs . DentAttr {
Type : nt ,
// Construct the key to find the virtual inode.
// Directory entries reside on the same Device
// and SecondaryDevice as their parent.
InodeID : goferDevice . Map ( device . MultiDeviceKey {
Device : f . inodeOperations . fileState . key . Device ,
SecondaryDevice : f . inodeOperations . fileState . key . SecondaryDevice ,
Inode : dirent . QID . Path ,
} ) ,
}
}
}
return entries , nil
}
2019-10-16 21:29:20 +00:00
// maybeSync will call FSync on the file if either the cache policy or file
// flags require it.
func ( f * fileOperations ) maybeSync ( ctx context . Context , file * fs . File , offset , n int64 ) error {
if n == 0 {
// Nothing to sync.
return nil
}
if f . inodeOperations . session ( ) . cachePolicy . writeThrough ( file . Dirent . Inode ) {
// Call WriteOut directly, as some "writethrough" filesystems
// do not support sync.
return f . inodeOperations . cachingInodeOps . WriteOut ( ctx , file . Dirent . Inode )
}
flags := file . Flags ( )
var syncType fs . SyncType
switch {
case flags . Direct || flags . Sync :
syncType = fs . SyncAll
case flags . DSync :
syncType = fs . SyncData
default :
// No need to sync.
return nil
}
return f . Fsync ( ctx , file , offset , offset + n , syncType )
}
2018-04-27 17:37:02 +00:00
// Write implements fs.FileOperations.Write.
func ( f * fileOperations ) Write ( ctx context . Context , file * fs . File , src usermem . IOSequence , offset int64 ) ( int64 , error ) {
if fs . IsDir ( file . Dirent . Inode . StableAttr ) {
// Not all remote file systems enforce this so this client does.
return 0 , syserror . EISDIR
}
2019-10-16 21:29:20 +00:00
var (
n int64
err error
)
// The write is handled in different ways depending on the cache policy
// and availability of a host-mappable FD.
if f . inodeOperations . session ( ) . cachePolicy . useCachingInodeOps ( file . Dirent . Inode ) {
n , err = f . inodeOperations . cachingInodeOps . Write ( ctx , src , offset )
} else if f . inodeOperations . fileState . hostMappable != nil {
n , err = f . inodeOperations . fileState . hostMappable . Write ( ctx , src , offset )
} else {
n , err = src . CopyInTo ( ctx , f . handles . readWriterAt ( ctx , offset ) )
2018-04-27 17:37:02 +00:00
}
2019-10-16 21:29:20 +00:00
// We may need to sync the written bytes.
if syncErr := f . maybeSync ( ctx , file , offset , n ) ; syncErr != nil {
// Sync failed. Report 0 bytes written, since none of them are
// guaranteed to have been synced.
return 0 , syncErr
2019-01-31 20:53:00 +00:00
}
2019-10-16 21:29:20 +00:00
return n , err
2018-04-27 17:37:02 +00:00
}
2019-02-21 21:07:25 +00:00
// incrementReadCounters increments the read counters for the read starting at the given time. We
// use this function rather than using a defer in Read() to avoid the performance hit of defer.
func ( f * fileOperations ) incrementReadCounters ( start time . Time ) {
if f . handles . Host != nil {
readsHost . Increment ( )
fs . IncrementWait ( readWaitHost , start )
} else {
reads9P . Increment ( )
fs . IncrementWait ( readWait9P , start )
}
}
2018-04-27 17:37:02 +00:00
// Read implements fs.FileOperations.Read.
func ( f * fileOperations ) Read ( ctx context . Context , file * fs . File , dst usermem . IOSequence , offset int64 ) ( int64 , error ) {
2019-02-21 21:07:25 +00:00
var start time . Time
if fs . RecordWaitTime {
start = time . Now ( )
}
2018-04-27 17:37:02 +00:00
if fs . IsDir ( file . Dirent . Inode . StableAttr ) {
// Not all remote file systems enforce this so this client does.
2019-02-21 21:07:25 +00:00
f . incrementReadCounters ( start )
2018-04-27 17:37:02 +00:00
return 0 , syserror . EISDIR
}
2019-01-26 01:22:04 +00:00
if f . inodeOperations . session ( ) . cachePolicy . useCachingInodeOps ( file . Dirent . Inode ) {
2019-02-21 21:07:25 +00:00
n , err := f . inodeOperations . cachingInodeOps . Read ( ctx , file , dst , offset )
f . incrementReadCounters ( start )
return n , err
2018-04-27 17:37:02 +00:00
}
2019-02-21 21:07:25 +00:00
n , err := dst . CopyOutFrom ( ctx , f . handles . readWriterAt ( ctx , offset ) )
f . incrementReadCounters ( start )
return n , err
2018-04-27 17:37:02 +00:00
}
// Fsync implements fs.FileOperations.Fsync.
2019-10-16 21:29:20 +00:00
func ( f * fileOperations ) Fsync ( ctx context . Context , file * fs . File , start , end int64 , syncType fs . SyncType ) error {
2018-04-27 17:37:02 +00:00
switch syncType {
case fs . SyncAll , fs . SyncData :
if err := file . Dirent . Inode . WriteOut ( ctx ) ; err != nil {
return err
}
fallthrough
case fs . SyncBackingStorage :
// Sync remote caches.
if f . handles . Host != nil {
// Sync the host fd directly.
return syscall . Fsync ( f . handles . Host . FD ( ) )
}
// Otherwise sync on the p9.File handle.
return f . handles . File . fsync ( ctx )
}
panic ( "invalid sync type" )
}
// Flush implements fs.FileOperations.Flush.
func ( f * fileOperations ) Flush ( ctx context . Context , file * fs . File ) error {
// If this file is not opened writable then there is nothing to flush.
// We do this because some p9 server implementations of Flush are
// over-zealous.
//
2019-04-29 21:03:04 +00:00
// FIXME(edahlgren): weaken these implementations and remove this check.
2018-04-27 17:37:02 +00:00
if ! file . Flags ( ) . Write {
return nil
}
// Execute the flush.
return f . handles . File . flush ( ctx )
}
// ConfigureMMap implements fs.FileOperations.ConfigureMMap.
func ( f * fileOperations ) ConfigureMMap ( ctx context . Context , file * fs . File , opts * memmap . MMapOpts ) error {
2019-01-26 01:22:04 +00:00
return f . inodeOperations . configureMMap ( file , opts )
2018-04-27 17:37:02 +00:00
}
2019-04-11 07:41:42 +00:00
// UnstableAttr implements fs.FileOperations.UnstableAttr.
func ( f * fileOperations ) UnstableAttr ( ctx context . Context , file * fs . File ) ( fs . UnstableAttr , error ) {
s := f . inodeOperations . session ( )
if s . cachePolicy . cacheUAttrs ( file . Dirent . Inode ) {
return f . inodeOperations . cachingInodeOps . UnstableAttr ( ctx , file . Dirent . Inode )
}
// Use f.handles.File, which represents 9P fids that have been opened,
// instead of inodeFileState.file, which represents 9P fids that have not.
// This may be significantly more efficient in some implementations.
_ , valid , pattr , err := getattr ( ctx , f . handles . File )
if err != nil {
return fs . UnstableAttr { } , err
}
return unstable ( ctx , valid , pattr , s . mounter , s . client ) , nil
}
2018-04-27 17:37:02 +00:00
// Seek implements fs.FileOperations.Seek.
func ( f * fileOperations ) Seek ( ctx context . Context , file * fs . File , whence fs . SeekWhence , offset int64 ) ( int64 , error ) {
return fsutil . SeekWithDirCursor ( ctx , file , whence , offset , & f . dirCursor )
}