2020-01-06 20:51:35 +00:00
// Copyright 2019 The gVisor Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package tmpfs
import (
2020-02-20 17:57:06 +00:00
"fmt"
2020-01-06 20:51:35 +00:00
"io"
"math"
"sync/atomic"
"gvisor.dev/gvisor/pkg/abi/linux"
2020-01-27 23:17:58 +00:00
"gvisor.dev/gvisor/pkg/context"
"gvisor.dev/gvisor/pkg/safemem"
2020-02-20 17:57:06 +00:00
"gvisor.dev/gvisor/pkg/sentry/fs"
2020-01-06 20:51:35 +00:00
"gvisor.dev/gvisor/pkg/sentry/fs/fsutil"
2020-01-31 22:14:52 +00:00
"gvisor.dev/gvisor/pkg/sentry/fs/lock"
2020-01-06 20:51:35 +00:00
"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
"gvisor.dev/gvisor/pkg/sentry/memmap"
"gvisor.dev/gvisor/pkg/sentry/pgalloc"
"gvisor.dev/gvisor/pkg/sentry/usage"
"gvisor.dev/gvisor/pkg/sentry/vfs"
2020-01-10 06:00:42 +00:00
"gvisor.dev/gvisor/pkg/sync"
2020-01-06 20:51:35 +00:00
"gvisor.dev/gvisor/pkg/syserror"
2020-01-27 23:17:58 +00:00
"gvisor.dev/gvisor/pkg/usermem"
2020-01-06 20:51:35 +00:00
)
2020-02-20 17:57:06 +00:00
// regularFile is a regular (=S_IFREG) tmpfs file.
2020-01-06 20:51:35 +00:00
type regularFile struct {
inode inode
// memFile is a platform.File used to allocate pages to this regularFile.
memFile * pgalloc . MemoryFile
2020-02-20 17:57:06 +00:00
// mapsMu protects mappings.
mapsMu sync . Mutex ` state:"nosave" `
// mappings tracks mappings of the file into memmap.MappingSpaces.
//
// Protected by mapsMu.
mappings memmap . MappingSet
// writableMappingPages tracks how many pages of virtual memory are mapped
// as potentially writable from this file. If a page has multiple mappings,
// each mapping is counted separately.
//
// This counter is susceptible to overflow as we can potentially count
// mappings from many VMAs. We count pages rather than bytes to slightly
// mitigate this.
//
// Protected by mapsMu.
writableMappingPages uint64
// dataMu protects the fields below.
dataMu sync . RWMutex
2020-01-06 20:51:35 +00:00
// data maps offsets into the file to offsets into memFile that store
// the file's data.
2020-02-20 17:57:06 +00:00
//
// Protected by dataMu.
2020-01-06 20:51:35 +00:00
data fsutil . FileRangeSet
// seals represents file seals on this inode.
2020-02-20 17:57:06 +00:00
//
// Protected by dataMu.
2020-01-06 20:51:35 +00:00
seals uint32
2020-02-20 17:57:06 +00:00
// size is the size of data.
//
// Protected by both dataMu and inode.mu; reading it requires holding
// either mutex, while writing requires holding both AND using atomics.
// Readers that do not require consistency (like Stat) may read the
// value atomically without holding either lock.
size uint64
2020-01-06 20:51:35 +00:00
}
func ( fs * filesystem ) newRegularFile ( creds * auth . Credentials , mode linux . FileMode ) * inode {
file := & regularFile {
memFile : fs . memFile ,
}
2020-03-25 21:44:18 +00:00
file . inode . init ( file , fs , creds , linux . S_IFREG | mode )
2020-01-06 20:51:35 +00:00
file . inode . nlink = 1 // from parent directory
return & file . inode
}
2020-01-16 21:58:25 +00:00
// truncate grows or shrinks the file to the given size. It returns true if the
// file size was updated.
2020-02-20 17:57:06 +00:00
func ( rf * regularFile ) truncate ( newSize uint64 ) ( bool , error ) {
rf . inode . mu . Lock ( )
defer rf . inode . mu . Unlock ( )
return rf . truncateLocked ( newSize )
}
2020-01-16 21:58:25 +00:00
2020-02-20 17:57:06 +00:00
// Preconditions: rf.inode.mu must be held.
func ( rf * regularFile ) truncateLocked ( newSize uint64 ) ( bool , error ) {
oldSize := rf . size
if newSize == oldSize {
2020-01-16 21:58:25 +00:00
// Nothing to do.
return false , nil
}
2020-02-20 17:57:06 +00:00
// Need to hold inode.mu and dataMu while modifying size.
rf . dataMu . Lock ( )
if newSize > oldSize {
// Can we grow the file?
2020-01-16 21:58:25 +00:00
if rf . seals & linux . F_SEAL_GROW != 0 {
2020-02-20 17:57:06 +00:00
rf . dataMu . Unlock ( )
2020-01-16 21:58:25 +00:00
return false , syserror . EPERM
}
2020-02-20 17:57:06 +00:00
// We only need to update the file size.
atomic . StoreUint64 ( & rf . size , newSize )
rf . dataMu . Unlock ( )
2020-01-16 21:58:25 +00:00
return true , nil
}
2020-02-20 17:57:06 +00:00
// We are shrinking the file. First check if this is allowed.
2020-01-16 21:58:25 +00:00
if rf . seals & linux . F_SEAL_SHRINK != 0 {
2020-02-20 17:57:06 +00:00
rf . dataMu . Unlock ( )
2020-01-16 21:58:25 +00:00
return false , syserror . EPERM
}
2020-02-20 17:57:06 +00:00
// Update the file size.
atomic . StoreUint64 ( & rf . size , newSize )
rf . dataMu . Unlock ( )
// Invalidate past translations of truncated pages.
oldpgend := fs . OffsetPageEnd ( int64 ( oldSize ) )
newpgend := fs . OffsetPageEnd ( int64 ( newSize ) )
if newpgend < oldpgend {
rf . mapsMu . Lock ( )
rf . mappings . Invalidate ( memmap . MappableRange { newpgend , oldpgend } , memmap . InvalidateOpts {
// Compare Linux's mm/shmem.c:shmem_setattr() =>
// mm/memory.c:unmap_mapping_range(evencows=1).
InvalidatePrivate : true ,
} )
rf . mapsMu . Unlock ( )
}
2020-01-16 21:58:25 +00:00
2020-02-20 17:57:06 +00:00
// We are now guaranteed that there are no translations of truncated pages,
// and can remove them.
rf . dataMu . Lock ( )
rf . data . Truncate ( newSize , rf . memFile )
rf . dataMu . Unlock ( )
2020-01-16 21:58:25 +00:00
return true , nil
}
2020-02-20 17:57:06 +00:00
// AddMapping implements memmap.Mappable.AddMapping.
func ( rf * regularFile ) AddMapping ( ctx context . Context , ms memmap . MappingSpace , ar usermem . AddrRange , offset uint64 , writable bool ) error {
rf . mapsMu . Lock ( )
defer rf . mapsMu . Unlock ( )
rf . dataMu . RLock ( )
defer rf . dataMu . RUnlock ( )
// Reject writable mapping if F_SEAL_WRITE is set.
if rf . seals & linux . F_SEAL_WRITE != 0 && writable {
return syserror . EPERM
}
rf . mappings . AddMapping ( ms , ar , offset , writable )
if writable {
pagesBefore := rf . writableMappingPages
// ar is guaranteed to be page aligned per memmap.Mappable.
rf . writableMappingPages += uint64 ( ar . Length ( ) / usermem . PageSize )
if rf . writableMappingPages < pagesBefore {
panic ( fmt . Sprintf ( "Overflow while mapping potentially writable pages pointing to a tmpfs file. Before %v, after %v" , pagesBefore , rf . writableMappingPages ) )
}
}
return nil
}
// RemoveMapping implements memmap.Mappable.RemoveMapping.
func ( rf * regularFile ) RemoveMapping ( ctx context . Context , ms memmap . MappingSpace , ar usermem . AddrRange , offset uint64 , writable bool ) {
rf . mapsMu . Lock ( )
defer rf . mapsMu . Unlock ( )
rf . mappings . RemoveMapping ( ms , ar , offset , writable )
if writable {
pagesBefore := rf . writableMappingPages
// ar is guaranteed to be page aligned per memmap.Mappable.
rf . writableMappingPages -= uint64 ( ar . Length ( ) / usermem . PageSize )
if rf . writableMappingPages > pagesBefore {
panic ( fmt . Sprintf ( "Underflow while unmapping potentially writable pages pointing to a tmpfs file. Before %v, after %v" , pagesBefore , rf . writableMappingPages ) )
}
}
}
// CopyMapping implements memmap.Mappable.CopyMapping.
func ( rf * regularFile ) CopyMapping ( ctx context . Context , ms memmap . MappingSpace , srcAR , dstAR usermem . AddrRange , offset uint64 , writable bool ) error {
return rf . AddMapping ( ctx , ms , dstAR , offset , writable )
}
// Translate implements memmap.Mappable.Translate.
func ( rf * regularFile ) Translate ( ctx context . Context , required , optional memmap . MappableRange , at usermem . AccessType ) ( [ ] memmap . Translation , error ) {
rf . dataMu . Lock ( )
defer rf . dataMu . Unlock ( )
// Constrain translations to f.attr.Size (rounded up) to prevent
// translation to pages that may be concurrently truncated.
pgend := fs . OffsetPageEnd ( int64 ( rf . size ) )
var beyondEOF bool
if required . End > pgend {
if required . Start >= pgend {
return nil , & memmap . BusError { io . EOF }
}
beyondEOF = true
required . End = pgend
}
if optional . End > pgend {
optional . End = pgend
}
cerr := rf . data . Fill ( ctx , required , optional , rf . memFile , usage . Tmpfs , func ( _ context . Context , dsts safemem . BlockSeq , _ uint64 ) ( uint64 , error ) {
// Newly-allocated pages are zeroed, so we don't need to do anything.
return dsts . NumBytes ( ) , nil
} )
var ts [ ] memmap . Translation
var translatedEnd uint64
for seg := rf . data . FindSegment ( required . Start ) ; seg . Ok ( ) && seg . Start ( ) < required . End ; seg , _ = seg . NextNonEmpty ( ) {
segMR := seg . Range ( ) . Intersect ( optional )
ts = append ( ts , memmap . Translation {
Source : segMR ,
File : rf . memFile ,
Offset : seg . FileRangeOf ( segMR ) . Start ,
Perms : usermem . AnyAccess ,
} )
translatedEnd = segMR . End
}
// Don't return the error returned by f.data.Fill if it occurred outside of
// required.
if translatedEnd < required . End && cerr != nil {
return ts , & memmap . BusError { cerr }
}
if beyondEOF {
return ts , & memmap . BusError { io . EOF }
}
return ts , nil
}
// InvalidateUnsavable implements memmap.Mappable.InvalidateUnsavable.
func ( * regularFile ) InvalidateUnsavable ( context . Context ) error {
return nil
}
2020-01-06 20:51:35 +00:00
type regularFileFD struct {
fileDescription
// off is the file offset. off is accessed using atomic memory operations.
// offMu serializes operations that may mutate off.
off int64
offMu sync . Mutex
}
// Release implements vfs.FileDescriptionImpl.Release.
func ( fd * regularFileFD ) Release ( ) {
2020-01-22 20:27:16 +00:00
// noop
2020-01-06 20:51:35 +00:00
}
// PRead implements vfs.FileDescriptionImpl.PRead.
func ( fd * regularFileFD ) PRead ( ctx context . Context , dst usermem . IOSequence , offset int64 , opts vfs . ReadOptions ) ( int64 , error ) {
if offset < 0 {
return 0 , syserror . EINVAL
}
if dst . NumBytes ( ) == 0 {
return 0 , nil
}
f := fd . inode ( ) . impl . ( * regularFile )
rw := getRegularFileReadWriter ( f , offset )
n , err := dst . CopyOutFrom ( ctx , rw )
putRegularFileReadWriter ( rw )
2020-04-03 02:37:41 +00:00
fd . inode ( ) . touchAtime ( fd . vfsfd . Mount ( ) )
return n , err
2020-01-06 20:51:35 +00:00
}
// Read implements vfs.FileDescriptionImpl.Read.
func ( fd * regularFileFD ) Read ( ctx context . Context , dst usermem . IOSequence , opts vfs . ReadOptions ) ( int64 , error ) {
fd . offMu . Lock ( )
n , err := fd . PRead ( ctx , dst , fd . off , opts )
fd . off += n
fd . offMu . Unlock ( )
return n , err
}
// PWrite implements vfs.FileDescriptionImpl.PWrite.
func ( fd * regularFileFD ) PWrite ( ctx context . Context , src usermem . IOSequence , offset int64 , opts vfs . WriteOptions ) ( int64 , error ) {
if offset < 0 {
return 0 , syserror . EINVAL
}
srclen := src . NumBytes ( )
if srclen == 0 {
return 0 , nil
}
f := fd . inode ( ) . impl . ( * regularFile )
2020-03-16 22:59:29 +00:00
if end := offset + srclen ; end < offset {
2020-01-06 20:51:35 +00:00
// Overflow.
return 0 , syserror . EFBIG
}
2020-03-16 22:59:29 +00:00
var err error
srclen , err = vfs . CheckLimit ( ctx , offset , srclen )
if err != nil {
return 0 , err
}
src = src . TakeFirst64 ( srclen )
2020-02-20 17:57:06 +00:00
f . inode . mu . Lock ( )
2020-01-06 20:51:35 +00:00
rw := getRegularFileReadWriter ( f , offset )
n , err := src . CopyInTo ( ctx , rw )
2020-04-03 02:37:41 +00:00
fd . inode ( ) . touchCMtimeLocked ( )
2020-02-20 17:57:06 +00:00
f . inode . mu . Unlock ( )
2020-01-06 20:51:35 +00:00
putRegularFileReadWriter ( rw )
return n , err
}
// Write implements vfs.FileDescriptionImpl.Write.
func ( fd * regularFileFD ) Write ( ctx context . Context , src usermem . IOSequence , opts vfs . WriteOptions ) ( int64 , error ) {
fd . offMu . Lock ( )
n , err := fd . PWrite ( ctx , src , fd . off , opts )
fd . off += n
fd . offMu . Unlock ( )
return n , err
}
// Seek implements vfs.FileDescriptionImpl.Seek.
func ( fd * regularFileFD ) Seek ( ctx context . Context , offset int64 , whence int32 ) ( int64 , error ) {
fd . offMu . Lock ( )
defer fd . offMu . Unlock ( )
switch whence {
case linux . SEEK_SET :
// use offset as specified
case linux . SEEK_CUR :
offset += fd . off
case linux . SEEK_END :
offset += int64 ( atomic . LoadUint64 ( & fd . inode ( ) . impl . ( * regularFile ) . size ) )
default :
return 0 , syserror . EINVAL
}
if offset < 0 {
return 0 , syserror . EINVAL
}
fd . off = offset
return offset , nil
}
// Sync implements vfs.FileDescriptionImpl.Sync.
func ( fd * regularFileFD ) Sync ( ctx context . Context ) error {
return nil
}
2020-01-31 22:14:52 +00:00
// LockBSD implements vfs.FileDescriptionImpl.LockBSD.
func ( fd * regularFileFD ) LockBSD ( ctx context . Context , uid lock . UniqueID , t lock . LockType , block lock . Blocker ) error {
return fd . inode ( ) . lockBSD ( uid , t , block )
}
// UnlockBSD implements vfs.FileDescriptionImpl.UnlockBSD.
func ( fd * regularFileFD ) UnlockBSD ( ctx context . Context , uid lock . UniqueID ) error {
fd . inode ( ) . unlockBSD ( uid )
return nil
}
// LockPOSIX implements vfs.FileDescriptionImpl.LockPOSIX.
func ( fd * regularFileFD ) LockPOSIX ( ctx context . Context , uid lock . UniqueID , t lock . LockType , rng lock . LockRange , block lock . Blocker ) error {
return fd . inode ( ) . lockPOSIX ( uid , t , rng , block )
}
// UnlockPOSIX implements vfs.FileDescriptionImpl.UnlockPOSIX.
func ( fd * regularFileFD ) UnlockPOSIX ( ctx context . Context , uid lock . UniqueID , rng lock . LockRange ) error {
fd . inode ( ) . unlockPOSIX ( uid , rng )
return nil
}
2020-02-20 17:57:06 +00:00
// ConfigureMMap implements vfs.FileDescriptionImpl.ConfigureMMap.
func ( fd * regularFileFD ) ConfigureMMap ( ctx context . Context , opts * memmap . MMapOpts ) error {
file := fd . inode ( ) . impl . ( * regularFile )
return vfs . GenericConfigureMMap ( & fd . vfsfd , file , opts )
}
2020-01-06 20:51:35 +00:00
// regularFileReadWriter implements safemem.Reader and Safemem.Writer.
type regularFileReadWriter struct {
file * regularFile
// Offset into the file to read/write at. Note that this may be
// different from the FD offset if PRead/PWrite is used.
off uint64
}
var regularFileReadWriterPool = sync . Pool {
New : func ( ) interface { } {
return & regularFileReadWriter { }
} ,
}
func getRegularFileReadWriter ( file * regularFile , offset int64 ) * regularFileReadWriter {
rw := regularFileReadWriterPool . Get ( ) . ( * regularFileReadWriter )
rw . file = file
rw . off = uint64 ( offset )
return rw
}
func putRegularFileReadWriter ( rw * regularFileReadWriter ) {
rw . file = nil
regularFileReadWriterPool . Put ( rw )
}
// ReadToBlocks implements safemem.Reader.ReadToBlocks.
func ( rw * regularFileReadWriter ) ReadToBlocks ( dsts safemem . BlockSeq ) ( uint64 , error ) {
2020-02-20 17:57:06 +00:00
rw . file . dataMu . RLock ( )
defer rw . file . dataMu . RUnlock ( )
size := rw . file . size
2020-01-06 20:51:35 +00:00
// Compute the range to read (limited by file size and overflow-checked).
2020-02-20 17:57:06 +00:00
if rw . off >= size {
2020-01-06 20:51:35 +00:00
return 0 , io . EOF
}
2020-02-20 17:57:06 +00:00
end := size
2020-01-06 20:51:35 +00:00
if rend := rw . off + dsts . NumBytes ( ) ; rend > rw . off && rend < end {
end = rend
}
var done uint64
seg , gap := rw . file . data . Find ( uint64 ( rw . off ) )
for rw . off < end {
mr := memmap . MappableRange { uint64 ( rw . off ) , uint64 ( end ) }
switch {
case seg . Ok ( ) :
// Get internal mappings.
ims , err := rw . file . memFile . MapInternal ( seg . FileRangeOf ( seg . Range ( ) . Intersect ( mr ) ) , usermem . Read )
if err != nil {
return done , err
}
// Copy from internal mappings.
n , err := safemem . CopySeq ( dsts , ims )
done += n
rw . off += uint64 ( n )
dsts = dsts . DropFirst64 ( n )
if err != nil {
return done , err
}
// Continue.
seg , gap = seg . NextNonEmpty ( )
case gap . Ok ( ) :
// Tmpfs holes are zero-filled.
gapmr := gap . Range ( ) . Intersect ( mr )
dst := dsts . TakeFirst64 ( gapmr . Length ( ) )
n , err := safemem . ZeroSeq ( dst )
done += n
rw . off += uint64 ( n )
dsts = dsts . DropFirst64 ( n )
if err != nil {
return done , err
}
// Continue.
seg , gap = gap . NextSegment ( ) , fsutil . FileRangeGapIterator { }
}
}
return done , nil
}
// WriteFromBlocks implements safemem.Writer.WriteFromBlocks.
2020-02-20 17:57:06 +00:00
//
// Preconditions: inode.mu must be held.
2020-01-06 20:51:35 +00:00
func ( rw * regularFileReadWriter ) WriteFromBlocks ( srcs safemem . BlockSeq ) ( uint64 , error ) {
2020-02-20 17:57:06 +00:00
// Hold dataMu so we can modify size.
rw . file . dataMu . Lock ( )
defer rw . file . dataMu . Unlock ( )
2020-01-06 20:51:35 +00:00
// Compute the range to write (overflow-checked).
end := rw . off + srcs . NumBytes ( )
if end <= rw . off {
end = math . MaxInt64
}
// Check if seals prevent either file growth or all writes.
switch {
case rw . file . seals & linux . F_SEAL_WRITE != 0 : // Write sealed
return 0 , syserror . EPERM
case end > rw . file . size && rw . file . seals & linux . F_SEAL_GROW != 0 : // Grow sealed
// When growth is sealed, Linux effectively allows writes which would
// normally grow the file to partially succeed up to the current EOF,
// rounded down to the page boundary before the EOF.
//
// This happens because writes (and thus the growth check) for tmpfs
// files proceed page-by-page on Linux, and the final write to the page
// containing EOF fails, resulting in a partial write up to the start of
// that page.
//
// To emulate this behaviour, artifically truncate the write to the
// start of the page containing the current EOF.
//
// See Linux, mm/filemap.c:generic_perform_write() and
// mm/shmem.c:shmem_write_begin().
if pgstart := uint64 ( usermem . Addr ( rw . file . size ) . RoundDown ( ) ) ; end > pgstart {
end = pgstart
}
if end <= rw . off {
// Truncation would result in no data being written.
return 0 , syserror . EPERM
}
}
// Page-aligned mr for when we need to allocate memory. RoundUp can't
// overflow since end is an int64.
pgstartaddr := usermem . Addr ( rw . off ) . RoundDown ( )
pgendaddr , _ := usermem . Addr ( end ) . RoundUp ( )
pgMR := memmap . MappableRange { uint64 ( pgstartaddr ) , uint64 ( pgendaddr ) }
var (
done uint64
retErr error
)
seg , gap := rw . file . data . Find ( uint64 ( rw . off ) )
for rw . off < end {
mr := memmap . MappableRange { uint64 ( rw . off ) , uint64 ( end ) }
switch {
case seg . Ok ( ) :
// Get internal mappings.
ims , err := rw . file . memFile . MapInternal ( seg . FileRangeOf ( seg . Range ( ) . Intersect ( mr ) ) , usermem . Write )
if err != nil {
retErr = err
goto exitLoop
}
// Copy to internal mappings.
n , err := safemem . CopySeq ( ims , srcs )
done += n
rw . off += uint64 ( n )
srcs = srcs . DropFirst64 ( n )
if err != nil {
retErr = err
goto exitLoop
}
// Continue.
seg , gap = seg . NextNonEmpty ( )
case gap . Ok ( ) :
// Allocate memory for the write.
gapMR := gap . Range ( ) . Intersect ( pgMR )
fr , err := rw . file . memFile . Allocate ( gapMR . Length ( ) , usage . Tmpfs )
if err != nil {
retErr = err
goto exitLoop
}
// Write to that memory as usual.
seg , gap = rw . file . data . Insert ( gap , gapMR , fr . Start ) , fsutil . FileRangeGapIterator { }
}
}
exitLoop :
// If the write ends beyond the file's previous size, it causes the
// file to grow.
if rw . off > rw . file . size {
2020-02-20 17:57:06 +00:00
rw . file . size = rw . off
2020-01-06 20:51:35 +00:00
}
return done , retErr
}