Migrate from using io.ReadSeeker to io.ReaderAt.

This provides the following benefits: - We can now use pkg/fd package which does not take ownership of the file descriptor. So it does not close the fd when garbage collected. This reduces scope of errors from unexpected garbage collection of io.File. - It enforces the offset parameter in every read call. It does not affect the fd offset nor is it affected by it. Hence reducing scope of error of using stale offsets when reading. - We do not need to serialize the usage of any global file descriptor anymore. So this drops the mutual exclusion req hence reducing complexity and congestion. PiperOrigin-RevId: 260635174
2019-07-29 20:11:24 -07:00 · 2019-07-29 20:11:24 -07:00 · 8da9f8a12c
parent ddf25e3331
commit 8da9f8a12c
12 changed files with 55 additions and 96 deletions
--- a/pkg/sentry/fs/ext/BUILD
+++ b/pkg/sentry/fs/ext/BUILD
@ -37,6 +37,7 @@ go_library(
    deps = [
        "//pkg/abi/linux",
        "//pkg/binary",
+        "//pkg/fd",
        "//pkg/sentry/context",
        "//pkg/sentry/fs",
        "//pkg/sentry/fs/ext/disklayout",
--- a/pkg/sentry/fs/ext/block_map_file.go
+++ b/pkg/sentry/fs/ext/block_map_file.go
@ -16,6 +16,7 @@ package ext

 import (
 	"io"
+	"sync"

 	"gvisor.dev/gvisor/pkg/binary"
 )
@ -25,10 +26,13 @@ import (
 type blockMapFile struct {
 	regFile regularFile

+	// mu serializes changes to fileToPhysBlks.
+	mu sync.RWMutex
+
 	// fileToPhysBlks maps the file block numbers to the physical block numbers.
 	// the physical block number for the (i)th file block is stored in the (i)th
 	// index. This is initialized (at max) with the first 12 entries. The rest
-	// have to be read in from disk when required.
+	// have to be read in from disk when required. Protected by mu.
 	fileToPhysBlks []uint32
 }

@ -36,7 +40,7 @@ type blockMapFile struct {
 var _ fileReader = (*blockMapFile)(nil)

 // Read implements fileReader.getFileReader.
-func (f *blockMapFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader {
+func (f *blockMapFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
 	panic("unimplemented")
 }

--- a/pkg/sentry/fs/ext/ext.go
+++ b/pkg/sentry/fs/ext/ext.go
@ -19,9 +19,9 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"os"

 	"gvisor.dev/gvisor/pkg/abi/linux"
+	"gvisor.dev/gvisor/pkg/fd"
 	"gvisor.dev/gvisor/pkg/sentry/context"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/sentry/kernel/auth"
@ -35,11 +35,11 @@ type filesystemType struct{}
 // Compiles only if filesystemType implements vfs.FilesystemType.
 var _ vfs.FilesystemType = (*filesystemType)(nil)

-// getDeviceFd returns the read seeker to the underlying device.
+// getDeviceFd returns an io.ReaderAt to the underlying device.
 // Currently there are two ways of mounting an ext(2/3/4) fs:
 //   1. Specify a mount with our internal special MountType in the OCI spec.
 //   2. Expose the device to the container and mount it from application layer.
-func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, error) {
+func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReaderAt, error) {
 	if opts.InternalData == nil {
 		// User mount call.
 		// TODO(b/134676337): Open the device specified by `source` and return that.
@ -47,20 +47,19 @@ func getDeviceFd(source string, opts vfs.NewFilesystemOptions) (io.ReadSeeker, e
 	}

 	// NewFilesystem call originated from within the sentry.
-	fd, ok := opts.InternalData.(uintptr)
+	devFd, ok := opts.InternalData.(int)
 	if !ok {
-		return nil, errors.New("internal data for ext fs must be a uintptr containing the file descriptor to device")
+		return nil, errors.New("internal data for ext fs must be an int containing the file descriptor to device")
 	}

-	// We do not close this file because that would close the underlying device
-	// file descriptor (which is required for reading the fs from disk).
-	// TODO(b/134676337): Use pkg/fd instead.
-	deviceFile := os.NewFile(fd, source)
-	if deviceFile == nil {
-		return nil, fmt.Errorf("ext4 device file descriptor is not valid: %d", fd)
+	if devFd < 0 {
+		return nil, fmt.Errorf("ext device file descriptor is not valid: %d", devFd)
 	}

-	return deviceFile, nil
+	// The fd.ReadWriter returned from fd.NewReadWriter() does not take ownership
+	// of the file descriptor and hence will not close it when it is garbage
+	// collected.
+	return fd.NewReadWriter(devFd), nil
 }

 // NewFilesystem implements vfs.FilesystemType.NewFilesystem.
--- a/pkg/sentry/fs/ext/ext_test.go
+++ b/pkg/sentry/fs/ext/ext_test.go
@ -69,7 +69,7 @@ func setUp(t *testing.T, imagePath string) (context.Context, *vfs.Filesystem, *v

 	// Mount the ext4 fs and retrieve the inode structure for the file.
 	mockCtx := contexttest.Context(t)
-	fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: f.Fd()})
+	fs, d, err := filesystemType{}.NewFilesystem(mockCtx, nil, localImagePath, vfs.NewFilesystemOptions{InternalData: int(f.Fd())})
 	if err != nil {
 		f.Close()
 		return nil, nil, nil, nil, err
--- a/pkg/sentry/fs/ext/extent_file.go
+++ b/pkg/sentry/fs/ext/extent_file.go
@ -36,7 +36,7 @@ type extentFile struct {
 var _ fileReader = (*extentFile)(nil)

 // Read implements fileReader.getFileReader.
-func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader {
+func (f *extentFile) getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader {
 	return &extentReader{
 		dev:     dev,
 		file:    f,
@ -47,10 +47,8 @@ func (f *extentFile) getFileReader(dev io.ReadSeeker, blkSize uint64, offset uin

 // newExtentFile is the extent file constructor. It reads the entire extent
 // tree into memory.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
 // TODO(b/134676337): Build extent tree on demand to reduce memory usage.
-func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*extentFile, error) {
+func newExtentFile(dev io.ReaderAt, blkSize uint64, regFile regularFile) (*extentFile, error) {
 	file := &extentFile{regFile: regFile}
 	file.regFile.impl = file
 	err := file.buildExtTree(dev, blkSize)
@ -65,10 +63,8 @@ func newExtentFile(dev io.ReadSeeker, blkSize uint64, regFile regularFile) (*ext
 // memory. Then it recursively builds the rest of the tree by reading it off
 // disk.
 //
-// Preconditions:
-//   - Must hold the mutex of the filesystem containing dev.
-//   - Inode flag InExtents must be set.
-func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
+// Precondition: inode flag InExtents must be set.
+func (f *extentFile) buildExtTree(dev io.ReaderAt, blkSize uint64) error {
 	rootNodeData := f.regFile.inode.diskInode.Data()

 	binary.Unmarshal(rootNodeData[:disklayout.ExtentStructsSize], binary.LittleEndian, &f.root.Header)
@ -110,9 +106,7 @@ func (f *extentFile) buildExtTree(dev io.ReadSeeker, blkSize uint64) error {
 // buildExtTreeFromDisk reads the extent tree nodes from disk and recursively
 // builds the tree. Performs a simple DFS. It returns the ExtentNode pointed to
 // by the ExtentEntry.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
+func buildExtTreeFromDisk(dev io.ReaderAt, entry disklayout.ExtentEntry, blkSize uint64) (*disklayout.ExtentNode, error) {
 	var header disklayout.ExtentHeader
 	off := entry.PhysicalBlock() * blkSize
 	err := readFromDisk(dev, int64(off), &header)
@ -155,7 +149,7 @@ func buildExtTreeFromDisk(dev io.ReadSeeker, entry disklayout.ExtentEntry, blkSi
 // extentReader implements io.Reader which can traverse the extent tree and
 // read file data. This is not thread safe.
 type extentReader struct {
-	dev     io.ReadSeeker
+	dev     io.ReaderAt
 	file    *extentFile
 	fileOff uint64 // Represents the current file offset being read from.
 	blkSize uint64
@ -247,9 +241,12 @@ func (r *extentReader) readFromExtent(ex *disklayout.Extent, dst []byte) (int, e
 		toRead = len(dst)
 	}

-	n, err := readFull(r.dev, int64(readStart), dst[:toRead])
+	n, _ := r.dev.ReadAt(dst[:toRead], int64(readStart))
 	r.fileOff += uint64(n)
-	return n, err
+	if n < toRead {
+		return n, syserror.EIO
+	}
+	return n, nil
 }

 // fileBlock returns the file block number we are currently reading.
--- a/pkg/sentry/fs/ext/extent_test.go
+++ b/pkg/sentry/fs/ext/extent_test.go
@ -201,7 +201,7 @@ func TestBuildExtentTree(t *testing.T) {
 // extentTreeSetUp writes the passed extent tree to a mock disk as an extent
 // tree. It also constucts a mock extent file with the same tree built in it.
 // It also writes random data file data and returns it.
-func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReadSeeker, *extentFile, []byte) {
+func extentTreeSetUp(t *testing.T, root *disklayout.ExtentNode) (io.ReaderAt, *extentFile, []byte) {
 	t.Helper()

 	mockDisk := make([]byte, mockExtentBlkSize*10)
--- a/pkg/sentry/fs/ext/filesystem.go
+++ b/pkg/sentry/fs/ext/filesystem.go
@ -31,22 +31,16 @@ type filesystem struct {

 	vfsfs vfs.Filesystem

-	// mu serializes changes to the Dentry tree and the usage of the read seeker.
-	mu sync.Mutex
+	// mu serializes changes to the Dentry tree.
+	mu sync.RWMutex

-	// dev is the ReadSeeker for the underlying fs device. It is protected by mu.
-	//
-	// The ext filesystems aim to maximize locality, i.e. place all the data
-	// blocks of a file close together. On a spinning disk, locality reduces the
-	// amount of movement of the head hence speeding up IO operations. On an SSD
-	// there are no moving parts but locality increases the size of each transer
-	// request. Hence, having mutual exclusion on the read seeker while reading a
-	// file *should* help in achieving the intended performance gains.
-	//
-	// Note: This synchronization was not coupled with the ReadSeeker itself
-	// because we want to synchronize across read/seek operations for the
-	// performance gains mentioned above. Helps enforcing one-file-at-a-time IO.
-	dev io.ReadSeeker
+	// dev is the io.ReaderAt for the underlying fs device. It does not require
+	// protection because io.ReaderAt permits concurrent read calls to it. It
+	// translates to the pread syscall which passes on the read request directly
+	// to the device driver. Device drivers are intelligent in serving multiple
+	// concurrent read requests in the optimal order (taking locality into
+	// consideration).
+	dev io.ReaderAt

 	// inodeCache maps absolute inode numbers to the corresponding Inode struct.
 	// Inodes should be removed from this once their reference count hits 0.
@ -69,7 +63,7 @@ var _ vfs.FilesystemImpl = (*filesystem)(nil)
 // getOrCreateInode gets the inode corresponding to the inode number passed in.
 // It creates a new one with the given inode number if one does not exist.
 //
-// Preconditions: must be holding fs.mu.
+// Precondition: must be holding fs.mu.
 func (fs *filesystem) getOrCreateInode(ctx context.Context, inodeNum uint32) (*inode, error) {
 	if in, ok := fs.inodeCache[inodeNum]; ok {
 		return in, nil
--- a/pkg/sentry/fs/ext/inline_file.go
+++ b/pkg/sentry/fs/ext/inline_file.go
@ -28,7 +28,7 @@ type inlineFile struct {
 var _ fileReader = (*inlineFile)(nil)

 // getFileReader implements fileReader.getFileReader.
-func (f *inlineFile) getFileReader(_ io.ReadSeeker, _ uint64, offset uint64) io.Reader {
+func (f *inlineFile) getFileReader(_ io.ReaderAt, _ uint64, offset uint64) io.Reader {
 	diskInode := f.regFile.inode.diskInode
 	return &inlineReader{offset: offset, data: diskInode.Data()[:diskInode.Size()]}
 }
--- a/pkg/sentry/fs/ext/inode.go
+++ b/pkg/sentry/fs/ext/inode.go
@ -75,7 +75,7 @@ func (in *inode) tryIncRef() bool {
 // decRef decrements the inode ref count and releases the inode resources if
 // the ref count hits 0.
 //
-// Preconditions: Must have locked fs.mu.
+// Precondition: Must have locked fs.mu.
 func (in *inode) decRef(fs *filesystem) {
 	if refs := atomic.AddInt64(&in.refs, -1); refs == 0 {
 		delete(fs.inodeCache, in.inodeNum)
@ -86,9 +86,7 @@ func (in *inode) decRef(fs *filesystem) {

 // newInode is the inode constructor. Reads the inode off disk. Identifies
 // inodes based on the absolute inode number on disk.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newInode(ctx context.Context, dev io.ReadSeeker, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
+func newInode(ctx context.Context, dev io.ReaderAt, sb disklayout.SuperBlock, bgs []disklayout.BlockGroup, inodeNum uint32) (*inode, error) {
 	if inodeNum == 0 {
 		panic("inode number 0 on ext filesystems is not possible")
 	}
--- a/pkg/sentry/fs/ext/regular_file.go
+++ b/pkg/sentry/fs/ext/regular_file.go
@ -29,10 +29,7 @@ type fileReader interface {
 	//
 	// This reader is not meant to be retained across Read operations as it needs
 	// to be reinitialized with the correct offset for every Read.
-	//
-	// Precondition: Must hold the mutex of the filesystem containing dev while
-	//               using the Reader.
-	getFileReader(dev io.ReadSeeker, blkSize uint64, offset uint64) io.Reader
+	getFileReader(dev io.ReaderAt, blkSize uint64, offset uint64) io.Reader
 }

 // regularFile represents a regular file's inode. This too follows the
@ -48,9 +45,7 @@ type regularFile struct {

 // newRegularFile is the regularFile constructor. It figures out what kind of
 // file this is and initializes the fileReader.
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newRegularFile(dev io.ReadSeeker, blkSize uint64, inode inode) (*regularFile, error) {
+func newRegularFile(dev io.ReaderAt, blkSize uint64, inode inode) (*regularFile, error) {
 	regFile := regularFile{
 		inode: inode,
 	}
--- a/pkg/sentry/fs/ext/symlink.go
+++ b/pkg/sentry/fs/ext/symlink.go
@ -28,9 +28,7 @@ type symlink struct {

 // newSymlink is the symlink constructor. It reads out the symlink target from
 // the inode (however it might have been stored).
-//
-// Preconditions: Must hold the mutex of the filesystem containing dev.
-func newSymlink(dev io.ReadSeeker, blkSize uint64, inode inode) (*symlink, error) {
+func newSymlink(dev io.ReaderAt, blkSize uint64, inode inode) (*symlink, error) {
 	var file *symlink
 	var link []byte

--- a/pkg/sentry/fs/ext/utils.go
+++ b/pkg/sentry/fs/ext/utils.go
@ -15,55 +15,30 @@
 package ext

 import (
-	"encoding/binary"
 	"io"

+	"gvisor.dev/gvisor/pkg/binary"
 	"gvisor.dev/gvisor/pkg/sentry/fs/ext/disklayout"
 	"gvisor.dev/gvisor/pkg/syserror"
 )

 // readFromDisk performs a binary read from disk into the given struct from
 // the absolute offset provided.
-//
-// All disk reads should use this helper so we avoid reading from stale
-// previously used offsets. This function forces the offset parameter.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readFromDisk(dev io.ReadSeeker, abOff int64, v interface{}) error {
-	if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
-		return syserror.EIO
-	}
-
-	if err := binary.Read(dev, binary.LittleEndian, v); err != nil {
+func readFromDisk(dev io.ReaderAt, abOff int64, v interface{}) error {
+	n := binary.Size(v)
+	buf := make([]byte, n)
+	if read, _ := dev.ReadAt(buf, abOff); read < int(n) {
 		return syserror.EIO
 	}

+	binary.Unmarshal(buf, binary.LittleEndian, v)
 	return nil
 }

-// readFull is a wrapper around io.ReadFull which enforces the absolute offset
-// parameter so that we can ensure that we do not perform incorrect reads from
-// stale previously used offsets.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readFull(dev io.ReadSeeker, abOff int64, dst []byte) (int, error) {
-	if _, err := dev.Seek(abOff, io.SeekStart); err != nil {
-		return 0, syserror.EIO
-	}
-
-	n, err := io.ReadFull(dev, dst)
-	if err != nil {
-		err = syserror.EIO
-	}
-	return n, err
-}
-
 // readSuperBlock reads the SuperBlock from block group 0 in the underlying
 // device. There are three versions of the superblock. This function identifies
 // and returns the correct version.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readSuperBlock(dev io.ReadSeeker) (disklayout.SuperBlock, error) {
+func readSuperBlock(dev io.ReaderAt) (disklayout.SuperBlock, error) {
 	var sb disklayout.SuperBlock = &disklayout.SuperBlockOld{}
 	if err := readFromDisk(dev, disklayout.SbOffset, sb); err != nil {
 		return nil, err
@ -98,9 +73,7 @@ func blockGroupsCount(sb disklayout.SuperBlock) uint64 {

 // readBlockGroups reads the block group descriptor table from block group 0 in
 // the underlying device.
-//
-// Precondition: Must hold the mutex of the filesystem containing dev.
-func readBlockGroups(dev io.ReadSeeker, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
+func readBlockGroups(dev io.ReaderAt, sb disklayout.SuperBlock) ([]disklayout.BlockGroup, error) {
 	bgCount := blockGroupsCount(sb)
 	bgdSize := uint64(sb.BgDescSize())
 	is64Bit := sb.IncompatibleFeatures().Is64Bit