Use FD limit and file size limit from host

FD limit and file size limit is read from the host, instead
of using hard-coded defaults, given that they effect the sandbox
process. Also limit the direct cache to use no more than half
if the available FDs.

PiperOrigin-RevId: 244050323
Change-Id: I787ad0fdf07c49d589e51aebfeae477324fe26e6
This commit is contained in:
Fabricio Voznika 2019-04-17 12:56:23 -07:00 committed by Shentubot
parent 08d99c5fbe
commit c8cee7108f
14 changed files with 347 additions and 14 deletions

View File

@ -12,6 +12,7 @@ go_library(
"dentry.go", "dentry.go",
"dirent.go", "dirent.go",
"dirent_cache.go", "dirent_cache.go",
"dirent_cache_limiter.go",
"dirent_list.go", "dirent_list.go",
"dirent_state.go", "dirent_state.go",
"event_list.go", "event_list.go",

View File

@ -26,6 +26,9 @@ type contextID int
const ( const (
// CtxRoot is a Context.Value key for a Dirent. // CtxRoot is a Context.Value key for a Dirent.
CtxRoot contextID = iota CtxRoot contextID = iota
// CtxDirentCacheLimiter is a Context.Value key for DirentCacheLimiter.
CtxDirentCacheLimiter
) )
// ContextCanAccessFile determines whether `file` can be accessed in the requested way // ContextCanAccessFile determines whether `file` can be accessed in the requested way
@ -100,3 +103,12 @@ func RootFromContext(ctx context.Context) *Dirent {
} }
return nil return nil
} }
// DirentCacheLimiterFromContext returns the DirentCacheLimiter used by ctx, or
// nil if ctx does not have a dirent cache limiter.
func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter {
if v := ctx.Value(CtxDirentCacheLimiter); v != nil {
return v.(*DirentCacheLimiter)
}
return nil
}

View File

@ -32,6 +32,10 @@ type DirentCache struct {
// when cache is nil. // when cache is nil.
maxSize uint64 maxSize uint64
// limit restricts the number of entries in the cache amoung multiple caches.
// It may be nil if there are no global limit for this cache.
limit *DirentCacheLimiter
// mu protects currentSize and direntList. // mu protects currentSize and direntList.
mu sync.Mutex `state:"nosave"` mu sync.Mutex `state:"nosave"`
@ -45,8 +49,7 @@ type DirentCache struct {
list direntList `state:"zerovalue"` list direntList `state:"zerovalue"`
} }
// NewDirentCache returns a new DirentCache with the given maxSize. If maxSize // NewDirentCache returns a new DirentCache with the given maxSize.
// is 0, nil is returned.
func NewDirentCache(maxSize uint64) *DirentCache { func NewDirentCache(maxSize uint64) *DirentCache {
return &DirentCache{ return &DirentCache{
maxSize: maxSize, maxSize: maxSize,
@ -71,15 +74,24 @@ func (c *DirentCache) Add(d *Dirent) {
return return
} }
// First check against the global limit.
for c.limit != nil && !c.limit.tryInc() {
if c.currentSize == 0 {
// If the global limit is reached, but there is nothing more to drop from
// this cache, there is not much else to do.
c.mu.Unlock()
return
}
c.remove(c.list.Back())
}
// d is not in cache. Add it and take a reference. // d is not in cache. Add it and take a reference.
c.list.PushFront(d) c.list.PushFront(d)
d.IncRef() d.IncRef()
c.currentSize++ c.currentSize++
// Remove the oldest until we are under the size limit. c.maybeShrink()
for c.maxSize > 0 && c.currentSize > c.maxSize {
c.remove(c.list.Back())
}
c.mu.Unlock() c.mu.Unlock()
} }
@ -92,6 +104,9 @@ func (c *DirentCache) remove(d *Dirent) {
d.SetNext(nil) d.SetNext(nil)
d.DecRef() d.DecRef()
c.currentSize-- c.currentSize--
if c.limit != nil {
c.limit.dec()
}
} }
// Remove removes the element from the cache and decrements its refCount. It // Remove removes the element from the cache and decrements its refCount. It
@ -142,3 +157,19 @@ func (c *DirentCache) Invalidate() {
} }
c.mu.Unlock() c.mu.Unlock()
} }
// setMaxSize sets cache max size. If current size is larger than max size, the
// cache shrinks to acommodate the new max.
func (c *DirentCache) setMaxSize(max uint64) {
c.mu.Lock()
c.maxSize = max
c.maybeShrink()
c.mu.Unlock()
}
// shrink removes the oldest element until the list is under the size limit.
func (c *DirentCache) maybeShrink() {
for c.maxSize > 0 && c.currentSize > c.maxSize {
c.remove(c.list.Back())
}
}

View File

@ -0,0 +1,55 @@
// Copyright 2018 Google LLC
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package fs
import (
"fmt"
"sync"
)
// DirentCacheLimiter acts as a global limit for all dirent caches in the
// process.
//
// +stateify savable
type DirentCacheLimiter struct {
mu sync.Mutex `state:"nosave"`
max uint64
count uint64 `state:"zerovalue"`
}
// NewDirentCacheLimiter creates a new DirentCacheLimiter.
func NewDirentCacheLimiter(max uint64) *DirentCacheLimiter {
return &DirentCacheLimiter{max: max}
}
func (d *DirentCacheLimiter) tryInc() bool {
d.mu.Lock()
if d.count >= d.max {
d.mu.Unlock()
return false
}
d.count++
d.mu.Unlock()
return true
}
func (d *DirentCacheLimiter) dec() {
d.mu.Lock()
if d.count == 0 {
panic(fmt.Sprintf("underflowing DirentCacheLimiter count: %+v", d))
}
d.count--
d.mu.Unlock()
}

View File

@ -120,6 +120,96 @@ func TestDirentCache(t *testing.T) {
} }
} }
func TestDirentCacheLimiter(t *testing.T) {
const (
globalMaxSize = 5
maxSize = 3
)
limit := NewDirentCacheLimiter(globalMaxSize)
c1 := NewDirentCache(maxSize)
c1.limit = limit
c2 := NewDirentCache(maxSize)
c2.limit = limit
// Create a Dirent d.
d := NewNegativeDirent("")
// Add d to the cache.
c1.Add(d)
if got, want := c1.Size(), uint64(1); got != want {
t.Errorf("c1.Size() got %v, want %v", got, want)
}
// Add maxSize-1 more elements. d should be oldest element.
for i := 0; i < maxSize-1; i++ {
c1.Add(NewNegativeDirent(""))
}
if got, want := c1.Size(), uint64(maxSize); got != want {
t.Errorf("c1.Size() got %v, want %v", got, want)
}
// Check that d is still there.
if got, want := c1.contains(d), true; got != want {
t.Errorf("c1.contains(d) got %v want %v", got, want)
}
// Fill up the other cache, it will start dropping old entries from the cache
// when the global limit is reached.
for i := 0; i < maxSize; i++ {
c2.Add(NewNegativeDirent(""))
}
// Check is what's remaining from global max.
if got, want := c2.Size(), globalMaxSize-maxSize; int(got) != want {
t.Errorf("c2.Size() got %v, want %v", got, want)
}
// Check that d was not dropped.
if got, want := c1.contains(d), true; got != want {
t.Errorf("c1.contains(d) got %v want %v", got, want)
}
// Add an entry that will eventually be dropped. Check is done later...
drop := NewNegativeDirent("")
c1.Add(drop)
// Check that d is bumped to front even when global limit is reached.
c1.Add(d)
if got, want := c1.contains(d), true; got != want {
t.Errorf("c1.contains(d) got %v want %v", got, want)
}
// Add 2 more element and check that:
// - d is still in the list: to verify that d was bumped
// - d2/d3 are in the list: older entries are dropped when global limit is
// reached.
// - drop is not in the list: indeed older elements are dropped.
d2 := NewNegativeDirent("")
c1.Add(d2)
d3 := NewNegativeDirent("")
c1.Add(d3)
if got, want := c1.contains(d), true; got != want {
t.Errorf("c1.contains(d) got %v want %v", got, want)
}
if got, want := c1.contains(d2), true; got != want {
t.Errorf("c1.contains(d2) got %v want %v", got, want)
}
if got, want := c1.contains(d3), true; got != want {
t.Errorf("c1.contains(d3) got %v want %v", got, want)
}
if got, want := c1.contains(drop), false; got != want {
t.Errorf("c1.contains(drop) got %v want %v", got, want)
}
// Drop all entries from one cache. The other will be allowed to grow.
c1.Invalidate()
c2.Add(NewNegativeDirent(""))
if got, want := c2.Size(), uint64(maxSize); got != want {
t.Errorf("c2.Size() got %v, want %v", got, want)
}
}
// TestNilDirentCache tests that a nil cache supports all cache operations, but // TestNilDirentCache tests that a nil cache supports all cache operations, but
// treats them as noop. // treats them as noop.
func TestNilDirentCache(t *testing.T) { func TestNilDirentCache(t *testing.T) {

View File

@ -28,6 +28,10 @@ import (
"gvisor.googlesource.com/gvisor/pkg/unet" "gvisor.googlesource.com/gvisor/pkg/unet"
) )
// DefaultDirentCacheSize is the default dirent cache size for 9P mounts. It can
// be adjusted independentely from the other dirent caches.
var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize
// +stateify savable // +stateify savable
type endpointMaps struct { type endpointMaps struct {
// mu protexts the direntMap, the keyMap, and the pathMap below. // mu protexts the direntMap, the keyMap, and the pathMap below.
@ -249,6 +253,11 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF
// Construct the MountSource with the session and superBlockFlags. // Construct the MountSource with the session and superBlockFlags.
m := fs.NewMountSource(s, filesystem, superBlockFlags) m := fs.NewMountSource(s, filesystem, superBlockFlags)
// Given that gofer files can consume host FDs, restrict the number
// of files that can be held by the cache.
m.SetDirentCacheMaxSize(DefaultDirentCacheSize)
m.SetDirentCacheLimiter(fs.DirentCacheLimiterFromContext(ctx))
// Send the Tversion request. // Send the Tversion request.
s.client, err = p9.NewClient(conn, s.msize, s.version) s.client, err = p9.NewClient(conn, s.msize, s.version)
if err != nil { if err != nil {

View File

@ -151,9 +151,9 @@ type MountSource struct {
children map[*MountSource]struct{} children map[*MountSource]struct{}
} }
// defaultDirentCacheSize is the number of Dirents that the VFS can hold an extra // DefaultDirentCacheSize is the number of Dirents that the VFS can hold an
// reference on. // extra reference on.
const defaultDirentCacheSize uint64 = 1000 const DefaultDirentCacheSize uint64 = 1000
// NewMountSource returns a new MountSource. Filesystem may be nil if there is no // NewMountSource returns a new MountSource. Filesystem may be nil if there is no
// filesystem backing the mount. // filesystem backing the mount.
@ -162,7 +162,7 @@ func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags Mou
MountSourceOperations: mops, MountSourceOperations: mops,
Flags: flags, Flags: flags,
Filesystem: filesystem, Filesystem: filesystem,
fscache: NewDirentCache(defaultDirentCacheSize), fscache: NewDirentCache(DefaultDirentCacheSize),
children: make(map[*MountSource]struct{}), children: make(map[*MountSource]struct{}),
} }
} }
@ -246,6 +246,18 @@ func (msrc *MountSource) FlushDirentRefs() {
msrc.fscache.Invalidate() msrc.fscache.Invalidate()
} }
// SetDirentCacheMaxSize sets the max size to the dirent cache associated with
// this mount source.
func (msrc *MountSource) SetDirentCacheMaxSize(max uint64) {
msrc.fscache.setMaxSize(max)
}
// SetDirentCacheLimiter sets the limiter objcet to the dirent cache associated
// with this mount source.
func (msrc *MountSource) SetDirentCacheLimiter(l *DirentCacheLimiter) {
msrc.fscache.limit = l
}
// NewCachingMountSource returns a generic mount that will cache dirents // NewCachingMountSource returns a generic mount that will cache dirents
// aggressively. // aggressively.
func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource { func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource {

View File

@ -31,10 +31,19 @@ type overlayMountSourceOperations struct {
func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *MountSource { func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *MountSource {
upper.IncRef() upper.IncRef()
lower.IncRef() lower.IncRef()
return NewMountSource(&overlayMountSourceOperations{ msrc := NewMountSource(&overlayMountSourceOperations{
upper: upper, upper: upper,
lower: lower, lower: lower,
}, &overlayFilesystem{}, flags) }, &overlayFilesystem{}, flags)
// Use the minimum number to keep resource usage under limits.
size := lower.fscache.maxSize
if size > upper.fscache.maxSize {
size = upper.fscache.maxSize
}
msrc.fscache.setMaxSize(size)
return msrc
} }
// Revalidate implements MountSourceOperations.Revalidate for an overlay by // Revalidate implements MountSourceOperations.Revalidate for an overlay by

View File

@ -188,6 +188,11 @@ type Kernel struct {
// deviceRegistry is used to save/restore device.SimpleDevices. // deviceRegistry is used to save/restore device.SimpleDevices.
deviceRegistry struct{} `state:".(*device.Registry)"` deviceRegistry struct{} `state:".(*device.Registry)"`
// DirentCacheLimiter controls the number of total dirent entries can be in
// caches. Not all caches use it, only the caches that use host resources use
// the limiter. It may be nil if disabled.
DirentCacheLimiter *fs.DirentCacheLimiter
} }
// InitKernelArgs holds arguments to Init. // InitKernelArgs holds arguments to Init.
@ -626,6 +631,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} {
return ctx.k.mounts.Root() return ctx.k.mounts.Root()
} }
return nil return nil
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case ktime.CtxRealtimeClock: case ktime.CtxRealtimeClock:
return ctx.k.RealtimeClock() return ctx.k.RealtimeClock()
case limits.CtxLimits: case limits.CtxLimits:
@ -1170,6 +1177,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} {
return auth.NewRootCredentials(ctx.k.rootUserNamespace) return auth.NewRootCredentials(ctx.k.rootUserNamespace)
case fs.CtxRoot: case fs.CtxRoot:
return ctx.k.mounts.Root() return ctx.k.mounts.Root()
case fs.CtxDirentCacheLimiter:
return ctx.k.DirentCacheLimiter
case ktime.CtxRealtimeClock: case ktime.CtxRealtimeClock:
return ctx.k.RealtimeClock() return ctx.k.RealtimeClock()
case limits.CtxLimits: case limits.CtxLimits:

View File

@ -601,6 +601,8 @@ func (t *Task) Value(key interface{}) interface{} {
return int32(t.ThreadGroup().ID()) return int32(t.ThreadGroup().ID())
case fs.CtxRoot: case fs.CtxRoot:
return t.fsc.RootDirectory() return t.fsc.RootDirectory()
case fs.CtxDirentCacheLimiter:
return t.k.DirentCacheLimiter
case inet.CtxStack: case inet.CtxStack:
return t.NetworkContext() return t.NetworkContext()
case ktime.CtxRealtimeClock: case ktime.CtxRealtimeClock:

View File

@ -20,10 +20,10 @@ import (
"path/filepath" "path/filepath"
"strconv" "strconv"
"strings" "strings"
"syscall"
// Include filesystem types that OCI spec might mount. // Include filesystem types that OCI spec might mount.
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev"
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host"
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc"
_ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys"
@ -38,6 +38,7 @@ import (
"gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/context"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer"
"gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs"
"gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/pkg/syserror"
"gvisor.googlesource.com/gvisor/runsc/specutils" "gvisor.googlesource.com/gvisor/runsc/specutils"
@ -81,6 +82,22 @@ func (f *fdDispenser) empty() bool {
return len(f.fds) == 0 return len(f.fds) == 0
} }
func adjustDirentCache(k *kernel.Kernel) error {
var hl syscall.Rlimit
if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil {
return fmt.Errorf("getting RLIMIT_NOFILE: %v", err)
}
if int64(hl.Cur) != syscall.RLIM_INFINITY {
newSize := hl.Cur / 2
if newSize < gofer.DefaultDirentCacheSize {
log.Infof("Setting gofer dirent cache size to %d", newSize)
gofer.DefaultDirentCacheSize = newSize
k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize)
}
}
return nil
}
// setupRootContainerFS creates a mount namespace containing the root filesystem // setupRootContainerFS creates a mount namespace containing the root filesystem
// and all mounts. 'rootCtx' is used to walk directories to find mount points. // and all mounts. 'rootCtx' is used to walk directories to find mount points.
// 'setMountNS' is called after namespace is created. It must set the mount NS // 'setMountNS' is called after namespace is created. It must set the mount NS

View File

@ -16,8 +16,11 @@ package boot
import ( import (
"fmt" "fmt"
"sync"
"syscall"
specs "github.com/opencontainers/runtime-spec/specs-go" specs "github.com/opencontainers/runtime-spec/specs-go"
"gvisor.googlesource.com/gvisor/pkg/log"
"gvisor.googlesource.com/gvisor/pkg/sentry/limits" "gvisor.googlesource.com/gvisor/pkg/sentry/limits"
) )
@ -41,10 +44,43 @@ var fromLinuxResource = map[string]limits.LimitType{
"RLIMIT_STACK": limits.Stack, "RLIMIT_STACK": limits.Stack,
} }
func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { func findName(lt limits.LimitType) string {
for k, v := range fromLinuxResource {
if v == lt {
return k
}
}
return "unknown"
}
var defaults defs
type defs struct {
mu sync.Mutex
set *limits.LimitSet
err error
}
func (d *defs) get() (*limits.LimitSet, error) {
d.mu.Lock()
defer d.mu.Unlock()
if d.err != nil {
return nil, d.err
}
if d.set == nil {
if err := d.initDefaults(); err != nil {
d.err = err
return nil, err
}
}
return d.set, nil
}
func (d *defs) initDefaults() error {
ls, err := limits.NewLinuxLimitSet() ls, err := limits.NewLinuxLimitSet()
if err != nil { if err != nil {
return nil, err return err
} }
// Set default limits based on what containers get by default, ex: // Set default limits based on what containers get by default, ex:
@ -66,6 +102,43 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0}) ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0})
ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity}) ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity})
// Read host limits that directly affect the sandbox and adjust the defaults
// based on them.
for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} {
var hl syscall.Rlimit
if err := syscall.Getrlimit(res, &hl); err != nil {
return err
}
lt, ok := limits.FromLinuxResource[res]
if !ok {
return fmt.Errorf("unknown rlimit type %v", res)
}
hostLimit := limits.Limit{
Cur: limits.FromLinux(hl.Cur),
Max: limits.FromLinux(hl.Max),
}
defaultLimit := ls.Get(lt)
if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur {
log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur)
}
if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max {
log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max)
ls.SetUnchecked(lt, hostLimit)
}
}
d.set = ls
return nil
}
func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) {
ls, err := defaults.get()
if err != nil {
return nil, err
}
// Then apply overwrites on top of defaults. // Then apply overwrites on top of defaults.
for _, rl := range spec.Process.Rlimits { for _, rl := range spec.Process.Rlimits {
lt, ok := fromLinuxResource[rl.Type] lt, ok := fromLinuxResource[rl.Type]

View File

@ -274,6 +274,10 @@ func New(args Args) (*Loader, error) {
return nil, fmt.Errorf("initializing kernel: %v", err) return nil, fmt.Errorf("initializing kernel: %v", err)
} }
if err := adjustDirentCache(k); err != nil {
return nil, err
}
// Turn on packet logging if enabled. // Turn on packet logging if enabled.
if args.Conf.LogPackets { if args.Conf.LogPackets {
log.Infof("Packet logging enabled") log.Infof("Packet logging enabled")

View File

@ -255,7 +255,16 @@ TEST_F(PollTest, Nfds) {
// Stash value of RLIMIT_NOFILES. // Stash value of RLIMIT_NOFILES.
struct rlimit rlim; struct rlimit rlim;
TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0); TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0);
// gVisor caps the number of FDs that epoll can use beyond RLIMIT_NOFILE.
constexpr rlim_t gVisorMax = 1048576;
if (rlim.rlim_cur > gVisorMax) {
rlim.rlim_cur = gVisorMax;
TEST_PCHECK(setrlimit(RLIMIT_NOFILE, &rlim) == 0);
}
rlim_t max_fds = rlim.rlim_cur; rlim_t max_fds = rlim.rlim_cur;
LOG(INFO) << "Using limit: " << max_fds;
// Create an eventfd. Since its value is initially zero, it is writable. // Create an eventfd. Since its value is initially zero, it is writable.
FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD()); FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());