diff --git a/pkg/sentry/fs/BUILD b/pkg/sentry/fs/BUILD index 1742d3a65..1fd9e30f6 100644 --- a/pkg/sentry/fs/BUILD +++ b/pkg/sentry/fs/BUILD @@ -12,6 +12,7 @@ go_library( "dentry.go", "dirent.go", "dirent_cache.go", + "dirent_cache_limiter.go", "dirent_list.go", "dirent_state.go", "event_list.go", diff --git a/pkg/sentry/fs/context.go b/pkg/sentry/fs/context.go index c0e6075e4..4869428a8 100644 --- a/pkg/sentry/fs/context.go +++ b/pkg/sentry/fs/context.go @@ -26,6 +26,9 @@ type contextID int const ( // CtxRoot is a Context.Value key for a Dirent. CtxRoot contextID = iota + + // CtxDirentCacheLimiter is a Context.Value key for DirentCacheLimiter. + CtxDirentCacheLimiter ) // ContextCanAccessFile determines whether `file` can be accessed in the requested way @@ -100,3 +103,12 @@ func RootFromContext(ctx context.Context) *Dirent { } return nil } + +// DirentCacheLimiterFromContext returns the DirentCacheLimiter used by ctx, or +// nil if ctx does not have a dirent cache limiter. +func DirentCacheLimiterFromContext(ctx context.Context) *DirentCacheLimiter { + if v := ctx.Value(CtxDirentCacheLimiter); v != nil { + return v.(*DirentCacheLimiter) + } + return nil +} diff --git a/pkg/sentry/fs/dirent_cache.go b/pkg/sentry/fs/dirent_cache.go index 502b0a09b..d26a06971 100644 --- a/pkg/sentry/fs/dirent_cache.go +++ b/pkg/sentry/fs/dirent_cache.go @@ -32,6 +32,10 @@ type DirentCache struct { // when cache is nil. maxSize uint64 + // limit restricts the number of entries in the cache amoung multiple caches. + // It may be nil if there are no global limit for this cache. + limit *DirentCacheLimiter + // mu protects currentSize and direntList. mu sync.Mutex `state:"nosave"` @@ -45,8 +49,7 @@ type DirentCache struct { list direntList `state:"zerovalue"` } -// NewDirentCache returns a new DirentCache with the given maxSize. If maxSize -// is 0, nil is returned. +// NewDirentCache returns a new DirentCache with the given maxSize. func NewDirentCache(maxSize uint64) *DirentCache { return &DirentCache{ maxSize: maxSize, @@ -71,15 +74,24 @@ func (c *DirentCache) Add(d *Dirent) { return } + // First check against the global limit. + for c.limit != nil && !c.limit.tryInc() { + if c.currentSize == 0 { + // If the global limit is reached, but there is nothing more to drop from + // this cache, there is not much else to do. + c.mu.Unlock() + return + } + c.remove(c.list.Back()) + } + // d is not in cache. Add it and take a reference. c.list.PushFront(d) d.IncRef() c.currentSize++ - // Remove the oldest until we are under the size limit. - for c.maxSize > 0 && c.currentSize > c.maxSize { - c.remove(c.list.Back()) - } + c.maybeShrink() + c.mu.Unlock() } @@ -92,6 +104,9 @@ func (c *DirentCache) remove(d *Dirent) { d.SetNext(nil) d.DecRef() c.currentSize-- + if c.limit != nil { + c.limit.dec() + } } // Remove removes the element from the cache and decrements its refCount. It @@ -142,3 +157,19 @@ func (c *DirentCache) Invalidate() { } c.mu.Unlock() } + +// setMaxSize sets cache max size. If current size is larger than max size, the +// cache shrinks to acommodate the new max. +func (c *DirentCache) setMaxSize(max uint64) { + c.mu.Lock() + c.maxSize = max + c.maybeShrink() + c.mu.Unlock() +} + +// shrink removes the oldest element until the list is under the size limit. +func (c *DirentCache) maybeShrink() { + for c.maxSize > 0 && c.currentSize > c.maxSize { + c.remove(c.list.Back()) + } +} diff --git a/pkg/sentry/fs/dirent_cache_limiter.go b/pkg/sentry/fs/dirent_cache_limiter.go new file mode 100644 index 000000000..024c7b2d5 --- /dev/null +++ b/pkg/sentry/fs/dirent_cache_limiter.go @@ -0,0 +1,55 @@ +// Copyright 2018 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fs + +import ( + "fmt" + "sync" +) + +// DirentCacheLimiter acts as a global limit for all dirent caches in the +// process. +// +// +stateify savable +type DirentCacheLimiter struct { + mu sync.Mutex `state:"nosave"` + max uint64 + count uint64 `state:"zerovalue"` +} + +// NewDirentCacheLimiter creates a new DirentCacheLimiter. +func NewDirentCacheLimiter(max uint64) *DirentCacheLimiter { + return &DirentCacheLimiter{max: max} +} + +func (d *DirentCacheLimiter) tryInc() bool { + d.mu.Lock() + if d.count >= d.max { + d.mu.Unlock() + return false + } + d.count++ + d.mu.Unlock() + return true +} + +func (d *DirentCacheLimiter) dec() { + d.mu.Lock() + if d.count == 0 { + panic(fmt.Sprintf("underflowing DirentCacheLimiter count: %+v", d)) + } + d.count-- + d.mu.Unlock() +} diff --git a/pkg/sentry/fs/dirent_cache_test.go b/pkg/sentry/fs/dirent_cache_test.go index 5d0e9d91c..93e8d415f 100644 --- a/pkg/sentry/fs/dirent_cache_test.go +++ b/pkg/sentry/fs/dirent_cache_test.go @@ -120,6 +120,96 @@ func TestDirentCache(t *testing.T) { } } +func TestDirentCacheLimiter(t *testing.T) { + const ( + globalMaxSize = 5 + maxSize = 3 + ) + + limit := NewDirentCacheLimiter(globalMaxSize) + c1 := NewDirentCache(maxSize) + c1.limit = limit + c2 := NewDirentCache(maxSize) + c2.limit = limit + + // Create a Dirent d. + d := NewNegativeDirent("") + + // Add d to the cache. + c1.Add(d) + if got, want := c1.Size(), uint64(1); got != want { + t.Errorf("c1.Size() got %v, want %v", got, want) + } + + // Add maxSize-1 more elements. d should be oldest element. + for i := 0; i < maxSize-1; i++ { + c1.Add(NewNegativeDirent("")) + } + if got, want := c1.Size(), uint64(maxSize); got != want { + t.Errorf("c1.Size() got %v, want %v", got, want) + } + + // Check that d is still there. + if got, want := c1.contains(d), true; got != want { + t.Errorf("c1.contains(d) got %v want %v", got, want) + } + + // Fill up the other cache, it will start dropping old entries from the cache + // when the global limit is reached. + for i := 0; i < maxSize; i++ { + c2.Add(NewNegativeDirent("")) + } + + // Check is what's remaining from global max. + if got, want := c2.Size(), globalMaxSize-maxSize; int(got) != want { + t.Errorf("c2.Size() got %v, want %v", got, want) + } + + // Check that d was not dropped. + if got, want := c1.contains(d), true; got != want { + t.Errorf("c1.contains(d) got %v want %v", got, want) + } + + // Add an entry that will eventually be dropped. Check is done later... + drop := NewNegativeDirent("") + c1.Add(drop) + + // Check that d is bumped to front even when global limit is reached. + c1.Add(d) + if got, want := c1.contains(d), true; got != want { + t.Errorf("c1.contains(d) got %v want %v", got, want) + } + + // Add 2 more element and check that: + // - d is still in the list: to verify that d was bumped + // - d2/d3 are in the list: older entries are dropped when global limit is + // reached. + // - drop is not in the list: indeed older elements are dropped. + d2 := NewNegativeDirent("") + c1.Add(d2) + d3 := NewNegativeDirent("") + c1.Add(d3) + if got, want := c1.contains(d), true; got != want { + t.Errorf("c1.contains(d) got %v want %v", got, want) + } + if got, want := c1.contains(d2), true; got != want { + t.Errorf("c1.contains(d2) got %v want %v", got, want) + } + if got, want := c1.contains(d3), true; got != want { + t.Errorf("c1.contains(d3) got %v want %v", got, want) + } + if got, want := c1.contains(drop), false; got != want { + t.Errorf("c1.contains(drop) got %v want %v", got, want) + } + + // Drop all entries from one cache. The other will be allowed to grow. + c1.Invalidate() + c2.Add(NewNegativeDirent("")) + if got, want := c2.Size(), uint64(maxSize); got != want { + t.Errorf("c2.Size() got %v, want %v", got, want) + } +} + // TestNilDirentCache tests that a nil cache supports all cache operations, but // treats them as noop. func TestNilDirentCache(t *testing.T) { diff --git a/pkg/sentry/fs/gofer/session.go b/pkg/sentry/fs/gofer/session.go index d626b86f5..ed5147c65 100644 --- a/pkg/sentry/fs/gofer/session.go +++ b/pkg/sentry/fs/gofer/session.go @@ -28,6 +28,10 @@ import ( "gvisor.googlesource.com/gvisor/pkg/unet" ) +// DefaultDirentCacheSize is the default dirent cache size for 9P mounts. It can +// be adjusted independentely from the other dirent caches. +var DefaultDirentCacheSize uint64 = fs.DefaultDirentCacheSize + // +stateify savable type endpointMaps struct { // mu protexts the direntMap, the keyMap, and the pathMap below. @@ -249,6 +253,11 @@ func Root(ctx context.Context, dev string, filesystem fs.Filesystem, superBlockF // Construct the MountSource with the session and superBlockFlags. m := fs.NewMountSource(s, filesystem, superBlockFlags) + // Given that gofer files can consume host FDs, restrict the number + // of files that can be held by the cache. + m.SetDirentCacheMaxSize(DefaultDirentCacheSize) + m.SetDirentCacheLimiter(fs.DirentCacheLimiterFromContext(ctx)) + // Send the Tversion request. s.client, err = p9.NewClient(conn, s.msize, s.version) if err != nil { diff --git a/pkg/sentry/fs/mount.go b/pkg/sentry/fs/mount.go index 5cc777bef..1e245ae5f 100644 --- a/pkg/sentry/fs/mount.go +++ b/pkg/sentry/fs/mount.go @@ -151,9 +151,9 @@ type MountSource struct { children map[*MountSource]struct{} } -// defaultDirentCacheSize is the number of Dirents that the VFS can hold an extra -// reference on. -const defaultDirentCacheSize uint64 = 1000 +// DefaultDirentCacheSize is the number of Dirents that the VFS can hold an +// extra reference on. +const DefaultDirentCacheSize uint64 = 1000 // NewMountSource returns a new MountSource. Filesystem may be nil if there is no // filesystem backing the mount. @@ -162,7 +162,7 @@ func NewMountSource(mops MountSourceOperations, filesystem Filesystem, flags Mou MountSourceOperations: mops, Flags: flags, Filesystem: filesystem, - fscache: NewDirentCache(defaultDirentCacheSize), + fscache: NewDirentCache(DefaultDirentCacheSize), children: make(map[*MountSource]struct{}), } } @@ -246,6 +246,18 @@ func (msrc *MountSource) FlushDirentRefs() { msrc.fscache.Invalidate() } +// SetDirentCacheMaxSize sets the max size to the dirent cache associated with +// this mount source. +func (msrc *MountSource) SetDirentCacheMaxSize(max uint64) { + msrc.fscache.setMaxSize(max) +} + +// SetDirentCacheLimiter sets the limiter objcet to the dirent cache associated +// with this mount source. +func (msrc *MountSource) SetDirentCacheLimiter(l *DirentCacheLimiter) { + msrc.fscache.limit = l +} + // NewCachingMountSource returns a generic mount that will cache dirents // aggressively. func NewCachingMountSource(filesystem Filesystem, flags MountSourceFlags) *MountSource { diff --git a/pkg/sentry/fs/mount_overlay.go b/pkg/sentry/fs/mount_overlay.go index 4c89673b5..fb60a1aec 100644 --- a/pkg/sentry/fs/mount_overlay.go +++ b/pkg/sentry/fs/mount_overlay.go @@ -31,10 +31,19 @@ type overlayMountSourceOperations struct { func newOverlayMountSource(upper, lower *MountSource, flags MountSourceFlags) *MountSource { upper.IncRef() lower.IncRef() - return NewMountSource(&overlayMountSourceOperations{ + msrc := NewMountSource(&overlayMountSourceOperations{ upper: upper, lower: lower, }, &overlayFilesystem{}, flags) + + // Use the minimum number to keep resource usage under limits. + size := lower.fscache.maxSize + if size > upper.fscache.maxSize { + size = upper.fscache.maxSize + } + msrc.fscache.setMaxSize(size) + + return msrc } // Revalidate implements MountSourceOperations.Revalidate for an overlay by diff --git a/pkg/sentry/kernel/kernel.go b/pkg/sentry/kernel/kernel.go index b8953657c..290c4a53c 100644 --- a/pkg/sentry/kernel/kernel.go +++ b/pkg/sentry/kernel/kernel.go @@ -188,6 +188,11 @@ type Kernel struct { // deviceRegistry is used to save/restore device.SimpleDevices. deviceRegistry struct{} `state:".(*device.Registry)"` + + // DirentCacheLimiter controls the number of total dirent entries can be in + // caches. Not all caches use it, only the caches that use host resources use + // the limiter. It may be nil if disabled. + DirentCacheLimiter *fs.DirentCacheLimiter } // InitKernelArgs holds arguments to Init. @@ -626,6 +631,8 @@ func (ctx *createProcessContext) Value(key interface{}) interface{} { return ctx.k.mounts.Root() } return nil + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: @@ -1170,6 +1177,8 @@ func (ctx supervisorContext) Value(key interface{}) interface{} { return auth.NewRootCredentials(ctx.k.rootUserNamespace) case fs.CtxRoot: return ctx.k.mounts.Root() + case fs.CtxDirentCacheLimiter: + return ctx.k.DirentCacheLimiter case ktime.CtxRealtimeClock: return ctx.k.RealtimeClock() case limits.CtxLimits: diff --git a/pkg/sentry/kernel/task.go b/pkg/sentry/kernel/task.go index 9c365e781..ed2175c37 100644 --- a/pkg/sentry/kernel/task.go +++ b/pkg/sentry/kernel/task.go @@ -601,6 +601,8 @@ func (t *Task) Value(key interface{}) interface{} { return int32(t.ThreadGroup().ID()) case fs.CtxRoot: return t.fsc.RootDirectory() + case fs.CtxDirentCacheLimiter: + return t.k.DirentCacheLimiter case inet.CtxStack: return t.NetworkContext() case ktime.CtxRealtimeClock: diff --git a/runsc/boot/fs.go b/runsc/boot/fs.go index 8dfb6dce6..761142d98 100644 --- a/runsc/boot/fs.go +++ b/runsc/boot/fs.go @@ -20,10 +20,10 @@ import ( "path/filepath" "strconv" "strings" + "syscall" // Include filesystem types that OCI spec might mount. _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/dev" - _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/host" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/proc" _ "gvisor.googlesource.com/gvisor/pkg/sentry/fs/sys" @@ -38,6 +38,7 @@ import ( "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/context" "gvisor.googlesource.com/gvisor/pkg/sentry/fs" + "gvisor.googlesource.com/gvisor/pkg/sentry/fs/gofer" "gvisor.googlesource.com/gvisor/pkg/sentry/fs/ramfs" "gvisor.googlesource.com/gvisor/pkg/syserror" "gvisor.googlesource.com/gvisor/runsc/specutils" @@ -81,6 +82,22 @@ func (f *fdDispenser) empty() bool { return len(f.fds) == 0 } +func adjustDirentCache(k *kernel.Kernel) error { + var hl syscall.Rlimit + if err := syscall.Getrlimit(syscall.RLIMIT_NOFILE, &hl); err != nil { + return fmt.Errorf("getting RLIMIT_NOFILE: %v", err) + } + if int64(hl.Cur) != syscall.RLIM_INFINITY { + newSize := hl.Cur / 2 + if newSize < gofer.DefaultDirentCacheSize { + log.Infof("Setting gofer dirent cache size to %d", newSize) + gofer.DefaultDirentCacheSize = newSize + k.DirentCacheLimiter = fs.NewDirentCacheLimiter(newSize) + } + } + return nil +} + // setupRootContainerFS creates a mount namespace containing the root filesystem // and all mounts. 'rootCtx' is used to walk directories to find mount points. // 'setMountNS' is called after namespace is created. It must set the mount NS diff --git a/runsc/boot/limits.go b/runsc/boot/limits.go index e3e716bf9..32e62cdf7 100644 --- a/runsc/boot/limits.go +++ b/runsc/boot/limits.go @@ -16,8 +16,11 @@ package boot import ( "fmt" + "sync" + "syscall" specs "github.com/opencontainers/runtime-spec/specs-go" + "gvisor.googlesource.com/gvisor/pkg/log" "gvisor.googlesource.com/gvisor/pkg/sentry/limits" ) @@ -41,10 +44,43 @@ var fromLinuxResource = map[string]limits.LimitType{ "RLIMIT_STACK": limits.Stack, } -func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { +func findName(lt limits.LimitType) string { + for k, v := range fromLinuxResource { + if v == lt { + return k + } + } + return "unknown" +} + +var defaults defs + +type defs struct { + mu sync.Mutex + set *limits.LimitSet + err error +} + +func (d *defs) get() (*limits.LimitSet, error) { + d.mu.Lock() + defer d.mu.Unlock() + + if d.err != nil { + return nil, d.err + } + if d.set == nil { + if err := d.initDefaults(); err != nil { + d.err = err + return nil, err + } + } + return d.set, nil +} + +func (d *defs) initDefaults() error { ls, err := limits.NewLinuxLimitSet() if err != nil { - return nil, err + return err } // Set default limits based on what containers get by default, ex: @@ -66,6 +102,43 @@ func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { ls.SetUnchecked(limits.SignalsPending, limits.Limit{Cur: 0, Max: 0}) ls.SetUnchecked(limits.Stack, limits.Limit{Cur: 8388608, Max: limits.Infinity}) + // Read host limits that directly affect the sandbox and adjust the defaults + // based on them. + for _, res := range []int{syscall.RLIMIT_FSIZE, syscall.RLIMIT_NOFILE} { + var hl syscall.Rlimit + if err := syscall.Getrlimit(res, &hl); err != nil { + return err + } + + lt, ok := limits.FromLinuxResource[res] + if !ok { + return fmt.Errorf("unknown rlimit type %v", res) + } + hostLimit := limits.Limit{ + Cur: limits.FromLinux(hl.Cur), + Max: limits.FromLinux(hl.Max), + } + + defaultLimit := ls.Get(lt) + if hostLimit.Cur != limits.Infinity && hostLimit.Cur < defaultLimit.Cur { + log.Warningf("Host limit is lower than recommended, resource: %q, host: %d, recommended: %d", findName(lt), hostLimit.Cur, defaultLimit.Cur) + } + if hostLimit.Cur != defaultLimit.Cur || hostLimit.Max != defaultLimit.Max { + log.Infof("Setting limit from host, resource: %q {soft: %d, hard: %d}", findName(lt), hostLimit.Cur, hostLimit.Max) + ls.SetUnchecked(lt, hostLimit) + } + } + + d.set = ls + return nil +} + +func createLimitSet(spec *specs.Spec) (*limits.LimitSet, error) { + ls, err := defaults.get() + if err != nil { + return nil, err + } + // Then apply overwrites on top of defaults. for _, rl := range spec.Process.Rlimits { lt, ok := fromLinuxResource[rl.Type] diff --git a/runsc/boot/loader.go b/runsc/boot/loader.go index 56cb137f0..88a834aa5 100644 --- a/runsc/boot/loader.go +++ b/runsc/boot/loader.go @@ -274,6 +274,10 @@ func New(args Args) (*Loader, error) { return nil, fmt.Errorf("initializing kernel: %v", err) } + if err := adjustDirentCache(k); err != nil { + return nil, err + } + // Turn on packet logging if enabled. if args.Conf.LogPackets { log.Infof("Packet logging enabled") diff --git a/test/syscalls/linux/poll.cc b/test/syscalls/linux/poll.cc index 7a6a39444..67a86cc22 100644 --- a/test/syscalls/linux/poll.cc +++ b/test/syscalls/linux/poll.cc @@ -255,7 +255,16 @@ TEST_F(PollTest, Nfds) { // Stash value of RLIMIT_NOFILES. struct rlimit rlim; TEST_PCHECK(getrlimit(RLIMIT_NOFILE, &rlim) == 0); + + // gVisor caps the number of FDs that epoll can use beyond RLIMIT_NOFILE. + constexpr rlim_t gVisorMax = 1048576; + if (rlim.rlim_cur > gVisorMax) { + rlim.rlim_cur = gVisorMax; + TEST_PCHECK(setrlimit(RLIMIT_NOFILE, &rlim) == 0); + } + rlim_t max_fds = rlim.rlim_cur; + LOG(INFO) << "Using limit: " << max_fds; // Create an eventfd. Since its value is initially zero, it is writable. FileDescriptor efd = ASSERT_NO_ERRNO_AND_VALUE(NewEventFD());